# 01_data_pull.ipynb
## Snowflake Data Extraction with Unified Event Stream

This notebook handles all data extraction from Snowflake and saves checkpoints for analysis.

### Workflow:
1. Connect to Snowflake
2. Sample users deterministically using hash-based bucketing
3. Extract raw event data (auctions, impressions, clicks, purchases)
4. Create unified event stream for sequential modeling
5. Extract catalog data
6. Save all data as timestamped checkpoint files

In [39]:
# --- IMPORTS ---
import os
import textwrap
from datetime import date, timedelta, datetime
from pathlib import Path
import warnings
import json

import pandas as pd
import numpy as np
from dotenv import load_dotenv
import snowflake.connector
from tqdm import tqdm

# Suppress the specific pandas UserWarning for non-SQLAlchemy connections
warnings.filterwarnings(
    'ignore',
    category=UserWarning,
    message='pandas only supports SQLAlchemy connectable.*'
)

In [40]:
# --- CONFIGURATION & HYPERPARAMETERS ---

# Load environment variables from the .env file for secure credential management
load_dotenv()

# ANALYSIS_END_DATE: The last date to be included in our analysis window
ANALYSIS_END_DATE = date(2025, 9, 7)

# DAYS_WINDOW: Number of days of data to pull, counting back from ANALYSIS_END_DATE
# Using 2 days as a balance between data volume and speed
DAYS_WINDOW = 2  # 2 days - middle ground

# SAMPLING_FRACTION: Percentage of users to include in the analysis
# 5% to get ~5000 users - enough for meaningful model training
SAMPLING_FRACTION = 0.0002  # 5% of users (~5000 users)

# JOURNEY_WINDOW_HOURS: Duration that defines a single user journey/session
JOURNEY_WINDOW_HOURS = 168  # 7 days

# SESSION_GAP_HOURS: Hours of inactivity that define a new session within a journey
SESSION_GAP_HOURS = 24  # 24 hours for sessionization

# HISTORICAL_DAYS: Days of historical data to pull for feature engineering
# Reduced to speed up extraction
HISTORICAL_DAYS = 3  # 3 days of history

# Calculate date ranges
ANALYSIS_START_DATE = ANALYSIS_END_DATE - timedelta(days=DAYS_WINDOW)
HISTORICAL_START_DATE = ANALYSIS_START_DATE - timedelta(days=HISTORICAL_DAYS)

print("Configuration (SCALED UP):")
print(f"  Analysis period: {ANALYSIS_START_DATE} to {ANALYSIS_END_DATE}")
print(f"  Historical period: {HISTORICAL_START_DATE} to {ANALYSIS_START_DATE}")
print(f"  Sampling fraction: {SAMPLING_FRACTION:.1%}")
print(f"  Expected users: ~5000")
print(f"  Journey window: {JOURNEY_WINDOW_HOURS} hours")
print(f"  Session gap: {SESSION_GAP_HOURS} hours")
print(f"  SCALED UP: {DAYS_WINDOW} days, {SAMPLING_FRACTION:.1%} of users")

Configuration (SCALED UP):
  Analysis period: 2025-09-05 to 2025-09-07
  Historical period: 2025-09-02 to 2025-09-05
  Sampling fraction: 0.0%
  Expected users: ~5000
  Journey window: 168 hours
  Session gap: 24 hours
  SCALED UP: 2 days, 0.0% of users


In [41]:
# --- SNOWFLAKE CONNECTION ---
try:
    conn = snowflake.connector.connect(
        user=os.getenv('SNOWFLAKE_USER'),
        password=os.getenv('SNOWFLAKE_PASSWORD'),
        account=os.getenv('SNOWFLAKE_ACCOUNT'),
        warehouse=os.getenv('SNOWFLAKE_WAREHOUSE', 'COMPUTE_WH'),
        database='INCREMENTALITY',
        schema='INCREMENTALITY_RESEARCH'
    )
    print("[SUCCESS] Snowflake connection established.")
except Exception as e:
    print(f"[FAILURE] Could not connect to Snowflake: {e}")
    conn = None

[SUCCESS] Snowflake connection established.


## Core Data Fetching Functions

In [42]:
def build_sampling_cte(start_date: str, end_date: str, sampling_fraction: float) -> str:
    """
    Build CTE for deterministic user sampling using hash-based bucketing.
    This ensures reproducible sampling across runs.
    """
    total_buckets = 10000
    selection_threshold = int(total_buckets * sampling_fraction)
    
    return textwrap.dedent(f"""
        WITH SAMPLED_USER_IDS AS (
            WITH REPEAT_PURCHASERS AS (
                SELECT USER_ID
                FROM PURCHASES
                WHERE PURCHASED_AT BETWEEN '{start_date}'
                  AND '{end_date}'
                GROUP BY USER_ID
                HAVING COUNT(DISTINCT PURCHASE_ID) >= 2
            ),
            BUCKETED_USERS AS (
                SELECT
                    USER_ID,
                    MOD(ABS(HASH(USER_ID)), {total_buckets}) AS bucket
                FROM REPEAT_PURCHASERS
            )
            SELECT USER_ID
            FROM BUCKETED_USERS
            WHERE bucket < {selection_threshold}
        )
    """)

In [43]:
def extract_auctions_users(conn, start_date: str, end_date: str, sampling_fraction: float) -> pd.DataFrame:
    """Extract AUCTIONS_USERS table with proper ID standardization."""
    print("\nExtracting AUCTIONS_USERS...")
    
    sampling_cte = build_sampling_cte(HISTORICAL_START_DATE, end_date, sampling_fraction)
    
    query = sampling_cte + textwrap.dedent(f"""
        SELECT
            LOWER(TO_VARCHAR(au.AUCTION_ID, 'HEX')) AS AUCTION_ID,
            au.OPAQUE_USER_ID AS USER_ID,
            au.CREATED_AT
        FROM AUCTIONS_USERS au
        JOIN SAMPLED_USER_IDS s ON au.OPAQUE_USER_ID = s.USER_ID
        WHERE au.CREATED_AT BETWEEN '{start_date}' AND '{end_date}'
    """)
    
    with tqdm(desc="AUCTIONS_USERS") as pbar:
        df = pd.read_sql(query, conn)
        pbar.update(len(df))
    
    print(f"  Extracted {len(df):,} auction records")
    return df

In [44]:
def extract_auctions_results(conn, start_date: str, end_date: str, sampling_fraction: float) -> pd.DataFrame:
    """Extract AUCTIONS_RESULTS table with proper ID standardization."""
    print("\nExtracting AUCTIONS_RESULTS...")
    
    sampling_cte = build_sampling_cte(HISTORICAL_START_DATE, end_date, sampling_fraction)
    
    query = sampling_cte + textwrap.dedent(f"""
        SELECT
            LOWER(TO_VARCHAR(ar.AUCTION_ID, 'HEX')) AS AUCTION_ID,
            LOWER(TO_VARCHAR(ar.VENDOR_ID, 'HEX')) AS VENDOR_ID,
            LOWER(TO_VARCHAR(ar.CAMPAIGN_ID, 'HEX')) AS CAMPAIGN_ID,
            LOWER(TRIM(ar.PRODUCT_ID)) AS PRODUCT_ID,
            ar.RANKING,
            ar.IS_WINNER,
            ar.CREATED_AT
        FROM AUCTIONS_RESULTS ar
        JOIN AUCTIONS_USERS au ON ar.AUCTION_ID = au.AUCTION_ID
        JOIN SAMPLED_USER_IDS s ON au.OPAQUE_USER_ID = s.USER_ID
        WHERE ar.CREATED_AT BETWEEN '{start_date}' AND '{end_date}'
    """)
    
    with tqdm(desc="AUCTIONS_RESULTS") as pbar:
        df = pd.read_sql(query, conn)
        pbar.update(len(df))
    
    print(f"  Extracted {len(df):,} bid records")
    return df

In [45]:
def extract_impressions(conn, start_date: str, end_date: str, sampling_fraction: float) -> pd.DataFrame:
    """Extract IMPRESSIONS table with proper ID standardization."""
    print("\nExtracting IMPRESSIONS...")
    
    sampling_cte = build_sampling_cte(HISTORICAL_START_DATE, end_date, sampling_fraction)
    
    query = sampling_cte + textwrap.dedent(f"""
        SELECT
            i.INTERACTION_ID,
            LOWER(REPLACE(i.AUCTION_ID, '-', '')) AS AUCTION_ID,
            LOWER(TRIM(i.PRODUCT_ID)) AS PRODUCT_ID,
            i.USER_ID,
            LOWER(REPLACE(i.CAMPAIGN_ID, '-', '')) AS CAMPAIGN_ID,
            LOWER(REPLACE(i.VENDOR_ID, '-', '')) AS VENDOR_ID,
            i.OCCURRED_AT
        FROM IMPRESSIONS i
        JOIN SAMPLED_USER_IDS s ON i.USER_ID = s.USER_ID
        WHERE i.OCCURRED_AT BETWEEN '{start_date}' AND '{end_date}'
    """)
    
    with tqdm(desc="IMPRESSIONS") as pbar:
        df = pd.read_sql(query, conn)
        pbar.update(len(df))
    
    print(f"  Extracted {len(df):,} impression records")
    return df

In [46]:
def extract_clicks(conn, start_date: str, end_date: str, sampling_fraction: float) -> pd.DataFrame:
    """Extract CLICKS table with proper ID standardization."""
    print("\nExtracting CLICKS...")
    
    sampling_cte = build_sampling_cte(HISTORICAL_START_DATE, end_date, sampling_fraction)
    
    query = sampling_cte + textwrap.dedent(f"""
        SELECT
            c.INTERACTION_ID,
            LOWER(REPLACE(c.AUCTION_ID, '-', '')) AS AUCTION_ID,
            LOWER(TRIM(c.PRODUCT_ID)) AS PRODUCT_ID,
            c.USER_ID,
            LOWER(REPLACE(c.CAMPAIGN_ID, '-', '')) AS CAMPAIGN_ID,
            LOWER(REPLACE(c.VENDOR_ID, '-', '')) AS VENDOR_ID,
            c.OCCURRED_AT
        FROM CLICKS c
        JOIN SAMPLED_USER_IDS s ON c.USER_ID = s.USER_ID
        WHERE c.OCCURRED_AT BETWEEN '{start_date}' AND '{end_date}'
    """)
    
    with tqdm(desc="CLICKS") as pbar:
        df = pd.read_sql(query, conn)
        pbar.update(len(df))
    
    print(f"  Extracted {len(df):,} click records")
    return df

In [47]:
def extract_purchases(conn, start_date: str, end_date: str, sampling_fraction: float) -> pd.DataFrame:
    """Extract PURCHASES table with proper ID standardization."""
    print("\nExtracting PURCHASES...")
    
    sampling_cte = build_sampling_cte(HISTORICAL_START_DATE, end_date, sampling_fraction)
    
    query = sampling_cte + textwrap.dedent(f"""
        SELECT
            p.PURCHASE_ID,
            p.PURCHASED_AT,
            LOWER(TRIM(p.PRODUCT_ID)) AS PRODUCT_ID,
            p.QUANTITY,
            p.UNIT_PRICE,
            p.USER_ID,
            p.PURCHASE_LINE
        FROM PURCHASES p
        JOIN SAMPLED_USER_IDS s ON p.USER_ID = s.USER_ID
        WHERE p.PURCHASED_AT BETWEEN '{start_date}' AND '{end_date}'
    """)
    
    with tqdm(desc="PURCHASES") as pbar:
        df = pd.read_sql(query, conn)
        pbar.update(len(df))
    
    print(f"  Extracted {len(df):,} purchase records")
    return df

In [48]:
def extract_catalog_with_products(conn, start_date: str, end_date: str, sampling_fraction: float) -> pd.DataFrame:
    """
    Extract catalog data for products that sampled users interacted with.
    Uses a complete CTE chain: SAMPLED_USER_IDS -> ALL_PRODUCT_IDS -> CATALOG
    """
    print("\nExtracting CATALOG with full CTE chain...")
    
    # Build the base sampling CTE
    sampling_cte = build_sampling_cte(start_date, end_date, sampling_fraction)
    
    # Build the complete query with proper CTE chaining
    # Note the comma after the first CTE and no WITH for subsequent CTEs
    query = sampling_cte + f""",
        ALL_PRODUCT_IDS AS (
            -- Get products from impressions
            SELECT DISTINCT LOWER(TRIM(i.PRODUCT_ID)) AS PRODUCT_ID
            FROM IMPRESSIONS i
            JOIN SAMPLED_USER_IDS s ON i.USER_ID = s.USER_ID
            WHERE i.OCCURRED_AT BETWEEN '{start_date}' AND '{end_date}'
              AND i.PRODUCT_ID IS NOT NULL
            
            UNION
            
            -- Get products from clicks
            SELECT DISTINCT LOWER(TRIM(c.PRODUCT_ID)) AS PRODUCT_ID
            FROM CLICKS c
            JOIN SAMPLED_USER_IDS s ON c.USER_ID = s.USER_ID
            WHERE c.OCCURRED_AT BETWEEN '{start_date}' AND '{end_date}'
              AND c.PRODUCT_ID IS NOT NULL
            
            UNION
            
            -- Get products from purchases
            SELECT DISTINCT LOWER(TRIM(p.PRODUCT_ID)) AS PRODUCT_ID
            FROM PURCHASES p
            JOIN SAMPLED_USER_IDS s ON p.USER_ID = s.USER_ID
            WHERE p.PURCHASED_AT BETWEEN '{start_date}' AND '{end_date}'
              AND p.PRODUCT_ID IS NOT NULL
        )
        -- Now fetch catalog for those products
        SELECT
            LOWER(TRIM(c.PRODUCT_ID)) as PRODUCT_ID,
            c.NAME,
            c.PRICE,
            c.ACTIVE,
            c.IS_DELETED,
            c.DESCRIPTION,
            SPLIT_PART(ARRAY_TO_STRING(FILTER(c.CATEGORIES, x -> x LIKE 'brand%%'), ''), '#', 2) AS BRAND,
            SPLIT_PART(ARRAY_TO_STRING(FILTER(c.CATEGORIES, x -> x LIKE 'department%%'), ''), '#', 2) AS DEPARTMENT_ID,
            SPLIT_PART(ARRAY_TO_STRING(FILTER(c.CATEGORIES, x -> x LIKE 'category%%'), ''), '#', 2) AS CATEGORY_ID,
            SPLIT_PART(ARRAY_TO_STRING(FILTER(c.CATEGORIES, x -> x LIKE 'color%%'), ''), '#', 2) AS PRIMARY_COLOR
        FROM CATALOG c
        JOIN ALL_PRODUCT_IDS ap ON LOWER(TRIM(c.PRODUCT_ID)) = ap.PRODUCT_ID
    """
    
    try:
        with tqdm(desc="CATALOG (Full CTE)") as pbar:
            df = pd.read_sql(query, conn)
            pbar.update(len(df))
        
        print(f"  Extracted {len(df):,} catalog records")
        print(f"  Products with descriptions: {df['DESCRIPTION'].notna().sum():,}")
        print(f"  Products with price data: {df['PRICE'].notna().sum():,}")
        
        # Create price buckets for categorical encoding
        if 'PRICE' in df.columns and df['PRICE'].notna().any():
            # Use quantiles for price bucketing, handling missing values
            df['PRICE_BUCKET'] = pd.qcut(df['PRICE'].fillna(df['PRICE'].median()), 
                                         q=10, labels=False, duplicates='drop')
        else:
            df['PRICE_BUCKET'] = 0
            
        return df
        
    except Exception as e:
        print(f"  Error extracting catalog: {e}")
        print("  Returning empty catalog dataframe")
        return pd.DataFrame(columns=[
            'PRODUCT_ID', 'NAME', 'PRICE', 'ACTIVE', 'IS_DELETED', 
            'DESCRIPTION', 'BRAND', 'DEPARTMENT_ID', 'CATEGORY_ID', 
            'PRIMARY_COLOR', 'PRICE_BUCKET'
        ])

In [49]:
def create_unified_event_stream(
    auctions: pd.DataFrame,
    impressions: pd.DataFrame,
    clicks: pd.DataFrame,
    purchases: pd.DataFrame
) -> pd.DataFrame:
    """
    Combines individual event DataFrames into a single, chronologically sorted stream.
    This is critical for sequential modeling with the Causal Transformer.
    """
    print("\n--- Creating Unified Event Stream from pulled data ---")
    
    # 1. Standardize AUCTIONS_USERS to our event format
    df_auctions = auctions[['USER_ID', 'CREATED_AT']].copy()
    df_auctions.rename(columns={'CREATED_AT': 'event_timestamp'}, inplace=True)
    df_auctions['event_type'] = 'auction'
    df_auctions['product_id'] = None  # No specific product for auction event
    print(f"  Processed {len(df_auctions):,} auction events.")

    # 2. Standardize IMPRESSIONS
    df_impressions = impressions[['USER_ID', 'OCCURRED_AT', 'PRODUCT_ID']].copy()
    df_impressions.rename(columns={'OCCURRED_AT': 'event_timestamp', 'PRODUCT_ID': 'product_id'}, inplace=True)
    df_impressions['event_type'] = 'impression'
    print(f"  Processed {len(df_impressions):,} impression events.")

    # 3. Standardize CLICKS
    df_clicks = clicks[['USER_ID', 'OCCURRED_AT', 'PRODUCT_ID']].copy()
    df_clicks.rename(columns={'OCCURRED_AT': 'event_timestamp', 'PRODUCT_ID': 'product_id'}, inplace=True)
    df_clicks['event_type'] = 'click'
    print(f"  Processed {len(df_clicks):,} click events.")
    
    # 4. Standardize PURCHASES
    df_purchases = purchases[['USER_ID', 'PURCHASED_AT', 'PRODUCT_ID']].copy()
    df_purchases.rename(columns={'PURCHASED_AT': 'event_timestamp', 'PRODUCT_ID': 'product_id'}, inplace=True)
    df_purchases['event_type'] = 'purchase'
    print(f"  Processed {len(df_purchases):,} purchase events.")

    # 5. Concatenate all standardized DataFrames
    print("\n  Concatenating all event types...")
    unified_df = pd.concat([df_auctions, df_impressions, df_clicks, df_purchases], ignore_index=True)
    
    # 6. Sort the entire stream by user and time (CRITICAL for sequential modeling)
    print("  Sorting the unified stream by user and timestamp...")
    unified_df.sort_values(by=['USER_ID', 'event_timestamp'], inplace=True)
    unified_df.reset_index(drop=True, inplace=True)
    
    # 7. Final type casting for memory efficiency
    unified_df['event_type'] = unified_df['event_type'].astype('category')
    
    print(f"\n[SUCCESS] Created unified event stream with {len(unified_df):,} total events.")
    return unified_df[['USER_ID', 'event_timestamp', 'event_type', 'product_id']]

## Main Data Extraction Pipeline

In [50]:
# --- MAIN DATA EXTRACTION PIPELINE ---

if conn:
    print("="*80)
    print("STARTING DATA EXTRACTION PIPELINE FOR CAUSAL TRANSFORMER")
    print("="*80)
    
    # NO TIMESTAMP - use fixed filenames that overwrite
    # timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    
    # Convert dates to strings for SQL
    start_date_str = ANALYSIS_START_DATE.strftime('%Y-%m-%d')
    end_date_str = ANALYSIS_END_DATE.strftime('%Y-%m-%d')
    hist_start_str = HISTORICAL_START_DATE.strftime('%Y-%m-%d')
    
    # --- Extract ONLY what we need for unified events ---
    print("\n--- Extracting main analysis period data ---")
    auctions_users = extract_auctions_users(conn, start_date_str, end_date_str, SAMPLING_FRACTION)
    # auctions_results = extract_auctions_results(conn, start_date_str, end_date_str, SAMPLING_FRACTION)  # SKIP - too large
    impressions = extract_impressions(conn, start_date_str, end_date_str, SAMPLING_FRACTION)
    clicks = extract_clicks(conn, start_date_str, end_date_str, SAMPLING_FRACTION)
    purchases = extract_purchases(conn, start_date_str, end_date_str, SAMPLING_FRACTION)
    
    # --- Create unified event stream for sequential modeling ---
    unified_events = create_unified_event_stream(
        auctions=auctions_users,
        impressions=impressions,
        clicks=clicks,
        purchases=purchases
    )
    
    # --- Extract catalog using full CTE chain ---
    catalog = extract_catalog_with_products(conn, start_date_str, end_date_str, SAMPLING_FRACTION)
    
    # --- Enrich unified events with product features ---
    print("\n--- Enriching events with product features ---")
    if len(catalog) > 0:
        # Merge catalog features with unified events
        unified_events = unified_events.merge(
            catalog[['PRODUCT_ID', 'PRICE', 'PRICE_BUCKET', 'CATEGORY_ID', 'NAME', 'DESCRIPTION']], 
            left_on='product_id', 
            right_on='PRODUCT_ID', 
            how='left'
        )
        # Drop duplicate PRODUCT_ID column
        unified_events = unified_events.drop('PRODUCT_ID', axis=1)
        
        # Fill missing values
        unified_events['PRICE'] = unified_events['PRICE'].fillna(0)
        unified_events['PRICE_BUCKET'] = unified_events['PRICE_BUCKET'].fillna(0).astype(int)
        unified_events['CATEGORY_ID'] = unified_events['CATEGORY_ID'].fillna('unknown')
        unified_events['NAME'] = unified_events['NAME'].fillna('')
        unified_events['DESCRIPTION'] = unified_events['DESCRIPTION'].fillna('')
        
        print(f"  Events with price data: {(unified_events['PRICE'] > 0).sum():,}")
        print(f"  Events with category data: {(unified_events['CATEGORY_ID'] != 'unknown').sum():,}")
        print(f"  Events with descriptions: {(unified_events['DESCRIPTION'] != '').sum():,}")
    else:
        print("  Warning: No catalog data extracted, proceeding without product features")
        # Add empty columns to maintain consistency
        unified_events['PRICE'] = 0
        unified_events['PRICE_BUCKET'] = 0
        unified_events['CATEGORY_ID'] = 'unknown'
        unified_events['NAME'] = ''
        unified_events['DESCRIPTION'] = ''
    
    # Close connection
    conn.close()
    print("\n[SUCCESS] Snowflake connection closed")
    
    # --- Save ONLY essential data (no timestamps, overwrite) ---
    print("\n--- Saving data ---")
    output_dir = Path("./data")
    output_dir.mkdir(parents=True, exist_ok=True)
    
    # Save enriched unified_events
    unified_path = output_dir / "unified_events.parquet"
    unified_events.to_parquet(unified_path, index=False)
    print(f"  Saved enriched unified_events: {len(unified_events):,} rows to {unified_path.name}")
    
    # Save catalog for later reference
    if len(catalog) > 0:
        catalog_path = output_dir / "catalog.parquet"
        catalog.to_parquet(catalog_path, index=False)
        print(f"  Saved catalog: {len(catalog):,} rows to {catalog_path.name}")
    
    # Save metadata for reference (also fixed filename)
    # Convert all numpy types to native Python types for JSON serialization
    metadata = {
        'analysis_start_date': start_date_str,
        'analysis_end_date': end_date_str,
        'historical_start_date': hist_start_str,
        'sampling_fraction': float(SAMPLING_FRACTION),
        'journey_window_hours': int(JOURNEY_WINDOW_HOURS),
        'session_gap_hours': int(SESSION_GAP_HOURS),
        'total_events': int(len(unified_events)),
        'unique_users': int(unified_events['USER_ID'].nunique()),
        'unique_products': int(unified_events['product_id'].nunique()),
        'unique_categories': int(unified_events['CATEGORY_ID'].nunique()) if 'CATEGORY_ID' in unified_events.columns else 0,
        'events_with_features': int((unified_events['PRICE'] > 0).sum()) if 'PRICE' in unified_events.columns else 0,
        'catalog_products': int(len(catalog)) if len(catalog) > 0 else 0
    }
    
    metadata_path = output_dir / "metadata.json"
    with open(metadata_path, 'w') as f:
        json.dump(metadata, f, indent=2)
    print(f"  Saved metadata to {metadata_path.name}")
    
    print("\n" + "="*80)
    print("DATA EXTRACTION COMPLETE - ENRICHED WITH FEATURES!")
    print(f"Extracted {len(unified_events):,} events with product features")
    print("="*80)
    
else:
    print("[ERROR] No Snowflake connection available. Please check your credentials.")

STARTING DATA EXTRACTION PIPELINE FOR CAUSAL TRANSFORMER

--- Extracting main analysis period data ---

Extracting AUCTIONS_USERS...


AUCTIONS_USERS: 1115it [00:07, 159.20it/s]


  Extracted 1,115 auction records

Extracting IMPRESSIONS...


IMPRESSIONS: 4147it [00:07, 575.10it/s]


  Extracted 4,147 impression records

Extracting CLICKS...


CLICKS: 97it [00:02, 34.35it/s]


  Extracted 97 click records

Extracting PURCHASES...


PURCHASES: 49it [00:03, 15.51it/s]


  Extracted 49 purchase records

--- Creating Unified Event Stream from pulled data ---
  Processed 1,115 auction events.
  Processed 4,147 impression events.
  Processed 97 click events.
  Processed 49 purchase events.

  Concatenating all event types...
  Sorting the unified stream by user and timestamp...

[SUCCESS] Created unified event stream with 5,408 total events.

Extracting CATALOG with full CTE chain...


CATALOG (Full CTE): 1310it [00:52, 25.13it/s]


  Extracted 1,310 catalog records
  Products with descriptions: 1,310
  Products with price data: 1,310

--- Enriching events with product features ---
  Events with price data: 1,742
  Events with category data: 1,742
  Events with descriptions: 1,742

[SUCCESS] Snowflake connection closed

--- Saving data ---
  Saved enriched unified_events: 5,408 rows to unified_events.parquet
  Saved catalog: 1,310 rows to catalog.parquet
  Saved metadata to metadata.json

DATA EXTRACTION COMPLETE - ENRICHED WITH FEATURES!
Extracted 5,408 events with product features


In [53]:
# Data extraction summary and quality checks
if 'unified_events' in locals():
    print("\n" + "="*80)
    print("DATA EXTRACTION SUMMARY FOR SEQUENTIAL MODELING")
    print("="*80)
    
    print("\nUnified Event Stream:")
    print(f"  Total events: {len(unified_events):,}")
    print(f"  Unique users: {unified_events['USER_ID'].nunique():,}")
    print(f"  Date range: {unified_events['event_timestamp'].min()} to {unified_events['event_timestamp'].max()}")
    
    print("\nEvent Type Distribution:")
    event_counts = unified_events['event_type'].value_counts()
    for event_type, count in event_counts.items():
        print(f"  {event_type}: {count:,} ({count/len(unified_events)*100:.1f}%)")
    
    print("\nMain Analysis Period:")
    print(f"  Total auctions: {len(auctions_users):,}")
    # print(f"  Total bids: {len(auctions_results):,}")  # Skipped - too large
    print(f"  Total impressions: {len(impressions):,}")
    print(f"  Total clicks: {len(clicks):,}")
    print(f"  Total purchases: {len(purchases):,}")
    print(f"  Catalog products: {len(catalog):,}")
    
    # Basic data quality checks
    print("\nData Quality Checks:")
    
    # Check for null values in critical columns
    null_users = unified_events['USER_ID'].isnull().sum()
    null_timestamps = unified_events['event_timestamp'].isnull().sum()
    print(f"  Null USER_IDs: {null_users} ({null_users/len(unified_events)*100:.4f}%)")
    print(f"  Null timestamps: {null_timestamps} ({null_timestamps/len(unified_events)*100:.4f}%)")
    
    # Verify sort order
    is_sorted = True
    for user_id in unified_events['USER_ID'].unique()[:10]:  # Check first 10 users
        user_events = unified_events[unified_events['USER_ID'] == user_id]['event_timestamp']
        if not user_events.is_monotonic_increasing:
            is_sorted = False
            break
    print(f"  Events correctly sorted by user and time: {is_sorted}")
    
    # Check product coverage in catalog
    products_in_events = set(unified_events[unified_events['product_id'].notna()]['product_id'].unique())
    products_in_catalog = set(catalog['PRODUCT_ID'].unique())
    coverage = len(products_in_catalog & products_in_events) / len(products_in_events) if products_in_events else 0
    print(f"  Product catalog coverage: {coverage:.1%}")
    
    # Check enrichment success
    print("\nFeature Enrichment:")
    print(f"  Events with product features: {(unified_events['PRICE'] > 0).sum():,} / {len(unified_events):,}")
    print(f"  Events with categories: {(unified_events['CATEGORY_ID'] != 'unknown').sum():,} / {len(unified_events):,}")
    print(f"  Events with descriptions: {(unified_events['DESCRIPTION'] != '').sum():,} / {len(unified_events):,}")
    
    print("\n" + "="*80)
    print("Ready for sessionization and sequential modeling!")
    print("="*80)


DATA EXTRACTION SUMMARY FOR SEQUENTIAL MODELING

Unified Event Stream:
  Total events: 5,408
  Unique users: 24
  Date range: 2025-09-05 00:05:17.255000 to 2025-09-06 23:59:37

Event Type Distribution:
  impression: 4,147 (76.7%)
  auction: 1,115 (20.6%)
  click: 97 (1.8%)
  purchase: 49 (0.9%)

Main Analysis Period:
  Total auctions: 1,115
  Total impressions: 4,147
  Total clicks: 97
  Total purchases: 49
  Catalog products: 1,310

Data Quality Checks:
  Null USER_IDs: 0 (0.0000%)
  Null timestamps: 0 (0.0000%)
  Events correctly sorted by user and time: True
  Product catalog coverage: 44.3%

Feature Enrichment:
  Events with product features: 1,742 / 5,408
  Events with categories: 1,742 / 5,408
  Events with descriptions: 1,742 / 5,408

Ready for sessionization and sequential modeling!
