# 01_data_pull.ipynb
## Snowflake Data Extraction with Checkpointing

This notebook handles all data extraction from Snowflake and saves checkpoints for analysis.

### Workflow:
1. Connect to Snowflake
2. Sample users deterministically using hash-based bucketing
3. Extract raw event data (auctions, impressions, clicks, purchases)
4. Extract catalog data with enhanced parsing
5. Save all data as timestamped checkpoint files

In [1]:
# --- IMPORTS ---
import os
import textwrap
from datetime import date, timedelta, datetime
from pathlib import Path
import warnings

import pandas as pd
from dotenv import load_dotenv
import snowflake.connector
from tqdm import tqdm

# Suppress the specific pandas UserWarning for non-SQLAlchemy connections
warnings.filterwarnings(
    'ignore',
    category=UserWarning,
    message='pandas only supports SQLAlchemy connectable.*'
)

In [2]:
# --- CONFIGURATION & HYPERPARAMETERS ---

# Load environment variables from the .env file for secure credential management
load_dotenv()

# ANALYSIS_END_DATE: The last date to be included in our analysis window
ANALYSIS_END_DATE = date(2025, 9, 7)

# DAYS_WINDOW: Number of days of data to pull, counting back from ANALYSIS_END_DATE
DAYS_WINDOW = 14  # Two weeks of data

# SAMPLING_FRACTION: Percentage of users to include in the analysis
# 0.001 = 0.1% of users for development, can increase for production
SAMPLING_FRACTION = 0.001

# JOURNEY_WINDOW_HOURS: Duration that defines a single user journey/session
JOURNEY_WINDOW_HOURS = 168  # 7 days

# SESSION_GAP_HOURS: Hours of inactivity that define a new session within a journey
SESSION_GAP_HOURS = 2

# HISTORICAL_DAYS: Days of historical data to pull for feature engineering
HISTORICAL_DAYS = 90  # 3 months of history

# Calculate date ranges
ANALYSIS_START_DATE = ANALYSIS_END_DATE - timedelta(days=DAYS_WINDOW)
HISTORICAL_START_DATE = ANALYSIS_START_DATE - timedelta(days=HISTORICAL_DAYS)

print("Configuration:")
print(f"  Analysis period: {ANALYSIS_START_DATE} to {ANALYSIS_END_DATE}")
print(f"  Historical period: {HISTORICAL_START_DATE} to {ANALYSIS_START_DATE}")
print(f"  Sampling fraction: {SAMPLING_FRACTION:.2%}")
print(f"  Journey window: {JOURNEY_WINDOW_HOURS} hours")

Configuration:
  Analysis period: 2025-08-24 to 2025-09-07
  Historical period: 2025-05-26 to 2025-08-24
  Sampling fraction: 0.10%
  Journey window: 168 hours


In [3]:
# --- SNOWFLAKE CONNECTION ---
try:
    conn = snowflake.connector.connect(
        user=os.getenv('SNOWFLAKE_USER'),
        password=os.getenv('SNOWFLAKE_PASSWORD'),
        account=os.getenv('SNOWFLAKE_ACCOUNT'),
        warehouse=os.getenv('SNOWFLAKE_WAREHOUSE', 'COMPUTE_WH'),
        database='INCREMENTALITY',
        schema='INCREMENTALITY_RESEARCH'
    )
    print("[SUCCESS] Snowflake connection established.")
except Exception as e:
    print(f"[FAILURE] Could not connect to Snowflake: {e}")
    conn = None

[SUCCESS] Snowflake connection established.


## Core Data Fetching Functions

In [4]:
def build_sampling_cte(start_date: str, end_date: str, sampling_fraction: float) -> str:
    """
    Build CTE for deterministic user sampling using hash-based bucketing.
    This ensures reproducible sampling across runs.
    """
    total_buckets = 10000
    selection_threshold = int(total_buckets * sampling_fraction)
    
    return textwrap.dedent(f"""
        WITH SAMPLED_USER_IDS AS (
            WITH REPEAT_PURCHASERS AS (
                SELECT USER_ID
                FROM PURCHASES
                WHERE PURCHASED_AT BETWEEN '{start_date}'
                  AND '{end_date}'
                GROUP BY USER_ID
                HAVING COUNT(DISTINCT PURCHASE_ID) >= 2
            ),
            BUCKETED_USERS AS (
                SELECT
                    USER_ID,
                    MOD(ABS(HASH(USER_ID)), {total_buckets}) AS bucket
                FROM REPEAT_PURCHASERS
            )
            SELECT USER_ID
            FROM BUCKETED_USERS
            WHERE bucket < {selection_threshold}
        )
    """)

In [5]:
def extract_auctions_users(conn, start_date: str, end_date: str, sampling_fraction: float) -> pd.DataFrame:
    """Extract AUCTIONS_USERS table with proper ID standardization."""
    print("\nExtracting AUCTIONS_USERS...")
    
    sampling_cte = build_sampling_cte(HISTORICAL_START_DATE, end_date, sampling_fraction)
    
    query = sampling_cte + textwrap.dedent(f"""
        SELECT
            LOWER(TO_VARCHAR(au.AUCTION_ID, 'HEX')) AS AUCTION_ID,
            au.OPAQUE_USER_ID AS USER_ID,
            au.CREATED_AT
        FROM AUCTIONS_USERS au
        JOIN SAMPLED_USER_IDS s ON au.OPAQUE_USER_ID = s.USER_ID
        WHERE au.CREATED_AT BETWEEN '{start_date}' AND '{end_date}'
    """)
    
    with tqdm(desc="AUCTIONS_USERS") as pbar:
        df = pd.read_sql(query, conn)
        pbar.update(len(df))
    
    print(f"  Extracted {len(df):,} auction records")
    return df

In [6]:
def extract_auctions_results(conn, start_date: str, end_date: str, sampling_fraction: float) -> pd.DataFrame:
    """Extract AUCTIONS_RESULTS table with proper ID standardization."""
    print("\nExtracting AUCTIONS_RESULTS...")
    
    sampling_cte = build_sampling_cte(HISTORICAL_START_DATE, end_date, sampling_fraction)
    
    query = sampling_cte + textwrap.dedent(f"""
        SELECT
            LOWER(TO_VARCHAR(ar.AUCTION_ID, 'HEX')) AS AUCTION_ID,
            LOWER(TO_VARCHAR(ar.VENDOR_ID, 'HEX')) AS VENDOR_ID,
            LOWER(TO_VARCHAR(ar.CAMPAIGN_ID, 'HEX')) AS CAMPAIGN_ID,
            LOWER(TRIM(ar.PRODUCT_ID)) AS PRODUCT_ID,
            ar.RANKING,
            ar.IS_WINNER,
            ar.CREATED_AT
        FROM AUCTIONS_RESULTS ar
        JOIN AUCTIONS_USERS au ON ar.AUCTION_ID = au.AUCTION_ID
        JOIN SAMPLED_USER_IDS s ON au.OPAQUE_USER_ID = s.USER_ID
        WHERE ar.CREATED_AT BETWEEN '{start_date}' AND '{end_date}'
    """)
    
    with tqdm(desc="AUCTIONS_RESULTS") as pbar:
        df = pd.read_sql(query, conn)
        pbar.update(len(df))
    
    print(f"  Extracted {len(df):,} bid records")
    return df

In [7]:
def extract_impressions(conn, start_date: str, end_date: str, sampling_fraction: float) -> pd.DataFrame:
    """Extract IMPRESSIONS table with proper ID standardization."""
    print("\nExtracting IMPRESSIONS...")
    
    sampling_cte = build_sampling_cte(HISTORICAL_START_DATE, end_date, sampling_fraction)
    
    query = sampling_cte + textwrap.dedent(f"""
        SELECT
            i.INTERACTION_ID,
            LOWER(REPLACE(i.AUCTION_ID, '-', '')) AS AUCTION_ID,
            LOWER(TRIM(i.PRODUCT_ID)) AS PRODUCT_ID,
            i.USER_ID,
            LOWER(REPLACE(i.CAMPAIGN_ID, '-', '')) AS CAMPAIGN_ID,
            LOWER(REPLACE(i.VENDOR_ID, '-', '')) AS VENDOR_ID,
            i.OCCURRED_AT
        FROM IMPRESSIONS i
        JOIN SAMPLED_USER_IDS s ON i.USER_ID = s.USER_ID
        WHERE i.OCCURRED_AT BETWEEN '{start_date}' AND '{end_date}'
    """)
    
    with tqdm(desc="IMPRESSIONS") as pbar:
        df = pd.read_sql(query, conn)
        pbar.update(len(df))
    
    print(f"  Extracted {len(df):,} impression records")
    return df

In [8]:
def extract_clicks(conn, start_date: str, end_date: str, sampling_fraction: float) -> pd.DataFrame:
    """Extract CLICKS table with proper ID standardization."""
    print("\nExtracting CLICKS...")
    
    sampling_cte = build_sampling_cte(HISTORICAL_START_DATE, end_date, sampling_fraction)
    
    query = sampling_cte + textwrap.dedent(f"""
        SELECT
            c.INTERACTION_ID,
            LOWER(REPLACE(c.AUCTION_ID, '-', '')) AS AUCTION_ID,
            LOWER(TRIM(c.PRODUCT_ID)) AS PRODUCT_ID,
            c.USER_ID,
            LOWER(REPLACE(c.CAMPAIGN_ID, '-', '')) AS CAMPAIGN_ID,
            LOWER(REPLACE(c.VENDOR_ID, '-', '')) AS VENDOR_ID,
            c.OCCURRED_AT
        FROM CLICKS c
        JOIN SAMPLED_USER_IDS s ON c.USER_ID = s.USER_ID
        WHERE c.OCCURRED_AT BETWEEN '{start_date}' AND '{end_date}'
    """)
    
    with tqdm(desc="CLICKS") as pbar:
        df = pd.read_sql(query, conn)
        pbar.update(len(df))
    
    print(f"  Extracted {len(df):,} click records")
    return df

In [9]:
def extract_purchases(conn, start_date: str, end_date: str, sampling_fraction: float) -> pd.DataFrame:
    """Extract PURCHASES table with proper ID standardization."""
    print("\nExtracting PURCHASES...")
    
    sampling_cte = build_sampling_cte(HISTORICAL_START_DATE, end_date, sampling_fraction)
    
    query = sampling_cte + textwrap.dedent(f"""
        SELECT
            p.PURCHASE_ID,
            p.PURCHASED_AT,
            LOWER(TRIM(p.PRODUCT_ID)) AS PRODUCT_ID,
            p.QUANTITY,
            p.UNIT_PRICE,
            p.USER_ID,
            p.PURCHASE_LINE
        FROM PURCHASES p
        JOIN SAMPLED_USER_IDS s ON p.USER_ID = s.USER_ID
        WHERE p.PURCHASED_AT BETWEEN '{start_date}' AND '{end_date}'
    """)
    
    with tqdm(desc="PURCHASES") as pbar:
        df = pd.read_sql(query, conn)
        pbar.update(len(df))
    
    print(f"  Extracted {len(df):,} purchase records")
    return df

In [10]:
def extract_catalog(conn, product_ids: set) -> pd.DataFrame:
    """Extract CATALOG data for relevant products with enhanced category parsing."""
    print("\nExtracting CATALOG...")
    print(f"  Found {len(product_ids):,} unique products to fetch")
    
    # Handle the edge case where there are no product IDs to fetch
    if not product_ids:
        print("  No product IDs found, creating an empty catalog DataFrame.")
        return pd.DataFrame(columns=[
            'PRODUCT_ID', 'NAME', 'PRICE', 'ACTIVE', 'IS_DELETED',
            'BRAND', 'DEPARTMENT_ID', 'CATEGORY_ID', 'PRIMARY_COLOR', 'STYLE_TAGS'
        ])
    
    # Convert the set to a list for the parameters
    params_list = list(product_ids)
    
    # Create the correct number of placeholders for parameterized query
    placeholders = ', '.join(['%s'] * len(params_list))
    
    # Build the query using placeholders, NOT the actual values
    query = f"""
        SELECT
            LOWER(TRIM(PRODUCT_ID)) as PRODUCT_ID,
            NAME,
            PRICE,
            ACTIVE,
            IS_DELETED,
            SPLIT_PART(ARRAY_TO_STRING(FILTER(CATEGORIES, x -> x LIKE 'brand#%'), ''), '#', 2) AS BRAND,
            SPLIT_PART(ARRAY_TO_STRING(FILTER(CATEGORIES, x -> x LIKE 'department#%'), ''), '#', 2) AS DEPARTMENT_ID,
            SPLIT_PART(ARRAY_TO_STRING(FILTER(CATEGORIES, x -> x LIKE 'category#%'), ''), '#', 2) AS CATEGORY_ID,
            SPLIT_PART(ARRAY_TO_STRING(FILTER(CATEGORIES, x -> x LIKE 'color#%'), ''), '#', 2) AS PRIMARY_COLOR,
            REPLACE(
                ARRAY_TO_STRING(FILTER(CATEGORIES, x -> x LIKE 'style_tag#%'), ', '),
                'style_tag#', ''
            ) AS STYLE_TAGS
        FROM CATALOG
        WHERE LOWER(TRIM(PRODUCT_ID)) IN ({placeholders})
    """
    
    with tqdm(desc="CATALOG") as pbar:
        # Pass the query and the list of parameters separately to pandas
        # Pandas will pass them to the database driver, which handles binding them safely
        df = pd.read_sql(query, conn, params=params_list)
        pbar.update(len(df))
    
    print(f"  Extracted {len(df):,} catalog records")
    return df

In [18]:
def extract_catalog_and_get_product_ids(conn, hist_start_date: str, end_date: str, sampling_fraction: float) -> tuple[pd.DataFrame, set]:
    """
    Combines product ID collection and catalog extraction into a single, efficient query.
    
    This function uses a "mono-CTE" approach to:
    1. Identify all sampled users.
    2. Collect all unique product IDs associated with them from all event tables.
    3. Fetch the catalog entries for only those products.
    
    This is far more efficient than collecting IDs in Python and sending them back to the DB.
    """
    print("\nExtracting CATALOG and collecting all PRODUCT_IDs using a mono-CTE...")
    
    sampling_cte = build_sampling_cte(hist_start_date, end_date, sampling_fraction)
    
    # --- FIX IS HERE ---
    # We add a comma after the first CTE (sampling_cte) and remove the redundant 'WITH'
    # from the start of the second CTE definition.
    query = sampling_cte + textwrap.dedent(f""",
        ALL_PRODUCT_IDS AS (
            -- Collect all product IDs from all relevant tables for our sampled users
            SELECT DISTINCT LOWER(TRIM(ar.PRODUCT_ID)) AS PRODUCT_ID
            FROM AUCTIONS_RESULTS ar
            JOIN AUCTIONS_USERS au ON ar.AUCTION_ID = au.AUCTION_ID
            JOIN SAMPLED_USER_IDS s ON au.OPAQUE_USER_ID = s.USER_ID
            WHERE ar.CREATED_AT BETWEEN '{hist_start_date}' AND '{end_date}'
              AND ar.PRODUCT_ID IS NOT NULL
            
            UNION
            
            SELECT DISTINCT LOWER(TRIM(i.PRODUCT_ID)) AS PRODUCT_ID
            FROM IMPRESSIONS i
            JOIN SAMPLED_USER_IDS s ON i.USER_ID = s.USER_ID
            WHERE i.OCCURRED_AT BETWEEN '{hist_start_date}' AND '{end_date}'
              AND i.PRODUCT_ID IS NOT NULL

            UNION

            SELECT DISTINCT LOWER(TRIM(c.PRODUCT_ID)) AS PRODUCT_ID
            FROM CLICKS c
            JOIN SAMPLED_USER_IDS s ON c.USER_ID = s.USER_ID
            WHERE c.OCCURRED_AT BETWEEN '{hist_start_date}' AND '{end_date}'
              AND c.PRODUCT_ID IS NOT NULL

            UNION

            SELECT DISTINCT LOWER(TRIM(p.PRODUCT_ID)) AS PRODUCT_ID
            FROM PURCHASES p
            JOIN SAMPLED_USER_IDS s ON p.USER_ID = s.USER_ID
            WHERE p.PURCHASED_AT BETWEEN '{hist_start_date}' AND '{end_date}'
              AND p.PRODUCT_ID IS NOT NULL
        )
        -- Now, fetch the catalog data for exactly those products
        SELECT
            c.PRODUCT_ID,
            c.NAME,
            c.PRICE,
            c.ACTIVE,
            c.IS_DELETED,
            SPLIT_PART(ARRAY_TO_STRING(FILTER(c.CATEGORIES, x -> x LIKE 'brand#%%'), ''), '#', 2) AS BRAND,
            SPLIT_PART(ARRAY_TO_STRING(FILTER(c.CATEGORIES, x -> x LIKE 'department#%%'), ''), '#', 2) AS DEPARTMENT_ID,
            SPLIT_PART(ARRAY_TO_STRING(FILTER(c.CATEGORIES, x -> x LIKE 'category#%%'), ''), '#', 2) AS CATEGORY_ID,
            SPLIT_PART(ARRAY_TO_STRING(FILTER(c.CATEGORIES, x -> x LIKE 'color#%%'), ''), '#', 2) AS PRIMARY_COLOR,
            REPLACE(
                ARRAY_TO_STRING(FILTER(c.CATEGORIES, x -> x LIKE 'style_tag#%%'), ', '),
                'style_tag#', ''
            ) AS STYLE_TAGS
        FROM CATALOG c
        JOIN ALL_PRODUCT_IDS ap ON c.PRODUCT_ID = ap.PRODUCT_ID
    """)
    
    with tqdm(desc="CATALOG (Mono-CTE)") as pbar:
        df = pd.read_sql(query, conn)
        pbar.update(len(df))

    # Get the set of product IDs from the resulting dataframe
    product_ids = set(df['PRODUCT_ID'].unique())
    print(f"  Extracted {len(df):,} catalog records for {len(product_ids):,} unique products.")
    
    return df, product_ids

## Main Data Extraction Pipeline

In [19]:
# %% --- MAIN DATA EXTRACTION PIPELINE (REVISED) ---

if conn:
    print("="*80)
    print("STARTING REVISED DATA EXTRACTION PIPELINE")
    print("="*80)
    
    # Create timestamp for this extraction run
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    
    # Convert dates to strings for SQL
    start_date_str = ANALYSIS_START_DATE.strftime('%Y-%m-%d')
    end_date_str = ANALYSIS_END_DATE.strftime('%Y-%m-%d')
    hist_start_str = HISTORICAL_START_DATE.strftime('%Y-%m-%d')
    
    # --- Extract main event tables (as before) ---
    print("\n--- Extracting main analysis period data ---")
    #auctions_users = extract_auctions_users(conn, start_date_str, end_date_str, SAMPLING_FRACTION)
    #auctions_results = extract_auctions_results(conn, start_date_str, end_date_str, SAMPLING_FRACTION)
    #impressions = extract_impressions(conn, start_date_str, end_date_str, SAMPLING_FRACTION)
    #clicks = extract_clicks(conn, start_date_str, end_date_str, SAMPLING_FRACTION)
    #purchases = extract_purchases(conn, start_date_str, end_date_str, SAMPLING_FRACTION)
    
    print("\n--- Extracting historical data for feature engineering ---")
    #hist_purchases = extract_purchases(conn, hist_start_str, start_date_str, SAMPLING_FRACTION)
    #hist_impressions = extract_impressions(conn, hist_start_str, start_date_str, SAMPLING_FRACTION)
    #hist_clicks = extract_clicks(conn, hist_start_str, start_date_str, SAMPLING_FRACTION)
    
    # --- NEW: EFFICIENT CATALOG & PRODUCT ID EXTRACTION ---
    # This single call replaces the slow Python-side ID collection and the failing catalog pull.
    catalog, product_ids = extract_catalog_and_get_product_ids(conn, hist_start_str, end_date_str, SAMPLING_FRACTION)
    
    # Close connection
    #conn.close()
    print("\n[SUCCESS] Snowflake connection closed")
    
    # --- Save all data as checkpoint (no changes needed below this line) ---
    print("\n--- Saving data checkpoint ---")
    output_dir = Path("./data/raw")
    output_dir.mkdir(parents=True, exist_ok=True)
    
    datasets = [
        ("auctions_users", auctions_users),
        ("auctions_results", auctions_results),
        ("impressions", impressions),
        ("clicks", clicks),
        ("purchases", purchases),
        ("catalog", catalog),
        ("hist_purchases", hist_purchases),
        ("hist_impressions", hist_impressions),
        ("hist_clicks", hist_clicks)
    ]
    
    for name, df in datasets:
        path = output_dir / f"{name}_{timestamp}.parquet"
        df.to_parquet(path, index=False)
        print(f"  Saved {name}: {len(df):,} rows to {path.name}")
    
    # Create a metadata file
    metadata = {
        'timestamp': timestamp,
        'analysis_start_date': start_date_str,
        'analysis_end_date': end_date_str,
        'historical_start_date': hist_start_str,
        'sampling_fraction': SAMPLING_FRACTION,
        'journey_window_hours': JOURNEY_WINDOW_HOURS,
        'total_products': len(product_ids),
        'row_counts': {name: len(df) for name, df in datasets}
    }
    
    import json
    metadata_path = output_dir / f"metadata_{timestamp}.json"
    with open(metadata_path, 'w') as f:
        json.dump(metadata, f, indent=2)
    print(f"\n  Saved metadata to {metadata_path.name}")
    
    print("\n" + "="*80)
    print("DATA EXTRACTION COMPLETE")
    print("="*80)
    
else:
    print("[ERROR] No Snowflake connection available. Please check your credentials.")

STARTING REVISED DATA EXTRACTION PIPELINE

--- Extracting main analysis period data ---

--- Extracting historical data for feature engineering ---

Extracting CATALOG and collecting all PRODUCT_IDs using a mono-CTE...


CATALOG (Mono-CTE): 4961480it [17:43, 4664.24it/s]


  Extracted 4,961,480 catalog records for 4,961,480 unique products.

[SUCCESS] Snowflake connection closed

--- Saving data checkpoint ---
  Saved auctions_users: 88,690 rows to auctions_users_20250923_043038.parquet
  Saved auctions_results: 3,410,770 rows to auctions_results_20250923_043038.parquet
  Saved impressions: 347,741 rows to impressions_20250923_043038.parquet
  Saved clicks: 11,215 rows to clicks_20250923_043038.parquet
  Saved purchases: 1,859 rows to purchases_20250923_043038.parquet
  Saved catalog: 4,961,480 rows to catalog_20250923_043038.parquet
  Saved hist_purchases: 10,332 rows to hist_purchases_20250923_043038.parquet
  Saved hist_impressions: 1,685,675 rows to hist_impressions_20250923_043038.parquet
  Saved hist_clicks: 59,691 rows to hist_clicks_20250923_043038.parquet

  Saved metadata to metadata_20250923_043038.json

DATA EXTRACTION COMPLETE


In [20]:
# Data extraction summary
if 'auctions_users' in locals():
    print("\n" + "="*80)
    print("DATA EXTRACTION SUMMARY")
    print("="*80)
    
    print("\nMain Analysis Period:")
    print(f"  Unique users: {auctions_users['USER_ID'].nunique():,}")
    print(f"  Total auctions: {len(auctions_users):,}")
    print(f"  Total bids: {len(auctions_results):,}")
    print(f"  Total impressions: {len(impressions):,}")
    print(f"  Total clicks: {len(clicks):,}")
    print(f"  Total purchases: {len(purchases):,}")
    print(f"  Catalog products: {len(catalog):,}")
    
    print("\nHistorical Period (for features):")
    print(f"  Historical purchases: {len(hist_purchases):,}")
    print(f"  Historical impressions: {len(hist_impressions):,}")
    print(f"  Historical clicks: {len(hist_clicks):,}")
    
    # Basic data quality checks
    print("\nData Quality Checks:")
    
    # Check auction ID consistency
    auction_ids_users = set(auctions_users['AUCTION_ID'].unique())
    auction_ids_results = set(auctions_results['AUCTION_ID'].unique())
    overlap = len(auction_ids_users & auction_ids_results)
    print(f"  Auction ID overlap (users ∩ results): {overlap:,} / {len(auction_ids_users):,}")
    
    # Check user ID consistency
    users_in_auctions = set(auctions_users['USER_ID'].unique())
    users_in_purchases = set(purchases['USER_ID'].unique())
    print(f"  Users with purchases: {len(users_in_purchases):,} / {len(users_in_auctions):,}")
    
    # Check product coverage in catalog
    products_in_events = product_ids
    products_in_catalog = set(catalog['PRODUCT_ID'].unique())
    coverage = len(products_in_catalog) / len(products_in_events) if products_in_events else 0
    print(f"  Product catalog coverage: {coverage:.1%}")


DATA EXTRACTION SUMMARY

Main Analysis Period:
  Unique users: 1,185
  Total auctions: 88,690
  Total bids: 3,410,770
  Total impressions: 347,741
  Total clicks: 11,215
  Total purchases: 1,859
  Catalog products: 4,961,480

Historical Period (for features):
  Historical purchases: 10,332
  Historical impressions: 1,685,675
  Historical clicks: 59,691

Data Quality Checks:
  Auction ID overlap (users ∩ results): 88,019 / 88,690
  Users with purchases: 619 / 1,185
  Product catalog coverage: 100.0%


In [15]:
purchases.head()

Unnamed: 0,PURCHASE_ID,PURCHASED_AT,PRODUCT_ID,QUANTITY,UNIT_PRICE,USER_ID,PURCHASE_LINE
0,68bcbedad4b9431a669592df,2025-09-06 23:08:25,68bcb71021b8010caf01c85a,1,2000,ext1:a1a0b122-a736-4356-b2bb-50173b4c173f,1
1,68bafb0f7d56cdcb0fb8f926,2025-09-05 15:00:35,67e4d2e2d90d21f8fe59231a,1,5000,ext1:3511adb0-f429-4ec2-a4a9-a1a405ceccc9,1
2,68ba60ac21a0acdc46219cdd,2025-09-05 04:01:52,66f187f1d90d21033b1222ea,1,2500,ext1:ed4f9c63-747f-4e4f-b8ab-f8541ef7f07c,1
3,68b2812f3bc447eb226e5456,2025-08-30 04:42:27,68649c9305deb3739a26ad67,1,2000,ext1:5bbf83a2-724a-49e9-99a6-99b663e23ff1,4
4,68b5e6d7a84d7eeb906a8da8,2025-09-01 18:33:02,687ad2ef88849a368daf78ac,1,2200,ext1:cfae9c0f-b056-4b3a-8590-e565fde82ff7,1
