# Data Pull - Full 365 Day Window for Fixed Effects Analysis

This notebook pulls a continuous 365-day window of data for macro-session based fixed effects analysis.

Key features:
- Single continuous 365-day data window
- CTE-based sampling strategy for efficiency
- All filtering happens in Snowflake (no large IN clauses)
- Proper ID standardization across all tables

In [1]:
import os
import textwrap
from datetime import date, timedelta, datetime
from pathlib import Path
import warnings

import pandas as pd
from dotenv import load_dotenv
import snowflake.connector
from tqdm import tqdm
import json
import gc

# Suppress pandas SQLAlchemy warning
warnings.filterwarnings(
    'ignore',
    category=UserWarning,
    message='pandas only supports SQLAlchemy connectable.*'
)

## 1. Configuration

In [None]:
# Load environment variables from parent directory
load_dotenv('../.env')  # Load from parent directory since .env is at project root

# Time windows - 365 day window
ANALYSIS_END_DATE = date(2025, 9, 2)
TOTAL_PULL_DAYS = 365
ANALYSIS_START_DATE = ANALYSIS_END_DATE - timedelta(days=TOTAL_PULL_DAYS)

# Sampling parameters
SAMPLING_FRACTION = 0.0005  # 0.05% of users for 365-day window

# Output paths
DATA_DIR = Path('data')
DATA_DIR.mkdir(exist_ok=True)

print("Configuration:")
print(f"  Analysis period: {ANALYSIS_START_DATE} to {ANALYSIS_END_DATE}")
print(f"  Total days: {TOTAL_PULL_DAYS}")
print(f"  Sampling fraction: {SAMPLING_FRACTION:.2%}")

## 2. Snowflake Connection

In [3]:
try:
    conn = snowflake.connector.connect(
        user=os.getenv('SNOWFLAKE_USER'),
        password=os.getenv('SNOWFLAKE_PASSWORD'),
        account=os.getenv('SNOWFLAKE_ACCOUNT'),
        warehouse=os.getenv('SNOWFLAKE_WAREHOUSE', 'COMPUTE_WH'),
        database='INCREMENTALITY',
        schema='INCREMENTALITY_RESEARCH'
    )
    print("[SUCCESS] Snowflake connection established.")
except Exception as e:
    print(f"[FAILURE] Could not connect to Snowflake: {e}")
    conn = None

[SUCCESS] Snowflake connection established.


## 3. Core Data Fetching Functions with CTE Strategy

In [4]:
def build_sampling_cte(start_date: str, end_date: str, sampling_fraction: float) -> str:
    """
    Build CTE for deterministic user sampling using hash-based bucketing.
    This ensures reproducible sampling across runs.
    """
    total_buckets = 10000
    selection_threshold = int(total_buckets * sampling_fraction)
    
    return textwrap.dedent(f"""
        WITH SAMPLED_USER_IDS AS (
            WITH ALL_USERS AS (
                -- Get all unique users from auctions in the time window
                SELECT DISTINCT OPAQUE_USER_ID AS USER_ID
                FROM AUCTIONS_USERS
                WHERE CREATED_AT BETWEEN '{start_date}' AND '{end_date}'
                  AND OPAQUE_USER_ID IS NOT NULL
            ),
            BUCKETED_USERS AS (
                SELECT
                    USER_ID,
                    MOD(ABS(HASH(USER_ID)), {total_buckets}) AS bucket
                FROM ALL_USERS
            )
            SELECT USER_ID
            FROM BUCKETED_USERS
            WHERE bucket < {selection_threshold}
        )
    """)

In [5]:
def extract_auctions_users(conn, start_date: str, end_date: str, sampling_fraction: float) -> pd.DataFrame:
    """Extract AUCTIONS_USERS table with proper ID standardization."""
    print("\nExtracting AUCTIONS_USERS...")
    
    sampling_cte = build_sampling_cte(start_date, end_date, sampling_fraction)
    
    query = sampling_cte + textwrap.dedent(f"""
        SELECT
            LOWER(TO_VARCHAR(au.AUCTION_ID, 'HEX')) AS AUCTION_ID,
            au.OPAQUE_USER_ID AS USER_ID,
            au.CREATED_AT,
            DATE(au.CREATED_AT) AS auction_date,
            HOUR(au.CREATED_AT) AS auction_hour,
            DAYOFWEEK(au.CREATED_AT) AS auction_dow,
            WEEKOFYEAR(au.CREATED_AT) AS auction_week
        FROM AUCTIONS_USERS au
        JOIN SAMPLED_USER_IDS s ON au.OPAQUE_USER_ID = s.USER_ID
        WHERE au.CREATED_AT BETWEEN '{start_date}' AND '{end_date}'
        ORDER BY au.OPAQUE_USER_ID, au.CREATED_AT
    """)
    
    df = pd.read_sql(query, conn)
    
    # FIX: Standardize all column names to lowercase to prevent KeyErrors
    df.columns = [x.lower() for x in df.columns]
    
    print(f"  Extracted {len(df):,} auction records")
    return df

In [6]:
def extract_auctions_results(conn, start_date: str, end_date: str, sampling_fraction: float) -> pd.DataFrame:
    """Extract AUCTIONS_RESULTS table with proper ID standardization."""
    print("\nExtracting AUCTIONS_RESULTS...")
    
    sampling_cte = build_sampling_cte(start_date, end_date, sampling_fraction)
    
    query = sampling_cte + textwrap.dedent(f"""
        SELECT
            LOWER(TO_VARCHAR(ar.AUCTION_ID, 'HEX')) AS AUCTION_ID,
            LOWER(TO_VARCHAR(ar.VENDOR_ID, 'HEX')) AS VENDOR_ID,
            LOWER(TO_VARCHAR(ar.CAMPAIGN_ID, 'HEX')) AS CAMPAIGN_ID,
            LOWER(TRIM(ar.PRODUCT_ID)) AS PRODUCT_ID,
            ar.RANKING AS bid_rank,
            ar.IS_WINNER,
            ar.CREATED_AT AS bid_time
        FROM AUCTIONS_RESULTS ar
        JOIN AUCTIONS_USERS au ON ar.AUCTION_ID = au.AUCTION_ID
        JOIN SAMPLED_USER_IDS s ON au.OPAQUE_USER_ID = s.USER_ID
        WHERE ar.CREATED_AT BETWEEN '{start_date}' AND '{end_date}'
    """)
    
    df = pd.read_sql(query, conn)
    
    # FIX: Standardize all column names to lowercase to prevent KeyErrors
    df.columns = [x.lower() for x in df.columns]
    
    print(f"  Extracted {len(df):,} bid records")
    return df

In [7]:
def extract_impressions(conn, start_date: str, end_date: str, sampling_fraction: float) -> pd.DataFrame:
    """Extract IMPRESSIONS table with proper ID standardization."""
    print("\nExtracting IMPRESSIONS...")
    
    sampling_cte = build_sampling_cte(start_date, end_date, sampling_fraction)
    
    query = sampling_cte + textwrap.dedent(f"""
        SELECT
            i.INTERACTION_ID AS impression_id,
            LOWER(REPLACE(i.AUCTION_ID, '-', '')) AS AUCTION_ID,
            LOWER(TRIM(i.PRODUCT_ID)) AS PRODUCT_ID,
            i.USER_ID,
            LOWER(REPLACE(i.CAMPAIGN_ID, '-', '')) AS CAMPAIGN_ID,
            LOWER(REPLACE(i.VENDOR_ID, '-', '')) AS VENDOR_ID,
            i.OCCURRED_AT AS impression_time
        FROM IMPRESSIONS i
        JOIN SAMPLED_USER_IDS s ON i.USER_ID = s.USER_ID
        WHERE i.OCCURRED_AT BETWEEN '{start_date}' AND '{end_date}'
    """)
    
    df = pd.read_sql(query, conn)

    # FIX: Standardize all column names to lowercase to prevent KeyErrors
    df.columns = [x.lower() for x in df.columns]
    
    print(f"  Extracted {len(df):,} impression records")
    return df

In [8]:
def extract_clicks(conn, start_date: str, end_date: str, sampling_fraction: float) -> pd.DataFrame:
    """Extract CLICKS table with proper ID standardization."""
    print("\nExtracting CLICKS...")
    
    sampling_cte = build_sampling_cte(start_date, end_date, sampling_fraction)
    
    query = sampling_cte + textwrap.dedent(f"""
        SELECT
            c.INTERACTION_ID AS click_id,
            LOWER(REPLACE(c.AUCTION_ID, '-', '')) AS AUCTION_ID,
            LOWER(TRIM(c.PRODUCT_ID)) AS PRODUCT_ID,
            c.USER_ID,
            LOWER(REPLACE(c.CAMPAIGN_ID, '-', '')) AS CAMPAIGN_ID,
            LOWER(REPLACE(c.VENDOR_ID, '-', '')) AS VENDOR_ID,
            c.OCCURRED_AT AS click_time
        FROM CLICKS c
        JOIN SAMPLED_USER_IDS s ON c.USER_ID = s.USER_ID
        WHERE c.OCCURRED_AT BETWEEN '{start_date}' AND '{end_date}'
    """)

    df = pd.read_sql(query, conn)

    # FIX: Standardize all column names to lowercase to prevent KeyErrors
    df.columns = [x.lower() for x in df.columns]
    
    print(f"  Extracted {len(df):,} click records")
    return df

In [9]:
def extract_purchases(conn, start_date: str, end_date: str, sampling_fraction: float) -> pd.DataFrame:
    """Extract PURCHASES table with proper ID standardization."""
    print("\nExtracting PURCHASES...")
    
    sampling_cte = build_sampling_cte(start_date, end_date, sampling_fraction)
    
    query = sampling_cte + textwrap.dedent(f"""
        SELECT
            p.PURCHASE_ID,
            p.PURCHASED_AT AS purchase_time,
            LOWER(TRIM(p.PRODUCT_ID)) AS PRODUCT_ID,
            p.QUANTITY,
            p.UNIT_PRICE,
            p.USER_ID,
            p.PURCHASE_LINE,
            (p.QUANTITY * p.UNIT_PRICE) AS revenue
        FROM PURCHASES p
        JOIN SAMPLED_USER_IDS s ON p.USER_ID = s.USER_ID
        WHERE p.PURCHASED_AT BETWEEN '{start_date}' AND '{end_date}'
    """)

    df = pd.read_sql(query, conn)

    # FIX: Standardize all column names to lowercase to prevent KeyErrors
    df.columns = [x.lower() for x in df.columns]
    
    print(f"  Extracted {len(df):,} purchase records")
    return df

In [10]:
def extract_catalog_and_get_product_ids(conn, start_date: str, end_date: str, sampling_fraction: float) -> tuple[pd.DataFrame, set]:
    """
    Combines product ID collection and catalog extraction into a single, efficient query.
    Uses CTE to collect all product IDs from sampled users' events.
    """
    print("\nExtracting CATALOG and collecting all PRODUCT_IDs using CTE...")
    
    sampling_cte = build_sampling_cte(start_date, end_date, sampling_fraction)
    
    query = sampling_cte + textwrap.dedent(f""",
        ALL_PRODUCT_IDS AS (
            -- Collect all product IDs from all relevant tables for our sampled users
            SELECT DISTINCT LOWER(TRIM(ar.PRODUCT_ID)) AS PRODUCT_ID
            FROM AUCTIONS_RESULTS ar
            JOIN AUCTIONS_USERS au ON ar.AUCTION_ID = au.AUCTION_ID
            JOIN SAMPLED_USER_IDS s ON au.OPAQUE_USER_ID = s.USER_ID
            WHERE ar.CREATED_AT BETWEEN '{start_date}' AND '{end_date}'
              AND ar.PRODUCT_ID IS NOT NULL
            
            UNION
            
            SELECT DISTINCT LOWER(TRIM(i.PRODUCT_ID)) AS PRODUCT_ID
            FROM IMPRESSIONS i
            JOIN SAMPLED_USER_IDS s ON i.USER_ID = s.USER_ID
            WHERE i.OCCURRED_AT BETWEEN '{start_date}' AND '{end_date}'
              AND i.PRODUCT_ID IS NOT NULL

            UNION

            SELECT DISTINCT LOWER(TRIM(c.PRODUCT_ID)) AS PRODUCT_ID
            FROM CLICKS c
            JOIN SAMPLED_USER_IDS s ON c.USER_ID = s.USER_ID
            WHERE c.OCCURRED_AT BETWEEN '{start_date}' AND '{end_date}'
              AND c.PRODUCT_ID IS NOT NULL

            UNION

            SELECT DISTINCT LOWER(TRIM(p.PRODUCT_ID)) AS PRODUCT_ID
            FROM PURCHASES p
            JOIN SAMPLED_USER_IDS s ON p.USER_ID = s.USER_ID
            WHERE p.PURCHASED_AT BETWEEN '{start_date}' AND '{end_date}'
              AND p.PRODUCT_ID IS NOT NULL
        )
        -- Now, fetch the catalog data for exactly those products
        SELECT
            LOWER(TRIM(c.PRODUCT_ID)) AS PRODUCT_ID,
            c.NAME AS product_name,
            c.PRICE AS catalog_price,
            c.ACTIVE AS is_active,
            c.IS_DELETED,
            c.DESCRIPTION,
            c.VENDORS,
            c.CATEGORIES,
            SPLIT_PART(ARRAY_TO_STRING(FILTER(c.CATEGORIES, x -> x LIKE 'brand#%%'), ''), '#', 2) AS BRAND,
            SPLIT_PART(ARRAY_TO_STRING(FILTER(c.CATEGORIES, x -> x LIKE 'department#%%'), ''), '#', 2) AS DEPARTMENT_ID,
            SPLIT_PART(ARRAY_TO_STRING(FILTER(c.CATEGORIES, x -> x LIKE 'category#%%'), ''), '#', 2) AS CATEGORY_ID,
            SPLIT_PART(ARRAY_TO_STRING(FILTER(c.CATEGORIES, x -> x LIKE 'color#%%'), ''), '#', 2) AS PRIMARY_COLOR,
            REPLACE(
                ARRAY_TO_STRING(FILTER(c.CATEGORIES, x -> x LIKE 'style_tag#%%'), ', '),
                'style_tag#', ''
            ) AS STYLE_TAGS
        FROM CATALOG c
        JOIN ALL_PRODUCT_IDS ap ON LOWER(TRIM(c.PRODUCT_ID)) = ap.PRODUCT_ID
    """)
    
    df = pd.read_sql(query, conn)

    # FIX: Standardize all column names to lowercase to prevent KeyErrors
    df.columns = [x.lower() for x in df.columns]

    # Get the set of product IDs from the resulting dataframe
    product_ids = set(df['product_id'].unique())
    print(f"  Extracted {len(df):,} catalog records for {len(product_ids):,} unique products")
    
    return df, product_ids

## 4. Main Data Extraction Pipeline

In [11]:
if conn:
    print("="*80)
    print("STARTING DATA EXTRACTION PIPELINE")
    print("="*80)
    
    # Create timestamp for this extraction run
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    
    # Convert dates to strings for SQL
    start_date_str = ANALYSIS_START_DATE.strftime('%Y-%m-%d')
    end_date_str = ANALYSIS_END_DATE.strftime('%Y-%m-%d')
    
    # Extract main event tables
    print("\n--- Extracting event data for analysis period ---")
    auctions_users = extract_auctions_users(conn, start_date_str, end_date_str, SAMPLING_FRACTION)
    auctions_results = extract_auctions_results(conn, start_date_str, end_date_str, SAMPLING_FRACTION)
    impressions = extract_impressions(conn, start_date_str, end_date_str, SAMPLING_FRACTION)
    clicks = extract_clicks(conn, start_date_str, end_date_str, SAMPLING_FRACTION)
    purchases = extract_purchases(conn, start_date_str, end_date_str, SAMPLING_FRACTION)
    
    # Extract catalog with product IDs
    catalog, product_ids = extract_catalog_and_get_product_ids(conn, start_date_str, end_date_str, SAMPLING_FRACTION)
    
    # Close connection
    conn.close()
    print("\n[SUCCESS] Snowflake connection closed")
    
else:
    print("[ERROR] No Snowflake connection available. Please check your credentials.")

STARTING DATA EXTRACTION PIPELINE

--- Extracting event data for analysis period ---

Extracting AUCTIONS_USERS...
  Extracted 657,597 auction records

Extracting AUCTIONS_RESULTS...
  Extracted 26,362,985 bid records

Extracting IMPRESSIONS...
  Extracted 2,459,435 impression records

Extracting CLICKS...
  Extracted 79,031 click records

Extracting PURCHASES...
  Extracted 11,215 purchase records

Extracting CATALOG and collecting all PRODUCT_IDs using CTE...
  Extracted 6,842,400 catalog records for 6,842,400 unique products

[SUCCESS] Snowflake connection closed


## 5. Data Validation and Summary

In [12]:
def validate_data(auctions_users, auctions_results, impressions, clicks, purchases, catalog, product_ids):
    """Validate data integrity and print summary statistics"""
    
    print("\n" + "="*50)
    print("DATA VALIDATION SUMMARY")
    print("="*50)
    
    # Basic counts
    print("\nRecord Counts:")
    print(f"  Auctions:    {len(auctions_users):,}")
    print(f"  Bids:        {len(auctions_results):,}")
    print(f"  Impressions: {len(impressions):,}")
    print(f"  Clicks:      {len(clicks):,}")
    print(f"  Purchases:   {len(purchases):,}")
    print(f"  Products (Catalog): {len(catalog):,}")
    
    # Unique counts
    print("\nUnique Entities:")
    print(f"  Users:       {auctions_users['user_id'].nunique():,}")
    print(f"  Vendors:     {auctions_results['vendor_id'].nunique():,}")
    print(f"  Campaigns:   {auctions_results['campaign_id'].nunique():,}")
    print(f"  Products (from Events): {len(product_ids):,}")
    
    # Date ranges (using standardized lowercase column names)
    print("\nDate Ranges:")
    try:
        print(f"  Auctions:    {auctions_users['created_at'].min()} to {auctions_users['created_at'].max()}")
        print(f"  Bids:        {auctions_results['bid_time'].min()} to {auctions_results['bid_time'].max()}")
        print(f"  Impressions: {impressions['impression_time'].min()} to {impressions['impression_time'].max()}")
        print(f"  Clicks:      {clicks['click_time'].min()} to {clicks['click_time'].max()}")
        print(f"  Purchases:   {purchases['purchase_time'].min()} to {purchases['purchase_time'].max()}")
    except KeyError as e:
        print(f"  ERROR generating date ranges: A timestamp column was not found -> {e}")

    # Conversion funnel
    print("\nConversion Funnel:")
    winning_bids = len(auctions_results[auctions_results['is_winner'] == True])
    print(f"  Winning Bids:     {winning_bids:,}")
    if winning_bids > 0 and len(impressions) > 0:
        print(f"  Impressions:      {len(impressions):,} ({len(impressions)/winning_bids*100:.1f}% of winning bids)")
    if len(impressions) > 0 and len(clicks) > 0:
        print(f"  Clicks:           {len(clicks):,} ({len(clicks)/len(impressions)*100:.1f}% CTR)")
    print(f"  Purchase Events:  {len(purchases):,}")
    
    # Missing data check
    print("\nMissing Data Check:")
    for df_name, df in [("Auctions", auctions_users), ("Bids", auctions_results), 
                        ("Impressions", impressions), ("Clicks", clicks),
                        ("Purchases", purchases), ("Catalog", catalog)]:
        if not df.empty:
            max_missing_pct = (df.isnull().sum().max() / len(df)) * 100
            if max_missing_pct > 0:
                print(f"  {df_name}: {max_missing_pct:.2f}% max missing")
            else:
                print(f"  {df_name}: No missing values")
        else:
            print(f"  {df_name}: Empty DataFrame")

# ==============================================================================
# RUN THE VALIDATION FUNCTION ON YOUR REAL DATA
# ==============================================================================
if 'auctions_users' in locals() and not auctions_users.empty:
    validate_data(auctions_users, auctions_results, impressions, clicks, purchases, catalog, product_ids)
else:
    print("DataFrames not loaded. Please re-run the main data extraction pipeline.")


DATA VALIDATION SUMMARY

Record Counts:
  Auctions:    657,597
  Bids:        26,362,985
  Impressions: 2,459,435
  Clicks:      79,031
  Purchases:   11,215
  Products (Catalog): 6,842,400

Unique Entities:
  Users:       8,350
  Vendors:     143,268
  Campaigns:   787,356
  Products (from Events): 6,842,400

Date Ranges:
  Auctions:    2025-03-14 00:00:15.367000 to 2025-09-01 23:59:31.293000
  Bids:        2025-03-14 00:00:15.407000 to 2025-09-01 23:59:31.302000
  Impressions: 2025-03-14 00:00:18 to 2025-09-01 23:59:59
  Clicks:      2025-03-14 00:02:16 to 2025-09-01 23:57:08
  Purchases:   2025-03-14 00:04:05 to 2025-09-01 23:53:49

Conversion Funnel:
  Winning Bids:     20,191,987
  Impressions:      2,459,435 (12.2% of winning bids)
  Clicks:           79,031 (3.2% CTR)
  Purchase Events:  11,215

Missing Data Check:
  Auctions: No missing values
  Bids: No missing values
  Impressions: No missing values
  Clicks: No missing values
  Purchases: No missing values
  Catalog: No mis

In [15]:
from pathlib import Path
import datetime

# 1. Define the full path to your new directory on the external drive
output_dir = Path("/Volumes/rawat/data/marketplace-data")

# 2. Create the directory. 
#    - `parents=True` creates any missing parent folders (like 'data').
#    - `exist_ok=True` means it won't crash if the folder already exists.
try:
    print(f"Attempting to create directory: {output_dir}")
    output_dir.mkdir(parents=True, exist_ok=True)
    print("[SUCCESS] Directory exists and is ready.")

    # 3. Create a small test file to confirm write access.
    test_file_path = output_dir / "write_test.txt"
    timestamp_message = f"Successfully wrote this file at: {datetime.datetime.now()}"
    
    test_file_path.write_text(timestamp_message)
    
    print(f"[SUCCESS] Wrote a test file to: {test_file_path}")
    print("\nThe path is valid and writable. You can now run the main data-saving cell.")

except Exception as e:
    print(f"[FAILURE] An error occurred. Please check the path and drive permissions.")
    print(f"Error details: {e}")

Attempting to create directory: /Volumes/rawat/data/marketplace-data
[SUCCESS] Directory exists and is ready.
[SUCCESS] Wrote a test file to: /Volumes/rawat/data/marketplace-data/write_test.txt

The path is valid and writable. You can now run the main data-saving cell.


## 6. Save Data

In [17]:
if 'auctions_users' in locals():
    print("\n--- Saving data checkpoint ---")
    output_dir = Path("./data")
    output_dir.mkdir(parents=True, exist_ok=True)
    
    datasets = [
        ("auctions_users", auctions_users),
        ("auctions_results", auctions_results),
        ("impressions", impressions),
        ("clicks", clicks),
        ("purchases", purchases),
        ("catalog", catalog)
    ]
    
    # Using tqdm without an inner print statement for a cleaner progress bar
    for name, df in tqdm(datasets, desc="Saving Parquet files"):
        path = output_dir / f"{name}_365d.parquet"
        df.to_parquet(path, index=False)
    
    print("\nAll data files saved successfully.")

    # Create metadata file
    metadata = {
        'timestamp': timestamp,
        'analysis_start_date': start_date_str,
        'analysis_end_date': end_date_str,
        'total_days': TOTAL_PULL_DAYS,
        'sampling_fraction': SAMPLING_FRACTION,
        'total_products': len(product_ids),
        'row_counts': {name: len(df) for name, df in datasets}
    }
    
    metadata_path = output_dir / f"metadata_365d.json"
    with open(metadata_path, 'w') as f:
        json.dump(metadata, f, indent=2)
    print(f"Saved metadata to {metadata_path.name}")
    
    print("\n" + "="*80)
    print("DATA EXTRACTION COMPLETE")
    print("="*80)
    print(f"All data saved to {output_dir}/")
    print(f"Ready for macro-session processing and fixed effects analysis")


--- Saving data checkpoint ---


Saving Parquet files: 100%|██████████| 6/6 [04:53<00:00, 48.90s/it]


All data files saved successfully.
Saved metadata to metadata_365d.json

DATA EXTRACTION COMPLETE
All data saved to data/
Ready for macro-session processing and fixed effects analysis





In [1]:
import polars as pl
from pathlib import Path
import json
import sys

def verify_data_checkpoint_polars():
    """
    Checks the integrity of the saved data using the Polars library.
    It verifies file existence and compares row counts in Parquet files
    against the metadata JSON.
    """
    output_dir = Path("./data")
    print(f"--- Verifying data checkpoint in '{output_dir}' (using Polars) ---\n")

    # 1. Check if the output directory exists
    if not output_dir.is_dir():
        print(f"❌ ERROR: Directory '{output_dir}' not found. Did the script run correctly?")
        sys.exit(1)
    print(f"✅ Directory '{output_dir}' found.")

    # 2. Check for and load the metadata file
    metadata_path = output_dir / "metadata_365d.json"
    if not metadata_path.is_file():
        print(f"❌ ERROR: Metadata file '{metadata_path.name}' not found.")
        sys.exit(1)
    
    try:
        with open(metadata_path, 'r') as f:
            metadata = json.load(f)
        print(f"✅ Metadata file '{metadata_path.name}' loaded successfully.")
    except Exception as e:
        print(f"❌ ERROR: Could not read or parse metadata file: {e}")
        sys.exit(1)

    print("\n--- Verifying Parquet files against metadata ---")
    all_checks_passed = True
    
    # Get the expected row counts from the metadata file
    expected_row_counts = metadata.get('row_counts')
    if not expected_row_counts:
        print("❌ ERROR: 'row_counts' key not found in metadata file.")
        sys.exit(1)

    # 3. Iterate through datasets listed in metadata and check each one
    for name, expected_count in expected_row_counts.items():
        file_path = output_dir / f"{name}_365d.parquet"
        print(f"\nChecking: {file_path.name}")

        # Check file existence
        if not file_path.is_file():
            print(f"  ❌ ERROR: File not found.")
            all_checks_passed = False
            continue

        # Try to read the file with Polars and check its height (row count)
        try:
            # Use read_parquet from Polars
            df = pl.read_parquet(file_path)
            
            # In Polars, .height gives the number of rows
            actual_count = df.height
            print(f"  - File is readable: ✅")
            
            # Compare row counts
            if actual_count == expected_count:
                print(f"  - Row count matches metadata: ✅ ({actual_count:,} rows)")
            else:
                print(f"  - Row count MISMATCH: ❌ (Actual: {actual_count:,}, Expected: {expected_count:,})")
                all_checks_passed = False
        except Exception as e:
            print(f"  ❌ ERROR: Failed to read Parquet file with Polars: {e}")
            all_checks_passed = False

    # 4. Final summary
    print("\n" + "="*50)
    if all_checks_passed:
        print("✅ VERIFICATION COMPLETE: All checks passed!")
        print("   All expected files exist and their row counts match the metadata.")
    else:
        print("❌ VERIFICATION FAILED: Issues were found. Please review the errors above.")
    print("="*50)


if __name__ == "__main__":
    # Make sure you have polars and a parquet engine (like pyarrow) installed
    # pip install polars pyarrow
    verify_data_checkpoint_polars()

--- Verifying data checkpoint in 'data' (using Polars) ---

✅ Directory 'data' found.
✅ Metadata file 'metadata_365d.json' loaded successfully.

--- Verifying Parquet files against metadata ---

Checking: auctions_users_365d.parquet
  - File is readable: ✅
  - Row count matches metadata: ✅ (657,597 rows)

Checking: auctions_results_365d.parquet
  - File is readable: ✅
  - Row count matches metadata: ✅ (26,362,985 rows)

Checking: impressions_365d.parquet
  - File is readable: ✅
  - Row count matches metadata: ✅ (2,459,435 rows)

Checking: clicks_365d.parquet
  - File is readable: ✅
  - Row count matches metadata: ✅ (79,031 rows)

Checking: purchases_365d.parquet
  - File is readable: ✅
  - Row count matches metadata: ✅ (11,215 rows)

Checking: catalog_365d.parquet
  - File is readable: ✅
  - Row count matches metadata: ✅ (6,842,400 rows)

✅ VERIFICATION COMPLETE: All checks passed!
   All expected files exist and their row counts match the metadata.


In [2]:
pip install polars pyarrow


Note: you may need to restart the kernel to use updated packages.


## 7. Memory Cleanup

In [None]:
# Clean up memory
if 'auctions_users' in locals():
    del auctions_users, auctions_results, impressions, clicks, purchases, catalog
    gc.collect()
    print("✓ Memory cleared")