In [1]:
import os
import pandas as pd
from dotenv import load_dotenv
import snowflake.connector

load_dotenv()

conn = snowflake.connector.connect(
    user=os.getenv('SNOWFLAKE_USER'),
    password=os.getenv('SNOWFLAKE_PASSWORD'),
    account=os.getenv('SNOWFLAKE_ACCOUNT'),
    warehouse=os.getenv('SNOWFLAKE_WAREHOUSE', 'COMPUTE_WH'),
    database='INCREMENTALITY',
    schema='INCREMENTALITY_RESEARCH'
)

print("Connected")

Connected


In [None]:

import pandas as pd
from pathlib import Path
from datetime import date, timedelta
from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings('ignore')

# PHASE 1: ONE-TIME SETUP
print("=== PHASE 1: SETUP ===")

# Define paths
BASE_PATH = Path("/Users/pranjal/Code/marketplace-incrementality/daily_summaries/data")
AUCTIONS_DIR = BASE_PATH / "product_daily_auctions_dataset"
IMPRESSIONS_DIR = BASE_PATH / "product_daily_impressions_dataset"
CLICKS_DIR = BASE_PATH / "product_daily_clicks_dataset"
PURCHASES_DIR = BASE_PATH / "product_daily_purchases_dataset"

# Create panels output directory
PANELS_DIR = BASE_PATH / "panels"
PANELS_DIR.mkdir(exist_ok=True)

# Define final output paths
PRODUCT_PANEL_PATH = PANELS_DIR / "product_day_panel.parquet"
VENDOR_PANEL_PATH = PANELS_DIR / "vendor_day_panel.parquet"

# Analysis period
ANALYSIS_START = date(2025, 3, 14)
ANALYSIS_END = date(2025, 9, 7)
date_list = pd.date_range(start=ANALYSIS_START, end=ANALYSIS_END, freq='D')

print(f"Analysis Period: {ANALYSIS_START} to {ANALYSIS_END}")
print(f"Total days to process: {len(date_list)}")
print(f"Output directory: {PANELS_DIR}")

# Create master catalog from impressions data
print("\nCreating master catalog (Product -> Campaign -> Vendor mapping)...")
master_catalog_rows = []

for current_date in tqdm(date_list[:10], desc="Building Master Catalog"):  # Sample from first 10 days
    date_str = current_date.strftime('%Y-%m-%d')
    impressions_file = IMPRESSIONS_DIR / f"data_{date_str}.parquet"

    if impressions_file.exists():
        df_imp = pd.read_parquet(impressions_file)
        catalog_day = df_imp[['PRODUCT_ID', 'VENDOR_ID', 'CAMPAIGN_ID']].drop_duplicates()
        master_catalog_rows.append(catalog_day)

if master_catalog_rows:
    master_catalog = pd.concat(master_catalog_rows, ignore_index=True).drop_duplicates()
    master_catalog.to_parquet(PANELS_DIR / "master_catalog.parquet", index=False)
    print(f"Master catalog created: {len(master_catalog):,} unique product-vendor-campaign mappings")
else:
    print("No impressions data found for master catalog")
    master_catalog = pd.DataFrame(columns=['PRODUCT_ID', 'VENDOR_ID', 'CAMPAIGN_ID'])

# PHASE 2: DAY-BY-DAY ASSEMBLY OF PRODUCT-DAY PANEL
print("\n=== PHASE 2: PRODUCT-DAY PANEL ASSEMBLY ===")

product_day_dfs = []

for current_date in tqdm(date_list, desc="Processing Product-Day Data"):
    date_str = current_date.strftime('%Y-%m-%d')

    # Load daily files (create empty if missing)
    def load_or_empty(file_path, expected_columns):
        if file_path.exists():
            return pd.read_parquet(file_path)
        else:
            empty_df = pd.DataFrame(columns=expected_columns)
            return empty_df

    # Load all four datasets for this day
    auctions_file = AUCTIONS_DIR / f"data_{date_str}.parquet"
    impressions_file = IMPRESSIONS_DIR / f"data_{date_str}.parquet"
    clicks_file = CLICKS_DIR / f"data_{date_str}.parquet"
    purchases_file = PURCHASES_DIR / f"data_{date_str}.parquet"

    # Expected columns for empty DataFrames
    auctions_cols = ['PRODUCT_ID', 'DATE', 'VENDOR_ID', 'CAMPAIGN_ID', 'PRODUCT_AUCTIONS_COUNT',
                    'TOTAL_BIDS_FOR_PRODUCT', 'TOTAL_WINS_FOR_PRODUCT', 'AVG_BID_RANK_FOR_PRODUCT',
                    'DISTINCT_BIDDERS_FOR_PRODUCT', 'BEST_RANK_FOR_PRODUCT', 'WORST_RANK_FOR_PRODUCT']

    impressions_cols = ['PRODUCT_ID', 'DATE', 'VENDOR_ID', 'CAMPAIGN_ID', 'TOTAL_IMPRESSIONS',
                        'IMPRESSIONS', 'DISTINCT_USERS_IMPRESSED']

    clicks_cols = ['PRODUCT_ID', 'DATE', 'VENDOR_ID', 'CAMPAIGN_ID', 'TOTAL_CLICKS',
                    'CLICKS', 'DISTINCT_USERS_CLICKED']

    purchases_cols = ['PRODUCT_ID', 'DATE', 'PURCHASES', 'LINES_SOLD', 'UNITS_SOLD',
                    'REVENUE_CENTS', 'AVG_UNIT_PRICE_CENTS', 'MIN_UNIT_PRICE_CENTS',
                    'MAX_UNIT_PRICE_CENTS', 'STDDEV_UNIT_PRICE_CENTS', 'DISTINCT_USERS_PURCHASED']

    df_auctions = load_or_empty(auctions_file, auctions_cols)
    df_impressions = load_or_empty(impressions_file, impressions_cols)
    df_clicks = load_or_empty(clicks_file, clicks_cols)
    df_purchases = load_or_empty(purchases_file, purchases_cols)

    # Start with auctions as base (has vendor_id, campaign_id)
    if not df_auctions.empty:
        df_daily = df_auctions.copy()
    else:
        # If no auctions, create minimal structure with date
        df_daily = pd.DataFrame({'PRODUCT_ID': [], 'DATE': pd.to_datetime(date_str)})
        continue  # Skip days with no auction data

    # Outer join with impressions
    if not df_impressions.empty:
        df_daily = df_daily.merge(
            df_impressions[['PRODUCT_ID', 'TOTAL_IMPRESSIONS', 'IMPRESSIONS', 'DISTINCT_USERS_IMPRESSED']],
            on='PRODUCT_ID', how='outer'
        )
    else:
        df_daily[['TOTAL_IMPRESSIONS', 'IMPRESSIONS', 'DISTINCT_USERS_IMPRESSED']] = 0

    # Outer join with clicks
    if not df_clicks.empty:
        df_daily = df_daily.merge(
            df_clicks[['PRODUCT_ID', 'TOTAL_CLICKS', 'CLICKS', 'DISTINCT_USERS_CLICKED']],
            on='PRODUCT_ID', how='outer'
        )
    else:
        df_daily[['TOTAL_CLICKS', 'CLICKS', 'DISTINCT_USERS_CLICKED']] = 0

    # Outer join with purchases
    if not df_purchases.empty:
        df_daily = df_daily.merge(df_purchases, on='PRODUCT_ID', how='outer')
    else:
        for col in purchases_cols[2:]:  # Skip PRODUCT_ID and DATE
            df_daily[col] = 0

    # Clean merged DataFrame
    # Fill nulls with 0 for all metric columns
    metric_columns = [col for col in df_daily.columns if col not in ['PRODUCT_ID', 'DATE', 'VENDOR_ID', 'CAMPAIGN_ID']]
    df_daily[metric_columns] = df_daily[metric_columns].fillna(0)

    # Ensure DATE column is properly set
    df_daily['DATE'] = pd.to_datetime(date_str)

    # Append to list
    if not df_daily.empty:
        product_day_dfs.append(df_daily)

# Concatenate and save product-day panel
if product_day_dfs:
    df_product_day_final = pd.concat(product_day_dfs, ignore_index=True)
    df_product_day_final.to_parquet(PRODUCT_PANEL_PATH, index=False, engine='pyarrow', compression='snappy')

    print(f"✅ Product-Day Panel Created:")
    print(f"   Shape: {df_product_day_final.shape}")
    print(f"   Date range: {df_product_day_final['DATE'].min().date()} to {df_product_day_final['DATE'].max().date()}")
    print(f"   Unique products: {df_product_day_final['PRODUCT_ID'].nunique():,}")
    print(f"   Saved to: {PRODUCT_PANEL_PATH}")
else:
    print("❌ No product-day data to save")

# PHASE 3: DAY-BY-DAY ASSEMBLY OF VENDOR-DAY PANEL
print("\n=== PHASE 3: VENDOR-DAY PANEL ASSEMBLY ===")

vendor_day_dfs = []

# Process each day from the product-day data
for current_date in tqdm(date_list, desc="Processing Vendor-Day Data"):
    date_str = current_date.strftime('%Y-%m-%d')

    # Filter product-day data for this specific date
    if 'df_product_day_final' in locals():
        df_daily_products = df_product_day_final[df_product_day_final['DATE'] == pd.to_datetime(date_str)]

        if not df_daily_products.empty and 'VENDOR_ID' in df_daily_products.columns:
            # Group by vendor_id and aggregate
            vendor_agg = df_daily_products.groupby('VENDOR_ID').agg({
                'PRODUCT_ID': 'nunique',  # distinct_products_active
                'PRODUCT_AUCTIONS_COUNT': 'sum',
                'TOTAL_BIDS_FOR_PRODUCT': 'sum',
                'TOTAL_WINS_FOR_PRODUCT': 'sum',
                'TOTAL_IMPRESSIONS': 'sum',
                'IMPRESSIONS': 'sum',
                'CLICKS': 'sum',
                'PURCHASES': 'sum',
                'UNITS_SOLD': 'sum',
                'REVENUE_CENTS': 'sum',
                'DISTINCT_USERS_IMPRESSED': 'sum',
                'DISTINCT_USERS_CLICKED': 'sum',
                'DISTINCT_USERS_PURCHASED': 'sum'
            }).reset_index()

            # Rename columns
            vendor_agg.rename(columns={'PRODUCT_ID': 'DISTINCT_PRODUCTS_ACTIVE'}, inplace=True)

            # Add date
            vendor_agg['DATE'] = pd.to_datetime(date_str)

            # Calculate ratios
            vendor_agg['WIN_RATE'] = vendor_agg['TOTAL_WINS_FOR_PRODUCT'] / vendor_agg['TOTAL_BIDS_FOR_PRODUCT'].replace(0, 1)
            vendor_agg['REVENUE_DOLLARS'] = vendor_agg['REVENUE_CENTS'] / 100.0

            vendor_day_dfs.append(vendor_agg)

# Concatenate and save vendor-day panel
if vendor_day_dfs:
    df_vendor_day_final = pd.concat(vendor_day_dfs, ignore_index=True)
    df_vendor_day_final.to_parquet(VENDOR_PANEL_PATH, index=False, engine='pyarrow', compression='snappy')

    print(f"✅ Vendor-Day Panel Created:")
    print(f"   Shape: {df_vendor_day_final.shape}")
    print(f"   Date range: {df_vendor_day_final['DATE'].min().date()} to {df_vendor_day_final['DATE'].max().date()}")
    print(f"   Unique vendors: {df_vendor_day_final['VENDOR_ID'].nunique():,}")
    print(f"   Saved to: {VENDOR_PANEL_PATH}")

    # Show sample
    print(f"\nSample of vendor-day data:")
    print(df_vendor_day_final.head().to_markdown(index=False))
else:
    print("❌ No vendor-day data to save")

print(f"\n=== PANEL CREATION COMPLETE ===")
print(f"Output directory: {PANELS_DIR}")
print(f"Files created:")
print(f"  - {PRODUCT_PANEL_PATH}")
print(f"  - {VENDOR_PANEL_PATH}")
print(f"  - master_catalog.parquet")


=== PHASE 1: SETUP ===
Analysis Period: 2025-03-14 to 2025-09-07
Total days to process: 178
Output directory: /Users/pranjal/Code/marketplace-incrementality/daily_summaries/data/panels

Creating master catalog (Product -> Campaign -> Vendor mapping)...


Building Master Catalog:   0%|          | 0/10 [00:00<?, ?it/s]

Master catalog created: 8,490,131 unique product-vendor-campaign mappings

=== PHASE 2: PRODUCT-DAY PANEL ASSEMBLY ===


Processing Product-Day Data:   0%|          | 0/178 [00:00<?, ?it/s]

❌ No product-day data to save

=== PHASE 3: VENDOR-DAY PANEL ASSEMBLY ===


Processing Vendor-Day Data:   0%|          | 0/178 [00:00<?, ?it/s]

❌ No vendor-day data to save

=== PANEL CREATION COMPLETE ===
Output directory: /Users/pranjal/Code/marketplace-incrementality/daily_summaries/data/panels
Files created:
  - /Users/pranjal/Code/marketplace-incrementality/daily_summaries/data/panels/product_day_panel.parquet
  - /Users/pranjal/Code/marketplace-incrementality/daily_summaries/data/panels/vendor_day_panel.parquet
  - master_catalog.parquet


In [None]:

import pandas as pd
from pathlib import Path
from datetime import date, timedelta
from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings('ignore')

# PHASE 1: SETUP (same as before)
print("=== PHASE 1: SETUP ===")
BASE_PATH = Path("/Users/pranjal/Code/marketplace-incrementality/daily_summaries/data")
AUCTIONS_DIR = BASE_PATH / "product_daily_auctions_dataset"
IMPRESSIONS_DIR = BASE_PATH / "product_daily_impressions_dataset"
CLICKS_DIR = BASE_PATH / "product_daily_clicks_dataset"
PURCHASES_DIR = BASE_PATH / "product_daily_purchases_dataset"
PANELS_DIR = BASE_PATH / "panels"
PANELS_DIR.mkdir(exist_ok=True)

ANALYSIS_START = date(2025, 3, 14)
ANALYSIS_END = date(2025, 9, 7)
date_list = pd.date_range(start=ANALYSIS_START, end=ANALYSIS_END, freq='D')

print(f"Analysis Period: {ANALYSIS_START} to {ANALYSIS_END}")
print(f"Total days to process: {len(date_list)}")

# PHASE 2: MODIFIED PRODUCT-DAY PANEL ASSEMBLY
print("\n=== PHASE 2: PRODUCT-DAY PANEL ASSEMBLY (FIXED) ===")

product_day_dfs = []
debug_info = {"auctions": 0, "impressions": 0, "clicks": 0, "purchases": 0, "processed": 0}

for current_date in tqdm(date_list, desc="Processing Product-Day Data"):
    date_str = current_date.strftime('%Y-%m-%d')

    # Check which files exist for this date
    auctions_file = AUCTIONS_DIR / f"data_{date_str}.parquet"
    impressions_file = IMPRESSIONS_DIR / f"data_{date_str}.parquet"
    clicks_file = CLICKS_DIR / f"data_{date_str}.parquet"
    purchases_file = PURCHASES_DIR / f"data_{date_str}.parquet"

    files_exist = {
        'auctions': auctions_file.exists(),
        'impressions': impressions_file.exists(),
        'clicks': clicks_file.exists(),
        'purchases': purchases_file.exists()
    }

    # Update debug counters
    for key, exists in files_exist.items():
        if exists:
            debug_info[key] += 1

    # Strategy: Use impressions as base if auctions not available (impressions has vendor_id)
    df_daily = None
    base_dataset = None

    # Try to establish a base dataset with PRODUCT_ID and VENDOR_ID
    if files_exist['auctions']:
        df_daily = pd.read_parquet(auctions_file)
        base_dataset = "auctions"
    elif files_exist['impressions']:
        df_daily = pd.read_parquet(impressions_file)
        base_dataset = "impressions"
    elif files_exist['clicks']:
        df_daily = pd.read_parquet(clicks_file)
        base_dataset = "clicks"
    elif files_exist['purchases']:
        df_daily = pd.read_parquet(purchases_file)
        base_dataset = "purchases"
        # Purchases doesn't have vendor_id, we'll need to add it later

    if df_daily is None or df_daily.empty:
        continue

    # Ensure we have the basic columns
    if 'PRODUCT_ID' not in df_daily.columns:
        continue

    # Add DATE column
    df_daily['DATE'] = pd.to_datetime(date_str)

    # Merge other datasets
    merge_datasets = [
        (impressions_file, 'impressions', files_exist['impressions']),
        (clicks_file, 'clicks', files_exist['clicks']),
        (purchases_file, 'purchases', files_exist['purchases']),
        (auctions_file, 'auctions', files_exist['auctions'])
    ]

    for file_path, dataset_name, exists in merge_datasets:
        if exists and dataset_name != base_dataset:
            try:
                df_merge = pd.read_parquet(file_path)
                if not df_merge.empty and 'PRODUCT_ID' in df_merge.columns:

                    # Get columns to merge (exclude overlapping metadata columns)
                    merge_cols = [col for col in df_merge.columns
                                if col not in ['DATE', 'VENDOR_ID', 'CAMPAIGN_ID'] or col == 'PRODUCT_ID']

                    df_daily = df_daily.merge(
                        df_merge[merge_cols],
                        on='PRODUCT_ID',
                        how='outer',
                        suffixes=('', f'_{dataset_name}')
                    )
            except Exception as e:
                print(f"Error merging {dataset_name} for {date_str}: {e}")
                continue

    # Fill nulls with 0 for metric columns
    numeric_columns = df_daily.select_dtypes(include=['number']).columns
    df_daily[numeric_columns] = df_daily[numeric_columns].fillna(0)

    # Add to list
    if not df_daily.empty:
        product_day_dfs.append(df_daily)
        debug_info['processed'] += 1

print(f"\nDebug Info:")
print(f"  Days with auctions: {debug_info['auctions']}")
print(f"  Days with impressions: {debug_info['impressions']}")
print(f"  Days with clicks: {debug_info['clicks']}")
print(f"  Days with purchases: {debug_info['purchases']}")
print(f"  Days actually processed: {debug_info['processed']}")

# Save product-day panel
if product_day_dfs:
    df_product_day_final = pd.concat(product_day_dfs, ignore_index=True)

    # Save to parquet
    PRODUCT_PANEL_PATH = PANELS_DIR / "product_day_panel.parquet"
    df_product_day_final.to_parquet(PRODUCT_PANEL_PATH, index=False)

    print(f"\n✅ Product-Day Panel Created:")
    print(f"   Shape: {df_product_day_final.shape}")
    print(f"   Columns: {list(df_product_day_final.columns)}")
    print(f"   Date range: {df_product_day_final['DATE'].min().date()} to {df_product_day_final['DATE'].max().date()}")
    print(f"   Unique products: {df_product_day_final['PRODUCT_ID'].nunique():,}")
    print(f"   Saved to: {PRODUCT_PANEL_PATH}")
3
    # Show sample
    print("\nSample data:")
    print(df_product_day_final.head(3).to_string())

else:
    print("❌ No product-day data to save")

print("\n=== DIAGNOSIS COMPLETE ===")

=== PHASE 1: SETUP ===
Analysis Period: 2025-03-14 to 2025-09-07
Total days to process: 178

=== PHASE 2: PRODUCT-DAY PANEL ASSEMBLY (FIXED) ===


Processing Product-Day Data:   0%|          | 0/178 [00:00<?, ?it/s]

: 