# Vendor-Week Panel Data Extraction: Auctions

This notebook orchestrates the extraction of vendor auction data from Snowflake, processing it one day at a time and then aggregating to weekly panels.

## 1. Setup and Configuration

In [1]:
import os
import sys
import pandas as pd
import numpy as np
from pathlib import Path
from datetime import datetime, timedelta
from tqdm import tqdm
from dotenv import load_dotenv
import snowflake.connector
import matplotlib.pyplot as plt
import seaborn as sns

# Load environment variables
load_dotenv()

# Set up plotting style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("✅ Environment loaded successfully")

✅ Environment loaded successfully


## 2. Define Analysis Period and Create Date Ranges

In [3]:
# Define the analysis period
ANALYSIS_START_DATE = '2025-03-14'
ANALYSIS_END_DATE = '2025-09-07'

# Generate daily date range
start_date = pd.to_datetime(ANALYSIS_START_DATE)
end_date = pd.to_datetime(ANALYSIS_END_DATE)
date_list = pd.date_range(start=start_date, end=end_date, freq='D')

# Generate weekly date range
week_list = pd.date_range(start=start_date, end=end_date, freq='W-MON')

print(f"📅 Analysis Period: {ANALYSIS_START_DATE} to {ANALYSIS_END_DATE}")
print(f"📊 Total days: {len(date_list)}")
print(f"📈 Total weeks: {len(week_list)}")
print(f"\n🗓️ First 5 weeks:")
for week in week_list[:5]:
    week_end = week + timedelta(days=6)
    print(f"  Week of {week.strftime('%Y-%m-%d')} to {week_end.strftime('%Y-%m-%d')}")

📅 Analysis Period: 2025-03-14 to 2025-09-07
📊 Total days: 178
📈 Total weeks: 25

🗓️ First 5 weeks:
  Week of 2025-03-17 to 2025-03-23
  Week of 2025-03-24 to 2025-03-30
  Week of 2025-03-31 to 2025-04-06
  Week of 2025-04-07 to 2025-04-13
  Week of 2025-04-14 to 2025-04-20


## 3. Extract Daily Auction Data

Run the extraction script to pull auction data one day at a time. This process is resumable - if it fails, you can run it again and it will skip already processed days.

In [4]:
# Run the daily extraction script
!python extract_auctions_daily.py

DAILY AUCTION DATA EXTRACTION
Analysis period: 2025-03-14 to 2025-09-07
Output directory: data/vendor_daily_pulls/auctions

✅ Connected to Snowflake
📊 Processing 178 days of data...

Extracting daily auction data:   1%|        | 1/178 [04:08<12:13:00, 248.48s/it]^C

Force exit
Extracting daily auction data:   1%|        | 1/178 [06:32<19:18:06, 392.58s/it]


## 4. Verify Daily Data Extraction

In [None]:
# Check extracted files
daily_data_dir = Path('data/vendor_daily_pulls/auctions')
parquet_files = sorted(daily_data_dir.glob('auctions_*.parquet'))

print(f"📁 Found {len(parquet_files)} daily files")
print(f"📂 Expected {len(date_list)} files")

if parquet_files:
    # Check coverage
    missing_dates = []
    for date in date_list:
        date_str = date.strftime('%Y-%m-%d')
        file_path = daily_data_dir / f"auctions_{date_str}.parquet"
        if not file_path.exists():
            missing_dates.append(date_str)
    
    if missing_dates:
        print(f"\n⚠️ Missing {len(missing_dates)} files:")
        for date in missing_dates[:10]:  # Show first 10
            print(f"  - {date}")
        if len(missing_dates) > 10:
            print(f"  ... and {len(missing_dates) - 10} more")
    else:
        print("\n✅ All expected files present")
    
    # Sample a file to check structure
    sample_file = parquet_files[len(parquet_files)//2]  # Middle file
    df_sample = pd.read_parquet(sample_file)
    
    print(f"\n📋 Sample file: {sample_file.name}")
    print(f"Shape: {df_sample.shape}")
    print(f"\nColumns: {', '.join(df_sample.columns)}")
    print(f"\nFirst 5 rows:")
    display(df_sample.head())
    
    print(f"\nSummary statistics:")
    display(df_sample.describe().round(2))

📁 Found 0 daily files
📂 Expected 178 files


: 

## 5. Aggregate to Weekly Panels

In [None]:
# Run the weekly aggregation script
!python aggregate_to_weekly.py

## 6. Load and Explore Weekly Panel

In [None]:
# Load the aggregated weekly panel
weekly_panel_file = Path('data/vendor_weekly_panels/vendor_weekly_auctions_panel.parquet')

if weekly_panel_file.exists():
    df_panel = pd.read_parquet(weekly_panel_file)
    print(f"✅ Loaded panel with {len(df_panel):,} observations")
    print(f"\n📊 Panel structure:")
    print(f"  - Vendors: {df_panel['vendor_id'].nunique():,}")
    print(f"  - Weeks: {df_panel['week'].nunique():,}")
    print(f"  - Date range: {df_panel['week'].min()} to {df_panel['week'].max()}")
    
    # Display sample
    print(f"\n📋 Sample data:")
    display(df_panel.head(10))
    
    # Check for balance
    expected_obs = df_panel['vendor_id'].nunique() * df_panel['week'].nunique()
    print(f"\n🔄 Panel balance:")
    print(f"  - Expected observations (balanced): {expected_obs:,}")
    print(f"  - Actual observations: {len(df_panel):,}")
    print(f"  - Panel is {'balanced ✅' if len(df_panel) == expected_obs else 'unbalanced ⚠️'}")
else:
    print("❌ Weekly panel file not found. Please run the aggregation script first.")

## 7. Visualize Auction Activity Patterns

In [None]:
if 'df_panel' in locals() and not df_panel.empty:
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    
    # 1. Total auctions over time
    weekly_totals = df_panel.groupby('week')['auctions'].sum()
    axes[0, 0].plot(weekly_totals.index, weekly_totals.values, linewidth=2)
    axes[0, 0].set_title('Total Auctions Over Time')
    axes[0, 0].set_xlabel('Week')
    axes[0, 0].set_ylabel('Total Auctions')
    axes[0, 0].grid(True, alpha=0.3)
    
    # 2. Average win rate over time
    weekly_winrate = df_panel[df_panel['bids'] > 0].groupby('week')['win_rate'].mean()
    axes[0, 1].plot(weekly_winrate.index, weekly_winrate.values, linewidth=2, color='green')
    axes[0, 1].set_title('Average Win Rate Over Time')
    axes[0, 1].set_xlabel('Week')
    axes[0, 1].set_ylabel('Win Rate')
    axes[0, 1].grid(True, alpha=0.3)
    
    # 3. Distribution of auctions per vendor
    vendor_activity = df_panel.groupby('vendor_id')['auctions'].sum()
    axes[1, 0].hist(vendor_activity[vendor_activity > 0], bins=50, edgecolor='black', alpha=0.7)
    axes[1, 0].set_title('Distribution of Total Auctions per Vendor')
    axes[1, 0].set_xlabel('Total Auctions')
    axes[1, 0].set_ylabel('Number of Vendors')
    axes[1, 0].set_yscale('log')
    
    # 4. Active vendors over time
    active_vendors = df_panel[df_panel['auctions'] > 0].groupby('week')['vendor_id'].nunique()
    axes[1, 1].plot(active_vendors.index, active_vendors.values, linewidth=2, color='orange')
    axes[1, 1].set_title('Number of Active Vendors Over Time')
    axes[1, 1].set_xlabel('Week')
    axes[1, 1].set_ylabel('Active Vendors')
    axes[1, 1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    # Summary statistics
    print("\n📊 Key Metrics Summary:")
    print(f"  - Average auctions per vendor-week: {df_panel['auctions'].mean():.1f}")
    print(f"  - Median auctions per vendor-week: {df_panel['auctions'].median():.1f}")
    print(f"  - Average win rate (when bidding): {df_panel[df_panel['bids'] > 0]['win_rate'].mean():.3f}")
    print(f"  - Vendors with at least one auction: {(vendor_activity > 0).sum():,} ({(vendor_activity > 0).mean():.1%})")

## 8. Export for Further Analysis

In [None]:
if 'df_panel' in locals() and not df_panel.empty:
    # Create different export formats
    export_dir = Path('data/exports')
    export_dir.mkdir(parents=True, exist_ok=True)
    
    # 1. Full panel for modeling
    df_panel.to_parquet(export_dir / 'vendor_weekly_panel_full.parquet', index=False)
    print("✅ Exported full panel to Parquet")
    
    # 2. Active vendors only (at least one auction in the period)
    active_vendors = df_panel.groupby('vendor_id')['auctions'].sum()
    active_vendor_ids = active_vendors[active_vendors > 0].index
    df_active = df_panel[df_panel['vendor_id'].isin(active_vendor_ids)]
    df_active.to_parquet(export_dir / 'vendor_weekly_panel_active_only.parquet', index=False)
    print(f"✅ Exported active vendors panel ({len(active_vendor_ids):,} vendors)")
    
    # 3. Summary statistics CSV
    summary_stats = df_panel.groupby('vendor_id').agg({
        'auctions': ['sum', 'mean', 'std'],
        'bids': ['sum', 'mean'],
        'wins': ['sum', 'mean'],
        'win_rate': 'mean',
        'avg_rank': 'mean'
    }).round(2)
    summary_stats.columns = ['_'.join(col).strip() for col in summary_stats.columns.values]
    summary_stats.to_csv(export_dir / 'vendor_summary_statistics.csv')
    print("✅ Exported summary statistics to CSV")
    
    print(f"\n📁 All exports saved to: {export_dir.absolute()}")

## 9. Data Quality Checks

In [None]:
if 'df_panel' in locals() and not df_panel.empty:
    print("🔍 Running data quality checks...\n")
    
    # Check 1: Win rate bounds
    invalid_winrate = df_panel[(df_panel['win_rate'] < 0) | (df_panel['win_rate'] > 1)]
    status = "✅" if len(invalid_winrate) == 0 else "❌"
    print(f"{status} Win rate in [0, 1]: {len(invalid_winrate)} invalid rows")
    
    # Check 2: Wins <= Bids
    invalid_wins = df_panel[df_panel['wins'] > df_panel['bids']]
    status = "✅" if len(invalid_wins) == 0 else "❌"
    print(f"{status} Wins ≤ Bids: {len(invalid_wins)} invalid rows")
    
    # Check 3: Bids <= Auctions (a vendor can bid multiple times per auction)
    # This is actually okay, so we check if bids are reasonable
    bids_per_auction = df_panel[df_panel['auctions'] > 0]['bids'] / df_panel[df_panel['auctions'] > 0]['auctions']
    unreasonable_bids = (bids_per_auction > 100).sum()  # More than 100 bids per auction seems unreasonable
    status = "✅" if unreasonable_bids == 0 else "⚠️"
    print(f"{status} Reasonable bids per auction: {unreasonable_bids} vendors with >100 bids/auction")
    
    # Check 4: No negative values
    numeric_cols = ['auctions', 'bids', 'wins', 'distinct_campaigns_bid', 'distinct_products_bid']
    negative_values = (df_panel[numeric_cols] < 0).sum().sum()
    status = "✅" if negative_values == 0 else "❌"
    print(f"{status} No negative counts: {negative_values} negative values found")
    
    # Check 5: Panel completeness
    missing_values = df_panel[['vendor_id', 'week', 'auctions', 'bids', 'wins']].isna().sum()
    total_missing = missing_values.sum()
    status = "✅" if total_missing == 0 else "❌"
    print(f"{status} Core columns complete: {total_missing} missing values")
    
    # Check 6: Date consistency
    df_panel['week'] = pd.to_datetime(df_panel['week'])
    week_gaps = df_panel['week'].sort_values().unique()
    expected_weeks = pd.date_range(start=week_gaps.min(), end=week_gaps.max(), freq='W-MON')
    missing_weeks = set(expected_weeks) - set(week_gaps)
    status = "✅" if len(missing_weeks) == 0 else "⚠️"
    print(f"{status} Continuous weekly coverage: {len(missing_weeks)} missing weeks")
    
    print("\n✅ Data quality checks complete!")