# 03 - Panel Construction for Funnel Econometric Analysis

This notebook creates three specialized panel datasets for econometric analysis:
1. **Model 1**: Ad Effectiveness Panel (impression-level)
2. **Model 2**: Journey Continuation Panel (browsing session-level)
3. **Model 3**: Final Conversion Panel (shopping session-level)

Each panel is optimized for memory efficiency and saved separately for use in `04_fixed_effects.ipynb`.

## Setup and Configuration

In [1]:
import polars as pl
import numpy as np
from pathlib import Path
from datetime import datetime, timedelta
import gc
import warnings
from tqdm import tqdm

warnings.filterwarnings('ignore')

# Configuration
DATA_DIR = Path('./data')
PANEL_DIR = Path('./data/panels')
PANEL_DIR.mkdir(exist_ok=True, parents=True)

# Sampling parameters for memory efficiency
AUCTION_SAMPLE_FRACTION = 1  # 10% of auctions for Model 1
USER_SAMPLE_FRACTION = 1     # 5% of users for Models 2 & 3
RANDOM_SEED = 42

print("="*80)
print("PANEL CONSTRUCTION FOR FUNNEL ECONOMETRIC ANALYSIS")
print("="*80)
print(f"\nConfiguration:")
print(f"  Data directory: {DATA_DIR}")
print(f"  Panel directory: {PANEL_DIR}")
print(f"  Auction sample: {AUCTION_SAMPLE_FRACTION:.0%}")
print(f"  User sample: {USER_SAMPLE_FRACTION:.0%}")
print(f"  Random seed: {RANDOM_SEED}")

PANEL CONSTRUCTION FOR FUNNEL ECONOMETRIC ANALYSIS

Configuration:
  Data directory: data
  Panel directory: data/panels
  Auction sample: 100%
  User sample: 100%
  Random seed: 42


## Model 1: Ad Effectiveness Panel (Impression-Level)

Unit of analysis: Individual ad impression
Key outcome: WasClicked (binary)

In [2]:
print("\n" + "="*80)
print("MODEL 1: AD EFFECTIVENESS PANEL")
print("="*80)

# Load required data
print("\nLoading data...")
df_impressions = pl.read_parquet(DATA_DIR / 'raw_sample_impressions.parquet')
df_bids = pl.read_parquet(DATA_DIR / 'raw_sample_auctions_results.parquet')
df_clicks = pl.read_parquet(DATA_DIR / 'raw_sample_clicks.parquet')
df_catalog = pl.read_parquet(DATA_DIR / 'processed_sample_catalog.parquet')

print(f"  Impressions: {len(df_impressions):,} rows")
print(f"  Bids: {len(df_bids):,} rows")
print(f"  Clicks: {len(df_clicks):,} rows")
print(f"  Catalog: {len(df_catalog):,} rows")


MODEL 1: AD EFFECTIVENESS PANEL

Loading data...
  Impressions: 1,118,310 rows
  Bids: 11,254,106 rows
  Clicks: 34,260 rows
  Catalog: 3,981,005 rows


In [3]:
# Join impressions with bids to get ranking
print("\nJoining impressions with bid rankings...")
join_keys = ['AUCTION_ID', 'PRODUCT_ID', 'VENDOR_ID', 'CAMPAIGN_ID']

model1_data = df_impressions.join(
    df_bids.select(join_keys + ['RANKING', 'IS_WINNER']),
    on=join_keys,
    how='left'
)

# Add click indicator
print("Adding click indicators...")
clicks_dedup = df_clicks.select(join_keys).unique().with_columns(
    pl.lit(1).alias('WasClicked')
)

model1_data = model1_data.join(
    clicks_dedup,
    on=join_keys,
    how='left'
).with_columns(
    pl.col('WasClicked').fill_null(0).cast(pl.Int8)
)

print(f"\nBasic statistics:")
print(f"  Total impressions: {len(model1_data):,}")
print(f"  Click rate: {model1_data['WasClicked'].mean():.2%}")
print(f"  Average rank: {model1_data['RANKING'].mean():.2f}")


Joining impressions with bid rankings...
Adding click indicators...

Basic statistics:
  Total impressions: 1,121,199
  Click rate: 2.66%
  Average rank: 12.62


In [4]:
# Calculate auction-level aggregates
print("\nCalculating auction-level controls...")

# Get auction aggregates
auction_aggs = df_bids.filter(pl.col('IS_WINNER') == True).group_by('AUCTION_ID').agg([
    pl.count().alias('NumWinningBids_a'),
    pl.col('RANKING').filter(pl.col('RANKING') <= 5).count().alias('NumTop5_a')
])

# Get product prices for top 5 items
top5_bids = df_bids.filter((pl.col('IS_WINNER') == True) & (pl.col('RANKING') <= 5))
top5_with_price = top5_bids.join(
    df_catalog.select(['PRODUCT_ID', 'PRICE']),
    on='PRODUCT_ID',
    how='left'
)

# Calculate average price of top 5
price_aggs = top5_with_price.group_by('AUCTION_ID').agg([
    pl.col('PRICE').mean().alias('AvgPriceTop5_a'),
    pl.col('PRICE').std().alias('StdPriceTop5_a')
]).with_columns([
    pl.col('AvgPriceTop5_a').fill_null(50.0),
    pl.col('StdPriceTop5_a').fill_null(10.0)
])

# Calculate brand concentration (simplified HHI)
brand_conc = df_bids.filter(pl.col('IS_WINNER') == True).group_by(['AUCTION_ID', 'VENDOR_ID']).agg(
    pl.count().alias('vendor_count')
).group_by('AUCTION_ID').agg(
    (pl.col('vendor_count').pow(2).sum() / pl.col('vendor_count').sum().pow(2)).alias('BrandConcentration_a')
)

# Join all auction-level controls
auction_controls = auction_aggs.join(price_aggs, on='AUCTION_ID', how='left')
auction_controls = auction_controls.join(brand_conc, on='AUCTION_ID', how='left')

# Add to main data
model1_data = model1_data.join(auction_controls, on='AUCTION_ID', how='left')

# Fill nulls with sensible defaults
model1_data = model1_data.with_columns([
    pl.col('NumWinningBids_a').fill_null(10),
    pl.col('AvgPriceTop5_a').fill_null(50.0),
    pl.col('BrandConcentration_a').fill_null(0.5)
])

print(f"  Added NumWinningBids_a: mean={model1_data['NumWinningBids_a'].mean():.1f}")
print(f"  Added AvgPriceTop5_a: mean=${model1_data['AvgPriceTop5_a'].mean():.2f}")
print(f"  Added BrandConcentration_a: mean={model1_data['BrandConcentration_a'].mean():.3f}")


Calculating auction-level controls...
  Added NumWinningBids_a: mean=38.1
  Added AvgPriceTop5_a: mean=$716388.69
  Added BrandConcentration_a: mean=0.078


In [5]:
# Sample auctions for memory efficiency
print(f"\nSampling {AUCTION_SAMPLE_FRACTION:.0%} of auctions...")
unique_auctions = model1_data['AUCTION_ID'].unique()
sampled_auctions = pl.DataFrame({'AUCTION_ID': unique_auctions}).sample(
    fraction=AUCTION_SAMPLE_FRACTION,
    with_replacement=False,
    shuffle=True,
    seed=RANDOM_SEED
)

model1_panel = model1_data.join(
    sampled_auctions,
    on='AUCTION_ID',
    how='inner'
)

# Add polynomial and interaction terms
model1_panel = model1_panel.with_columns([
    pl.col('RANKING').pow(2).alias('RankSquared'),
    (pl.col('RANKING') * pl.col('AvgPriceTop5_a')).alias('Rank_x_AvgPrice')
])

print(f"\nFinal Model 1 Panel:")
print(f"  Observations: {len(model1_panel):,}")
print(f"  Unique auctions: {model1_panel['AUCTION_ID'].n_unique():,}")
print(f"  Unique vendors: {model1_panel['VENDOR_ID'].n_unique():,}")
print(f"  Click rate: {model1_panel['WasClicked'].mean():.2%}")

# Save panel
model1_path = PANEL_DIR / 'panel_model1_ad_effectiveness.parquet'
model1_panel.write_parquet(model1_path)
print(f"\n✓ Saved to {model1_path}")
print(f"  File size: {model1_path.stat().st_size / (1024**2):.2f} MB")

# Clear memory
del df_impressions, df_bids, df_clicks, df_catalog
del model1_data, model1_panel
gc.collect()
print("\n✓ Memory cleared")


Sampling 100% of auctions...

Final Model 1 Panel:
  Observations: 1,121,199
  Unique auctions: 152,959
  Unique vendors: 100,306
  Click rate: 2.66%

✓ Saved to data/panels/panel_model1_ad_effectiveness.parquet
  File size: 79.00 MB

✓ Memory cleared


## Model 2: Journey Continuation Panel (Session-Level)

Unit of analysis: Browsing session
Key outcome: ReturnedForNextSession (binary)

In [6]:
print("\n" + "="*80)
print("MODEL 2: JOURNEY CONTINUATION PANEL")
print("="*80)

# Load browsing sessions
print("\nLoading browsing sessions...")
df_browsing = pl.read_parquet(DATA_DIR / 'browsing_sessions.parquet')
print(f"  Total sessions: {len(df_browsing):,}")
print(f"  Unique users: {df_browsing['user_id'].n_unique():,}")

# Sort by user and time
model2_data = df_browsing.sort(['user_id', 'session_start'])

# Create outcome variable using window functions
print("\nCreating outcome variable (ReturnedForNextSession)...")

# Get next session start time within same shopping session
model2_data = model2_data.with_columns(
    pl.col('session_start').shift(-1).over(['user_id', 'shopping_session_id']).alias('next_session_start')
)

# Create binary outcome
model2_data = model2_data.with_columns(
    pl.col('next_session_start').is_not_null().cast(pl.Int8).alias('ReturnedForNextSession')
)

print(f"  Return rate: {model2_data['ReturnedForNextSession'].mean():.2%}")


MODEL 2: JOURNEY CONTINUATION PANEL

Loading browsing sessions...
  Total sessions: 56,324
  Unique users: 3,396

Creating outcome variable (ReturnedForNextSession)...
  Return rate: 83.64%


In [7]:
# Add session-level features
print("\nAdding session-level features...")

# Basic features
model2_data = model2_data.with_columns([
    # Engagement metrics
    pl.col('num_clicks').alias('NumClicks'),
    pl.col('num_impressions').alias('NumImpressions'),
    pl.col('unique_products').alias('VarietyProductsClicked'),
    
    # Purchase indicator
    (pl.col('session_revenue_usd') > 0).cast(pl.Int8).alias('MadePurchase'),
    
    # Session duration in minutes
    pl.col('duration_minutes').alias('SessionDuration'),
    
    # Session order within user
    pl.col('user_id').cum_count().over('user_id').alias('session_number')
])

# Add first session indicator
model2_data = model2_data.with_columns(
    (pl.col('session_number') == 1).cast(pl.Int8).alias('IsFirstSession')
)

# Calculate time since last session
model2_data = model2_data.with_columns(
    pl.col('session_end').shift(1).over('user_id').alias('prev_session_end')
)

# Convert to datetime if needed
if model2_data['session_start'].dtype == pl.Utf8:
    model2_data = model2_data.with_columns([
        pl.col('session_start').str.to_datetime(format='%Y-%m-%dT%H:%M:%S%.f%z', strict=False),
        pl.col('session_end').str.to_datetime(format='%Y-%m-%dT%H:%M:%S%.f%z', strict=False),
        pl.col('prev_session_end').str.to_datetime(format='%Y-%m-%dT%H:%M:%S%.f%z', strict=False)
    ])

# Calculate time since last session in hours
model2_data = model2_data.with_columns(
    ((pl.col('session_start') - pl.col('prev_session_end')).dt.total_seconds() / 3600)
    .fill_null(0.0)
    .alias('TimeSinceLastSession')
)

# Add day of week
model2_data = model2_data.with_columns(
    pl.col('session_start').dt.weekday().alias('dayofweek')
)

print(f"  Average clicks per session: {model2_data['NumClicks'].mean():.2f}")
print(f"  Purchase rate: {model2_data['MadePurchase'].mean():.2%}")
print(f"  Average session duration: {model2_data['SessionDuration'].mean():.1f} minutes")


Adding session-level features...
  Average clicks per session: 0.61
  Purchase rate: 6.72%
  Average session duration: 8.1 minutes


In [8]:
# Sample users for memory efficiency
print(f"\nSampling {USER_SAMPLE_FRACTION:.0%} of users...")
unique_users = model2_data['user_id'].unique()
sampled_users = pl.DataFrame({'user_id': unique_users}).sample(
    fraction=USER_SAMPLE_FRACTION,
    with_replacement=False,
    shuffle=True,
    seed=RANDOM_SEED
)

model2_panel = model2_data.join(
    sampled_users,
    on='user_id',
    how='inner'
)

print(f"\nFinal Model 2 Panel:")
print(f"  Observations: {len(model2_panel):,}")
print(f"  Unique users: {model2_panel['user_id'].n_unique():,}")
print(f"  Return rate: {model2_panel['ReturnedForNextSession'].mean():.2%}")

# Save panel
model2_path = PANEL_DIR / 'panel_model2_continuation.parquet'
model2_panel.write_parquet(model2_path)
print(f"\n✓ Saved to {model2_path}")
print(f"  File size: {model2_path.stat().st_size / (1024**2):.2f} MB")

# Clear memory
del df_browsing, model2_data, model2_panel
gc.collect()
print("\n✓ Memory cleared")


Sampling 100% of users...

Final Model 2 Panel:
  Observations: 56,324
  Unique users: 3,396
  Return rate: 83.64%

✓ Saved to data/panels/panel_model2_continuation.parquet
  File size: 2.81 MB

✓ Memory cleared


## Model 3: Final Conversion Panel (Shopping Session Level)

Unit of analysis: Shopping session (complete journey)
Key outcome: DidPurchase (binary)

In [9]:
print("\n" + "="*80)
print("MODEL 3: FINAL CONVERSION PANEL")
print("="*80)

# Load shopping sessions
print("\nLoading shopping sessions...")
df_shopping = pl.read_parquet(DATA_DIR / 'shopping_sessions.parquet')
print(f"  Total shopping sessions: {len(df_shopping):,}")
print(f"  Unique users: {df_shopping['user_id'].n_unique():,}")


MODEL 3: FINAL CONVERSION PANEL

Loading shopping sessions...
  Total shopping sessions: 9,214
  Unique users: 3,396


In [10]:
# Prepare features
print("\nPreparing shopping session features...")

model3_data = df_shopping.with_columns([
    # Outcome
    pl.col('did_purchase').cast(pl.Int8).alias('DidPurchase'),
    
    # Journey engagement metrics
    pl.col('num_browsing_sessions').alias('NumBrowsingSessions'),
    pl.col('total_clicks').alias('TotalClicks'),
    pl.col('total_impressions').alias('TotalImpressions'),
    
    # Journey duration
    pl.col('shopping_duration_days').alias('TotalDurationDays'),
    
    # Variety metrics - using correct column names
    pl.col('total_unique_products').alias('UniqueProductsViewed'),
    pl.col('total_unique_auctions').alias('UniqueAuctionsEngaged'),
    
    # Revenue metrics
    pl.col('total_revenue_usd').alias('TotalRevenue')
])

# Add derived features
model3_data = model3_data.with_columns([
    # Session density (sessions per day)
    (pl.col('NumBrowsingSessions') / (pl.col('TotalDurationDays') + 0.01)).alias('SessionDensity'),
    
    # Click-to-impression ratio
    (pl.col('TotalClicks') / (pl.col('TotalImpressions') + 1)).alias('ClickThroughRate'),
    
    # Interaction term
    (pl.col('NumBrowsingSessions') * pl.col('TotalClicks')).alias('Sessions_x_Clicks'),
    
    # Variety vendor proxy (using unique auctions as proxy)
    pl.col('UniqueAuctionsEngaged').alias('VarietyVendorsClicked')
])

print(f"  Conversion rate: {model3_data['DidPurchase'].mean():.2%}")
print(f"  Average browsing sessions: {model3_data['NumBrowsingSessions'].mean():.2f}")
print(f"  Average total clicks: {model3_data['TotalClicks'].mean():.2f}")


Preparing shopping session features...
  Conversion rate: 16.21%
  Average browsing sessions: 6.11
  Average total clicks: 3.72


In [11]:
# Add time identifiers
print("\nAdding time identifiers...")

# Convert to datetime if needed
if model3_data['shopping_start'].dtype == pl.Utf8:
    model3_data = model3_data.with_columns(
        pl.col('shopping_start').str.to_datetime(format='%Y-%m-%dT%H:%M:%S%.f%z', strict=False)
    )

# Extract week and year
model3_data = model3_data.with_columns([
    pl.col('shopping_start').dt.week().alias('week_of_year'),
    pl.col('shopping_start').dt.year().alias('year')
])

# Create week-year identifier
model3_data = model3_data.with_columns(
    (pl.col('year').cast(pl.Utf8) + '_' + pl.col('week_of_year').cast(pl.Utf8)).alias('week_year')
)

# Sample users and filter to multi-session users
print(f"\nSampling {USER_SAMPLE_FRACTION:.0%} of users...")
unique_users = model3_data['user_id'].unique()
sampled_users = pl.DataFrame({'user_id': unique_users}).sample(
    fraction=USER_SAMPLE_FRACTION,
    with_replacement=False,
    shuffle=True,
    seed=RANDOM_SEED
)

model3_sampled = model3_data.join(
    sampled_users,
    on='user_id',
    how='inner'
)

# Filter to users with multiple shopping sessions for panel variation
print("Filtering to multi-session users for panel analysis...")
user_counts = model3_sampled.group_by('user_id').agg(
    pl.count().alias('n_shopping_sessions')
)
multi_users = user_counts.filter(pl.col('n_shopping_sessions') > 1)['user_id']

model3_panel = model3_sampled.join(
    pl.DataFrame({'user_id': multi_users}),
    on='user_id',
    how='inner'
)

print(f"\nFinal Model 3 Panel:")
print(f"  Observations: {len(model3_panel):,}")
print(f"  Unique users: {model3_panel['user_id'].n_unique():,}")
print(f"  Conversion rate: {model3_panel['DidPurchase'].mean():.2%}")
print(f"  Average sessions per user: {len(model3_panel) / model3_panel['user_id'].n_unique():.2f}")

# Panel balance checks
print(f"\nPanel Balance:")
print(f"  Avg browsing sessions per shopping journey: {model3_panel['NumBrowsingSessions'].mean():.2f}")
print(f"  Avg total clicks per journey: {model3_panel['TotalClicks'].mean():.2f}")
print(f"  Avg journey duration: {model3_panel['TotalDurationDays'].mean():.2f} days")

# Check user distribution
user_shopping_counts = model3_panel.group_by('user_id').agg([
    pl.count().alias('shopping_sessions'),
    pl.col('DidPurchase').sum().alias('purchases')
])

print(f"\n  User distribution:")
print(f"    Users with 2+ shopping sessions: {user_shopping_counts.filter(pl.col('shopping_sessions') >= 2).height:,}")
print(f"    Users with 3+ shopping sessions: {user_shopping_counts.filter(pl.col('shopping_sessions') >= 3).height:,}")
print(f"    Users with 5+ shopping sessions: {user_shopping_counts.filter(pl.col('shopping_sessions') >= 5).height:,}")
print(f"    Max shopping sessions per user: {user_shopping_counts['shopping_sessions'].max()}")

print(f"\n  Purchase distribution:")
print(f"    Users with 0 purchases: {user_shopping_counts.filter(pl.col('purchases') == 0).height:,}")
print(f"    Users with 1+ purchases: {user_shopping_counts.filter(pl.col('purchases') >= 1).height:,}")
print(f"    Users with 2+ purchases: {user_shopping_counts.filter(pl.col('purchases') >= 2).height:,}")
print(f"    Max purchases per user: {user_shopping_counts['purchases'].max()}")

# Check temporal balance
week_counts = model3_panel.group_by('week_year').agg(pl.count().alias('count'))
print(f"\n  Temporal distribution:")
print(f"    Unique week-years: {week_counts.height}")
print(f"    Min sessions per week: {week_counts['count'].min()}")
print(f"    Max sessions per week: {week_counts['count'].max()}")
print(f"    Avg sessions per week: {week_counts['count'].mean():.1f}")

# Save panel
model3_path = PANEL_DIR / 'panel_model3_conversion.parquet'
model3_panel.write_parquet(model3_path)
print(f"\n✓ Saved to {model3_path}")
print(f"  File size: {model3_path.stat().st_size / (1024**2):.2f} MB")

# Clear memory
del df_shopping, model3_data, model3_sampled, model3_panel
gc.collect()
print("\n✓ Memory cleared")


Adding time identifiers...

Sampling 100% of users...
Filtering to multi-session users for panel analysis...

Final Model 3 Panel:
  Observations: 7,675
  Unique users: 1,857
  Conversion rate: 17.43%
  Average sessions per user: 4.13

Panel Balance:
  Avg browsing sessions per shopping journey: 5.25
  Avg total clicks per journey: 3.15
  Avg journey duration: 5.26 days

  User distribution:
    Users with 2+ shopping sessions: 1,857
    Users with 3+ shopping sessions: 1,239
    Users with 5+ shopping sessions: 674
    Max shopping sessions per user: 14

  Purchase distribution:
    Users with 0 purchases: 1,082
    Users with 1+ purchases: 775
    Users with 2+ purchases: 330
    Max purchases per user: 7

  Temporal distribution:
    Unique week-years: 26
    Min sessions per week: 223
    Max sessions per week: 455
    Avg sessions per week: 295.2

✓ Saved to data/panels/panel_model3_conversion.parquet
  File size: 0.57 MB

✓ Memory cleared


## Summary

In [12]:
print("\n" + "="*80)
print("PANEL CONSTRUCTION COMPLETE")
print("="*80)
print("\nCreated panels:")

for filename in ['panel_model1_ad_effectiveness.parquet', 
                 'panel_model2_continuation.parquet',
                 'panel_model3_conversion.parquet']:
    filepath = PANEL_DIR / filename
    if filepath.exists():
        size_mb = filepath.stat().st_size / (1024**2)
        df_temp = pl.scan_parquet(filepath)
        n_rows = df_temp.select(pl.count()).collect().item()
        print(f"  ✓ {filename}: {n_rows:,} rows, {size_mb:.2f} MB")
    else:
        print(f"  ✗ {filename}: not created")

print("\n✓ All panels ready for econometric analysis in 04_fixed_effects.ipynb")


PANEL CONSTRUCTION COMPLETE

Created panels:
  ✓ panel_model1_ad_effectiveness.parquet: 1,121,199 rows, 79.00 MB
  ✓ panel_model2_continuation.parquet: 56,324 rows, 2.81 MB
  ✓ panel_model3_conversion.parquet: 7,675 rows, 0.57 MB

✓ All panels ready for econometric analysis in 04_fixed_effects.ipynb
