# 03 Panel Construction

**Purpose:** Build estimation panels for regression analysis.

**Panels:**
1. **Panel A: (u, t, v)** - User × Week × Vendor
2. **Panel B: (s, t, v)** - Session-Week × Vendor (for each gap threshold)

**Variables:**
- `C` = sponsored click count
- `Y` = spend (promoted-linked only)
- `I` = impression count
- Controls: avg_rank, share_rank1, avg_pacing, avg_quality, avg_final_bid

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

DATA_DIR = Path('data')
print(f"Data directory: {DATA_DIR.resolve()}")

## 1. Load Canonical Tables

In [None]:
print("Loading canonical tables...")

promoted_events = pd.read_parquet(DATA_DIR / 'promoted_events.parquet')
purchases_mapped = pd.read_parquet(DATA_DIR / 'purchases_mapped.parquet')
events_with_sessions = pd.read_parquet(DATA_DIR / 'events_with_sessions.parquet')

print(f"Promoted events: {len(promoted_events):,}")
print(f"Purchases mapped: {len(purchases_mapped):,}")
print(f"Events with sessions: {len(events_with_sessions):,}")

In [None]:
# Add week info to promoted_events
promoted_events['week'] = pd.to_datetime(promoted_events['click_time']).dt.isocalendar().week
promoted_events['year'] = pd.to_datetime(promoted_events['click_time']).dt.year
promoted_events['year_week'] = promoted_events['year'].astype(str) + '_W' + promoted_events['week'].astype(str).str.zfill(2)

# Add week info to purchases
purchases_valid = purchases_mapped[purchases_mapped['is_post_click']].copy()
purchases_valid['week'] = pd.to_datetime(purchases_valid['purchase_time']).dt.isocalendar().week
purchases_valid['year'] = pd.to_datetime(purchases_valid['purchase_time']).dt.year
purchases_valid['year_week'] = purchases_valid['year'].astype(str) + '_W' + purchases_valid['week'].astype(str).str.zfill(2)

print(f"Valid purchases: {len(purchases_valid):,}")
print(f"Total valid spend: ${purchases_valid['spend'].sum():,.2f}")

## 2. Panel A: User × Week × Vendor (u, t, v)

In [None]:
print("=" * 80)
print("PANEL A: USER × WEEK × VENDOR")
print("=" * 80)

# Aggregate clicks to (user, week, vendor)
clicks_utv = promoted_events.groupby(['user_id', 'year_week', 'vendor_id']).agg({
    'click_id': 'count',
    'ranking': ['mean', 'min'],
    'is_winner': 'mean',
    'final_bid': 'mean',
    'quality': 'mean',
    'pacing': 'mean',
    'conversion_rate': 'mean',
    'price': 'mean'
}).reset_index()

# Flatten column names
clicks_utv.columns = ['user_id', 'year_week', 'vendor_id', 
                      'C', 'avg_rank', 'min_rank', 'share_winner',
                      'avg_final_bid', 'avg_quality', 'avg_pacing',
                      'avg_conversion_rate', 'avg_price']

# Share rank=1
rank1_counts = promoted_events[promoted_events['ranking'] == 1].groupby(
    ['user_id', 'year_week', 'vendor_id']
).size().reset_index(name='rank1_clicks')

clicks_utv = clicks_utv.merge(rank1_counts, on=['user_id', 'year_week', 'vendor_id'], how='left')
clicks_utv['rank1_clicks'] = clicks_utv['rank1_clicks'].fillna(0)
clicks_utv['share_rank1'] = clicks_utv['rank1_clicks'] / clicks_utv['C']

print(f"Click aggregates: {len(clicks_utv):,} (u,t,v) observations")

In [None]:
# Aggregate spend to (user, week, vendor)
spend_utv = purchases_valid.groupby(['user_id', 'year_week', 'click_vendor_id']).agg({
    'spend': 'sum',
    'purchase_id': 'count'
}).reset_index()
spend_utv.columns = ['user_id', 'year_week', 'vendor_id', 'Y', 'n_purchases']

print(f"Spend aggregates: {len(spend_utv):,} (u,t,v) observations")

In [None]:
# Merge clicks and spend
panel_utv = clicks_utv.merge(
    spend_utv,
    on=['user_id', 'year_week', 'vendor_id'],
    how='outer'
)

# Fill missing values
panel_utv['C'] = panel_utv['C'].fillna(0).astype(int)
panel_utv['Y'] = panel_utv['Y'].fillna(0)
panel_utv['n_purchases'] = panel_utv['n_purchases'].fillna(0).astype(int)

# Binary conversion indicator
panel_utv['D'] = (panel_utv['Y'] > 0).astype(int)

# Log spend
panel_utv['log_Y'] = np.log1p(panel_utv['Y'])

print(f"\nPanel A dimensions: {len(panel_utv):,} observations")
print(f"Unique users: {panel_utv['user_id'].nunique():,}")
print(f"Unique weeks: {panel_utv['year_week'].nunique()}")
print(f"Unique vendors: {panel_utv['vendor_id'].nunique():,}")

In [None]:
# Summary statistics
print("\n--- Panel A Summary Statistics ---")
print(f"\nClick distribution (C):")
print(panel_utv['C'].describe())
print(f"Zero clicks: {(panel_utv['C'] == 0).mean()*100:.1f}%")

print(f"\nSpend distribution (Y):")
print(panel_utv['Y'].describe())
print(f"Zero spend: {(panel_utv['Y'] == 0).mean()*100:.1f}%")

print(f"\nConversion rate: {panel_utv['D'].mean()*100:.2f}%")

In [None]:
# Save Panel A
panel_utv.to_parquet(DATA_DIR / 'panel_utv.parquet', index=False)
print(f"\nSaved Panel A to {DATA_DIR / 'panel_utv.parquet'}")

## 3. Panel B: Session-Week × Vendor (s, t, v)

Build for each session gap threshold (1, 2, 3, 5, 7 days)

In [None]:
print("=" * 80)
print("PANEL B: SESSION-WEEK × VENDOR")
print("=" * 80)

SESSION_GAPS = [1, 2, 3, 5, 7]
panels_stv = {}

In [None]:
for gap_days in tqdm(SESSION_GAPS, desc="Building session panels"):
    session_col = f'session_id_{gap_days}d'
    
    # Filter to clicks only (for C)
    clicks_events = events_with_sessions[events_with_sessions['event_type'] == 'click'].copy()
    
    # Aggregate clicks to (session, week, vendor)
    clicks_stv = clicks_events.groupby([session_col, 'year_week', 'vendor_id']).agg({
        'user_id': 'first',  # session belongs to one user
        'product_id': 'count'  # count as clicks
    }).reset_index()
    clicks_stv.columns = ['session_id', 'year_week', 'vendor_id', 'user_id', 'C']
    
    # Filter to purchases (for Y)
    purchase_events = events_with_sessions[events_with_sessions['event_type'] == 'purchase'].copy()
    
    # Aggregate spend to (session, week, vendor)
    spend_stv = purchase_events.groupby([session_col, 'year_week', 'vendor_id']).agg({
        'spend': 'sum'
    }).reset_index()
    spend_stv.columns = ['session_id', 'year_week', 'vendor_id', 'Y']
    
    # Merge
    panel_stv = clicks_stv.merge(
        spend_stv,
        on=['session_id', 'year_week', 'vendor_id'],
        how='outer'
    )
    
    # Fill missing
    panel_stv['C'] = panel_stv['C'].fillna(0).astype(int)
    panel_stv['Y'] = panel_stv['Y'].fillna(0)
    panel_stv['D'] = (panel_stv['Y'] > 0).astype(int)
    panel_stv['log_Y'] = np.log1p(panel_stv['Y'])
    
    # Fill user_id for spend-only rows
    if panel_stv['user_id'].isna().any():
        # Extract user from session_id (format: user_id_SX)
        panel_stv['user_id'] = panel_stv['user_id'].fillna(
            panel_stv['session_id'].str.rsplit('_S', n=1).str[0]
        )
    
    # Store
    panels_stv[gap_days] = panel_stv
    
    print(f"\n{gap_days}-day gap panel: {len(panel_stv):,} (s,t,v) observations")
    print(f"  Sessions: {panel_stv['session_id'].nunique():,}")
    print(f"  Users: {panel_stv['user_id'].nunique():,}")
    print(f"  Zero spend: {(panel_stv['Y'] == 0).mean()*100:.1f}%")

In [None]:
# Save all session panels
for gap_days, panel in panels_stv.items():
    filename = f'panel_stv_{gap_days}d.parquet'
    panel.to_parquet(DATA_DIR / filename, index=False)
    print(f"Saved {filename}: {len(panel):,} rows")

## 4. Add Impressions (Optional Control)

In [None]:
print("=" * 80)
print("ADDING IMPRESSIONS")
print("=" * 80)

# Load impressions
impressions = pd.read_parquet(Path('../eda/data/impressions_365d.parquet'))
impressions['impression_time'] = pd.to_datetime(impressions['OCCURRED_AT'])
impressions['week'] = impressions['impression_time'].dt.isocalendar().week
impressions['year'] = impressions['impression_time'].dt.year
impressions['year_week'] = impressions['year'].astype(str) + '_W' + impressions['week'].astype(str).str.zfill(2)

print(f"Total impressions: {len(impressions):,}")

In [None]:
# Aggregate impressions to (user, week, vendor)
impressions_utv = impressions.groupby(['USER_ID', 'year_week', 'VENDOR_ID']).size().reset_index(name='I')
impressions_utv.columns = ['user_id', 'year_week', 'vendor_id', 'I']

print(f"Impression aggregates: {len(impressions_utv):,} (u,t,v) observations")

# Merge to Panel A
panel_utv = pd.read_parquet(DATA_DIR / 'panel_utv.parquet')
panel_utv = panel_utv.merge(
    impressions_utv,
    on=['user_id', 'year_week', 'vendor_id'],
    how='left'
)
panel_utv['I'] = panel_utv['I'].fillna(0).astype(int)

# Save updated Panel A
panel_utv.to_parquet(DATA_DIR / 'panel_utv.parquet', index=False)
print(f"Updated Panel A with impressions: {len(panel_utv):,} rows")

## 5. Create Fixed Effect Indices

In [None]:
print("=" * 80)
print("CREATING FIXED EFFECT INDICES")
print("=" * 80)

# Panel A
panel_utv = pd.read_parquet(DATA_DIR / 'panel_utv.parquet')

# Create categorical indices for fixed effects
panel_utv['user_fe'] = pd.Categorical(panel_utv['user_id']).codes
panel_utv['week_fe'] = pd.Categorical(panel_utv['year_week']).codes
panel_utv['vendor_fe'] = pd.Categorical(panel_utv['vendor_id']).codes

print(f"User FE levels: {panel_utv['user_fe'].nunique()}")
print(f"Week FE levels: {panel_utv['week_fe'].nunique()}")
print(f"Vendor FE levels: {panel_utv['vendor_fe'].nunique()}")

panel_utv.to_parquet(DATA_DIR / 'panel_utv.parquet', index=False)

In [None]:
# Panel B (for each gap)
for gap_days in SESSION_GAPS:
    filename = f'panel_stv_{gap_days}d.parquet'
    panel = pd.read_parquet(DATA_DIR / filename)
    
    panel['session_fe'] = pd.Categorical(panel['session_id']).codes
    panel['week_fe'] = pd.Categorical(panel['year_week']).codes
    panel['vendor_fe'] = pd.Categorical(panel['vendor_id']).codes
    panel['user_fe'] = pd.Categorical(panel['user_id']).codes
    
    panel.to_parquet(DATA_DIR / filename, index=False)
    print(f"{filename}: session_fe={panel['session_fe'].nunique()}, week_fe={panel['week_fe'].nunique()}, vendor_fe={panel['vendor_fe'].nunique()}")

## 6. Panel Summary

In [None]:
print("=" * 80)
print("PANEL CONSTRUCTION COMPLETE")
print("=" * 80)

print("\n--- Panel A: (u, t, v) ---")
panel_utv = pd.read_parquet(DATA_DIR / 'panel_utv.parquet')
print(f"Observations: {len(panel_utv):,}")
print(f"Columns: {list(panel_utv.columns)}")
print(f"C range: [{panel_utv['C'].min()}, {panel_utv['C'].max()}], mean={panel_utv['C'].mean():.2f}")
print(f"Y range: [${panel_utv['Y'].min():.2f}, ${panel_utv['Y'].max():.2f}], mean=${panel_utv['Y'].mean():.2f}")
print(f"I range: [{panel_utv['I'].min()}, {panel_utv['I'].max()}], mean={panel_utv['I'].mean():.2f}")

print("\n--- Panel B: (s, t, v) by gap threshold ---")
for gap_days in SESSION_GAPS:
    panel = pd.read_parquet(DATA_DIR / f'panel_stv_{gap_days}d.parquet')
    print(f"\n{gap_days}-day gap:")
    print(f"  Observations: {len(panel):,}")
    print(f"  Sessions: {panel['session_id'].nunique():,}")
    print(f"  Conversion rate: {panel['D'].mean()*100:.2f}%")

In [None]:
# Output files
print("\n--- Output Files ---")
for f in sorted(DATA_DIR.glob('panel_*.parquet')):
    size_mb = f.stat().st_size / 1e6
    print(f"  {f.name}: {size_mb:.1f} MB")

print("\nReady for 04_main_regressions.ipynb")