# 04 Main Regressions

**Purpose:** Estimate primary ad-click → vendor spend models.

**Models:**
1. **Model 2:** Y_utv = α_u + λ_t + φ_v + β·C_utv + ε
2. **Model 2.5 (Intent-Controlled):** Y_utv = α_ut + φ_v + β·C_utv + ε (User×Week FE)
3. **Model 3:** Y_stv = α_s + λ_t + φ_v + β·C_stv + ε
4. **Two-Part:** Conversion (D) + Conditional spend (log Y | Y > 0)

**Interpretation:** β = dollars of vendor spend per additional sponsored click

**Note on Model 2.5:** User×Week FE absorbs weekly purchasing intent, identifying β from within-(user,week) reallocation across vendors. This is more conservative as it controls for "I was going to buy anyway this week".

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# Fixed effects regression
try:
    import pyfixest as pf
    print(f"pyfixest version: {pf.__version__}")
except ImportError:
    print("Installing pyfixest...")
    !pip install pyfixest -q
    import pyfixest as pf

DATA_DIR = Path('data')
print(f"Data directory: {DATA_DIR.resolve()}")

## 1. Load Panels

In [None]:
print("Loading panels...")

panel_utv = pd.read_parquet(DATA_DIR / 'panel_utv.parquet')
print(f"Panel A (u,t,v): {len(panel_utv):,} rows")

SESSION_GAPS = [1, 2, 3, 5, 7]
panels_stv = {}
for gap in SESSION_GAPS:
    panels_stv[gap] = pd.read_parquet(DATA_DIR / f'panel_stv_{gap}d.parquet')
    print(f"Panel B ({gap}d gap): {len(panels_stv[gap]):,} rows")

In [None]:
# Preview Panel A
print("\n--- Panel A Preview ---")
print(panel_utv.head())
print(f"\nColumns: {list(panel_utv.columns)}")

## 2. Model 2: User × Week × Vendor Panel

$$Y_{utv} = \alpha_u + \lambda_t + \phi_v + \beta \cdot C_{utv} + \varepsilon_{utv}$$

- **Unit of analysis:** (user, week, vendor)
- **Dependent variable:** Y = vendor spend ($)
- **Independent variable:** C = sponsored click count
- **Fixed effects:** user (α_u), week (λ_t), vendor (φ_v)
- **Interpretation of β:** dollars of additional spend per additional click
- **Standard errors:** two-way clustered by (user, vendor)

In [None]:
print("=" * 80)
print("MODEL 2: Y_utv = α_u + λ_t + φ_v + β·C_utv + ε")
print("=" * 80)

# Prepare data
df = panel_utv.copy()

# Basic OLS (no FE)
print("\n--- 2.1 OLS (no fixed effects) ---")
model_ols = pf.feols("Y ~ C", data=df)
print(model_ols.summary())

In [None]:
# User FE only
print("\n--- 2.2 User FE only ---")
model_user = pf.feols("Y ~ C | user_id", data=df)
print(model_user.summary())

In [None]:
# User + Week FE
print("\n--- 2.3 User + Week FE ---")
model_user_week = pf.feols("Y ~ C | user_id + year_week", data=df)
print(model_user_week.summary())

In [None]:
# Full Model 2: User + Week + Vendor FE
print("\n--- 2.4 FULL MODEL 2: User + Week + Vendor FE ---")
model2 = pf.feols("Y ~ C | user_id + year_week + vendor_id", data=df, vcov={'CRV1': 'user_id'})
print(model2.summary())

In [None]:
# Model 2 with two-way clustering
print("\n--- 2.5 Model 2 with two-way clustering (user, vendor) ---")
try:
    model2_twoway = pf.feols("Y ~ C | user_id + year_week + vendor_id", 
                             data=df, 
                             vcov={'CRV1': ['user_id', 'vendor_id']})
    print(model2_twoway.summary())
except Exception as e:
    print(f"Two-way clustering error: {e}")
    print("Falling back to user clustering")

In [None]:
# Model 2 with controls
print("\n--- 2.6 Model 2 with auction controls ---")
control_cols = ['avg_rank', 'share_rank1', 'avg_quality', 'avg_pacing']
available_controls = [c for c in control_cols if c in df.columns and df[c].notna().any()]

if available_controls:
    # Fill NaN with 0 for controls
    for c in available_controls:
        df[c] = df[c].fillna(0)
    
    formula = f"Y ~ C + {' + '.join(available_controls)} | user_id + year_week + vendor_id"
    print(f"Formula: {formula}")
    model2_controls = pf.feols(formula, data=df, vcov={'CRV1': 'user_id'})
    print(model2_controls.summary())
else:
    print("No controls available")

In [None]:
# Model 2 with impressions
print("\n--- 2.7 Model 2 with impressions (view-through) ---")
if 'I' in df.columns:
    model2_impressions = pf.feols("Y ~ C + I | user_id + year_week + vendor_id", 
                                   data=df, vcov={'CRV1': 'user_id'})
    print(model2_impressions.summary())
else:
    print("Impressions not available")

## 2.5 Model 2.5: User×Week FE (Intent-Controlled)

$$Y_{utv} = \alpha_{ut} + \phi_v + \beta \cdot C_{utv} + \varepsilon_{utv}$$

- **User×Week FE (α_ut):** Absorbs weekly purchasing intent (controls for "I was going to buy anyway this week")
- **Identification:** β identified from within-(user,week) reallocation across vendors
- **Interpretation:** Given a user's total weekly budget, does clicking on vendor v shift spend toward v?
- **Caution:** More conservative estimate - requires variation across vendors within same user-week

In [None]:
print("=" * 80)
print("MODEL 2.5: Y_utv = α_ut + φ_v + β·C_utv + ε (User×Week FE)")
print("=" * 80)
print("\nThis model absorbs user-week purchasing intent.")
print("β is identified from within-(user,week) reallocation across vendors.\n")

df = panel_utv.copy()

# Create user×week interaction for FE
df['user_week'] = df['user_id'].astype(str) + '_' + df['year_week'].astype(str)

# Check variation within user-week
n_user_weeks = df['user_week'].nunique()
vendors_per_uw = df.groupby('user_week')['vendor_id'].nunique()
multi_vendor_uw = (vendors_per_uw > 1).sum()

print(f"User-week cells: {n_user_weeks:,}")
print(f"User-weeks with >1 vendor: {multi_vendor_uw:,} ({multi_vendor_uw/n_user_weeks*100:.1f}%)")
print(f"Mean vendors per user-week: {vendors_per_uw.mean():.2f}")

# Estimate Model 2.5: User×Week FE + Vendor FE
print("\n--- Model 2.5: User×Week FE + Vendor FE ---")
try:
    model2_5 = pf.feols("Y ~ C | user_week + vendor_id", data=df, vcov={'CRV1': 'user_id'})
    print(model2_5.summary())
    
    beta_2_5 = model2_5.coef()['C']
    se_2_5 = model2_5.se()['C']
    
    print(f"\nβ (Model 2.5) = {beta_2_5:.4f} (SE = {se_2_5:.4f})")
    print(f"\nComparison with Model 2:")
    print(f"  Model 2 (α_u + λ_t + φ_v): β = {model2.coef()['C']:.4f}")
    print(f"  Model 2.5 (α_ut + φ_v):   β = {beta_2_5:.4f}")
    
    change = (beta_2_5 - model2.coef()['C']) / model2.coef()['C'] * 100
    print(f"\n  Change: {change:+.1f}%")
    
    if abs(beta_2_5) < abs(model2.coef()['C']):
        print("  → Model 2.5 is more conservative (controls for weekly intent)")
    else:
        print("  → Model 2.5 estimate is larger (within-week vendor switching matters)")
        
except Exception as e:
    print(f"Error estimating Model 2.5: {e}")
    print("User×Week FE may be too fine-grained for this data.")
    model2_5 = None
    beta_2_5 = None

## 3. Model 3: Session × Week × Vendor Panel

$$Y_{stv} = \alpha_s + \lambda_t + \phi_v + \beta \cdot C_{stv} + \varepsilon_{stv}$$

Session FE absorbs browsing intent within shopping episode.

In [None]:
print("=" * 80)
print("MODEL 3: Y_stv = α_s + λ_t + φ_v + β·C_stv + ε")
print("=" * 80)

results_model3 = []

for gap_days in tqdm(SESSION_GAPS, desc="Estimating Model 3"):
    df_stv = panels_stv[gap_days].copy()
    
    print(f"\n--- {gap_days}-day session gap ---")
    print(f"Observations: {len(df_stv):,}")
    print(f"Sessions: {df_stv['session_id'].nunique():,}")
    
    try:
        model3 = pf.feols("Y ~ C | session_id + year_week + vendor_id", 
                          data=df_stv, 
                          vcov={'CRV1': 'user_id'})
        
        coef = model3.coef()['C']
        se = model3.se()['C']
        
        print(f"β = {coef:.4f} (SE = {se:.4f})")
        
        results_model3.append({
            'gap_days': gap_days,
            'n_obs': len(df_stv),
            'n_sessions': df_stv['session_id'].nunique(),
            'beta': coef,
            'se': se,
            't_stat': coef / se
        })
    except Exception as e:
        print(f"Error: {e}")

In [None]:
# Summary of Model 3 across session gaps
print("\n--- Model 3 Summary Across Session Gaps ---")
results_df = pd.DataFrame(results_model3)
print(results_df.to_string(index=False))

## 4. Two-Part Model

**Part 1 (Conversion):** $D_{utv} = \mathbf{1}\{Y_{utv} > 0\} = \alpha_u + \lambda_t + \phi_v + \beta^D \cdot C_{utv} + \eta$

**Part 2 (Conditional spend):** $\log(1 + Y_{utv}) | Y_{utv} > 0 = \alpha_u + \lambda_t + \phi_v + \beta^Y \cdot C_{utv} + \nu$

In [None]:
print("=" * 80)
print("TWO-PART MODEL")
print("=" * 80)

df = panel_utv.copy()

# Part 1: Conversion (binary)
print("\n--- Part 1: Conversion (D = 1{Y > 0}) ---")
print(f"Conversion rate: {df['D'].mean()*100:.2f}%")

model_part1 = pf.feols("D ~ C | user_id + year_week + vendor_id", 
                        data=df, vcov={'CRV1': 'user_id'})
print(model_part1.summary())

beta_D = model_part1.coef()['C']
print(f"\nβ^D = {beta_D:.6f}")
print(f"Interpretation: 1 additional click → {beta_D*100:.2f} percentage point increase in conversion probability")

In [None]:
# Part 2: Conditional spend (among converters)
print("\n--- Part 2: Conditional Spend (log(1+Y) | Y > 0) ---")
df_converters = df[df['D'] == 1].copy()
print(f"Converters: {len(df_converters):,} ({len(df_converters)/len(df)*100:.1f}%)")

model_part2 = pf.feols("log_Y ~ C | user_id + year_week + vendor_id", 
                        data=df_converters, vcov={'CRV1': 'user_id'})
print(model_part2.summary())

beta_Y = model_part2.coef()['C']
print(f"\nβ^Y = {beta_Y:.4f}")
print(f"Interpretation: 1 additional click → {(np.exp(beta_Y)-1)*100:.1f}% increase in conditional spend")

## 5. Vendor-Specific ROI (Optional)

In [None]:
print("=" * 80)
print("VENDOR-SPECIFIC ROI")
print("=" * 80)

# Get top vendors by click volume
vendor_clicks = panel_utv.groupby('vendor_id')['C'].sum().sort_values(ascending=False)
top_vendors = vendor_clicks.head(20).index.tolist()

print(f"Estimating β for top {len(top_vendors)} vendors by click volume...")

vendor_results = []
for vendor in tqdm(top_vendors, desc="Vendors"):
    df_vendor = panel_utv[panel_utv['vendor_id'] == vendor].copy()
    
    if len(df_vendor) < 100:
        continue
    
    try:
        model = pf.feols("Y ~ C | user_id + year_week", data=df_vendor)
        
        vendor_results.append({
            'vendor_id': vendor[:20],
            'n_obs': len(df_vendor),
            'total_clicks': df_vendor['C'].sum(),
            'total_spend': df_vendor['Y'].sum(),
            'beta': model.coef()['C'],
            'se': model.se()['C']
        })
    except:
        pass

vendor_df = pd.DataFrame(vendor_results)
print("\n--- Vendor-Specific β Estimates ---")
print(vendor_df.to_string(index=False))

## 6. Results Summary

In [None]:
print("=" * 80)
print("MAIN REGRESSION RESULTS SUMMARY")
print("=" * 80)

print("\n--- Model 2: Y_utv = α_u + λ_t + φ_v + β·C_utv + ε ---")
print(f"β = {model2.coef()['C']:.4f} (SE = {model2.se()['C']:.4f})")
print(f"Interpretation: 1 additional sponsored click → ${model2.coef()['C']:.2f} additional vendor spend")

print("\n--- Model 2.5: Y_utv = α_ut + φ_v + β·C_utv + ε (Intent-Controlled) ---")
if 'model2_5' in dir() and model2_5 is not None:
    print(f"β = {model2_5.coef()['C']:.4f} (SE = {model2_5.se()['C']:.4f})")
    print("(User×Week FE absorbs weekly purchasing intent)")
else:
    print("Model 2.5 not estimated (insufficient within-user-week variation)")

print("\n--- Model 3: Y_stv = α_s + λ_t + φ_v + β·C_stv + ε ---")
print("(Session FE absorbs browsing intent)")
for r in results_model3:
    print(f"  {r['gap_days']}d gap: β = {r['beta']:.4f} (SE = {r['se']:.4f})")

print("\n--- Two-Part Model ---")
print(f"Part 1 (Conversion): β^D = {beta_D:.6f}")
print(f"Part 2 (Cond. Spend): β^Y = {beta_Y:.4f}")

In [None]:
# Save results
results_summary = {
    'model2_beta': model2.coef()['C'],
    'model2_se': model2.se()['C'],
    'model2_5_beta': model2_5.coef()['C'] if ('model2_5' in dir() and model2_5 is not None) else None,
    'model2_5_se': model2_5.se()['C'] if ('model2_5' in dir() and model2_5 is not None) else None,
    'model3_results': results_model3,
    'twopart_beta_D': beta_D,
    'twopart_beta_Y': beta_Y,
    'vendor_results': vendor_results if 'vendor_results' in dir() else []
}

import json
with open(DATA_DIR / 'regression_results.json', 'w') as f:
    json.dump(results_summary, f, indent=2, default=str)

print(f"\nResults saved to {DATA_DIR / 'regression_results.json'}")
print("\nReady for 05_robustness_suite.ipynb")