# 06 Umbrella Regression (Mechanism-Nesting Model)

**Purpose:** Estimate a comprehensive model that nests multiple mechanism tests.

**Unit of Analysis:** (s, t, v) = Session × Week × Vendor

**Fixed Effects:** User (α_u) + Week (λ_t) + Vendor (φ_v) — NOT session FE

**Full Specification:**
```
Y_stv = α_u + λ_t + φ_v
      + β_0·C_stv                               # short-run activation
      + Σ_{ℓ=1}^{4} β_ℓ·C_{s,t-ℓ,v}             # lag/awareness/adstock
      + η_0·I_stv + Σ_{ℓ=1}^{4} η_ℓ·I_{s,t-ℓ,v}  # view-through
      + δ_1·(C_stv·Short_st)                    # search-cost/friction
      + δ_2·(C_stv·Top_stv)                     # position bias
      + ρ·C^{(-v)}_st                           # competition/substitution
      + κ·A_st                                  # anchoring
      + ε_stv
```

**Coefficient Tests → Theory:**
- Short-run activation: H0: β_0 = 0
- Awareness/adstock: H0: β_ℓ = 0 ∀ℓ≥1
- View-through: H0: η_ℓ = 0
- Search-cost/position: H0: δ_1 = δ_2 = 0
- Competition/substitution: H0: ρ = 0 (expect ρ < 0)
- Anchoring: H0: κ = 0

**NOTE:** Auction controls are EXCLUDED from main model (post-treatment bias).

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

import pyfixest as pf

DATA_DIR = Path('data')
print(f"Data directory: {DATA_DIR.resolve()}")

## 1. Load Session-Level Panel

In [None]:
print("Loading data...")

# Use 3-day session gap as primary (can test others)
GAP_DAYS = 3
panel_stv = pd.read_parquet(DATA_DIR / f'panel_stv_{GAP_DAYS}d.parquet')
panel_utv = pd.read_parquet(DATA_DIR / 'panel_utv.parquet')
promoted_events = pd.read_parquet(DATA_DIR / 'promoted_events.parquet')
events_with_sessions = pd.read_parquet(DATA_DIR / 'events_with_sessions.parquet')

print(f"Panel (s,t,v) {GAP_DAYS}d: {len(panel_stv):,} rows")
print(f"Panel (u,t,v): {len(panel_utv):,} rows")
print(f"Promoted events: {len(promoted_events):,} rows")
print(f"Events with sessions: {len(events_with_sessions):,} rows")

## 2. Build Umbrella Variables

Variables needed:
- C_stv: clicks (already have)
- C_{s,t-ℓ,v}: lagged clicks (L=1,2,3,4)
- I_stv: impressions
- I_{s,t-ℓ,v}: lagged impressions
- Short_st: friction proxy (short session)
- Top_stv: share of clicks at rank 1
- C^{(-v)}_st: competition clicks (other vendors in session-week)
- A_st: anchor price

In [None]:
print("=" * 80)
print("BUILDING UMBRELLA VARIABLES")
print("=" * 80)

df = panel_stv.copy()

# Create week index for lagging
weeks_ordered = sorted(df['year_week'].unique())
week_to_idx = {w: i for i, w in enumerate(weeks_ordered)}
df['week_idx'] = df['year_week'].map(week_to_idx)

print(f"Weeks available: {len(weeks_ordered)}")
print(f"Sessions: {df['session_id'].nunique():,}")
print(f"Users: {df['user_id'].nunique():,}")

In [None]:
# 2.1 Build lagged clicks: C_{s,t-ℓ,v} for ℓ = 1,2,3,4
print("\n--- Building Lagged Clicks ---")

MAX_LAG = 4
click_by_week = df[['session_id', 'vendor_id', 'week_idx', 'C']].copy()

for lag in tqdm(range(1, MAX_LAG + 1), desc="Building click lags"):
    lagged = click_by_week.copy()
    lagged['week_idx'] = lagged['week_idx'] + lag  # shift forward so it becomes lag when merged
    lagged = lagged.rename(columns={'C': f'C_lag{lag}'})
    
    df = df.merge(
        lagged[['session_id', 'vendor_id', 'week_idx', f'C_lag{lag}']],
        on=['session_id', 'vendor_id', 'week_idx'],
        how='left'
    )
    df[f'C_lag{lag}'] = df[f'C_lag{lag}'].fillna(0)

for lag in range(1, MAX_LAG + 1):
    print(f"C_lag{lag}: mean = {df[f'C_lag{lag}'].mean():.3f}")

In [None]:
# 2.2 Build impressions and lagged impressions
print("\n--- Building Impressions ---")

# Get impressions from panel_utv (user-week-vendor level)
if 'I' in panel_utv.columns:
    impressions_utv = panel_utv[['user_id', 'year_week', 'vendor_id', 'I']].copy()
    
    # Merge to session-level (using user_id from session)
    df = df.merge(
        impressions_utv,
        on=['user_id', 'year_week', 'vendor_id'],
        how='left'
    )
    df['I'] = df['I'].fillna(0)
    print(f"Impressions (I): mean = {df['I'].mean():.2f}")
    
    # Build lagged impressions
    impressions_utv['week_idx'] = impressions_utv['year_week'].map(week_to_idx)
    
    for lag in tqdm(range(1, MAX_LAG + 1), desc="Building impression lags"):
        lagged = impressions_utv.copy()
        lagged['week_idx'] = lagged['week_idx'] + lag
        lagged = lagged.rename(columns={'I': f'I_lag{lag}'})
        
        # Merge via user_id (session belongs to one user)
        df = df.merge(
            lagged[['user_id', 'vendor_id', 'week_idx', f'I_lag{lag}']],
            on=['user_id', 'vendor_id', 'week_idx'],
            how='left'
        )
        df[f'I_lag{lag}'] = df[f'I_lag{lag}'].fillna(0)
else:
    print("Impressions not available - skipping")
    df['I'] = 0
    for lag in range(1, MAX_LAG + 1):
        df[f'I_lag{lag}'] = 0

In [None]:
# 2.3 Build Short_st (friction proxy)
print("\n--- Building Friction Proxy (Short_st) ---")

session_col = f'session_id_{GAP_DAYS}d'

# Count events per session to measure session "length"
session_stats = events_with_sessions.groupby(session_col).agg({
    'product_id': 'count',  # number of events in session
}).reset_index()
session_stats.columns = ['session_id', 'n_events']

# Define "short" session as below median
median_events = session_stats['n_events'].median()
session_stats['is_short'] = (session_stats['n_events'] < median_events).astype(int)

df = df.merge(session_stats[['session_id', 'n_events', 'is_short']], on='session_id', how='left')
df['is_short'] = df['is_short'].fillna(0)

print(f"Median session events: {median_events}")
print(f"Short sessions: {df['is_short'].mean()*100:.1f}%")

In [None]:
# 2.4 Build Top_stv (share of clicks at rank 1)
print("\n--- Building Position Proxy (Top_stv) ---")

# Get rank info from promoted_events
promoted_events['click_time'] = pd.to_datetime(promoted_events['click_time'])
promoted_events['year_week'] = (
    promoted_events['click_time'].dt.year.astype(str) + '_W' + 
    promoted_events['click_time'].dt.isocalendar().week.astype(str).str.zfill(2)
)

# Need to map to session
# Use user_id + year_week as proxy (session spans multiple weeks anyway)
rank_by_utv = promoted_events.groupby(['user_id', 'year_week', 'vendor_id']).agg({
    'ranking': 'mean',
    'is_winner': 'mean'  # share at rank 1
}).reset_index()
rank_by_utv.columns = ['user_id', 'year_week', 'vendor_id', 'avg_rank', 'share_rank1']

df = df.merge(rank_by_utv, on=['user_id', 'year_week', 'vendor_id'], how='left')
df['share_rank1'] = df['share_rank1'].fillna(0)
df['avg_rank'] = df['avg_rank'].fillna(0)

# Create Top indicator (majority rank 1)
df['is_top'] = (df['share_rank1'] > 0.5).astype(int)

print(f"Mean rank: {df[df['avg_rank'] > 0]['avg_rank'].mean():.2f}")
print(f"Share at top rank: {df['is_top'].mean()*100:.1f}%")

In [None]:
# 2.5 Build C^{(-v)}_st (competition clicks - other vendors in same session-week)
print("\n--- Building Competition Clicks (C^{(-v)}_st) ---")

# Total clicks by (session, week)
total_clicks_st = df.groupby(['session_id', 'year_week'])['C'].sum().reset_index()
total_clicks_st.columns = ['session_id', 'year_week', 'C_total']

df = df.merge(total_clicks_st, on=['session_id', 'year_week'], how='left')

# Competition clicks = total - own vendor clicks
df['C_competition'] = df['C_total'] - df['C']
df['C_competition'] = df['C_competition'].clip(lower=0)  # ensure non-negative

print(f"Mean competition clicks: {df['C_competition'].mean():.2f}")

In [None]:
# 2.6 Build A_st (anchor price - mean price of first impressions)
print("\n--- Building Anchor Price (A_st) ---")

# Get price from promoted_events
if 'price' in promoted_events.columns:
    anchor_prices = promoted_events.groupby(['user_id', 'year_week']).agg({
        'price': 'mean'
    }).reset_index()
    anchor_prices.columns = ['user_id', 'year_week', 'anchor_price']
    
    df = df.merge(anchor_prices, on=['user_id', 'year_week'], how='left')
    df['anchor_price'] = df['anchor_price'].fillna(0)
    
    print(f"Mean anchor price: ${df[df['anchor_price'] > 0]['anchor_price'].mean():.2f}")
else:
    print("Price not available - setting anchor to 0")
    df['anchor_price'] = 0

In [None]:
# 2.7 Create interaction terms
print("\n--- Building Interaction Terms ---")

# Friction interaction: C × Short
df['C_x_short'] = df['C'] * df['is_short']

# Position interaction: C × Top
df['C_x_top'] = df['C'] * df['is_top']

print(f"C × Short: mean = {df['C_x_short'].mean():.4f}")
print(f"C × Top: mean = {df['C_x_top'].mean():.4f}")

In [None]:
# Summary of variables
print("\n" + "=" * 80)
print("UMBRELLA VARIABLE SUMMARY")
print("=" * 80)

print(f"\nObservations: {len(df):,}")
print(f"Sessions: {df['session_id'].nunique():,}")
print(f"Users: {df['user_id'].nunique():,}")
print(f"Vendors: {df['vendor_id'].nunique():,}")

print("\n--- Variable Statistics ---")
vars_to_show = ['Y', 'C', 'C_lag1', 'C_lag2', 'C_lag3', 'C_lag4', 
                'I', 'I_lag1', 'is_short', 'is_top', 'C_competition', 'anchor_price']
for v in vars_to_show:
    if v in df.columns:
        print(f"{v}: mean = {df[v].mean():.4f}, std = {df[v].std():.4f}")

## 3. Baseline Model (No Mechanisms)

In [None]:
print("=" * 80)
print("MODEL 0: BASELINE (User + Week + Vendor FE)")
print("=" * 80)

# Simple model: Y ~ C with user + week + vendor FE
model0 = pf.feols("Y ~ C | user_id + year_week + vendor_id", 
                   data=df, vcov={'CRV1': 'user_id'})
print(model0.summary())

print(f"\nβ_0 (contemporaneous click effect) = {model0.coef()['C']:.4f}")

## 4. Awareness/Adstock Model (Lagged Clicks)

In [None]:
print("=" * 80)
print("MODEL 1: AWARENESS/ADSTOCK (Contemporaneous + Lagged Clicks)")
print("=" * 80)
print("\nH0: β_ℓ = 0 for ℓ ≥ 1 (no delayed/adstock effect)")

# Model with lagged clicks
lag_vars = ' + '.join([f'C_lag{i}' for i in range(1, MAX_LAG + 1)])
formula = f"Y ~ C + {lag_vars} | user_id + year_week + vendor_id"
print(f"\nFormula: {formula}")

model1 = pf.feols(formula, data=df, vcov={'CRV1': 'user_id'})
print(model1.summary())

print("\n--- Lag Structure ---")
print(f"β_0 (t=0): {model1.coef()['C']:.4f}")
for lag in range(1, MAX_LAG + 1):
    coef = model1.coef()[f'C_lag{lag}']
    se = model1.se()[f'C_lag{lag}']
    print(f"β_{lag} (t-{lag}): {coef:.4f} (SE = {se:.4f}, t = {coef/se:.2f})")

## 5. View-Through Model (Impressions)

In [None]:
print("=" * 80)
print("MODEL 2: VIEW-THROUGH (Clicks + Impressions with Lags)")
print("=" * 80)
print("\nH0: η_ℓ = 0 for all ℓ (no view-through effect)")

if df['I'].sum() > 0:
    # Add impression lags
    imp_vars = 'I + ' + ' + '.join([f'I_lag{i}' for i in range(1, MAX_LAG + 1)])
    formula = f"Y ~ C + {lag_vars} + {imp_vars} | user_id + year_week + vendor_id"
    print(f"\nFormula: {formula}")
    
    model2 = pf.feols(formula, data=df, vcov={'CRV1': 'user_id'})
    print(model2.summary())
    
    print("\n--- View-Through Coefficients ---")
    print(f"η_0 (I, t=0): {model2.coef()['I']:.6f}")
    for lag in range(1, MAX_LAG + 1):
        coef = model2.coef()[f'I_lag{lag}']
        print(f"η_{lag} (I, t-{lag}): {coef:.6f}")
else:
    print("Impressions not available - skipping view-through model")
    model2 = None

## 6. Search-Cost/Position Model (Interactions)

In [None]:
print("=" * 80)
print("MODEL 3: SEARCH-COST/POSITION (Friction & Position Interactions)")
print("=" * 80)
print("\nH0: δ_1 = δ_2 = 0 (no friction/position moderation)")

# Model with friction and position interactions
formula = f"Y ~ C + C_x_short + C_x_top + {lag_vars} | user_id + year_week + vendor_id"
print(f"\nFormula: {formula}")

model3 = pf.feols(formula, data=df, vcov={'CRV1': 'user_id'})
print(model3.summary())

print("\n--- Interaction Effects ---")
print(f"β (base click effect): {model3.coef()['C']:.4f}")
print(f"δ_1 (C × Short): {model3.coef()['C_x_short']:.4f}")
print(f"δ_2 (C × Top): {model3.coef()['C_x_top']:.4f}")
print("\nInterpretation:")
print("  δ_1 > 0: Clicks more effective in short sessions (friction reduction)")
print("  δ_2 > 0: Top-rank clicks more effective (position matters)")

## 7. Competition/Substitution Model

In [None]:
print("=" * 80)
print("MODEL 4: COMPETITION/SUBSTITUTION")
print("=" * 80)
print("\nH0: ρ = 0 (no competition effect)")
print("Expected: ρ < 0 if vendors are substitutes")

# Model with competition clicks
formula = f"Y ~ C + C_competition + {lag_vars} | user_id + year_week + vendor_id"
print(f"\nFormula: {formula}")

model4 = pf.feols(formula, data=df, vcov={'CRV1': 'user_id'})
print(model4.summary())

rho = model4.coef()['C_competition']
se_rho = model4.se()['C_competition']
print(f"\nρ (competition effect) = {rho:.4f} (SE = {se_rho:.4f})")
if rho < 0:
    print("→ Negative ρ: Clicks on other vendors REDUCE own vendor spend (substitution)")
else:
    print("→ Positive ρ: Clicks on other vendors INCREASE own vendor spend (complementarity)")

## 8. Anchoring Model

In [None]:
print("=" * 80)
print("MODEL 5: ANCHORING")
print("=" * 80)
print("\nH0: κ = 0 (no anchoring effect)")

if df['anchor_price'].sum() > 0:
    # Model with anchor price
    formula = f"Y ~ C + anchor_price + {lag_vars} | user_id + year_week + vendor_id"
    print(f"\nFormula: {formula}")
    
    model5 = pf.feols(formula, data=df, vcov={'CRV1': 'user_id'})
    print(model5.summary())
    
    kappa = model5.coef()['anchor_price']
    print(f"\nκ (anchoring effect) = {kappa:.6f}")
    print("Interpretation: How initial exposure price affects subsequent spend")
else:
    print("Anchor price not available - skipping")
    model5 = None

## 9. Full Umbrella Model

In [None]:
print("=" * 80)
print("FULL UMBRELLA MODEL (All Mechanisms)")
print("=" * 80)

# Build full formula
terms = ['C']

# Lagged clicks
terms.extend([f'C_lag{i}' for i in range(1, MAX_LAG + 1)])

# Impressions (if available)
if df['I'].sum() > 0:
    terms.append('I')
    terms.extend([f'I_lag{i}' for i in range(1, MAX_LAG + 1)])

# Interactions
terms.extend(['C_x_short', 'C_x_top'])

# Competition
terms.append('C_competition')

# Anchoring (if available)
if df['anchor_price'].sum() > 0:
    terms.append('anchor_price')

formula = f"Y ~ {' + '.join(terms)} | user_id + year_week + vendor_id"
print(f"\nFormula: Y ~ {' + '.join(terms[:5])} + ...")
print(f"         ... + {' + '.join(terms[5:])}")
print(f"         | user_id + year_week + vendor_id")

model_full = pf.feols(formula, data=df, vcov={'CRV1': 'user_id'})
print(model_full.summary())

In [None]:
# Full model coefficient tests
print("\n" + "=" * 80)
print("COEFFICIENT TESTS (Full Umbrella Model)")
print("=" * 80)

coefs = model_full.coef()
ses = model_full.se()

def test_coef(name, expected_sign=None):
    if name in coefs:
        c, s = coefs[name], ses[name]
        t = c / s
        sig = '***' if abs(t) > 2.58 else '**' if abs(t) > 1.96 else '*' if abs(t) > 1.65 else ''
        sign_match = ''
        if expected_sign == '+' and c > 0:
            sign_match = ' [expected]'
        elif expected_sign == '-' and c < 0:
            sign_match = ' [expected]'
        elif expected_sign:
            sign_match = ' [unexpected]'
        print(f"{name:20s}: {c:10.4f} (t={t:6.2f}){sig}{sign_match}")
    else:
        print(f"{name:20s}: not in model")

print("\n--- Short-Run Activation ---")
test_coef('C', '+')

print("\n--- Awareness/Adstock (Lagged Effects) ---")
for lag in range(1, MAX_LAG + 1):
    test_coef(f'C_lag{lag}')

print("\n--- View-Through (Impressions) ---")
test_coef('I')
for lag in range(1, MAX_LAG + 1):
    test_coef(f'I_lag{lag}')

print("\n--- Search-Cost/Position ---")
test_coef('C_x_short')  # friction
test_coef('C_x_top')    # position

print("\n--- Competition/Substitution ---")
test_coef('C_competition', '-')

print("\n--- Anchoring ---")
test_coef('anchor_price')

## 10. Model Comparison

In [None]:
print("=" * 80)
print("MODEL COMPARISON")
print("=" * 80)

models = {
    'M0: Baseline': model0,
    'M1: + Lags': model1,
    'M3: + Interactions': model3,
    'M4: + Competition': model4,
    'Full': model_full
}

if model2 is not None:
    models['M2: + Impressions'] = model2
if model5 is not None:
    models['M5: + Anchoring'] = model5

print(f"\n{'Model':<25s} {'β_C':<10s} {'R²':<10s} {'N':<10s}")
print("-" * 55)

for name, m in models.items():
    beta_c = m.coef()['C']
    r2 = m.r2 if hasattr(m, 'r2') else 'N/A'
    n = m.nobs
    if isinstance(r2, float):
        print(f"{name:<25s} {beta_c:<10.4f} {r2:<10.4f} {n:<10,}")
    else:
        print(f"{name:<25s} {beta_c:<10.4f} {str(r2):<10s} {n:<10,}")

## 11. Summary

In [None]:
print("=" * 80)
print("UMBRELLA REGRESSION SUMMARY")
print("=" * 80)

print("\n--- Key Findings ---")

# Short-run
beta0 = model_full.coef()['C']
se0 = model_full.se()['C']
print(f"\n1. SHORT-RUN ACTIVATION")
print(f"   β_0 = {beta0:.4f} (t = {beta0/se0:.2f})")
if abs(beta0/se0) > 1.96:
    print(f"   → REJECT H0: Significant contemporaneous click effect")

# Awareness
print(f"\n2. AWARENESS/ADSTOCK")
any_lag_sig = False
for lag in range(1, MAX_LAG + 1):
    name = f'C_lag{lag}'
    if name in model_full.coef():
        c, s = model_full.coef()[name], model_full.se()[name]
        if abs(c/s) > 1.96:
            any_lag_sig = True
            print(f"   β_{lag} = {c:.4f} (t = {c/s:.2f}) - SIGNIFICANT")
if not any_lag_sig:
    print("   → No significant lagged effects (limited adstock)")

# Competition
if 'C_competition' in model_full.coef():
    rho = model_full.coef()['C_competition']
    se_rho = model_full.se()['C_competition']
    print(f"\n3. COMPETITION/SUBSTITUTION")
    print(f"   ρ = {rho:.4f} (t = {rho/se_rho:.2f})")
    if rho < 0 and abs(rho/se_rho) > 1.96:
        print("   → SUBSTITUTION: Clicks on other vendors reduce own vendor spend")
    elif rho > 0 and abs(rho/se_rho) > 1.96:
        print("   → COMPLEMENTARITY: Clicks on other vendors increase own vendor spend")

print("\n--- Mechanism Summary ---")
print("Supported mechanisms (|t| > 1.96):")
for name in model_full.coef().keys():
    c, s = model_full.coef()[name], model_full.se()[name]
    if abs(c/s) > 1.96:
        print(f"  ✓ {name}: {c:.4f}")

In [None]:
# Save results
import json

umbrella_results = {
    'gap_days': GAP_DAYS,
    'n_obs': len(df),
    'n_sessions': df['session_id'].nunique(),
    'n_users': df['user_id'].nunique(),
    'baseline_beta': model0.coef()['C'],
    'full_model_coefs': {k: float(v) for k, v in model_full.coef().items()},
    'full_model_ses': {k: float(v) for k, v in model_full.se().items()}
}

with open(DATA_DIR / 'umbrella_results.json', 'w') as f:
    json.dump(umbrella_results, f, indent=2)

print(f"\nResults saved to {DATA_DIR / 'umbrella_results.json'}")

In [None]:
print("\n" + "=" * 80)
print("UMBRELLA REGRESSION COMPLETE")
print("=" * 80)