# 05 Robustness Suite

**Purpose:** Validate main regression results through multiple robustness checks.

**Tests:**
1. **Promoted-Halo Lower Bound:** Clicked-item vs other-vendor-items spend decomposition
2. **Delayed Conversion:** Window sweep (L = 0, 1, 2, 4 weeks)
3. **Position Bias:** Auction controls + rank stratification (SEPARATE from main model)
4. **View-Through:** Impressions effect
5. **Placebo Test:** Regress past spend on future clicks

**Important Caveats:**
- Halo effect is a LOWER BOUND because organic purchases cannot be attributed to vendor
- Position/rank controls are potentially post-treatment/endogenous - treat as decomposition only

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

import pyfixest as pf

DATA_DIR = Path('data')
print(f"Data directory: {DATA_DIR.resolve()}")

## 1. Load Data

In [None]:
print("Loading data...")

panel_utv = pd.read_parquet(DATA_DIR / 'panel_utv.parquet')
promoted_events = pd.read_parquet(DATA_DIR / 'promoted_events.parquet')
purchases_mapped = pd.read_parquet(DATA_DIR / 'purchases_mapped.parquet')

print(f"Panel (u,t,v): {len(panel_utv):,} rows")
print(f"Promoted events: {len(promoted_events):,} rows")
print(f"Purchases mapped: {len(purchases_mapped):,} rows")

## 2. Promoted-Halo Lower Bound

Decompose Y into:
- Y_clicked: spend on clicked items
- Y_other: spend on other items from same vendor

**CRITICAL CAVEAT:** This analysis provides only a **LOWER BOUND** on the true halo effect because:
- Y_other only captures vendor purchases that are linkable to promoted events
- Organic vendor purchases (no prior click/impression) are UNOBSERVED
- True halo = Y_other (observed) + organic_halo (unobserved)

In [None]:
print("=" * 80)
print("2. PROMOTED-HALO LOWER BOUND")
print("=" * 80)
print("\nCAVEAT: This is a LOWER BOUND - organic vendor purchases are unobserved.")
print("Only purchases linkable to promoted events can be attributed to vendor.\n")

# Get (user, vendor, product) pairs from clicks
promoted_events['click_time'] = pd.to_datetime(promoted_events['click_time'])
promoted_events['year_week'] = promoted_events['click_time'].dt.isocalendar().week.astype(str).str.zfill(2)
promoted_events['year'] = promoted_events['click_time'].dt.year
promoted_events['year_week'] = promoted_events['year'].astype(str) + '_W' + promoted_events['year_week']

clicked_products = promoted_events[['user_id', 'vendor_id', 'product_id']].drop_duplicates()
clicked_products['clicked'] = True

print(f"Unique (user, vendor, product) clicked: {len(clicked_products):,}")

In [None]:
# Split purchases into clicked vs other
purchases_valid = purchases_mapped[purchases_mapped['is_post_click']].copy()
purchases_valid['purchase_time'] = pd.to_datetime(purchases_valid['purchase_time'])
purchases_valid['year_week'] = purchases_valid['purchase_time'].dt.isocalendar().week.astype(str).str.zfill(2)
purchases_valid['year'] = purchases_valid['purchase_time'].dt.year
purchases_valid['year_week'] = purchases_valid['year'].astype(str) + '_W' + purchases_valid['year_week']

purchases_valid = purchases_valid.merge(
    clicked_products,
    left_on=['user_id', 'click_vendor_id', 'product_id'],
    right_on=['user_id', 'vendor_id', 'product_id'],
    how='left'
)
purchases_valid['clicked'] = purchases_valid['clicked'].fillna(False)

# Aggregate to (u, t, v)
spend_clicked = purchases_valid[purchases_valid['clicked']].groupby(
    ['user_id', 'year_week', 'click_vendor_id']
)['spend'].sum().reset_index()
spend_clicked.columns = ['user_id', 'year_week', 'vendor_id', 'Y_clicked']

spend_other = purchases_valid[~purchases_valid['clicked']].groupby(
    ['user_id', 'year_week', 'click_vendor_id']
)['spend'].sum().reset_index()
spend_other.columns = ['user_id', 'year_week', 'vendor_id', 'Y_other']

print(f"Spend on clicked items: ${spend_clicked['Y_clicked'].sum():,.2f}")
print(f"Spend on other items: ${spend_other['Y_other'].sum():,.2f}")

In [None]:
# Merge decomposed spend to panel
panel_halo = panel_utv.merge(spend_clicked, on=['user_id', 'year_week', 'vendor_id'], how='left')
panel_halo = panel_halo.merge(spend_other, on=['user_id', 'year_week', 'vendor_id'], how='left')
panel_halo['Y_clicked'] = panel_halo['Y_clicked'].fillna(0)
panel_halo['Y_other'] = panel_halo['Y_other'].fillna(0)

# Regression on clicked-item spend
print("\n--- Effect on Clicked-Item Spend ---")
model_clicked = pf.feols("Y_clicked ~ C | user_id + year_week + vendor_id", 
                          data=panel_halo, vcov={'CRV1': 'user_id'})
print(f"β (clicked items) = {model_clicked.coef()['C']:.4f} (SE = {model_clicked.se()['C']:.4f})")

# Regression on other-item spend
print("\n--- Effect on Other-Item Spend (Promoted-Halo Lower Bound) ---")
model_other = pf.feols("Y_other ~ C | user_id + year_week + vendor_id", 
                        data=panel_halo, vcov={'CRV1': 'user_id'})
print(f"β (other items) = {model_other.coef()['C']:.4f} (SE = {model_other.se()['C']:.4f})")

print("\n*** INTERPRETATION ***")
print("β_other > 0 indicates promoted-halo effect (click leads to other vendor purchases)")
print("This is a LOWER BOUND: organic vendor purchases after ad exposure are unobserved.")

## 3. Delayed Conversion (Window Sweep)

$$Y^{(L)}_{utv} = \sum_{k=0}^{L} Y_{u,t+k,v}$$

for L ∈ {0, 1, 2, 4} weeks

In [None]:
print("=" * 80)
print("3. DELAYED CONVERSION (WINDOW SWEEP)")
print("=" * 80)

# Create week order for forward-looking windows
weeks_ordered = sorted(panel_utv['year_week'].unique())
week_to_idx = {w: i for i, w in enumerate(weeks_ordered)}
idx_to_week = {i: w for w, i in week_to_idx.items()}

panel_utv['week_idx'] = panel_utv['year_week'].map(week_to_idx)

print(f"Weeks available: {len(weeks_ordered)}")
print(f"Range: {weeks_ordered[0]} to {weeks_ordered[-1]}")

In [None]:
# Build cumulative spend windows
WINDOWS = [0, 1, 2, 4]  # weeks
window_results = []

for L in tqdm(WINDOWS, desc="Window sweep"):
    # For each (u, t, v), sum Y from t to t+L
    panel_window = panel_utv.copy()
    
    # Get spend for each (u, v, week_idx)
    spend_by_week = panel_utv.groupby(['user_id', 'vendor_id', 'week_idx'])['Y'].sum().reset_index()
    
    # For each observation, sum spend from week_idx to week_idx + L
    def get_cumulative_spend(row):
        mask = (
            (spend_by_week['user_id'] == row['user_id']) &
            (spend_by_week['vendor_id'] == row['vendor_id']) &
            (spend_by_week['week_idx'] >= row['week_idx']) &
            (spend_by_week['week_idx'] <= row['week_idx'] + L)
        )
        return spend_by_week.loc[mask, 'Y'].sum()
    
    # For efficiency, use merge-based approach
    Y_cumulative = []
    for week_offset in range(L + 1):
        spend_offset = spend_by_week.copy()
        spend_offset['week_idx'] = spend_offset['week_idx'] - week_offset
        spend_offset = spend_offset.rename(columns={'Y': f'Y_offset_{week_offset}'})
        
        panel_window = panel_window.merge(
            spend_offset[['user_id', 'vendor_id', 'week_idx', f'Y_offset_{week_offset}']],
            on=['user_id', 'vendor_id', 'week_idx'],
            how='left'
        )
        panel_window[f'Y_offset_{week_offset}'] = panel_window[f'Y_offset_{week_offset}'].fillna(0)
    
    # Sum across offsets
    offset_cols = [f'Y_offset_{i}' for i in range(L + 1)]
    panel_window[f'Y_L{L}'] = panel_window[offset_cols].sum(axis=1)
    
    # Regression
    try:
        model = pf.feols(f"Y_L{L} ~ C | user_id + year_week + vendor_id", 
                         data=panel_window, vcov={'CRV1': 'user_id'})
        beta = model.coef()['C']
        se = model.se()['C']
        
        window_results.append({
            'L_weeks': L,
            'beta': beta,
            'se': se,
            'mean_Y': panel_window[f'Y_L{L}'].mean()
        })
        
        print(f"L={L} weeks: β = {beta:.4f} (SE = {se:.4f}), mean Y^(L) = ${panel_window[f'Y_L{L}'].mean():.2f}")
    except Exception as e:
        print(f"L={L} weeks: Error - {e}")

In [None]:
# Window sweep summary
print("\n--- Window Sweep Summary ---")
window_df = pd.DataFrame(window_results)
print(window_df.to_string(index=False))

print("\nInterpretation: β increasing with L indicates delayed conversion effect")

## 4. Position Bias (Rank Stratification)

**IMPORTANT CAVEAT - POST-TREATMENT BIAS:**
- Auction controls (rank, pacing, quality, bid) are determined AFTER the decision to run ads
- These are potentially endogenous/post-treatment variables
- Coefficients on controls should be interpreted as **decomposition/association only**, NOT causal effects
- This section is SEPARATE from the main model for this reason

In [None]:
print("=" * 80)
print("4. POSITION BIAS (RANK STRATIFICATION)")
print("=" * 80)
print("\nCAVEAT: Auction controls are POST-TREATMENT/ENDOGENOUS.")
print("Coefficients are ASSOCIATIONS, not causal effects.\n")

# Add rank controls to panel
df = panel_utv.copy()

# Fill missing controls
control_cols = ['avg_rank', 'share_rank1', 'avg_quality', 'avg_pacing', 'avg_final_bid']
for col in control_cols:
    if col in df.columns:
        df[col] = df[col].fillna(0)

# Model with rank controls
print("\n--- Decomposition Model with Auction Controls (NOT CAUSAL) ---")
available_controls = [c for c in control_cols if c in df.columns]

if available_controls:
    formula = f"Y ~ C + {' + '.join(available_controls)} | user_id + year_week + vendor_id"
    print(f"Formula: {formula}")
    model_controls = pf.feols(formula, data=df, vcov={'CRV1': 'user_id'})
    print(model_controls.summary())

In [None]:
# Stratification by rank
print("\n--- Stratification by Rank ---")

if 'share_rank1' in df.columns:
    # High rank1 share (top-rank clicks)
    df_top_rank = df[df['share_rank1'] > 0.5].copy()
    # Low rank1 share (lower-rank clicks)
    df_low_rank = df[(df['share_rank1'] <= 0.5) & (df['share_rank1'] > 0)].copy()
    
    print(f"Top-rank subsample (share_rank1 > 0.5): {len(df_top_rank):,} obs")
    print(f"Lower-rank subsample (0 < share_rank1 <= 0.5): {len(df_low_rank):,} obs")
    
    if len(df_top_rank) > 100:
        model_top = pf.feols("Y ~ C | user_id + year_week + vendor_id", 
                              data=df_top_rank, vcov={'CRV1': 'user_id'})
        print(f"\nTop-rank β = {model_top.coef()['C']:.4f} (SE = {model_top.se()['C']:.4f})")
    
    if len(df_low_rank) > 100:
        model_low = pf.feols("Y ~ C | user_id + year_week + vendor_id", 
                              data=df_low_rank, vcov={'CRV1': 'user_id'})
        print(f"Lower-rank β = {model_low.coef()['C']:.4f} (SE = {model_low.se()['C']:.4f})")

## 5. View-Through Effect (Impressions)

In [None]:
print("=" * 80)
print("5. VIEW-THROUGH EFFECT (IMPRESSIONS)")
print("=" * 80)

df = panel_utv.copy()

if 'I' in df.columns:
    print(f"\nImpressions available: mean = {df['I'].mean():.1f}, max = {df['I'].max()}")
    
    # Model with impressions
    model_viewthrough = pf.feols("Y ~ C + I | user_id + year_week + vendor_id", 
                                  data=df, vcov={'CRV1': 'user_id'})
    print("\n--- Model with Clicks + Impressions ---")
    print(model_viewthrough.summary())
    
    beta_C = model_viewthrough.coef()['C']
    beta_I = model_viewthrough.coef()['I']
    print(f"\nβ_C (click effect) = {beta_C:.4f}")
    print(f"β_I (view-through) = {beta_I:.4f}")
else:
    print("Impressions not available in panel")

## 6. Placebo Test

Regress **past** spend on **future** clicks:
$$Y_{u,t-1,v} = \alpha_u + \lambda_t + \phi_v + \beta^{pl} \cdot C_{utv} + \varepsilon$$

Large β^pl indicates selection/anticipation.

In [None]:
print("=" * 80)
print("6. PLACEBO TEST (Past Spend ~ Future Clicks)")
print("=" * 80)

# Create lagged spend (Y from t-1)
df = panel_utv.copy()

# Sort by user, vendor, week
df = df.sort_values(['user_id', 'vendor_id', 'week_idx'])

# Lag Y within (user, vendor)
df['Y_lag1'] = df.groupby(['user_id', 'vendor_id'])['Y'].shift(1)

# Drop missing
df_placebo = df[df['Y_lag1'].notna()].copy()

print(f"Placebo sample: {len(df_placebo):,} observations")

In [None]:
# Placebo regression: past spend on current clicks
print("\n--- Placebo: Y_{t-1} ~ C_t ---")
model_placebo = pf.feols("Y_lag1 ~ C | user_id + year_week + vendor_id", 
                          data=df_placebo, vcov={'CRV1': 'user_id'})
print(model_placebo.summary())

beta_placebo = model_placebo.coef()['C']
se_placebo = model_placebo.se()['C']

print(f"\nβ^pl = {beta_placebo:.4f} (SE = {se_placebo:.4f})")
print(f"t-stat = {beta_placebo / se_placebo:.2f}")

if abs(beta_placebo / se_placebo) < 1.96:
    print("\n✓ Placebo test PASSED: No significant relationship between past spend and future clicks")
else:
    print("\n⚠ Placebo test WARNING: Significant relationship detected - possible selection/anticipation")

## 7. New-to-Vendor Subsample

In [None]:
print("=" * 80)
print("7. NEW-TO-VENDOR SUBSAMPLE")
print("=" * 80)

# Find first click week for each (user, vendor)
first_click = panel_utv[panel_utv['C'] > 0].groupby(['user_id', 'vendor_id'])['week_idx'].min().reset_index()
first_click.columns = ['user_id', 'vendor_id', 'first_click_week']

# Merge to panel
df = panel_utv.merge(first_click, on=['user_id', 'vendor_id'], how='left')

# New-to-vendor: week_idx == first_click_week
df_new = df[df['week_idx'] == df['first_click_week']].copy()

print(f"New-to-vendor observations: {len(df_new):,}")
print(f"Share of total: {len(df_new)/len(panel_utv)*100:.1f}%")

In [None]:
# Regression on new-to-vendor subsample
print("\n--- Model on New-to-Vendor Subsample ---")
if len(df_new) > 100:
    model_new = pf.feols("Y ~ C | user_id + year_week + vendor_id", 
                          data=df_new, vcov={'CRV1': 'user_id'})
    print(model_new.summary())
    
    print(f"\nβ (new-to-vendor) = {model_new.coef()['C']:.4f}")
    print("Interpretation: Effect for users with no prior vendor relationship")
else:
    print("Insufficient observations for new-to-vendor analysis")

## 8. Robustness Summary

In [None]:
print("=" * 80)
print("ROBUSTNESS SUITE SUMMARY")
print("=" * 80)

print("\n1. PROMOTED-HALO LOWER BOUND")
print("   (Organic vendor purchases are unobserved - true halo may be higher)")
print(f"   β (clicked items): {model_clicked.coef()['C']:.4f}")
print(f"   β (other items):   {model_other.coef()['C']:.4f}")

print("\n2. WINDOW SWEEP")
for r in window_results:
    print(f"   L={r['L_weeks']} weeks: β = {r['beta']:.4f}")

print("\n3. POSITION BIAS (Decomposition Only - NOT Causal)")
if 'model_top' in dir():
    print(f"   Top-rank β: {model_top.coef()['C']:.4f}")
if 'model_low' in dir():
    print(f"   Lower-rank β: {model_low.coef()['C']:.4f}")

print("\n4. VIEW-THROUGH")
if 'model_viewthrough' in dir():
    print(f"   β_C (clicks): {model_viewthrough.coef()['C']:.4f}")
    print(f"   β_I (impressions): {model_viewthrough.coef()['I']:.4f}")

print("\n5. PLACEBO TEST")
print(f"   β^pl = {beta_placebo:.4f} (t = {beta_placebo/se_placebo:.2f})")
print(f"   Status: {'PASSED' if abs(beta_placebo/se_placebo) < 1.96 else 'WARNING'}")

print("\n6. NEW-TO-VENDOR")
if 'model_new' in dir():
    print(f"   β = {model_new.coef()['C']:.4f}")

In [None]:
# Save robustness results
import json

robustness_results = {
    'promoted_halo_lower_bound': {
        'beta_clicked': model_clicked.coef()['C'],
        'beta_other': model_other.coef()['C'],
        'caveat': 'LOWER BOUND - organic vendor purchases are unobserved'
    },
    'window_sweep': window_results,
    'placebo': {
        'beta': beta_placebo,
        'se': se_placebo,
        'passed': abs(beta_placebo/se_placebo) < 1.96
    },
    'position_bias_caveat': 'Auction controls are post-treatment/endogenous - associations only'
}

with open(DATA_DIR / 'robustness_results.json', 'w') as f:
    json.dump(robustness_results, f, indent=2, default=str)

print(f"\nResults saved to {DATA_DIR / 'robustness_results.json'}")

In [None]:
print("\n" + "=" * 80)
print("ROBUSTNESS SUITE COMPLETE")
print("=" * 80)