# Funnel Analysis: Auction → Impression → Click

## Objective
Analyze conversion funnel from bids through winners to impressions to clicks.

## Data Period
14 days (2025-09-02 to 2025-09-08), 0.1% user sample

## Join Strategy
Composite key: AUCTION_ID + PRODUCT_ID + CAMPAIGN_ID + VENDOR_ID

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import warnings
from scipy.stats import spearmanr, chi2_contingency
warnings.filterwarnings('ignore')

print("="*80)
print("FUNNEL ANALYSIS: AUCTION → IMPRESSION → CLICK")
print("="*80)
print()

print("Loading data...")
print("-" * 80)
df_bids = pd.read_parquet('data/raw_auctions_results_20251011.parquet')
print(f"Loaded AUCTIONS_RESULTS (bids): {len(df_bids):,} rows")

df_auctions = pd.read_parquet('data/raw_auctions_users_20251011.parquet')
print(f"Loaded AUCTIONS_USERS: {len(df_auctions):,} rows")

df_impressions = pd.read_parquet('data/raw_impressions_20251011.parquet')
print(f"Loaded IMPRESSIONS: {len(df_impressions):,} rows")

df_clicks = pd.read_parquet('data/raw_clicks_20251011.parquet')
print(f"Loaded CLICKS: {len(df_clicks):,} rows")

df_catalog = pd.read_parquet('data/catalog_20251011.parquet')
print(f"Loaded CATALOG: {len(df_catalog):,} rows")

print("\nMerging auction metadata...")
df_bids = df_bids.drop(columns=['CREATED_AT'])
df_bids = pd.merge(df_bids,
                   df_auctions[['AUCTION_ID', 'CREATED_AT', 'PLACEMENT', 'OPAQUE_USER_ID']],
                   on='AUCTION_ID', how='left')

df_bids['datetime'] = pd.to_datetime(df_bids['CREATED_AT'])
df_bids['date'] = df_bids['datetime'].dt.date
df_bids['hour'] = df_bids['datetime'].dt.hour
df_bids['day_of_week'] = df_bids['datetime'].dt.dayofweek
df_bids['week'] = df_bids['datetime'].dt.isocalendar().week

print(f"Total bids: {len(df_bids):,}")
print(f"Total winners: {df_bids['IS_WINNER'].sum():,} ({df_bids['IS_WINNER'].mean()*100:.2f}%)")
print()

FUNNEL ANALYSIS: AUCTION → IMPRESSION → CLICK

Loading data...
--------------------------------------------------------------------------------
Loaded AUCTIONS_RESULTS (bids): 18,838,670 rows
Loaded AUCTIONS_USERS: 413,457 rows
Loaded IMPRESSIONS: 533,146 rows
Loaded CLICKS: 16,706 rows
Loaded CATALOG: 2,007,695 rows

Merging auction metadata...
Total bids: 18,840,598
Total winners: 15,510,672 (82.33%)



## Section 1: Funnel Construction

In [2]:
print("="*80)
print("SECTION 1: FUNNEL CONSTRUCTION")
print("="*80)
print()

print("1.1 STAGE VOLUMES")
print("-" * 80)
n_bids = len(df_bids)
n_winners = df_bids['IS_WINNER'].sum()
n_impressions = len(df_impressions)
n_clicks = len(df_clicks)

print(f"Stage 1 - Bids: {n_bids:,}")
print(f"Stage 2 - Winners: {n_winners:,} ({n_winners/n_bids*100:.2f}% of bids)")
print(f"Stage 3 - Impressions: {n_impressions:,} ({n_impressions/n_winners*100:.2f}% of winners)")
print(f"Stage 4 - Clicks: {n_clicks:,} ({n_clicks/n_impressions*100:.2f}% of impressions)")

print("\n1.2 COMPOSITE KEY JOINS")
print("-" * 80)
print("Join keys: AUCTION_ID + PRODUCT_ID + CAMPAIGN_ID + VENDOR_ID")

winners = df_bids[df_bids['IS_WINNER'] == True].copy()
print(f"\nWinners: {len(winners):,}")

winners_imp = pd.merge(winners,
                       df_impressions[['AUCTION_ID', 'PRODUCT_ID', 'CAMPAIGN_ID', 'VENDOR_ID', 'INTERACTION_ID', 'OCCURRED_AT']],
                       on=['AUCTION_ID', 'PRODUCT_ID', 'CAMPAIGN_ID', 'VENDOR_ID'],
                       how='left',
                       indicator='_merge_imp')
winners_imp['has_impression'] = (winners_imp['_merge_imp'] == 'both').astype(int)
print(f"Winners matched to impressions: {winners_imp['has_impression'].sum():,} ({winners_imp['has_impression'].mean()*100:.2f}%)")
print(f"Winners without impressions: {(~winners_imp['has_impression'].astype(bool)).sum():,}")

impressions_clicks = pd.merge(df_impressions,
                              df_clicks[['AUCTION_ID', 'PRODUCT_ID', 'CAMPAIGN_ID', 'VENDOR_ID', 'INTERACTION_ID']].rename(columns={'INTERACTION_ID': 'CLICK_ID'}),
                              on=['AUCTION_ID', 'PRODUCT_ID', 'CAMPAIGN_ID', 'VENDOR_ID'],
                              how='left',
                              indicator='_merge_click')
impressions_clicks['has_click'] = (impressions_clicks['_merge_click'] == 'both').astype(int)
print(f"\nImpressions matched to clicks: {impressions_clicks['has_click'].sum():,} ({impressions_clicks['has_click'].mean()*100:.2f}%)")
print(f"Impressions without clicks: {(~impressions_clicks['has_click'].astype(bool)).sum():,}")

print("\n1.3 OVERALL CONVERSION RATES")
print("-" * 80)
bid_to_win = n_winners / n_bids
win_to_imp = winners_imp['has_impression'].sum() / len(winners_imp)
imp_to_click = impressions_clicks['has_click'].sum() / len(impressions_clicks)
bid_to_click = (winners_imp['has_impression'].sum() * impressions_clicks['has_click'].sum() / len(impressions_clicks)) / n_bids

print(f"Bid → Win: {bid_to_win:.4f} ({bid_to_win*100:.2f}%)")
print(f"Win → Impression: {win_to_imp:.4f} ({win_to_imp*100:.2f}%)")
print(f"Impression → Click: {imp_to_click:.4f} ({imp_to_click*100:.2f}%)")
print(f"Bid → Click (end-to-end): {bid_to_click:.6f} ({bid_to_click*100:.4f}%)")

print("\n1.4 FUNNEL BY PLACEMENT")
print("-" * 80)
placement_funnel = df_bids.groupby('PLACEMENT').agg({
    'AUCTION_ID': 'count',
    'IS_WINNER': 'sum'
}).reset_index()
placement_funnel.columns = ['PLACEMENT', 'n_bids', 'n_wins']
placement_funnel['win_rate'] = placement_funnel['n_wins'] / placement_funnel['n_bids']

winners_imp_placement = winners_imp.groupby('PLACEMENT')['has_impression'].agg(['sum', 'count']).reset_index()
winners_imp_placement.columns = ['PLACEMENT', 'n_impressions', 'n_winners']
winners_imp_placement['imp_rate'] = winners_imp_placement['n_impressions'] / winners_imp_placement['n_winners']

placement_funnel = pd.merge(placement_funnel, winners_imp_placement, on='PLACEMENT')

print("Placement funnel:")
for _, row in placement_funnel.iterrows():
    print(f"\nPlacement {row['PLACEMENT']}:")
    print(f"  Bids: {row['n_bids']:,}")
    print(f"  Wins: {row['n_wins']:,} (rate: {row['win_rate']:.4f})")
    print(f"  Impressions: {row['n_impressions']:,} (rate: {row['imp_rate']:.4f})")

print("\n1.5 FUNNEL BY TIME")
print("-" * 80)
hourly_funnel = df_bids.groupby('hour').agg({
    'AUCTION_ID': 'count',
    'IS_WINNER': 'sum'
}).reset_index()
hourly_funnel.columns = ['hour', 'n_bids', 'n_wins']
hourly_funnel['win_rate'] = hourly_funnel['n_wins'] / hourly_funnel['n_bids']

print("Hourly win rates:")
for _, row in hourly_funnel.iterrows():
    print(f"  Hour {row['hour']:2d}: {row['n_bids']:7,} bids, {row['n_wins']:7,} wins, rate: {row['win_rate']:.4f}")

daily_funnel = df_bids.groupby('day_of_week').agg({
    'AUCTION_ID': 'count',
    'IS_WINNER': 'sum'
}).reset_index()
daily_funnel.columns = ['day_of_week', 'n_bids', 'n_wins']
daily_funnel['win_rate'] = daily_funnel['n_wins'] / daily_funnel['n_bids']

print("\nDaily win rates:")
days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
for _, row in daily_funnel.iterrows():
    day_name = days[int(row['day_of_week'])] if int(row['day_of_week']) < 7 else f"Day{int(row['day_of_week'])}"
    print(f"  {day_name}: {row['n_bids']:,} bids, {row['n_wins']:,} wins, rate: {row['win_rate']:.4f}")

print("\n1.6 PRODUCTS IN MULTIPLE CAMPAIGNS")
print("-" * 80)
product_campaigns = df_bids.groupby('PRODUCT_ID')['CAMPAIGN_ID'].nunique()
print(f"Total unique products: {len(product_campaigns):,}")
print(f"Products in 1 campaign: {(product_campaigns == 1).sum():,} ({(product_campaigns == 1).mean()*100:.2f}%)")
print(f"Products in 2+ campaigns: {(product_campaigns >= 2).sum():,} ({(product_campaigns >= 2).mean()*100:.2f}%)")
print(f"Products in 5+ campaigns: {(product_campaigns >= 5).sum():,} ({(product_campaigns >= 5).mean()*100:.2f}%)")
print(f"Max campaigns for single product: {product_campaigns.max()}")

print()

SECTION 1: FUNNEL CONSTRUCTION

1.1 STAGE VOLUMES
--------------------------------------------------------------------------------
Stage 1 - Bids: 18,840,598
Stage 2 - Winners: 15,510,672 (82.33% of bids)
Stage 3 - Impressions: 533,146 (3.44% of winners)
Stage 4 - Clicks: 16,706 (3.13% of impressions)

1.2 COMPOSITE KEY JOINS
--------------------------------------------------------------------------------
Join keys: AUCTION_ID + PRODUCT_ID + CAMPAIGN_ID + VENDOR_ID

Winners: 15,510,672
Winners matched to impressions: 533,225 (3.44%)
Winners without impressions: 14,981,408

Impressions matched to clicks: 16,171 (3.03%)
Impressions without clicks: 518,199

1.3 OVERALL CONVERSION RATES
--------------------------------------------------------------------------------
Bid → Win: 0.8233 (82.33%)
Win → Impression: 0.0344 (3.44%)
Impression → Click: 0.0303 (3.03%)
Bid → Click (end-to-end): 0.000856 (0.0856%)

1.4 FUNNEL BY PLACEMENT
--------------------------------------------------------------

ValueError: Unknown format code 'd' for object of type 'float'

## Section 2: Stage Transitions

In [3]:
print("="*80)
print("SECTION 2: STAGE TRANSITIONS")
print("="*80)
print()

print("2.1 WINNERS → IMPRESSIONS")
print("-" * 80)
print(f"Winners with impressions: {winners_imp['has_impression'].sum():,}")
print(f"Winners without impressions: {(~winners_imp['has_impression'].astype(bool)).sum():,}")

with_imp = winners_imp[winners_imp['has_impression'] == 1]
without_imp = winners_imp[winners_imp['has_impression'] == 0]

print("\nComparison: Winners with vs without impressions")
print(f"  With impressions:")
print(f"    Mean FINAL_BID: {with_imp['FINAL_BID'].mean():.2f}")
print(f"    Mean QUALITY: {with_imp['QUALITY'].mean():.6f}")
print(f"    Mean RANKING: {with_imp['RANKING'].mean():.2f}")
print(f"    Mean PACING: {with_imp['PACING'].mean():.4f}")
print(f"  Without impressions:")
print(f"    Mean FINAL_BID: {without_imp['FINAL_BID'].mean():.2f}")
print(f"    Mean QUALITY: {without_imp['QUALITY'].mean():.6f}")
print(f"    Mean RANKING: {without_imp['RANKING'].mean():.2f}")
print(f"    Mean PACING: {without_imp['PACING'].mean():.4f}")

print("\nImpression rate by placement:")
imp_by_placement = winners_imp.groupby('PLACEMENT')['has_impression'].mean()
for placement, rate in imp_by_placement.items():
    print(f"  Placement {placement}: {rate:.4f}")

print("\nImpression rate by vendor (top 20):")
imp_by_vendor = winners_imp.groupby('VENDOR_ID').agg({
    'has_impression': ['sum', 'count', 'mean']
}).reset_index()
imp_by_vendor.columns = ['VENDOR_ID', 'n_impressions', 'n_winners', 'imp_rate']
imp_by_vendor = imp_by_vendor[imp_by_vendor['n_winners'] >= 100].sort_values('n_impressions', ascending=False).head(20)
for _, row in imp_by_vendor.iterrows():
    print(f"  Vendor {row['VENDOR_ID'][:20]}...: {row['n_impressions']:,} impressions / {row['n_winners']:,} winners = {row['imp_rate']:.4f}")

print("\n2.2 IMPRESSIONS → CLICKS")
print("-" * 80)
# Merge impressions with winners to get bid/quality data
impressions_full = pd.merge(impressions_clicks,
                            winners[['AUCTION_ID', 'PRODUCT_ID', 'CAMPAIGN_ID', 'VENDOR_ID', 'FINAL_BID', 'QUALITY', 'RANKING', 'PLACEMENT', 'hour', 'day_of_week']],
                            on=['AUCTION_ID', 'PRODUCT_ID', 'CAMPAIGN_ID', 'VENDOR_ID'],
                            how='left')

print(f"Impressions with clicks: {impressions_full['has_click'].sum():,}")
print(f"Impressions without clicks: {(~impressions_full['has_click'].astype(bool)).sum():,}")
print(f"Overall CTR: {impressions_full['has_click'].mean():.4f}")

with_click = impressions_full[impressions_full['has_click'] == 1]
without_click = impressions_full[impressions_full['has_click'] == 0]

print("\nComparison: Impressions with vs without clicks")
print(f"  With clicks:")
print(f"    Mean FINAL_BID: {with_click['FINAL_BID'].mean():.2f}")
print(f"    Mean QUALITY: {with_click['QUALITY'].mean():.6f}")
print(f"    Mean RANKING: {with_click['RANKING'].mean():.2f}")
print(f"  Without clicks:")
print(f"    Mean FINAL_BID: {without_click['FINAL_BID'].mean():.2f}")
print(f"    Mean QUALITY: {without_click['QUALITY'].mean():.6f}")
print(f"    Mean RANKING: {without_click['RANKING'].mean():.2f}")

print("\nCTR by placement:")
ctr_by_placement = impressions_full.groupby('PLACEMENT')['has_click'].agg(['sum', 'count', 'mean'])
ctr_by_placement.columns = ['n_clicks', 'n_impressions', 'ctr']
for placement, row in ctr_by_placement.iterrows():
    print(f"  Placement {placement}: {row['n_clicks']:,} / {row['n_impressions']:,} = {row['ctr']:.4f}")

print("\nCTR by ranking:")
ctr_by_ranking = impressions_full.groupby('RANKING')['has_click'].agg(['sum', 'count', 'mean']).reset_index()
ctr_by_ranking.columns = ['RANKING', 'n_clicks', 'n_impressions', 'ctr']
ctr_by_ranking = ctr_by_ranking[ctr_by_ranking['n_impressions'] >= 10].head(20)
for _, row in ctr_by_ranking.iterrows():
    print(f"  Ranking {row['RANKING']:.0f}: {row['n_clicks']:.0f} / {row['n_impressions']:.0f} = {row['ctr']:.4f}")

print("\nCTR by hour of day:")
ctr_by_hour = impressions_full.groupby('hour')['has_click'].agg(['sum', 'count', 'mean'])
ctr_by_hour.columns = ['n_clicks', 'n_impressions', 'ctr']
for hour, row in ctr_by_hour.iterrows():
    print(f"  Hour {hour:2d}: {row['n_clicks']:5.0f} / {row['n_impressions']:6.0f} = {row['ctr']:.4f}")

print("\nCTR by day of week:")
ctr_by_day = impressions_full.groupby('day_of_week')['has_click'].agg(['sum', 'count', 'mean'])
ctr_by_day.columns = ['n_clicks', 'n_impressions', 'ctr']
days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
for day_num, row in ctr_by_day.iterrows():
    day_name = days[int(day_num)] if int(day_num) < 7 else f"Day{int(day_num)}"
    print(f"  {day_name}: {row['n_clicks']:5.0f} / {row['n_impressions']:6.0f} = {row['ctr']:.4f}")

print()

SECTION 2: STAGE TRANSITIONS

2.1 WINNERS → IMPRESSIONS
--------------------------------------------------------------------------------
Winners with impressions: 533,225
Winners without impressions: 14,981,408

Comparison: Winners with vs without impressions
  With impressions:
    Mean FINAL_BID: 11.76
    Mean QUALITY: 0.027733
    Mean RANKING: 12.37
    Mean PACING: 0.9112
  Without impressions:
    Mean FINAL_BID: 12.57
    Mean QUALITY: 0.037216
    Mean RANKING: 22.24
    Mean PACING: 0.9210

Impression rate by placement:
  Placement 1: 0.2465
  Placement 2: 0.0342
  Placement 3: 0.0361
  Placement 4: 0.0203
  Placement 5: 0.0021

Impression rate by vendor (top 20):
  Vendor 018f84c8555a76c896d6...: 970 impressions / 143,835 winners = 0.0067
  Vendor 064baf18d3cb7bdb8224...: 866 impressions / 3,262 winners = 0.2655
  Vendor 01909527cac5703f8206...: 738 impressions / 8,103 winners = 0.0911
  Vendor 064b323954177b548224...: 494 impressions / 7,294 winners = 0.0677
  Vendor 0191d7

ValueError: Unknown format code 'd' for object of type 'float'

## Section 3: Hypothesis Testing

In [4]:
print("="*80)
print("SECTION 3: HYPOTHESIS TESTING")
print("="*80)
print()

results = []

print("AUCTION DESIGN HYPOTHESES (H1-H5)")
print("-" * 80)

# H1: Quality score → impression rate
corr_quality_imp, p_quality_imp = spearmanr(winners_imp['QUALITY'], winners_imp['has_impression'])
print(f"H1: Quality → Impression rate")
print(f"  Spearman correlation: {corr_quality_imp:.4f}, p-value: {p_quality_imp:.6f}")
results.append({'hypothesis': 'H1', 'correlation': corr_quality_imp, 'p_value': p_quality_imp})

# H2: Final bid → impression rate
corr_bid_imp, p_bid_imp = spearmanr(winners_imp['FINAL_BID'], winners_imp['has_impression'])
print(f"\nH2: Final bid → Impression rate")
print(f"  Spearman correlation: {corr_bid_imp:.4f}, p-value: {p_bid_imp:.6f}")
results.append({'hypothesis': 'H2', 'correlation': corr_bid_imp, 'p_value': p_bid_imp})

# H3: Ranking → impression rate
corr_rank_imp, p_rank_imp = spearmanr(winners_imp['RANKING'], winners_imp['has_impression'])
print(f"\nH3: Ranking → Impression rate")
print(f"  Spearman correlation: {corr_rank_imp:.4f}, p-value: {p_rank_imp:.6f}")
rank_1 = winners_imp[winners_imp['RANKING'] == 1]['has_impression'].mean()
rank_5 = winners_imp[winners_imp['RANKING'] == 5]['has_impression'].mean()
rank_10 = winners_imp[winners_imp['RANKING'] == 10]['has_impression'].mean()
print(f"  Rank 1 impression rate: {rank_1:.4f}")
print(f"  Rank 5 impression rate: {rank_5:.4f}")
print(f"  Rank 10 impression rate: {rank_10:.4f}")
results.append({'hypothesis': 'H3', 'correlation': corr_rank_imp, 'p_value': p_rank_imp})

# H4: First-price vs second-price winners
winners_imp['bid_price_diff'] = winners_imp['FINAL_BID'] - winners_imp['PRICE']
winners_imp['auction_type'] = 'unknown'
winners_imp.loc[winners_imp['bid_price_diff'].abs() <= 0.5, 'auction_type'] = 'first_price'
winners_imp.loc[winners_imp['bid_price_diff'] > 0.5, 'auction_type'] = 'second_price'
first_price_imp = winners_imp[winners_imp['auction_type'] == 'first_price']['has_impression'].mean()
second_price_imp = winners_imp[winners_imp['auction_type'] == 'second_price']['has_impression'].mean()
print(f"\nH4: First-price vs second-price winners")
print(f"  First-price impression rate: {first_price_imp:.4f} (n={(winners_imp['auction_type'] == 'first_price').sum():,})")
print(f"  Second-price impression rate: {second_price_imp:.4f} (n={(winners_imp['auction_type'] == 'second_price').sum():,})")
print(f"  Difference: {first_price_imp - second_price_imp:.4f}")
results.append({'hypothesis': 'H4', 'first_price_rate': first_price_imp, 'second_price_rate': second_price_imp})

# H5: Pacing → impression rate (controlling for win)
corr_pacing_imp, p_pacing_imp = spearmanr(winners_imp['PACING'], winners_imp['has_impression'])
print(f"\nH5: Pacing → Impression rate (for winners)")
print(f"  Spearman correlation: {corr_pacing_imp:.4f}, p-value: {p_pacing_imp:.6f}")
results.append({'hypothesis': 'H5', 'correlation': corr_pacing_imp, 'p_value': p_pacing_imp})

print("\nPOSITION EFFECTS HYPOTHESES (H6-H10)")
print("-" * 80)

# H6: Placement CTR differences
print("H6: Placement CTR differences")
placement_ctr = impressions_full.groupby('PLACEMENT')['has_click'].mean()
for p in sorted(placement_ctr.index):
    print(f"  Placement {p}: CTR = {placement_ctr[p]:.4f}")

# H7: Ranking within placement → CTR
sample_imps = impressions_full.sample(min(10000, len(impressions_full)), random_state=42)
corr_rank_ctr, p_rank_ctr = spearmanr(sample_imps['RANKING'], sample_imps['has_click'])
print(f"\nH7: Ranking → CTR")
print(f"  Spearman correlation: {corr_rank_ctr:.4f}, p-value: {p_rank_ctr:.6f}")
results.append({'hypothesis': 'H7', 'correlation': corr_rank_ctr, 'p_value': p_rank_ctr})

# H8: Number of winners per auction → impression rate
auction_winners = winners_imp.groupby('AUCTION_ID').agg({
    'has_impression': ['mean', 'count']
}).reset_index()
auction_winners.columns = ['AUCTION_ID', 'avg_imp_rate', 'n_winners']
corr_nwinners_imp, p_nwinners_imp = spearmanr(auction_winners['n_winners'], auction_winners['avg_imp_rate'])
print(f"\nH8: N winners per auction → impression rate")
print(f"  Spearman correlation: {corr_nwinners_imp:.4f}, p-value: {p_nwinners_imp:.6f}")
results.append({'hypothesis': 'H8', 'correlation': corr_nwinners_imp, 'p_value': p_nwinners_imp})

# H9: Auction competition → impression rate
auction_competition = df_bids.groupby('AUCTION_ID').size().reset_index()
auction_competition.columns = ['AUCTION_ID', 'n_bids']
winners_comp = pd.merge(winners_imp, auction_competition, on='AUCTION_ID')
sample_comp = winners_comp.sample(min(50000, len(winners_comp)), random_state=42)
corr_comp_imp, p_comp_imp = spearmanr(sample_comp['n_bids'], sample_comp['has_impression'])
print(f"\nH9: Auction competition (bids/auction) → impression rate")
print(f"  Spearman correlation: {corr_comp_imp:.4f}, p-value: {p_comp_imp:.6f}")
results.append({'hypothesis': 'H9', 'correlation': corr_comp_imp, 'p_value': p_comp_imp})

# H10: Placement 5 vs others
placement_5_imp = winners_imp[winners_imp['PLACEMENT'] == '5']['has_impression'].mean()
placement_other_imp = winners_imp[winners_imp['PLACEMENT'] != '5']['has_impression'].mean()
print(f"\nH10: Placement 5 vs others")
print(f"  Placement 5 impression rate: {placement_5_imp:.4f}")
print(f"  Other placements impression rate: {placement_other_imp:.4f}")
print(f"  Difference: {placement_5_imp - placement_other_imp:.4f}")

print("\nVENDOR/CAMPAIGN HYPOTHESES (H11-H15)")
print("-" * 80)

# H11: Vendor size → impression rate
vendor_size = winners_imp.groupby('VENDOR_ID')['CAMPAIGN_ID'].nunique().reset_index()
vendor_size.columns = ['VENDOR_ID', 'n_campaigns']
vendor_imp_rate = winners_imp.groupby('VENDOR_ID')['has_impression'].mean().reset_index()
vendor_imp_rate.columns = ['VENDOR_ID', 'imp_rate']
vendor_analysis = pd.merge(vendor_size, vendor_imp_rate, on='VENDOR_ID')
corr_vendor_size_imp, p_vendor_size_imp = spearmanr(vendor_analysis['n_campaigns'], vendor_analysis['imp_rate'])
print(f"H11: Vendor size (n_campaigns) → impression rate")
print(f"  Spearman correlation: {corr_vendor_size_imp:.4f}, p-value: {p_vendor_size_imp:.6f}")
results.append({'hypothesis': 'H11', 'correlation': corr_vendor_size_imp, 'p_value': p_vendor_size_imp})

# H12: Campaign n_bids → impression rate
campaign_size = winners_imp.groupby('CAMPAIGN_ID').size().reset_index()
campaign_size.columns = ['CAMPAIGN_ID', 'n_bids']
campaign_imp_rate = winners_imp.groupby('CAMPAIGN_ID')['has_impression'].mean().reset_index()
campaign_imp_rate.columns = ['CAMPAIGN_ID', 'imp_rate']
campaign_analysis = pd.merge(campaign_size, campaign_imp_rate, on='CAMPAIGN_ID')
campaign_analysis = campaign_analysis[campaign_analysis['n_bids'] >= 10]
corr_campaign_size_imp, p_campaign_size_imp = spearmanr(campaign_analysis['n_bids'], campaign_analysis['imp_rate'])
print(f"\nH12: Campaign size (n_bids) → impression rate")
print(f"  Spearman correlation: {corr_campaign_size_imp:.4f}, p-value: {p_campaign_size_imp:.6f}")
results.append({'hypothesis': 'H12', 'correlation': corr_campaign_size_imp, 'p_value': p_campaign_size_imp})

# H13: Campaign n_products → impression rate
campaign_products = winners_imp.groupby('CAMPAIGN_ID')['PRODUCT_ID'].nunique().reset_index()
campaign_products.columns = ['CAMPAIGN_ID', 'n_products']
campaign_analysis2 = pd.merge(campaign_products, campaign_imp_rate, on='CAMPAIGN_ID')
corr_campaign_products_imp, p_campaign_products_imp = spearmanr(campaign_analysis2['n_products'], campaign_analysis2['imp_rate'])
print(f"\nH13: Campaign n_products → impression rate")
print(f"  Spearman correlation: {corr_campaign_products_imp:.4f}, p-value: {p_campaign_products_imp:.6f}")
results.append({'hypothesis': 'H13', 'correlation': corr_campaign_products_imp, 'p_value': p_campaign_products_imp})

# H14: Campaign pacing → impression rate
campaign_pacing = winners_imp.groupby('CAMPAIGN_ID')['PACING'].mean().reset_index()
campaign_pacing.columns = ['CAMPAIGN_ID', 'avg_pacing']
campaign_analysis3 = pd.merge(campaign_pacing, campaign_imp_rate, on='CAMPAIGN_ID')
corr_campaign_pacing_imp, p_campaign_pacing_imp = spearmanr(campaign_analysis3['avg_pacing'], campaign_analysis3['imp_rate'])
print(f"\nH14: Campaign avg_pacing → impression rate")
print(f"  Spearman correlation: {corr_campaign_pacing_imp:.4f}, p-value: {p_campaign_pacing_imp:.6f}")
results.append({'hypothesis': 'H14', 'correlation': corr_campaign_pacing_imp, 'p_value': p_campaign_pacing_imp})

# H15: Vendor win_rate → impression_rate
vendor_win_rate = df_bids.groupby('VENDOR_ID')['IS_WINNER'].mean().reset_index()
vendor_win_rate.columns = ['VENDOR_ID', 'win_rate']
vendor_analysis2 = pd.merge(vendor_win_rate, vendor_imp_rate, on='VENDOR_ID')
corr_vendor_win_imp, p_vendor_win_imp = spearmanr(vendor_analysis2['win_rate'], vendor_analysis2['imp_rate'])
print(f"\nH15: Vendor win_rate → impression_rate")
print(f"  Spearman correlation: {corr_vendor_win_imp:.4f}, p-value: {p_vendor_win_imp:.6f}")
results.append({'hypothesis': 'H15', 'correlation': corr_vendor_win_imp, 'p_value': p_vendor_win_imp})

print()

SECTION 3: HYPOTHESIS TESTING

AUCTION DESIGN HYPOTHESES (H1-H5)
--------------------------------------------------------------------------------
H1: Quality → Impression rate
  Spearman correlation: -0.0670, p-value: 0.000000

H2: Final bid → Impression rate
  Spearman correlation: 0.0159, p-value: 0.000000

H3: Ranking → Impression rate
  Spearman correlation: -0.1431, p-value: 0.000000
  Rank 1 impression rate: 0.1415
  Rank 5 impression rate: 0.0695
  Rank 10 impression rate: 0.0475

H4: First-price vs second-price winners
  First-price impression rate: 0.0310 (n=12,780,430)
  Second-price impression rate: 0.0502 (n=2,732,762)
  Difference: -0.0192

H5: Pacing → Impression rate (for winners)
  Spearman correlation: -0.0032, p-value: 0.000000

POSITION EFFECTS HYPOTHESES (H6-H10)
--------------------------------------------------------------------------------
H6: Placement CTR differences
  Placement 1: CTR = 0.0249
  Placement 2: CTR = 0.0383
  Placement 3: CTR = 0.0363
  Placement

In [5]:
print("PRODUCT HYPOTHESES (H16-H20)")
print("-" * 80)

# Merge catalog price into impressions
impressions_catalog = pd.merge(impressions_full,
                               df_catalog[['PRODUCT_ID', 'PRICE']].rename(columns={'PRICE': 'CATALOG_PRICE'}),
                               on='PRODUCT_ID',
                               how='left')

# H16: Product price → CTR
price_sample = impressions_catalog[impressions_catalog['CATALOG_PRICE'].notna()].sample(min(10000, len(impressions_catalog[impressions_catalog['CATALOG_PRICE'].notna()])), random_state=42)
if len(price_sample) > 10:
    corr_price_ctr, p_price_ctr = spearmanr(price_sample['CATALOG_PRICE'], price_sample['has_click'])
    print(f"H16: Product catalog price → CTR")
    print(f"  Spearman correlation: {corr_price_ctr:.4f}, p-value: {p_price_ctr:.6f}")
    print(f"  Sample size: {len(price_sample):,}")
    results.append({'hypothesis': 'H16', 'correlation': corr_price_ctr, 'p_value': p_price_ctr})
else:
    print(f"H16: Insufficient data (n={len(price_sample)})")

# H17: Multi-campaign products vs single-campaign
product_n_campaigns = df_bids.groupby('PRODUCT_ID')['CAMPAIGN_ID'].nunique()
impressions_full['product_n_campaigns'] = impressions_full['PRODUCT_ID'].map(product_n_campaigns)
multi_campaign_ctr = impressions_full[impressions_full['product_n_campaigns'] > 1]['has_click'].mean()
single_campaign_ctr = impressions_full[impressions_full['product_n_campaigns'] == 1]['has_click'].mean()
print(f"\nH17: Multi-campaign vs single-campaign products")
print(f"  Multi-campaign product CTR: {multi_campaign_ctr:.4f} (n={(impressions_full['product_n_campaigns'] > 1).sum():,})")
print(f"  Single-campaign product CTR: {single_campaign_ctr:.4f} (n={(impressions_full['product_n_campaigns'] == 1).sum():,})")
print(f"  Difference: {multi_campaign_ctr - single_campaign_ctr:.4f}")

# H18: Product quality → CTR
corr_quality_ctr, p_quality_ctr = spearmanr(impressions_full['QUALITY'], impressions_full['has_click'])
print(f"\nH18: Product quality score → CTR")
print(f"  Spearman correlation: {corr_quality_ctr:.4f}, p-value: {p_quality_ctr:.6f}")
results.append({'hypothesis': 'H18', 'correlation': corr_quality_ctr, 'p_value': p_quality_ctr})

# H19: New products (first week) vs later
product_first_week = df_bids.groupby('PRODUCT_ID')['week'].min()
impressions_full['product_first_week'] = impressions_full['PRODUCT_ID'].map(product_first_week)
impressions_full['product_week'] = impressions_full['PRODUCT_ID'].map(df_bids.groupby('PRODUCT_ID')['week'].first())
first_week_impressions = impressions_full[impressions_full['product_first_week'] == impressions_full['week']]
later_week_impressions = impressions_full[impressions_full['product_first_week'] < impressions_full['week']]
if len(first_week_impressions) > 0 and len(later_week_impressions) > 0:
    first_week_ctr = first_week_impressions['has_click'].mean()
    later_week_ctr = later_week_impressions['has_click'].mean()
    print(f"\nH19: New products (first week) vs later")
    print(f"  First week CTR: {first_week_ctr:.4f} (n={len(first_week_impressions):,})")
    print(f"  Later weeks CTR: {later_week_ctr:.4f} (n={len(later_week_impressions):,})")
    print(f"  Difference: {first_week_ctr - later_week_ctr:.4f}")
else:
    print(f"\nH19: Insufficient data for week comparison")

# H20: Product bid frequency → CTR
product_bid_freq = df_bids.groupby('PRODUCT_ID').size()
impressions_full['product_bid_freq'] = impressions_full['PRODUCT_ID'].map(product_bid_freq)
sample_freq = impressions_full.sample(min(10000, len(impressions_full)), random_state=42)
corr_freq_ctr, p_freq_ctr = spearmanr(sample_freq['product_bid_freq'], sample_freq['has_click'])
print(f"\nH20: Product bid frequency → CTR")
print(f"  Spearman correlation: {corr_freq_ctr:.4f}, p-value: {p_freq_ctr:.6f}")
results.append({'hypothesis': 'H20', 'correlation': corr_freq_ctr, 'p_value': p_freq_ctr})

print("\nTEMPORAL HYPOTHESES (H21-H25)")
print("-" * 80)

# H21: Weekend vs weekday impression rate
winners_imp['is_weekend'] = winners_imp['day_of_week'].isin([5, 6])
weekend_imp = winners_imp[winners_imp['is_weekend']]['has_impression'].mean()
weekday_imp = winners_imp[~winners_imp['is_weekend']]['has_impression'].mean()
print(f"H21: Weekend vs weekday impression rate")
print(f"  Weekend: {weekend_imp:.4f} (n={winners_imp['is_weekend'].sum():,})")
print(f"  Weekday: {weekday_imp:.4f} (n={(~winners_imp['is_weekend']).sum():,})")
print(f"  Difference: {weekend_imp - weekday_imp:.4f}")

# H22: Weekend vs weekday CTR
impressions_full['is_weekend'] = impressions_full['day_of_week'].isin([5, 6])
weekend_ctr = impressions_full[impressions_full['is_weekend']]['has_click'].mean()
weekday_ctr = impressions_full[~impressions_full['is_weekend']]['has_click'].mean()
print(f"\nH22: Weekend vs weekday CTR")
print(f"  Weekend: {weekend_ctr:.4f} (n={impressions_full['is_weekend'].sum():,})")
print(f"  Weekday: {weekday_ctr:.4f} (n={(~impressions_full['is_weekend']).sum():,})")
print(f"  Difference: {weekend_ctr - weekday_ctr:.4f}")

# H23: Hour of day → CTR
impressions_full['hour_bin'] = pd.cut(impressions_full['hour'], bins=[0, 6, 12, 18, 24], labels=['0-5', '6-11', '12-17', '18-23'], include_lowest=True)
print(f"\nH23: Hour of day → CTR")
for hour_bin in ['0-5', '6-11', '12-17', '18-23']:
    bin_data = impressions_full[impressions_full['hour_bin'] == hour_bin]
    if len(bin_data) > 0:
        bin_ctr = bin_data['has_click'].mean()
        print(f"  Hours {hour_bin}: CTR = {bin_ctr:.4f} (n={len(bin_data):,})")

# H24: Week-over-week funnel changes
weekly_imp_rate = winners_imp.groupby('week')['has_impression'].mean()
weekly_ctr = impressions_full.groupby('week')['has_click'].mean()
print(f"\nH24: Week-over-week funnel rates")
for week in sorted(weekly_imp_rate.index):
    imp_rate = weekly_imp_rate.get(week, 0)
    ctr = weekly_ctr.get(week, 0)
    print(f"  Week {week}: Impression rate = {imp_rate:.4f}, CTR = {ctr:.4f}")

# H25: First impression of day vs later
impressions_full['impression_time'] = pd.to_datetime(impressions_full['OCCURRED_AT'])
impressions_full['date'] = impressions_full['impression_time'].dt.date
impressions_full = impressions_full.sort_values(['OPAQUE_USER_ID', 'date', 'impression_time'])
impressions_full['is_first_of_day'] = ~impressions_full.duplicated(['OPAQUE_USER_ID', 'date'], keep='first')
first_ctr = impressions_full[impressions_full['is_first_of_day']]['has_click'].mean()
later_ctr = impressions_full[~impressions_full['is_first_of_day']]['has_click'].mean()
print(f"\nH25: First impression of day vs later")
print(f"  First: CTR = {first_ctr:.4f} (n={impressions_full['is_first_of_day'].sum():,})")
print(f"  Later: CTR = {later_ctr:.4f} (n={(~impressions_full['is_first_of_day']).sum():,})")
print(f"  Difference: {first_ctr - later_ctr:.4f}")

print()

PRODUCT HYPOTHESES (H16-H20)
--------------------------------------------------------------------------------
H16: Product catalog price → CTR
  Spearman correlation: 0.0232, p-value: 0.020353
  Sample size: 10,000

H17: Multi-campaign vs single-campaign products
  Multi-campaign product CTR: 0.0305 (n=270,264)
  Single-campaign product CTR: 0.0300 (n=264,214)
  Difference: 0.0005

H18: Product quality score → CTR
  Spearman correlation: nan, p-value: nan


KeyError: 'week'

In [None]:
print("CROSS-STAGE HYPOTHESES (H26-H30)")
print("-" * 80)

# H26: Bid amount → CTR
corr_bid_ctr, p_bid_ctr = spearmanr(impressions_full['FINAL_BID'], impressions_full['has_click'])
print(f"H26: Bid amount → CTR (for impressions)")
print(f"  Spearman correlation: {corr_bid_ctr:.4f}, p-value: {p_bid_ctr:.6f}")
results.append({'hypothesis': 'H26', 'correlation': corr_bid_ctr, 'p_value': p_bid_ctr})

# H27: Auction n_bids → click probability
impressions_comp = pd.merge(impressions_full, auction_competition, on='AUCTION_ID')
sample_comp2 = impressions_comp.sample(min(10000, len(impressions_comp)), random_state=42)
corr_comp_click, p_comp_click = spearmanr(sample_comp2['n_bids'], sample_comp2['has_click'])
print(f"\nH27: Auction competition (n_bids) → click probability")
print(f"  Spearman correlation: {corr_comp_click:.4f}, p-value: {p_comp_click:.6f}")
results.append({'hypothesis': 'H27', 'correlation': corr_comp_click, 'p_value': p_comp_click})

# H28: Auction n_winners → impression rate per winner
auction_n_winners = winners_imp.groupby('AUCTION_ID').size().reset_index()
auction_n_winners.columns = ['AUCTION_ID', 'n_winners_auction']
winners_nwin = pd.merge(winners_imp, auction_n_winners, on='AUCTION_ID')
sample_nwin = winners_nwin.sample(min(50000, len(winners_nwin)), random_state=42)
corr_nwin_imp, p_nwin_imp = spearmanr(sample_nwin['n_winners_auction'], sample_nwin['has_impression'])
print(f"\nH28: Auction n_winners → impression rate per winner")
print(f"  Spearman correlation: {corr_nwin_imp:.4f}, p-value: {p_nwin_imp:.6f}")
results.append({'hypothesis': 'H28', 'correlation': corr_nwin_imp, 'p_value': p_nwin_imp})

# H29: (Quality × Bid) score → bid→click conversion
winners_imp['quality_bid_score'] = winners_imp['QUALITY'] * winners_imp['FINAL_BID']
# Create bid→click indicator
winners_full = pd.merge(winners_imp,
                       impressions_clicks[['AUCTION_ID', 'PRODUCT_ID', 'CAMPAIGN_ID', 'VENDOR_ID', 'has_click']],
                       on=['AUCTION_ID', 'PRODUCT_ID', 'CAMPAIGN_ID', 'VENDOR_ID'],
                       how='left')
winners_full['has_click'] = winners_full['has_click'].fillna(0)
sample_qb = winners_full.sample(min(50000, len(winners_full)), random_state=42)
corr_qb_click, p_qb_click = spearmanr(sample_qb['quality_bid_score'], sample_qb['has_click'])
print(f"\nH29: (Quality × Bid) score → bid→click conversion")
print(f"  Spearman correlation: {corr_qb_click:.4f}, p-value: {p_qb_click:.6f}")
results.append({'hypothesis': 'H29', 'correlation': corr_qb_click, 'p_value': p_qb_click})

# H30: Variance decomposition for CTR
print(f"\nH30: Variance decomposition for CTR")
overall_ctr = impressions_full['has_click'].mean()
total_var = impressions_full['has_click'].var()

vendor_ctr = impressions_full.groupby('VENDOR_ID')['has_click'].mean()
vendor_var = ((vendor_ctr - overall_ctr) ** 2).mean()

campaign_ctr = impressions_full.groupby('CAMPAIGN_ID')['has_click'].mean()
campaign_var = ((campaign_ctr - overall_ctr) ** 2).mean()

product_ctr = impressions_full.groupby('PRODUCT_ID')['has_click'].mean()
product_var = ((product_ctr - overall_ctr) ** 2).mean()

print(f"  Total CTR variance: {total_var:.6f}")
print(f"  Vendor-level variance: {vendor_var:.6f} ({vendor_var/total_var*100:.2f}%)")
print(f"  Campaign-level variance: {campaign_var:.6f} ({campaign_var/total_var*100:.2f}%)")
print(f"  Product-level variance: {product_var:.6f} ({product_var/total_var*100:.2f}%)")

print("\nDATA/ANOMALY HYPOTHESES (H31-H35)")
print("-" * 80)

# H31: Already covered in Section 2
print(f"H31: Winners without impressions characteristics")
print(f"  Covered in Section 2.1")

# H32: Already covered in Section 2
print(f"\nH32: Impressions without clicks characteristics")
print(f"  Covered in Section 2.2")

# H33: Same (product, campaign) CTR variance
product_campaign_imps = impressions_full.groupby(['PRODUCT_ID', 'CAMPAIGN_ID']).agg({
    'has_click': ['sum', 'count', 'mean'],
    'AUCTION_ID': 'nunique'
}).reset_index()
product_campaign_imps.columns = ['PRODUCT_ID', 'CAMPAIGN_ID', 'n_clicks', 'n_impressions', 'ctr', 'n_auctions']
multi_auction_pc = product_campaign_imps[product_campaign_imps['n_auctions'] > 1]
print(f"\nH33: Same (product, campaign) in multiple auctions")
print(f"  (Product, campaign) pairs in 1 auction: {(product_campaign_imps['n_auctions'] == 1).sum():,}")
print(f"  (Product, campaign) pairs in 2+ auctions: {(product_campaign_imps['n_auctions'] > 1).sum():,}")
if len(multi_auction_pc) > 0:
    print(f"  Mean CTR variance for multi-auction pairs: {multi_auction_pc['ctr'].std():.6f}")
    print(f"  Mean CTR for multi-auction pairs: {multi_auction_pc['ctr'].mean():.4f}")

# H34: Auction→impression time gap
winners_imp['auction_time'] = winners_imp['datetime']
winners_imp['impression_time'] = pd.to_datetime(winners_imp['OCCURRED_AT'])
winners_with_imp_time = winners_imp[winners_imp['has_impression'] == 1]
winners_with_imp_time['time_gap_seconds'] = (winners_with_imp_time['impression_time'] - winners_with_imp_time['auction_time']).dt.total_seconds()
print(f"\nH34: Auction→impression time gap")
if len(winners_with_imp_time) > 0:
    print(f"  Mean time gap: {winners_with_imp_time['time_gap_seconds'].mean():.2f} seconds")
    print(f"  Median time gap: {winners_with_imp_time['time_gap_seconds'].median():.2f} seconds")
    for q in [0.10, 0.25, 0.50, 0.75, 0.90, 0.95, 0.99]:
        print(f"  {q*100:5.1f}%: {winners_with_imp_time['time_gap_seconds'].quantile(q):.2f} seconds")

# H35: Impression rate by date
daily_imp_rate = winners_imp.groupby('date')['has_impression'].mean()
print(f"\nH35: Impression rate by data collection day")
for date, rate in daily_imp_rate.items():
    print(f"  {date}: {rate:.4f}")

print()

## Section 4: Conversion Rates

In [None]:
print("="*80)
print("SECTION 4: CONVERSION RATES")
print("="*80)
print()

print("4.1 OVERALL CONVERSION RATES")
print("-" * 80)
print(f"Bid → Win: {bid_to_win:.6f}")
print(f"Win → Impression: {win_to_imp:.6f}")
print(f"Impression → Click: {imp_to_click:.6f}")
print(f"Bid → Click: {bid_to_click:.8f}")

print("\n4.2 CONVERSION RATES BY PLACEMENT")
print("-" * 80)
for placement in sorted(df_bids['PLACEMENT'].unique()):
    placement_data = df_bids[df_bids['PLACEMENT'] == placement]
    p_bid_win = placement_data['IS_WINNER'].mean()
    
    placement_winners = winners_imp[winners_imp['PLACEMENT'] == placement]
    p_win_imp = placement_winners['has_impression'].mean() if len(placement_winners) > 0 else 0
    
    placement_imps = impressions_full[impressions_full['PLACEMENT'] == placement]
    p_imp_click = placement_imps['has_click'].mean() if len(placement_imps) > 0 else 0
    
    p_bid_click = p_bid_win * p_win_imp * p_imp_click
    
    print(f"\nPlacement {placement}:")
    print(f"  Bid → Win: {p_bid_win:.6f}")
    print(f"  Win → Imp: {p_win_imp:.6f}")
    print(f"  Imp → Click: {p_imp_click:.6f}")
    print(f"  Bid → Click: {p_bid_click:.8f}")

print("\n4.3 CONVERSION RATES BY VENDOR SIZE")
print("-" * 80)
vendor_size_all = df_bids.groupby('VENDOR_ID').size()
vendor_size_quartiles = pd.qcut(vendor_size_all, q=4, labels=['Q1', 'Q2', 'Q3', 'Q4'], duplicates='drop')
df_bids['vendor_size_q'] = df_bids['VENDOR_ID'].map(vendor_size_quartiles)
winners_imp['vendor_size_q'] = winners_imp['VENDOR_ID'].map(vendor_size_quartiles)
impressions_full['vendor_size_q'] = impressions_full['VENDOR_ID'].map(vendor_size_quartiles)

for q in ['Q1', 'Q2', 'Q3', 'Q4']:
    q_bids = df_bids[df_bids['vendor_size_q'] == q]
    q_bid_win = q_bids['IS_WINNER'].mean()
    
    q_winners = winners_imp[winners_imp['vendor_size_q'] == q]
    q_win_imp = q_winners['has_impression'].mean() if len(q_winners) > 0 else 0
    
    q_imps = impressions_full[impressions_full['vendor_size_q'] == q]
    q_imp_click = q_imps['has_click'].mean() if len(q_imps) > 0 else 0
    
    print(f"\nVendor size {q}:")
    print(f"  Bid → Win: {q_bid_win:.6f}")
    print(f"  Win → Imp: {q_win_imp:.6f}")
    print(f"  Imp → Click: {q_imp_click:.6f}")

print("\n4.4 CONVERSION RATES BY PRODUCT USAGE")
print("-" * 80)
product_n_campaigns_all = df_bids.groupby('PRODUCT_ID')['CAMPAIGN_ID'].nunique()
df_bids['product_usage'] = df_bids['PRODUCT_ID'].map(product_n_campaigns_all)
df_bids['product_usage_cat'] = 'single'
df_bids.loc[df_bids['product_usage'] > 1, 'product_usage_cat'] = 'multiple'

winners_imp['product_usage_cat'] = winners_imp['PRODUCT_ID'].map(product_n_campaigns_all).apply(lambda x: 'single' if x == 1 else 'multiple')
impressions_full['product_usage_cat'] = impressions_full['PRODUCT_ID'].map(product_n_campaigns_all).apply(lambda x: 'single' if x == 1 else 'multiple')

for cat in ['single', 'multiple']:
    cat_bids = df_bids[df_bids['product_usage_cat'] == cat]
    cat_bid_win = cat_bids['IS_WINNER'].mean()
    
    cat_winners = winners_imp[winners_imp['product_usage_cat'] == cat]
    cat_win_imp = cat_winners['has_impression'].mean() if len(cat_winners) > 0 else 0
    
    cat_imps = impressions_full[impressions_full['product_usage_cat'] == cat]
    cat_imp_click = cat_imps['has_click'].mean() if len(cat_imps) > 0 else 0
    
    print(f"\nProduct usage: {cat}:")
    print(f"  Bid → Win: {cat_bid_win:.6f}")
    print(f"  Win → Imp: {cat_win_imp:.6f}")
    print(f"  Imp → Click: {cat_imp_click:.6f}")

print()

## Section 5: Drop-off Analysis

In [None]:
print("="*80)
print("SECTION 5: DROP-OFF ANALYSIS")
print("="*80)
print()

print("5.1 WINNERS WITHOUT IMPRESSIONS")
print("-" * 80)
no_imp = winners_imp[winners_imp['has_impression'] == 0]
yes_imp = winners_imp[winners_imp['has_impression'] == 1]

print(f"N winners without impressions: {len(no_imp):,} ({len(no_imp)/len(winners_imp)*100:.2f}%)")
print(f"N winners with impressions: {len(yes_imp):,} ({len(yes_imp)/len(winners_imp)*100:.2f}%)")

print("\nCharacteristics comparison:")
print(f"  Mean FINAL_BID: no_imp={no_imp['FINAL_BID'].mean():.2f}, yes_imp={yes_imp['FINAL_BID'].mean():.2f}, diff={no_imp['FINAL_BID'].mean() - yes_imp['FINAL_BID'].mean():.2f}")
print(f"  Mean QUALITY: no_imp={no_imp['QUALITY'].mean():.6f}, yes_imp={yes_imp['QUALITY'].mean():.6f}, diff={no_imp['QUALITY'].mean() - yes_imp['QUALITY'].mean():.6f}")
print(f"  Mean RANKING: no_imp={no_imp['RANKING'].mean():.2f}, yes_imp={yes_imp['RANKING'].mean():.2f}, diff={no_imp['RANKING'].mean() - yes_imp['RANKING'].mean():.2f}")
print(f"  Mean PACING: no_imp={no_imp['PACING'].mean():.4f}, yes_imp={yes_imp['PACING'].mean():.4f}, diff={no_imp['PACING'].mean() - yes_imp['PACING'].mean():.4f}")

print("\nPlacement distribution:")
print("  No impression:")
for placement in sorted(no_imp['PLACEMENT'].unique()):
    pct = (no_imp['PLACEMENT'] == placement).mean()
    print(f"    Placement {placement}: {pct*100:.2f}%")
print("  With impression:")
for placement in sorted(yes_imp['PLACEMENT'].unique()):
    pct = (yes_imp['PLACEMENT'] == placement).mean()
    print(f"    Placement {placement}: {pct*100:.2f}%")

print("\nVendor distribution (top 10 by volume):")
no_imp_vendors = no_imp['VENDOR_ID'].value_counts().head(10)
yes_imp_vendors = yes_imp['VENDOR_ID'].value_counts().head(10)
print("  No impression top vendors:")
for vendor, count in no_imp_vendors.items():
    print(f"    {vendor[:20]}...: {count:,}")
print("  With impression top vendors:")
for vendor, count in yes_imp_vendors.items():
    print(f"    {vendor[:20]}...: {count:,}")

print("\n5.2 IMPRESSIONS WITHOUT CLICKS")
print("-" * 80)
no_click_imps = impressions_full[impressions_full['has_click'] == 0]
yes_click_imps = impressions_full[impressions_full['has_click'] == 1]

print(f"N impressions without clicks: {len(no_click_imps):,} ({len(no_click_imps)/len(impressions_full)*100:.2f}%)")
print(f"N impressions with clicks: {len(yes_click_imps):,} ({len(yes_click_imps)/len(impressions_full)*100:.2f}%)")

print("\nCharacteristics comparison:")
print(f"  Mean FINAL_BID: no_click={no_click_imps['FINAL_BID'].mean():.2f}, yes_click={yes_click_imps['FINAL_BID'].mean():.2f}, diff={no_click_imps['FINAL_BID'].mean() - yes_click_imps['FINAL_BID'].mean():.2f}")
print(f"  Mean QUALITY: no_click={no_click_imps['QUALITY'].mean():.6f}, yes_click={yes_click_imps['QUALITY'].mean():.6f}, diff={no_click_imps['QUALITY'].mean() - yes_click_imps['QUALITY'].mean():.6f}")
print(f"  Mean RANKING: no_click={no_click_imps['RANKING'].mean():.2f}, yes_click={yes_click_imps['RANKING'].mean():.2f}, diff={no_click_imps['RANKING'].mean() - yes_click_imps['RANKING'].mean():.2f}")

print("\nPlacement distribution:")
print("  No click:")
for placement in sorted(no_click_imps['PLACEMENT'].unique()):
    pct = (no_click_imps['PLACEMENT'] == placement).mean()
    print(f"    Placement {placement}: {pct*100:.2f}%")
print("  With click:")
for placement in sorted(yes_click_imps['PLACEMENT'].unique()):
    pct = (yes_click_imps['PLACEMENT'] == placement).mean()
    print(f"    Placement {placement}: {pct*100:.2f}%")

print()

## Section 6: Product Multi-Campaign Analysis

In [None]:
print("="*80)
print("SECTION 6: PRODUCT MULTI-CAMPAIGN ANALYSIS")
print("="*80)
print()

print("6.1 PRODUCT CAMPAIGN DISTRIBUTION")
print("-" * 80)
product_campaigns_all = df_bids.groupby('PRODUCT_ID')['CAMPAIGN_ID'].nunique()
print(f"Total unique products: {len(product_campaigns_all):,}")
print(f"Products in 1 campaign: {(product_campaigns_all == 1).sum():,} ({(product_campaigns_all == 1).mean()*100:.2f}%)")
print(f"Products in 2+ campaigns: {(product_campaigns_all >= 2).sum():,} ({(product_campaigns_all >= 2).mean()*100:.2f}%)")
print(f"Products in 5+ campaigns: {(product_campaigns_all >= 5).sum():,} ({(product_campaigns_all >= 5).mean()*100:.2f}%)")
print(f"Products in 10+ campaigns: {(product_campaigns_all >= 10).sum():,} ({(product_campaigns_all >= 10).mean()*100:.2f}%)")

print("\n6.2 MULTI-CAMPAIGN PRODUCT IMPRESSION RATE VARIANCE")
print("-" * 80)
multi_campaign_products = product_campaigns_all[product_campaigns_all >= 2].index
print(f"Analyzing {len(multi_campaign_products):,} multi-campaign products...")

product_campaign_imp_rates = winners_imp.groupby(['PRODUCT_ID', 'CAMPAIGN_ID'])['has_impression'].agg(['mean', 'count']).reset_index()
product_campaign_imp_rates.columns = ['PRODUCT_ID', 'CAMPAIGN_ID', 'imp_rate', 'n_winners']
product_campaign_imp_rates = product_campaign_imp_rates[product_campaign_imp_rates['n_winners'] >= 5]

multi_campaign_imp_variance = product_campaign_imp_rates.groupby('PRODUCT_ID')['imp_rate'].agg(['std', 'mean', 'count']).reset_index()
multi_campaign_imp_variance = multi_campaign_imp_variance[multi_campaign_imp_variance['count'] >= 2]
multi_campaign_imp_variance.columns = ['PRODUCT_ID', 'imp_rate_std', 'imp_rate_mean', 'n_campaigns']

print(f"Products with impression rate variance (2+ campaigns, 5+ winners each): {len(multi_campaign_imp_variance):,}")
print(f"Mean impression rate std across campaigns: {multi_campaign_imp_variance['imp_rate_std'].mean():.6f}")
print(f"Median impression rate std: {multi_campaign_imp_variance['imp_rate_std'].median():.6f}")

print("\n6.3 MULTI-CAMPAIGN PRODUCT CTR VARIANCE")
print("-" * 80)
product_campaign_ctr = impressions_full.groupby(['PRODUCT_ID', 'CAMPAIGN_ID'])['has_click'].agg(['mean', 'count']).reset_index()
product_campaign_ctr.columns = ['PRODUCT_ID', 'CAMPAIGN_ID', 'ctr', 'n_impressions']
product_campaign_ctr = product_campaign_ctr[product_campaign_ctr['n_impressions'] >= 3]

multi_campaign_ctr_variance = product_campaign_ctr.groupby('PRODUCT_ID')['ctr'].agg(['std', 'mean', 'count']).reset_index()
multi_campaign_ctr_variance = multi_campaign_ctr_variance[multi_campaign_ctr_variance['count'] >= 2]
multi_campaign_ctr_variance.columns = ['PRODUCT_ID', 'ctr_std', 'ctr_mean', 'n_campaigns']

print(f"Products with CTR variance (2+ campaigns, 3+ impressions each): {len(multi_campaign_ctr_variance):,}")
if len(multi_campaign_ctr_variance) > 0:
    print(f"Mean CTR std across campaigns: {multi_campaign_ctr_variance['ctr_std'].mean():.6f}")
    print(f"Median CTR std: {multi_campaign_ctr_variance['ctr_std'].median():.6f}")
else:
    print("Insufficient data for CTR variance analysis")

print("\n6.4 SAME PRODUCT DIFFERENT CAMPAIGNS: BID DIFFERENCES")
print("-" * 80)
product_campaign_bids = df_bids.groupby(['PRODUCT_ID', 'CAMPAIGN_ID'])['FINAL_BID'].mean().reset_index()
product_campaign_bids.columns = ['PRODUCT_ID', 'CAMPAIGN_ID', 'avg_bid']
product_bid_variance = product_campaign_bids.groupby('PRODUCT_ID')['avg_bid'].agg(['std', 'mean', 'count']).reset_index()
product_bid_variance = product_bid_variance[product_bid_variance['count'] >= 2]
product_bid_variance.columns = ['PRODUCT_ID', 'bid_std', 'bid_mean', 'n_campaigns']

print(f"Products with bid variance (2+ campaigns): {len(product_bid_variance):,}")
print(f"Mean bid std across campaigns: {product_bid_variance['bid_std'].mean():.2f}")
print(f"Median bid std: {product_bid_variance['bid_std'].median():.2f}")
print(f"Mean coefficient of variation: {(product_bid_variance['bid_std'] / product_bid_variance['bid_mean']).mean():.4f}")

print("\n6.5 MULTI-CAMPAIGN VS SINGLE-CAMPAIGN FUNNEL RATES")
print("-" * 80)
single_campaign_products = product_campaigns_all[product_campaigns_all == 1].index

single_wins = winners_imp[winners_imp['PRODUCT_ID'].isin(single_campaign_products)]
multi_wins = winners_imp[winners_imp['PRODUCT_ID'].isin(multi_campaign_products)]

single_imp_rate = single_wins['has_impression'].mean()
multi_imp_rate = multi_wins['has_impression'].mean()

single_imps = impressions_full[impressions_full['PRODUCT_ID'].isin(single_campaign_products)]
multi_imps = impressions_full[impressions_full['PRODUCT_ID'].isin(multi_campaign_products)]

single_ctr = single_imps['has_click'].mean() if len(single_imps) > 0 else 0
multi_ctr = multi_imps['has_click'].mean() if len(multi_imps) > 0 else 0

print(f"Single-campaign products:")
print(f"  Impression rate: {single_imp_rate:.6f} (n={len(single_wins):,})")
print(f"  CTR: {single_ctr:.6f} (n={len(single_imps):,})")
print(f"\nMulti-campaign products:")
print(f"  Impression rate: {multi_imp_rate:.6f} (n={len(multi_wins):,})")
print(f"  CTR: {multi_ctr:.6f} (n={len(multi_imps):,})")
print(f"\nDifferences:")
print(f"  Impression rate diff: {multi_imp_rate - single_imp_rate:.6f}")
print(f"  CTR diff: {multi_ctr - single_ctr:.6f}")

print()

## Section 7: Statistical Models

In [None]:
print("="*80)
print("SECTION 7: STATISTICAL MODELS")
print("="*80)
print()

from scipy.special import expit

print("MODEL 1: IMPRESSION DELIVERY (LOGISTIC REGRESSION)")
print("-" * 80)
print("Unit: Winner")
print("DV: has_impression (0/1)")
print("IV: log(final_bid), quality, ranking, pacing, placement dummies")
print()

# Sample for efficiency
sample_size = min(100000, len(winners_imp))
model1_data = winners_imp.sample(sample_size, random_state=42)
model1_data = model1_data[model1_data['FINAL_BID'] > 0].copy()

model1_data['log_bid'] = np.log(model1_data['FINAL_BID'])
model1_data = model1_data.dropna(subset=['log_bid', 'QUALITY', 'RANKING', 'PACING'])

# Create placement dummies
placement_dummies = pd.get_dummies(model1_data['PLACEMENT'], prefix='placement', drop_first=True)
X1 = pd.concat([model1_data[['log_bid', 'QUALITY', 'RANKING', 'PACING']], placement_dummies], axis=1)
X1 = np.column_stack([np.ones(len(X1)), X1.values])
y1 = model1_data['has_impression'].values.astype(float)

# Simple logistic regression (gradient descent)
beta1 = np.zeros(X1.shape[1])
lr = 0.001
for iteration in range(100):
    pred = expit(X1 @ beta1)
    gradient = X1.T @ (pred - y1) / len(y1)
    beta1 -= lr * gradient

pred_final = expit(X1 @ beta1)
accuracy = ((pred_final > 0.5) == y1).mean()

print(f"N observations: {len(model1_data):,}")
print(f"Accuracy: {accuracy:.4f}")
print(f"Baseline (always predict mean): {y1.mean():.4f}")
print("\nCoefficients:")
feature_names = ['Intercept', 'log(final_bid)', 'quality', 'ranking', 'pacing'] + list(placement_dummies.columns)
for i, name in enumerate(feature_names):
    odds_ratio = np.exp(beta1[i])
    print(f"  {name:20s}: coef={beta1[i]:10.6f}, OR={odds_ratio:10.6f}")

print("\n\nMODEL 2: CLICK MODEL (LOGISTIC REGRESSION)")
print("-" * 80)
print("Unit: Impression")
print("DV: has_click (0/1)")
print("IV: log(final_bid), quality, ranking, hour, day_of_week, placement dummies")
print()

sample_size2 = min(50000, len(impressions_full))
model2_data = impressions_full.sample(sample_size2, random_state=42)
model2_data = model2_data[model2_data['FINAL_BID'] > 0].copy()

model2_data['log_bid'] = np.log(model2_data['FINAL_BID'])
model2_data = model2_data.dropna(subset=['log_bid', 'QUALITY', 'RANKING', 'hour', 'day_of_week'])

placement_dummies2 = pd.get_dummies(model2_data['PLACEMENT'], prefix='placement', drop_first=True)
X2 = pd.concat([model2_data[['log_bid', 'QUALITY', 'RANKING', 'hour', 'day_of_week']], placement_dummies2], axis=1)
X2 = np.column_stack([np.ones(len(X2)), X2.values])
y2 = model2_data['has_click'].values.astype(float)

beta2 = np.zeros(X2.shape[1])
lr = 0.001
for iteration in range(100):
    pred = expit(X2 @ beta2)
    gradient = X2.T @ (pred - y2) / len(y2)
    beta2 -= lr * gradient

pred_final2 = expit(X2 @ beta2)
accuracy2 = ((pred_final2 > 0.5) == y2).mean()

print(f"N observations: {len(model2_data):,}")
print(f"Accuracy: {accuracy2:.4f}")
print(f"Baseline (always predict mean): {y2.mean():.4f}")
print("\nCoefficients:")
feature_names2 = ['Intercept', 'log(final_bid)', 'quality', 'ranking', 'hour', 'day_of_week'] + list(placement_dummies2.columns)
for i, name in enumerate(feature_names2):
    odds_ratio = np.exp(beta2[i])
    print(f"  {name:20s}: coef={beta2[i]:10.6f}, OR={odds_ratio:10.6f}")

print("\n\nMODEL 3: FULL FUNNEL (LOGISTIC REGRESSION)")
print("-" * 80)
print("Unit: Bid")
print("DV: reached_click (0/1)")
print("IV: log(final_bid), quality, pacing, n_bids_in_auction, placement dummies")
print()

# Create end-to-end indicator
bids_full = pd.merge(df_bids,
                     winners_full[['AUCTION_ID', 'PRODUCT_ID', 'CAMPAIGN_ID', 'VENDOR_ID', 'has_click']],
                     on=['AUCTION_ID', 'PRODUCT_ID', 'CAMPAIGN_ID', 'VENDOR_ID'],
                     how='left')
bids_full['reached_click'] = bids_full['has_click'].fillna(0)

sample_size3 = min(100000, len(bids_full))
model3_data = bids_full.sample(sample_size3, random_state=42)
model3_data = model3_data[model3_data['FINAL_BID'] > 0].copy()

model3_data['log_bid'] = np.log(model3_data['FINAL_BID'])
model3_data['n_bids_in_auction'] = model3_data['AUCTION_ID'].map(auction_competition.set_index('AUCTION_ID')['n_bids'])
model3_data = model3_data.dropna(subset=['log_bid', 'QUALITY', 'PACING', 'n_bids_in_auction'])

placement_dummies3 = pd.get_dummies(model3_data['PLACEMENT'], prefix='placement', drop_first=True)
X3 = pd.concat([model3_data[['log_bid', 'QUALITY', 'PACING', 'n_bids_in_auction']], placement_dummies3], axis=1)
X3 = np.column_stack([np.ones(len(X3)), X3.values])
y3 = model3_data['reached_click'].values.astype(float)

beta3 = np.zeros(X3.shape[1])
lr = 0.0001
for iteration in range(100):
    pred = expit(X3 @ beta3)
    gradient = X3.T @ (pred - y3) / len(y3)
    beta3 -= lr * gradient

pred_final3 = expit(X3 @ beta3)
accuracy3 = ((pred_final3 > 0.5) == y3).mean()

print(f"N observations: {len(model3_data):,}")
print(f"Accuracy: {accuracy3:.4f}")
print(f"Baseline (always predict mean): {y3.mean():.4f}")
print("\nCoefficients:")
feature_names3 = ['Intercept', 'log(final_bid)', 'quality', 'pacing', 'n_bids_in_auction'] + list(placement_dummies3.columns)
for i, name in enumerate(feature_names3):
    odds_ratio = np.exp(beta3[i])
    print(f"  {name:20s}: coef={beta3[i]:10.6f}, OR={odds_ratio:10.6f}")

print("\n\nMODEL 4: VARIANCE DECOMPOSITION")
print("-" * 80)
print("Covered in Section 3, H30")

print("\n" + "="*80)
print("ANALYSIS COMPLETE")
print("="*80)