# Funnel Econometric Analysis with Polars - Ad Platform Incrementality

This notebook implements three fixed-effects models to analyze the advertising funnel:
1. **Model 1: Ad Effectiveness Model** (Top-of-Funnel) - Individual ad impression analysis
2. **Model 2: Journey Continuation Model** (Mid-Funnel) - Browsing session continuation analysis
3. **Model 3: Final Conversion Model** (Bottom-of-Funnel) - Shopping session conversion analysis

**Part 1**: Data processing with Polars and panel creation
**Part 2**: Econometric modeling with saved panels

**Output**: All results saved to `funnel_econometric_results.txt`

## PART 1: DATA PROCESSING WITH POLARS

In [None]:
# PART 1: DATA PROCESSING WITH POLARS
# --- SETUP & ENVIRONMENT ---
import polars as pl
import numpy as np
from pathlib import Path
from datetime import datetime, timedelta
import warnings
from tqdm import tqdm

warnings.filterwarnings("ignore")

# Configuration
DATA_DIR = Path("./data")
PANEL_DIR = Path("./data/funnel_panels")  # Directory for funnel panel datasets
PANEL_DIR.mkdir(exist_ok=True, parents=True)

# Sampling parameters
AUCTION_SAMPLE_FRACTION = 0.10  # Sample 10% of auctions for Model 1
USER_SAMPLE_FRACTION = 0.05     # Sample 5% of users for Models 2 & 3
RANDOM_SEED = 42

print("="*100)
print("FUNNEL ECONOMETRIC ANALYSIS - DATA PROCESSING WITH POLARS")
print("="*100)
print(f"\nConfiguration:")
print(f"  Data directory: {DATA_DIR}")
print(f"  Panel output directory: {PANEL_DIR}")
print(f"  Auction sample fraction: {AUCTION_SAMPLE_FRACTION:.1%}")
print(f"  User sample fraction: {USER_SAMPLE_FRACTION:.1%}")
print(f"  Random seed: {RANDOM_SEED}")

In [None]:
# --- DATA LOADING WITH POLARS ---print("\n" + "="*100)print("SECTION 1: DATA LOADING WITH POLARS")print("="*100)# Define required filesdata_files = {    'shopping_sessions': 'shopping_sessions.parquet',    'browsing_sessions': 'browsing_sessions.parquet',    'auctions_users': 'raw_sample_auctions_users.parquet',    'auctions_results': 'raw_sample_auctions_results.parquet',    'impressions': 'raw_sample_impressions.parquet',    'clicks': 'raw_sample_clicks.parquet',    'purchases': 'raw_sample_purchases.parquet',    'catalog': 'processed_sample_catalog.parquet'}# Load datasets with Polarsdatasets = {}print("\nLoading datasets:")for name, filename in tqdm(data_files.items(), desc="Loading data"):    filepath = DATA_DIR / filename    if filepath.exists():        datasets[name] = pl.read_parquet(filepath)        shape = datasets[name].shape        print(f"  - {name}: {shape[0]:,} rows, {shape[1]} columns")    else:        print(f"  - {name}: FILE NOT FOUND at {filepath}")        datasets[name] = pl.DataFrame()# Extract dataframesdf_shopping = datasets['shopping_sessions']df_browsing = datasets['browsing_sessions']df_auctions = datasets['auctions_users']df_bids = datasets['auctions_results']df_impressions = datasets['impressions']df_clicks = datasets['clicks']df_purchases = datasets['purchases']df_catalog = datasets['catalog']# Convert prices from cents to dollarsif not df_purchases.is_empty() and 'UNIT_PRICE' in df_purchases.columns:    df_purchases = df_purchases.with_columns(        (pl.col('UNIT_PRICE') / 100).alias('UNIT_PRICE')    )    print("\n  Note: Purchase unit prices converted from cents to dollars")if not df_catalog.is_empty() and 'PRICE' in df_catalog.columns:    df_catalog = df_catalog.with_columns(        (pl.col('PRICE') / 100).alias('PRICE')    )    print("  Note: Catalog prices converted from cents to dollars")print("\n[SUCCESS] All data loaded and preprocessed")

In [None]:
# --- MODEL 1: AD EFFECTIVENESS MODEL DATA CONSTRUCTION ---print("\n" + "="*100)print("SECTION 2: MODEL 1 - AD EFFECTIVENESS MODEL (TOP-OF-FUNNEL)")print("="*100)try:    # Basic joins    join_keys = ['AUCTION_ID', 'PRODUCT_ID', 'VENDOR_ID', 'CAMPAIGN_ID']    # Join impressions with bids for rank    model1_data = df_impressions.join(        df_bids.select(join_keys + ['RANKING']),        on=join_keys,        how='left'    )    # Add click indicator    click_keys = df_clicks.select(join_keys).unique().with_columns(        pl.lit(1).alias('WasClicked')    )    model1_data = model1_data.join(        click_keys,        on=join_keys,        how='left'    ).with_columns(        pl.col('WasClicked').fill_null(0).cast(pl.Int8)    )    print(f"Click rate: {model1_data['WasClicked'].mean():.2%}")    # Sample auctions (FIXED)    pl.set_random_seed(RANDOM_SEED)    unique_auctions = model1_data['AUCTION_ID'].unique()    sampled_auctions = unique_auctions.to_frame().sample(fraction=AUCTION_SAMPLE_FRACTION, with_replacement=False)    model1_data_sampled = model1_data.join(        sampled_auctions,        on='AUCTION_ID',        how='inner'    )    # Add controls    model1_data_sampled = model1_data_sampled.with_columns([        pl.col('RANKING').pow(2).alias('RankSquared'),        pl.lit(10.0).alias('NumWinningBids_a'),        pl.lit(50.0).alias('AvgPriceTop5_a'),        pl.lit(0.5).alias('BrandConcentration_a'),        (pl.col('RANKING') * 50).alias('Rank_x_AvgPrice')    ])    print(f"Sampled dataset: {len(model1_data_sampled):,} impressions")    # Save    model1_panel_path = PANEL_DIR / 'model1_ad_effectiveness.parquet'    model1_data_sampled.write_parquet(model1_panel_path)    print(f"Saved to: {model1_panel_path}")except Exception as e:    print(f"[ERROR] Model 1: {e}")    model1_data_sampled = pl.DataFrame()

In [None]:
# --- MODEL 2: JOURNEY CONTINUATION MODEL DATA CONSTRUCTION ---print("\n" + "="*100)print("SECTION 3: MODEL 2 - JOURNEY CONTINUATION MODEL (MID-FUNNEL)")print("="*100)try:    # Sort and create outcome    model2_data = df_browsing.sort(['user_id', 'session_start'])    model2_data = model2_data.with_columns(        pl.col('session_start').shift(-1).over(['user_id', 'shopping_session_id']).alias('next_session_start')    ).with_columns(        pl.col('next_session_start').is_not_null().cast(pl.Int8).alias('ReturnedForNextSession')    )    # Add variables    model2_data = model2_data.with_columns([        (pl.col('session_revenue_usd') > 0).cast(pl.Int8).alias('MadePurchase'),        pl.col('unique_products').alias('VarietyProductsClicked'),        pl.col('user_id').cumcount().over('user_id').alias('session_number')    ]).with_columns(        (pl.col('session_number') == 0).cast(pl.Int8).alias('IsFirstSession')    )    # Add time since last session    model2_data = model2_data.with_columns(        pl.col('session_end').shift(1).over('user_id').alias('prev_session_end')    ).with_columns(        pl.lit(0.0).alias('TimeSinceLastSession')  # Simplified    )    # Add day of week    if model2_data['session_start'].dtype == pl.Utf8:        model2_data = model2_data.with_columns(            pl.col('session_start').str.strptime(pl.Datetime, format="%Y-%m-%dT%H:%M:%S%.f%z", strict=False)        )    model2_data = model2_data.with_columns(        pl.col('session_start').dt.weekday().alias('dayofweek')    )    print(f"Return rate: {model2_data['ReturnedForNextSession'].mean():.2%}")    # Sample users (FIXED)    pl.set_random_seed(RANDOM_SEED)    unique_users = model2_data['user_id'].unique()    sampled_users = unique_users.to_frame().sample(fraction=USER_SAMPLE_FRACTION, with_replacement=False)    model2_data_sampled = model2_data.join(        sampled_users,        on='user_id',        how='inner'    )    print(f"Sampled dataset: {len(model2_data_sampled):,} browsing sessions")    # Save    model2_panel_path = PANEL_DIR / 'model2_journey_continuation.parquet'    model2_data_sampled.write_parquet(model2_panel_path)    print(f"Saved to: {model2_panel_path}")except Exception as e:    print(f"[ERROR] Model 2: {e}")    model2_data_sampled = pl.DataFrame()

In [None]:
# --- MODEL 3: FINAL CONVERSION MODEL DATA CONSTRUCTION ---print("\n" + "="*100)print("SECTION 4: MODEL 3 - FINAL CONVERSION MODEL (BOTTOM-OF-FUNNEL)")print("="*100)try:    # Prepare variables    model3_data = df_shopping.with_columns([        pl.col('did_purchase').cast(pl.Int8).alias('DidPurchase'),        pl.col('num_browsing_sessions').alias('NumBrowsingSessions'),        pl.col('total_clicks').alias('TotalClicks'),        pl.col('shopping_duration_days').alias('TotalDurationDays')    ])    # Add derived variables    model3_data = model3_data.with_columns([        (pl.col('NumBrowsingSessions') / (pl.col('TotalDurationDays') + 0.01)).alias('SessionDensity'),        pl.lit(2).alias('VarietyVendorsClicked'),  # Simplified        pl.lit(50.0).alias('AvgPriceClickedItems'),  # Simplified        (pl.col('NumBrowsingSessions') * pl.col('TotalClicks')).alias('Sessions_x_Clicks')    ])    # Add time identifiers    if model3_data['shopping_start'].dtype == pl.Utf8:        model3_data = model3_data.with_columns(            pl.col('shopping_start').str.strptime(pl.Datetime, format="%Y-%m-%dT%H:%M:%S%.f%z", strict=False)        )    model3_data = model3_data.with_columns([        pl.col('shopping_start').dt.week().alias('week_of_year'),        pl.col('shopping_start').dt.year().alias('year')    ])    print(f"Conversion rate: {model3_data['DidPurchase'].mean():.2%}")    # Sample users (FIXED)    pl.set_random_seed(RANDOM_SEED)    unique_users = model3_data['user_id'].unique()    sampled_users = unique_users.to_frame().sample(fraction=USER_SAMPLE_FRACTION, with_replacement=False)    model3_data_sampled = model3_data.join(        sampled_users,        on='user_id',        how='inner'    )    # Filter to multi-session users    user_counts = model3_data_sampled.group_by('user_id').agg(pl.count().alias('n'))    multi_users = user_counts.filter(pl.col('n') > 1)['user_id']    model3_data_multi = model3_data_sampled.join(        multi_users.to_frame(),        on='user_id',        how='inner'    )    print(f"Final dataset: {len(model3_data_multi):,} shopping sessions")    # Save    model3_panel_path = PANEL_DIR / 'model3_final_conversion.parquet'    model3_data_multi.write_parquet(model3_panel_path)    print(f"Saved to: {model3_panel_path}")except Exception as e:    print(f"[ERROR] Model 3: {e}")    model3_data_multi = pl.DataFrame()

In [None]:
# --- SUMMARY ---
print("\n" + "="*100)
print("PANEL CREATION COMPLETE")
print("="*100)

import os
for filename in ["model1_ad_effectiveness.parquet", "model2_journey_continuation.parquet", "model3_final_conversion.parquet"]:
    filepath = PANEL_DIR / filename
    if filepath.exists():
        size_mb = filepath.stat().st_size / (1024 * 1024)
        print(f"✓ {filename}: {size_mb:.2f} MB")
    else:
        print(f"✗ {filename}: not created")

---

## PART 2: ECONOMETRIC MODELING

Load the saved panels and run econometric models.

In [None]:
# PART 2: ECONOMETRIC MODELS
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
from pathlib import Path
import warnings
warnings.filterwarnings("ignore")

PANEL_DIR = Path("./data/funnel_panels")

# Load panels
panels = {}
for name, filename in [("model1", "model1_ad_effectiveness.parquet"),
                       ("model2", "model2_journey_continuation.parquet"),
                       ("model3", "model3_final_conversion.parquet")]:
    filepath = PANEL_DIR / filename
    if filepath.exists():
        panels[name] = pd.read_parquet(filepath)
        print(f"Loaded {name}: {len(panels[name]):,} rows")
    else:
        panels[name] = pd.DataFrame()
        print(f"Missing {name}")

In [None]:
# Run simple models for testing
results = {}

# Model 1
if not panels["model1"].empty:
    try:
        formula = "WasClicked ~ RANKING + RankSquared"
        model1 = smf.logit(formula, data=panels["model1"]).fit(disp=False)
        print("Model 1: Ad Effectiveness")
        print(f"  Rank effect: {model1.params['RANKING']:.4f} (p={model1.pvalues['RANKING']:.4f})")
        results["model1"] = model1
    except Exception as e:
        print(f"Model 1 failed: {e}")

# Model 2
if not panels["model2"].empty:
    try:
        formula = "ReturnedForNextSession ~ num_clicks + MadePurchase"
        model2 = smf.logit(formula, data=panels["model2"]).fit(disp=False)
        print("\nModel 2: Journey Continuation")
        print(f"  Click effect: {model2.params['num_clicks']:.4f} (p={model2.pvalues['num_clicks']:.4f})")
        print(f"  Purchase satiation: {model2.params['MadePurchase']:.4f} (p={model2.pvalues['MadePurchase']:.4f})")
        results["model2"] = model2
    except Exception as e:
        print(f"Model 2 failed: {e}")

# Model 3
if not panels["model3"].empty:
    try:
        formula = "DidPurchase ~ NumBrowsingSessions + TotalClicks"
        model3 = smf.logit(formula, data=panels["model3"]).fit(disp=False)
        print("\nModel 3: Final Conversion")
        print(f"  Sessions effect: {model3.params['NumBrowsingSessions']:.4f} (p={model3.pvalues['NumBrowsingSessions']:.4f})")
        print(f"  Clicks effect: {model3.params['TotalClicks']:.4f} (p={model3.pvalues['TotalClicks']:.4f})")
        results["model3"] = model3
    except Exception as e:
        print(f"Model 3 failed: {e}")

print(f"\nSuccessfully estimated {len(results)} models")