# Funnel Econometric Analysis with Polars - Ad Platform IncrementalityThis notebook implements three fixed-effects models to analyze the advertising funnel:1. **Model 1: Ad Effectiveness Model** (Top-of-Funnel) - Individual ad impression analysis2. **Model 2: Journey Continuation Model** (Mid-Funnel) - Browsing session continuation analysis3. **Model 3: Final Conversion Model** (Bottom-of-Funnel) - Shopping session conversion analysis**Part 1**: Data processing with Polars and panel creation**Part 2**: Econometric modeling with saved panels**Output**: All results saved to `funnel_econometric_results.txt`

## PART 1: DATA PROCESSING WITH POLARS

PART 1: DATA PROCESSING WITH POLARS--- SETUP & ENVIRONMENT ---import polars as plimport numpy as npfrom pathlib import Pathfrom datetime import datetime, timedeltaimport warningsfrom tqdm import tqdmwarnings.filterwarnings('ignore')ConfigurationDATA_DIR = Path('./data')PANEL_DIR = Path('./data/funnel_panels')  # Directory for funnel panel datasetsPANEL_DIR.mkdir(exist_ok=True, parents=True)Sampling parametersAUCTION_SAMPLE_FRACTION = 0.10  # Sample 10% of auctions for Model 1USER_SAMPLE_FRACTION = 0.05     # Sample 5% of users for Models 2 & 3RANDOM_SEED = 42print("="*100)print("FUNNEL ECONOMETRIC ANALYSIS - DATA PROCESSING WITH POLARS")print("="*100)print(f"\nConfiguration:")print(f"  Data directory: {DATA_DIR}")print(f"  Panel output directory: {PANEL_DIR}")print(f"  Auction sample fraction: {AUCTION_SAMPLE_FRACTION:.1%}")print(f"  User sample fraction: {USER_SAMPLE_FRACTION:.1%}")print(f"  Random seed: {RANDOM_SEED}")

--- DATA LOADING WITH POLARS ---print("\n" + "="*100)print("SECTION 1: DATA LOADING WITH POLARS")print("="*100)Define required filesdata_files = {    'shopping_sessions': 'shopping_sessions.parquet',    'browsing_sessions': 'browsing_sessions.parquet',    'auctions_users': 'raw_sample_auctions_users.parquet',    'auctions_results': 'raw_sample_auctions_results.parquet',    'impressions': 'raw_sample_impressions.parquet',    'clicks': 'raw_sample_clicks.parquet',    'purchases': 'raw_sample_purchases.parquet',    'catalog': 'processed_sample_catalog.parquet'}Load datasets with Polarsdatasets = {}print("\nLoading datasets:")for name, filename in tqdm(data_files.items(), desc="Loading data"):    filepath = DATA_DIR / filename    if filepath.exists():        datasets[name] = pl.read_parquet(filepath)        shape = datasets[name].shape        print(f"  - {name}: {shape[0]:,} rows, {shape[1]} columns")    else:        print(f"  - {name}: FILE NOT FOUND at {filepath}")        datasets[name] = pl.DataFrame()Extract dataframesdf_shopping = datasets['shopping_sessions']df_browsing = datasets['browsing_sessions']df_auctions = datasets['auctions_users']df_bids = datasets['auctions_results']df_impressions = datasets['impressions']df_clicks = datasets['clicks']df_purchases = datasets['purchases']df_catalog = datasets['catalog']Convert prices from cents to dollarsif not df_purchases.is_empty() and 'UNIT_PRICE' in df_purchases.columns:    df_purchases = df_purchases.with_columns(        (pl.col('UNIT_PRICE') / 100).alias('UNIT_PRICE')    )    print("\n  Note: Purchase unit prices converted from cents to dollars")if not df_catalog.is_empty() and 'PRICE' in df_catalog.columns:    df_catalog = df_catalog.with_columns(        (pl.col('PRICE') / 100).alias('PRICE')    )    print("  Note: Catalog prices converted from cents to dollars")Convert timestampstimestamp_conversions = [    (df_auctions, 'CREATED_AT'),    (df_bids, 'CREATED_AT'),    (df_impressions, 'OCCURRED_AT'),    (df_clicks, 'OCCURRED_AT'),    (df_purchases, 'PURCHASED_AT')]for df, col in timestamp_conversions:    if not df.is_empty() and col in df.columns:        datasets[list(datasets.keys())[list(datasets.values()).index(df)]] = df.with_columns(            pl.col(col).str.strptime(pl.Datetime, format="%Y-%m-%dT%H:%M:%S%.f%z", strict=False)        )print("\n[SUCCESS] All data loaded and preprocessed")

--- MODEL 1: AD EFFECTIVENESS MODEL DATA CONSTRUCTION ---print("\n" + "="*100)print("SECTION 2: MODEL 1 - AD EFFECTIVENESS MODEL (TOP-OF-FUNNEL)")print("="*100)print("\nUnit of Analysis: Individual Ad Impression")print("Equation: WasClicked_iva = δₐ + γ_v + β₁·Rank_iva + [Controls] + ε_iva")print("\nData Construction Steps:")try:    # Step 1: Start with impressions as base table    print("\n1. Starting with impressions table as base...")    model1_data = df_impressions    print(f"   Base impressions: {len(model1_data):,} rows")    # Step 2: Add rank information from bids    print("\n2. Adding rank information from bids table...")    join_keys = ['AUCTION_ID', 'PRODUCT_ID', 'VENDOR_ID', 'CAMPAIGN_ID']    model1_data = model1_data.join(        df_bids.select(join_keys + ['RANKING']),        on=join_keys,        how='left'    )    print(f"   Successfully joined rank data: {model1_data['RANKING'].is_not_null().sum():,} matches")    print(f"   Missing ranks: {model1_data['RANKING'].is_null().sum():,} impressions")    # Step 3: Add outcome variable (WasClicked)    print("\n3. Creating outcome variable WasClicked...")    # Create a set of clicks for faster lookup    click_keys = df_clicks.select(join_keys).unique().with_columns(        pl.lit(1).alias('WasClicked')    )    model1_data = model1_data.join(        click_keys,        on=join_keys,        how='left'    ).with_columns(        pl.col('WasClicked').fill_null(0).cast(pl.Int8)    )    print(f"   Click rate: {model1_data['WasClicked'].mean():.2%}")    # Step 4: Engineer auction-level controls    print("\n4. Engineering auction-level control variables...")    # 4a. NumWinningBids per auction    winning_bids_count = df_bids.filter(pl.col('IS_WINNER') == True).group_by('AUCTION_ID').agg(        pl.count().alias('NumWinningBids_a')    )    model1_data = model1_data.join(winning_bids_count, on='AUCTION_ID', how='left').with_columns(        pl.col('NumWinningBids_a').fill_null(0)    )    print(f"   Added NumWinningBids_a: mean={model1_data['NumWinningBids_a'].mean():.1f}")    # 4b. Average price of top 5 products    if not df_catalog.is_empty():        top5_bids = df_bids.filter(pl.col('RANKING') <= 5)        top5_with_price = top5_bids.join(            df_catalog.select(['PRODUCT_ID', 'PRICE']),            on='PRODUCT_ID',            how='left'        )        avg_price_top5 = top5_with_price.group_by('AUCTION_ID').agg(            pl.col('PRICE').mean().alias('AvgPriceTop5_a')        )        model1_data = model1_data.join(avg_price_top5, on='AUCTION_ID', how='left')        median_price = model1_data['AvgPriceTop5_a'].median()        model1_data = model1_data.with_columns(            pl.col('AvgPriceTop5_a').fill_null(median_price)        )        print(f"   Added AvgPriceTop5_a: mean=${model1_data['AvgPriceTop5_a'].mean():.2f}")    else:        model1_data = model1_data.with_columns(            pl.lit(50.0).alias('AvgPriceTop5_a')        )        print("   AvgPriceTop5_a: Using default value $50 (catalog not available)")    # 4c. Brand concentration (Herfindahl-Hirschman Index)    top5_vendors = df_bids.filter(pl.col('RANKING') <= 5)    # Calculate HHI per auction    brand_concentration = top5_vendors.group_by(['AUCTION_ID', 'VENDOR_ID']).agg(        pl.count().alias('vendor_count')    ).group_by('AUCTION_ID').agg([        pl.col('vendor_count').sum().alias('total'),        pl.col('vendor_count').alias('counts')    ]).with_columns(        (pl.col('counts').list.eval(pl.element() / pl.col('total').first()).list.eval(pl.element() ** 2).list.sum()).alias('BrandConcentration_a')    ).select(['AUCTION_ID', 'BrandConcentration_a'])    model1_data = model1_data.join(brand_concentration, on='AUCTION_ID', how='left').with_columns(        pl.col('BrandConcentration_a').fill_null(0.2)    )    print(f"   Added BrandConcentration_a: mean={model1_data['BrandConcentration_a'].mean():.3f}")    # Step 5: Add non-linear and interaction terms    print("\n5. Creating non-linear and interaction terms...")    model1_data = model1_data.with_columns([        pl.col('RANKING').pow(2).alias('RankSquared'),        (pl.col('RANKING') * pl.col('AvgPriceTop5_a')).alias('Rank_x_AvgPrice')    ])    print("   Added RankSquared and Rank_x_AvgPrice interaction")    # Step 6: Auction-based sampling    print("\n6. Performing auction-based sampling...")    pl.set_random_seed(RANDOM_SEED)    unique_auctions = model1_data['AUCTION_ID'].unique()    sampled_auctions = unique_auctions.sample(fraction=AUCTION_SAMPLE_FRACTION, with_replacement=False)    model1_data_sampled = model1_data.join(        sampled_auctions.select('AUCTION_ID'),        on='AUCTION_ID',        how='inner'    )    print(f"   Sampled {len(sampled_auctions):,} auctions ({AUCTION_SAMPLE_FRACTION:.1%})")    print(f"   Final dataset: {len(model1_data_sampled):,} impressions")    print(f"   Unique vendors: {model1_data_sampled['VENDOR_ID'].n_unique():,}")    # Step 7: Final data validation and preparation    print("\n7. Final data validation...")    model1_data_sampled = model1_data_sampled.drop_nulls(subset=['RANKING', 'VENDOR_ID', 'AUCTION_ID'])    print(f"   After removing missing values: {len(model1_data_sampled):,} rows")    # Save Model 1 panel    model1_panel_path = PANEL_DIR / 'model1_ad_effectiveness.parquet'    model1_data_sampled.write_parquet(model1_panel_path)    print(f"\n[SUCCESS] Model 1 panel saved to: {model1_panel_path}")    # Summary statistics    print(f"\nSummary Statistics for Model 1:")    print(f"  Click rate: {model1_data_sampled['WasClicked'].mean():.2%}")    print(f"  Average rank: {model1_data_sampled['RANKING'].mean():.1f}")    print(f"  Average price (top 5): ${model1_data_sampled['AvgPriceTop5_a'].mean():.2f}")    print(f"  Average winning bids per auction: {model1_data_sampled['NumWinningBids_a'].mean():.1f}")    print(f"  Average brand concentration: {model1_data_sampled['BrandConcentration_a'].mean():.3f}")except Exception as e:    print(f"\n[ERROR] Model 1 data construction failed: {str(e)}")    model1_data_sampled = pl.DataFrame()

--- MODEL 2: JOURNEY CONTINUATION MODEL DATA CONSTRUCTION ---print("\n" + "="*100)print("SECTION 3: MODEL 2 - JOURNEY CONTINUATION MODEL (MID-FUNNEL)")print("="*100)print("\nUnit of Analysis: Browsing Session")print("Equation: ReturnedForNextSession_is = αᵢ + τ_d + β₁·NumClicks_is + β₂·MadePurchase_is + [Controls] + ε_isd")print("\nData Construction Steps:")try:    # Step 1: Start with browsing sessions as base table    print("\n1. Starting with browsing sessions as base...")    model2_data = df_browsing    print(f"   Base browsing sessions: {len(model2_data):,} rows")    # Step 2: Engineer outcome variable (ReturnedForNextSession)    print("\n2. Creating outcome variable ReturnedForNextSession...")    # Sort by user and session start time    model2_data = model2_data.sort(['user_id', 'session_start'])    # Within each user and shopping session, check if there's a next browsing session    model2_data = model2_data.with_columns(        pl.col('session_start').shift(-1).over(['user_id', 'shopping_session_id']).alias('next_session_start')    ).with_columns(        pl.col('next_session_start').is_not_null().cast(pl.Int8).alias('ReturnedForNextSession')    )    print(f"   Return rate: {model2_data['ReturnedForNextSession'].mean():.2%}")    # Step 3: Add key independent variables    print("\n3. Adding key independent variables...")    model2_data = model2_data.with_columns(        (pl.col('session_revenue_usd') > 0).cast(pl.Int8).alias('MadePurchase')    )    print(f"   Purchase rate per session: {model2_data['MadePurchase'].mean():.2%}")    print(f"   Average clicks per session: {model2_data['num_clicks'].mean():.2f}")    # Step 4: Engineer control variables    print("\n4. Engineering control variables...")    # 4a. SessionDurationMinutes (already in data as duration_minutes)    print(f"   SessionDurationMinutes: mean={model2_data['duration_minutes'].mean():.1f} min")    # 4b. VarietyProductsClicked    if not df_clicks.is_empty():        # Convert timestamps for comparison        clicks_with_time = df_clicks.with_columns(            pl.col('OCCURRED_AT').alias('click_time')        )        # For simplicity, use the existing unique_products column if available        if 'unique_products' in model2_data.columns:            model2_data = model2_data.with_columns(                pl.col('unique_products').alias('VarietyProductsClicked')            )        else:            # Use a default value            model2_data = model2_data.with_columns(                pl.lit(3).alias('VarietyProductsClicked')            )    else:        model2_data = model2_data.with_columns(            pl.col('unique_products').alias('VarietyProductsClicked')            if 'unique_products' in model2_data.columns            else pl.lit(3).alias('VarietyProductsClicked')        )    print(f"   VarietyProductsClicked: mean={model2_data['VarietyProductsClicked'].mean():.2f} products")    # 4c. NumImpressions (already in data)    print(f"   NumImpressions: mean={model2_data['num_impressions'].mean():.1f}")    # 4d. IsFirstSession    model2_data = model2_data.with_columns(        pl.col('user_id').cumcount().over('user_id').alias('session_number')    ).with_columns(        (pl.col('session_number') == 0).cast(pl.Int8).alias('IsFirstSession')    )    print(f"   First session rate: {model2_data['IsFirstSession'].mean():.2%}")    # 4e. TimeSinceLastSession    model2_data = model2_data.with_columns(        pl.col('session_end').shift(1).over('user_id').alias('prev_session_end')    )    # Convert to datetime if needed and calculate time difference    if model2_data['session_start'].dtype == pl.Utf8:        model2_data = model2_data.with_columns([            pl.col('session_start').str.strptime(pl.Datetime, format="%Y-%m-%dT%H:%M:%S%.f%z", strict=False),            pl.col('prev_session_end').str.strptime(pl.Datetime, format="%Y-%m-%dT%H:%M:%S%.f%z", strict=False)        ])    model2_data = model2_data.with_columns(        ((pl.col('session_start') - pl.col('prev_session_end')).dt.total_seconds() / 3600)        .fill_null(0)        .alias('TimeSinceLastSession')    )    print(f"   TimeSinceLastSession: mean={model2_data.filter(pl.col('TimeSinceLastSession') > 0)['TimeSinceLastSession'].mean():.1f} hours")    # Step 5: Add fixed effects identifiers    print("\n5. Adding fixed effects identifiers...")    model2_data = model2_data.with_columns([        pl.col('session_start').dt.date().alias('session_date'),        pl.col('session_start').dt.weekday().alias('dayofweek')    ])    print("   Added date and day-of-week for time fixed effects")    # Step 6: User-based sampling    print("\n6. Performing user-based sampling...")    pl.set_random_seed(RANDOM_SEED)    unique_users = model2_data['user_id'].unique()    sampled_users = unique_users.sample(fraction=USER_SAMPLE_FRACTION, with_replacement=False)    model2_data_sampled = model2_data.join(        sampled_users.select('user_id'),        on='user_id',        how='inner'    )    print(f"   Sampled {len(sampled_users):,} users ({USER_SAMPLE_FRACTION:.1%})")    print(f"   Final dataset: {len(model2_data_sampled):,} browsing sessions")    print(f"   Average sessions per user: {len(model2_data_sampled) / len(sampled_users):.2f}")    # Step 7: Final data validation    print("\n7. Final data validation...")    required_cols = ['user_id', 'browsing_session_id', 'num_clicks', 'ReturnedForNextSession']    model2_data_sampled = model2_data_sampled.drop_nulls(subset=required_cols)    print(f"   After removing missing values: {len(model2_data_sampled):,} rows")    # Save Model 2 panel    model2_panel_path = PANEL_DIR / 'model2_journey_continuation.parquet'    model2_data_sampled.write_parquet(model2_panel_path)    print(f"\n[SUCCESS] Model 2 panel saved to: {model2_panel_path}")    # Summary statistics    print(f"\nSummary Statistics for Model 2:")    print(f"  Return rate: {model2_data_sampled['ReturnedForNextSession'].mean():.2%}")    print(f"  Purchase rate: {model2_data_sampled['MadePurchase'].mean():.2%}")    print(f"  Average clicks: {model2_data_sampled['num_clicks'].mean():.2f}")    print(f"  Average duration: {model2_data_sampled['duration_minutes'].mean():.1f} minutes")    print(f"  Average products viewed: {model2_data_sampled['VarietyProductsClicked'].mean():.2f}")except Exception as e:    print(f"\n[ERROR] Model 2 data construction failed: {str(e)}")    model2_data_sampled = pl.DataFrame()

--- MODEL 3: FINAL CONVERSION MODEL DATA CONSTRUCTION ---print("\n" + "="*100)print("SECTION 4: MODEL 3 - FINAL CONVERSION MODEL (BOTTOM-OF-FUNNEL)")print("="*100)print("\nUnit of Analysis: Shopping Session")print("Equation: DidPurchase_is = αᵢ + τ_w + β₁·NumBrowsingSessions_is + β₂·TotalClicks_is + [Controls] + ε_isw")print("\nData Construction Steps:")try:    # Step 1: Start with shopping sessions as base table    print("\n1. Starting with shopping sessions as base...")    model3_data = df_shopping    print(f"   Base shopping sessions: {len(model3_data):,} rows")    # Step 2: Outcome variable already exists    print("\n2. Verifying outcome variable DidPurchase...")    model3_data = model3_data.with_columns(        pl.col('did_purchase').cast(pl.Int8).alias('DidPurchase')    )    print(f"   Conversion rate: {model3_data['DidPurchase'].mean():.2%}")    # Step 3: Key independent variables    print("\n3. Preparing key independent variables...")    model3_data = model3_data.with_columns([        pl.col('num_browsing_sessions').alias('NumBrowsingSessions'),        pl.col('total_clicks').alias('TotalClicks')    ])    print(f"   Average browsing sessions per shopping session: {model3_data['NumBrowsingSessions'].mean():.2f}")    print(f"   Average total clicks: {model3_data['TotalClicks'].mean():.2f}")    # Step 4: Engineer control variables    print("\n4. Engineering control variables...")    # 4a. TotalDurationDays    model3_data = model3_data.with_columns(        pl.col('shopping_duration_days').alias('TotalDurationDays')    )    print(f"   TotalDurationDays: mean={model3_data['TotalDurationDays'].mean():.2f} days")    # 4b. SessionDensity    model3_data = model3_data.with_columns(        (pl.col('NumBrowsingSessions') / (pl.col('TotalDurationDays') + 0.01)).alias('SessionDensity')    )    print(f"   SessionDensity: mean={model3_data['SessionDensity'].mean():.2f} sessions/day")    # 4c. VarietyVendorsClicked    print("\n   Calculating VarietyVendorsClicked...")    if not df_clicks.is_empty():        # Convert timestamps for comparison        if df_clicks['OCCURRED_AT'].dtype == pl.Utf8:            clicks_for_variety = df_clicks.with_columns(                pl.col('OCCURRED_AT').str.strptime(pl.Datetime, format="%Y-%m-%dT%H:%M:%S%.f%z", strict=False)            )        else:            clicks_for_variety = df_clicks        # Aggregate vendor variety per user        vendor_variety = clicks_for_variety.group_by('USER_ID').agg(            pl.col('VENDOR_ID').n_unique().alias('VarietyVendorsClicked')        )        model3_data = model3_data.join(            vendor_variety.rename({'USER_ID': 'user_id'}),            on='user_id',            how='left'        ).with_columns(            pl.col('VarietyVendorsClicked').fill_null(0)        )    else:        model3_data = model3_data.with_columns(            pl.lit(0).alias('VarietyVendorsClicked')        )    print(f"   VarietyVendorsClicked: mean={model3_data['VarietyVendorsClicked'].mean():.2f} vendors")    # 4d. AvgPriceClickedItems    print("\n   Calculating AvgPriceClickedItems...")    if not df_clicks.is_empty() and not df_catalog.is_empty():        # Join clicks with catalog to get prices        clicks_with_price = df_clicks.join(            df_catalog.select(['PRODUCT_ID', 'PRICE']),            on='PRODUCT_ID',            how='left'        )        # Aggregate average price per user        avg_price_per_user = clicks_with_price.group_by('USER_ID').agg(            pl.col('PRICE').mean().alias('AvgPriceClickedItems')        )        model3_data = model3_data.join(            avg_price_per_user.rename({'USER_ID': 'user_id'}),            on='user_id',            how='left'        ).with_columns(            pl.col('AvgPriceClickedItems').fill_null(50)        )    else:        model3_data = model3_data.with_columns(            pl.lit(50.0).alias('AvgPriceClickedItems')        )    print(f"   AvgPriceClickedItems: mean=${model3_data['AvgPriceClickedItems'].mean():.2f}")    # Step 5: Add interaction term    print("\n5. Creating interaction terms...")    model3_data = model3_data.with_columns(        (pl.col('NumBrowsingSessions') * pl.col('TotalClicks')).alias('Sessions_x_Clicks')    )    print("   Added Sessions_x_Clicks interaction term")    # Step 6: Add fixed effects identifiers    print("\n6. Adding fixed effects identifiers...")    # Convert shopping_start to datetime if needed    if model3_data['shopping_start'].dtype == pl.Utf8:        model3_data = model3_data.with_columns(            pl.col('shopping_start').str.strptime(pl.Datetime, format="%Y-%m-%dT%H:%M:%S%.f%z", strict=False)        )    model3_data = model3_data.with_columns([        pl.col('shopping_start').dt.week().alias('week_of_year'),        pl.col('shopping_start').dt.year().alias('year')    ])    print("   Added week and year for time fixed effects")    # Step 7: User-based sampling    print("\n7. Performing user-based sampling...")    pl.set_random_seed(RANDOM_SEED)    unique_users = model3_data['user_id'].unique()    sampled_users = unique_users.sample(fraction=USER_SAMPLE_FRACTION, with_replacement=False)    model3_data_sampled = model3_data.join(        sampled_users.select('user_id'),        on='user_id',        how='inner'    )    print(f"   Sampled {len(sampled_users):,} users ({USER_SAMPLE_FRACTION:.1%})")    print(f"   Final dataset: {len(model3_data_sampled):,} shopping sessions")    print(f"   Average sessions per user: {len(model3_data_sampled) / len(sampled_users):.2f}")    # Step 8: Final data validation    print("\n8. Final data validation...")    required_cols = ['user_id', 'shopping_session_id', 'DidPurchase', 'NumBrowsingSessions', 'TotalClicks']    model3_data_sampled = model3_data_sampled.drop_nulls(subset=required_cols)    print(f"   After removing missing values: {len(model3_data_sampled):,} rows")    # Filter to users with multiple shopping sessions for fixed effects    user_session_counts = model3_data_sampled.group_by('user_id').agg(pl.count().alias('session_count'))    multi_session_users = user_session_counts.filter(pl.col('session_count') > 1)['user_id']    model3_data_multi = model3_data_sampled.join(        multi_session_users.to_frame(),        on='user_id',        how='inner'    )    print(f"   Users with multiple sessions: {len(multi_session_users):,}")    print(f"   Observations for fixed effects model: {len(model3_data_multi):,}")    # Save Model 3 panel    model3_panel_path = PANEL_DIR / 'model3_final_conversion.parquet'    model3_data_multi.write_parquet(model3_panel_path)    print(f"\n[SUCCESS] Model 3 panel saved to: {model3_panel_path}")    # Summary statistics    print(f"\nSummary Statistics for Model 3:")    print(f"  Conversion rate: {model3_data_sampled['DidPurchase'].mean():.2%}")    print(f"  Average browsing sessions: {model3_data_sampled['NumBrowsingSessions'].mean():.2f}")    print(f"  Average total clicks: {model3_data_sampled['TotalClicks'].mean():.2f}")    print(f"  Average duration: {model3_data_sampled['TotalDurationDays'].mean():.2f} days")    print(f"  Average vendors clicked: {model3_data_sampled['VarietyVendorsClicked'].mean():.2f}")    print(f"  Average price of clicked items: ${model3_data_sampled['AvgPriceClickedItems'].mean():.2f}")except Exception as e:    print(f"\n[ERROR] Model 3 data construction failed: {str(e)}")    model3_data_sampled = pl.DataFrame()

--- SUMMARY OF PANEL CREATION ---print("\n" + "="*100)print("PANEL CREATION SUMMARY")print("="*100)print("\nFunnel panel datasets created and saved:")panel_files = [    ('Model 1 - Ad Effectiveness', 'model1_ad_effectiveness.parquet'),    ('Model 2 - Journey Continuation', 'model2_journey_continuation.parquet'),    ('Model 3 - Final Conversion', 'model3_final_conversion.parquet')]for panel_name, filename in panel_files:    filepath = PANEL_DIR / filename    if filepath.exists():        file_size = filepath.stat().st_size / (1024 * 1024)  # Size in MB        print(f"  ✓ {panel_name:30} {filename:40} ({file_size:.2f} MB)")    else:        print(f"  ✗ {panel_name:30} {filename:40} (not created)")print("\n" + "="*100)print("POLARS DATA PROCESSING COMPLETE!")print(f"Panels saved to: {PANEL_DIR}")print("Ready for econometric modeling in Part 2")print("="*100)

---# PART 2: ECONOMETRIC MODELINGNow we load the saved panels and run the econometric models.

PART 2: ECONOMETRIC MODELS--- SETUP FOR MODELING ---import pandas as pdimport numpy as npfrom pathlib import Pathfrom datetime import datetimeimport warningsfrom tqdm import tqdmimport statsmodels.formula.api as smffrom statsmodels.iolib.summary2 import summary_colfrom scipy import statswarnings.filterwarnings('ignore')ConfigurationPANEL_DIR = Path('./data/funnel_panels')REPORT_FILE = 'reports/funnel_econometric_results.txt'Path('reports').mkdir(exist_ok=True)Model parametersALPHA = 0.05  # Significance levelReport capturing classclass ReportLogger:    def __init__(self, filename):        self.filename = filename        self.content = []        self.content.append("="*100)        self.content.append("FUNNEL ECONOMETRIC ANALYSIS RESULTS")        self.content.append(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")        self.content.append("="*100)        self.content.append("\n")    def log(self, text):        """Log text to both console and report buffer"""        print(text)        self.content.append(text)    def save(self):        """Save accumulated content to file"""        Path(self.filename).parent.mkdir(exist_ok=True)        with open(self.filename, 'w') as f:            f.write('\n'.join(str(line) for line in self.content))        print(f"\n[SUCCESS] Report saved to {self.filename}")Initialize report loggerreport = ReportLogger(REPORT_FILE)report.log("Starting Funnel Econometric Analysis...\n")

--- LOAD PANEL DATASETS ---report.log("="*100)report.log("LOADING PANEL DATASETS")report.log("="*100)panels = {}panel_files = {    'model1': 'model1_ad_effectiveness.parquet',    'model2': 'model2_journey_continuation.parquet',    'model3': 'model3_final_conversion.parquet'}report.log("\nLoading panels:")for name, filename in tqdm(panel_files.items(), desc="Loading panels"):    filepath = PANEL_DIR / filename    if filepath.exists():        panels[name] = pd.read_parquet(filepath)        report.log(f"  - {name}: {len(panels[name]):,} rows, {panels[name].shape[1]} columns")    else:        report.log(f"  - {name}: FILE NOT FOUND")        panels[name] = pd.DataFrame()Extract dataframesmodel1_data_sampled = panels.get('model1', pd.DataFrame())model2_data_sampled = panels.get('model2', pd.DataFrame())model3_data_multi = panels.get('model3', pd.DataFrame())

--- MODEL 1: AD EFFECTIVENESS MODEL ---report.log("\n" + "-"*80)report.log("MODEL 1: AD EFFECTIVENESS MODEL")report.log("-"*80)if not model1_data_sampled.empty:    try:        # Baseline model without controls        report.log("\nEstimating baseline model (without controls)...")        formula_baseline = "WasClicked ~ RANKING + C(VENDOR_ID) + C(AUCTION_ID)"        model1_baseline = smf.logit(formula_baseline, data=model1_data_sampled).fit(disp=False)        # Full model with controls        report.log("Estimating full model (with controls)...")        formula_full = """WasClicked ~ RANKING + RankSquared + NumWinningBids_a +                          AvgPriceTop5_a + BrandConcentration_a + Rank_x_AvgPrice +                          C(VENDOR_ID) + C(AUCTION_ID)"""        model1_full = smf.logit(formula_full, data=model1_data_sampled).fit(disp=False)        report.log("\n" + "="*60)        report.log("MODEL 1 RESULTS: AD EFFECTIVENESS")        report.log("="*60)        report.log(f"\nNumber of observations: {len(model1_data_sampled):,}")        report.log(f"Number of auctions: {model1_data_sampled['AUCTION_ID'].nunique():,}")        report.log(f"Number of vendors: {model1_data_sampled['VENDOR_ID'].nunique():,}")        # Report key coefficients        report.log("\nKey Coefficients (Full Model):")        report.log("-"*40)        coef_names = ['RANKING', 'RankSquared', 'NumWinningBids_a', 'AvgPriceTop5_a',                      'BrandConcentration_a', 'Rank_x_AvgPrice']        for coef in coef_names:            if coef in model1_full.params.index:                param = model1_full.params[coef]                se = model1_full.bse[coef]                pval = model1_full.pvalues[coef]                sig = '***' if pval < 0.001 else '**' if pval < 0.01 else '*' if pval < 0.05 else ''                report.log(f"{coef:25s}: {param:8.4f} ({se:.4f}) {sig}")        # Model fit statistics        report.log("\nModel Fit Statistics:")        report.log("-"*40)        report.log(f"Baseline Log-Likelihood: {model1_baseline.llf:.2f}")        report.log(f"Full Model Log-Likelihood: {model1_full.llf:.2f}")        report.log(f"Pseudo R²: {model1_full.prsquared:.4f}")        report.log(f"AIC: {model1_full.aic:.2f}")        report.log(f"BIC: {model1_full.bic:.2f}")        # Interpretation        report.log("\nInterpretation:")        report.log("-"*40)        rank_effect = model1_full.params.get('RANKING', 0)        report.log(f"- Each position drop in rank changes click odds by {(np.exp(rank_effect) - 1) * 100:.2f}%")        if 'RankSquared' in model1_full.params.index:            report.log(f"- Non-linear effect detected: {'Yes' if model1_full.pvalues['RankSquared'] < 0.05 else 'No'}")        if 'BrandConcentration_a' in model1_full.params.index:            brand_effect = model1_full.params['BrandConcentration_a']            report.log(f"- Brand concentration effect: {'Positive' if brand_effect > 0 else 'Negative'}")    except Exception as e:        report.log(f"\n[ERROR] Model 1 estimation failed: {str(e)}")else:    report.log("\n[SKIP] Model 1: No data available for estimation")

--- MODEL 2: JOURNEY CONTINUATION MODEL ---report.log("\n" + "-"*80)report.log("MODEL 2: JOURNEY CONTINUATION MODEL")report.log("-"*80)if not model2_data_sampled.empty:    try:        # Only keep users with multiple sessions for fixed effects        user_counts = model2_data_sampled['user_id'].value_counts()        multi_session_users = user_counts[user_counts > 1].index        model2_fe_data = model2_data_sampled[model2_data_sampled['user_id'].isin(multi_session_users)]        if len(model2_fe_data) > 0:            report.log(f"\nUsing {len(model2_fe_data):,} observations from {len(multi_session_users):,} users with multiple sessions")            # Estimate model            formula = """ReturnedForNextSession ~ num_clicks + MadePurchase + duration_minutes +                        VarietyProductsClicked + num_impressions + IsFirstSession +                        TimeSinceLastSession + C(user_id) + C(dayofweek)"""            model2 = smf.logit(formula, data=model2_fe_data).fit(disp=False)            report.log("\n" + "="*60)            report.log("MODEL 2 RESULTS: JOURNEY CONTINUATION")            report.log("="*60)            report.log(f"\nNumber of observations: {len(model2_fe_data):,}")            report.log(f"Number of users: {model2_fe_data['user_id'].nunique():,}")            # Report key coefficients            report.log("\nKey Coefficients:")            report.log("-"*40)            coef_names = ['num_clicks', 'MadePurchase', 'duration_minutes', 'VarietyProductsClicked',                         'num_impressions', 'IsFirstSession', 'TimeSinceLastSession']            for coef in coef_names:                if coef in model2.params.index:                    param = model2.params[coef]                    se = model2.bse[coef]                    pval = model2.pvalues[coef]                    sig = '***' if pval < 0.001 else '**' if pval < 0.01 else '*' if pval < 0.05 else ''                    report.log(f"{coef:25s}: {param:8.4f} ({se:.4f}) {sig}")            # Model fit            report.log("\nModel Fit Statistics:")            report.log("-"*40)            report.log(f"Log-Likelihood: {model2.llf:.2f}")            report.log(f"Pseudo R²: {model2.prsquared:.4f}")            # Interpretation            report.log("\nInterpretation:")            report.log("-"*40)            click_effect = model2.params.get('num_clicks', 0)            report.log(f"- Each additional click changes return odds by {(np.exp(click_effect) - 1) * 100:.2f}%")            purchase_effect = model2.params.get('MadePurchase', 0)            report.log(f"- Making a purchase changes return odds by {(np.exp(purchase_effect) - 1) * 100:.2f}%")            report.log(f"  This indicates {'demand satiation' if purchase_effect < 0 else 'purchase momentum'}")        else:            report.log("\n[WARNING] No users with multiple sessions for fixed effects estimation")    except Exception as e:        report.log(f"\n[ERROR] Model 2 estimation failed: {str(e)}")else:    report.log("\n[SKIP] Model 2: No data available for estimation")

--- MODEL 3: FINAL CONVERSION MODEL ---report.log("\n" + "-"*80)report.log("MODEL 3: FINAL CONVERSION MODEL")report.log("-"*80)if not model3_data_multi.empty:    try:        # Estimate model        formula = """DidPurchase ~ NumBrowsingSessions + TotalClicks + TotalDurationDays +                    SessionDensity + VarietyVendorsClicked + AvgPriceClickedItems +                    Sessions_x_Clicks + C(user_id) + C(week_of_year)"""        model3 = smf.logit(formula, data=model3_data_multi).fit(disp=False)        report.log("\n" + "="*60)        report.log("MODEL 3 RESULTS: FINAL CONVERSION")        report.log("="*60)        report.log(f"\nNumber of observations: {len(model3_data_multi):,}")        report.log(f"Number of users: {model3_data_multi['user_id'].nunique():,}")        report.log(f"Baseline conversion rate: {model3_data_multi['DidPurchase'].mean():.2%}")        # Report key coefficients        report.log("\nKey Coefficients:")        report.log("-"*40)        coef_names = ['NumBrowsingSessions', 'TotalClicks', 'TotalDurationDays', 'SessionDensity',                     'VarietyVendorsClicked', 'AvgPriceClickedItems', 'Sessions_x_Clicks']        for coef in coef_names:            if coef in model3.params.index:                param = model3.params[coef]                se = model3.bse[coef]                pval = model3.pvalues[coef]                sig = '***' if pval < 0.001 else '**' if pval < 0.01 else '*' if pval < 0.05 else ''                report.log(f"{coef:25s}: {param:8.4f} ({se:.4f}) {sig}")        # Model fit        report.log("\nModel Fit Statistics:")        report.log("-"*40)        report.log(f"Log-Likelihood: {model3.llf:.2f}")        report.log(f"Pseudo R²: {model3.prsquared:.4f}")        # Interpretation        report.log("\nInterpretation:")        report.log("-"*40)        sessions_effect = model3.params.get('NumBrowsingSessions', 0)        report.log(f"- Each additional browsing session changes purchase odds by {(np.exp(sessions_effect) - 1) * 100:.2f}%")        clicks_effect = model3.params.get('TotalClicks', 0)        report.log(f"- Each additional click changes purchase odds by {(np.exp(clicks_effect) - 1) * 100:.2f}%")        vendor_effect = model3.params.get('VarietyVendorsClicked', 0)        if vendor_effect != 0:            report.log(f"- Shopping around effect: Each additional vendor {'increases' if vendor_effect > 0 else 'decreases'} purchase odds")        interaction_effect = model3.params.get('Sessions_x_Clicks', 0)        if interaction_effect != 0 and model3.pvalues.get('Sessions_x_Clicks', 1) < 0.05:            report.log(f"- Significant interaction between sessions and clicks detected")    except Exception as e:        report.log(f"\n[ERROR] Model 3 estimation failed: {str(e)}")else:    report.log("\n[SKIP] Model 3: No data available for estimation")

--- SAVE REPORT AND SUMMARY ---report.log("\n" + "="*100)report.log("ANALYSIS COMPLETE")report.log("="*100)report.log(f"\nAnalysis completed at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")report.save()print("\n" + "="*60)print("FUNNEL ECONOMETRIC ANALYSIS COMPLETE")print("="*60)print(f"\nResults saved to: {REPORT_FILE}")print("\nKey Findings Summary:")print("-"*40)Summarize key findings if models were estimatedif 'model1_full' in locals():    print("Model 1 (Ad Effectiveness): Estimated successfully")    if 'RANKING' in model1_full.params.index:        print(f"  - Rank effect on clicks: {'Significant' if model1_full.pvalues['RANKING'] < 0.05 else 'Not significant'}")if 'model2' in locals():    print("Model 2 (Journey Continuation): Estimated successfully")    if 'num_clicks' in model2.params.index:        print(f"  - Click effect on return: {'Positive' if model2.params['num_clicks'] > 0 else 'Negative'}")    if 'MadePurchase' in model2.params.index:        print(f"  - Purchase effect on return: {'Negative (satiation)' if model2.params['MadePurchase'] < 0 else 'Positive'}")if 'model3' in locals():    print("Model 3 (Final Conversion): Estimated successfully")    if 'NumBrowsingSessions' in model3.params.index:        print(f"  - Sessions effect on purchase: {'Positive' if model3.params['NumBrowsingSessions'] > 0 else 'Negative'}")    if 'Sessions_x_Clicks' in model3.params.index:        print(f"  - Session-Click interaction: {'Significant' if model3.pvalues['Sessions_x_Clicks'] < 0.05 else 'Not significant'}")print("\n" + "="*60)