# 02_analysis.ipynb
## Journey Processing and Causal Analysis

This notebook loads checkpointed data and performs comprehensive causal analysis.

### Workflow:
1. **Load Checkpoint** - Load data from 01_data_pull.ipynb
2. **Journey Processing** - Create user journeys and sessions
3. **Feature Engineering** - Create advanced features
4. **EDA** - Comprehensive exploratory analysis
5. **Causal Analysis** - Regression models with robust standard errors

In [3]:
# --- IMPORTS ---
import os
import sys
import json
import warnings
from datetime import datetime, timedelta
from pathlib import Path
from typing import Dict, Any, Tuple

import pandas as pd
import numpy as np
from tqdm import tqdm

# Statistical modeling imports
import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy import stats
from statsmodels.stats.diagnostic import het_breuschpagan

warnings.filterwarnings('ignore')

# Initialize logging
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
output_log = []

def log(message: str):
    """Add message to output log"""
    ts = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    log_entry = f"[{ts}] {message}"
    output_log.append(log_entry)
    print(log_entry)

## Section 1: Load Checkpoint Data

In [4]:
# Check for existing data files
raw_data_dir = Path("./data/raw")
processed_data_path = Path("./data/user_journey_causal_dataset.parquet")

def load_checkpoint_data() -> Tuple[Dict[str, pd.DataFrame], Dict[str, Any]]:
    """Load the most recent checkpoint data from raw directory."""
    
    if not raw_data_dir.exists():
        raise FileNotFoundError(f"Data directory not found: {raw_data_dir}")
    
    # Find most recent metadata file
    metadata_files = list(raw_data_dir.glob("metadata_*.json"))
    if not metadata_files:
        raise FileNotFoundError("No metadata files found. Please run 01_data_pull.ipynb first.")
    
    latest_metadata = max(metadata_files, key=lambda x: x.stat().st_mtime)
    
    # Load metadata
    with open(latest_metadata, 'r') as f:
        metadata = json.load(f)
    
    extraction_timestamp = metadata['timestamp']
    log(f"Loading data from extraction: {extraction_timestamp}")
    
    # Load all data files
    data = {}
    required_files = [
        'auctions_users', 'auctions_results', 'impressions', 
        'clicks', 'purchases', 'catalog'
    ]
    optional_files = ['hist_purchases', 'hist_impressions', 'hist_clicks']
    
    for file_type in required_files + optional_files:
        pattern = f"{file_type}_{extraction_timestamp}.parquet"
        file_path = raw_data_dir / pattern
        
        if file_path.exists():
            data[file_type] = pd.read_parquet(file_path)
            log(f"  Loaded {file_type}: {len(data[file_type]):,} rows")
        elif file_type in required_files:
            raise FileNotFoundError(f"Required file not found: {file_path}")
        else:
            log(f"  Optional file not found: {file_type}")
    
    return data, metadata

# Load the data
try:
    data, metadata = load_checkpoint_data()
    
    # Extract individual dataframes
    auctions_users = data['auctions_users']
    auctions_results = data['auctions_results']
    impressions = data['impressions']
    clicks = data['clicks']
    purchases = data['purchases']
    catalog = data['catalog']
    
    # Historical data (optional)
    hist_purchases = data.get('hist_purchases', pd.DataFrame())
    hist_impressions = data.get('hist_impressions', pd.DataFrame())
    hist_clicks = data.get('hist_clicks', pd.DataFrame())
    
    log("\n✓ Data loaded successfully")
    log(f"  Analysis period: {metadata['analysis_start_date']} to {metadata['analysis_end_date']}")
    log(f"  Sampling fraction: {metadata['sampling_fraction']:.2%}")
    log(f"  Journey window: {metadata['journey_window_hours']} hours")
    
except Exception as e:
    log(f"ERROR: Failed to load checkpoint data: {e}")
    raise

[2025-09-23 05:02:51] Loading data from extraction: 20250923_043038
[2025-09-23 05:02:51]   Loaded auctions_users: 88,690 rows
[2025-09-23 05:02:52]   Loaded auctions_results: 3,410,770 rows
[2025-09-23 05:02:52]   Loaded impressions: 347,741 rows
[2025-09-23 05:02:52]   Loaded clicks: 11,215 rows
[2025-09-23 05:02:52]   Loaded purchases: 1,859 rows
[2025-09-23 05:02:55]   Loaded catalog: 4,961,480 rows
[2025-09-23 05:02:55]   Loaded hist_purchases: 10,332 rows
[2025-09-23 05:02:55]   Loaded hist_impressions: 1,685,675 rows
[2025-09-23 05:02:56]   Loaded hist_clicks: 59,691 rows
[2025-09-23 05:02:56] 
✓ Data loaded successfully
[2025-09-23 05:02:56]   Analysis period: 2025-08-24 to 2025-09-07
[2025-09-23 05:02:56]   Sampling fraction: 0.10%
[2025-09-23 05:02:56]   Journey window: 168 hours


## Section 2: Journey Processing

In [5]:
# Sessionize events
log("\n" + "="*80)
log("SECTION 2: JOURNEY PROCESSING")
log("="*80)

log("\nSessionizing events...")
events = []

# Add auctions
auctions = auctions_users.copy()
auctions['event_type'] = 'auction'
auctions['event_time'] = pd.to_datetime(auctions['CREATED_AT'])
auctions['PRODUCT_ID'] = None
auctions['VENDOR_ID'] = None
events.append(auctions[['USER_ID', 'AUCTION_ID', 'event_type', 'event_time', 'PRODUCT_ID', 'VENDOR_ID']])

# Add impressions
impressions_evt = impressions.copy()
impressions_evt['event_type'] = 'impression'
impressions_evt['event_time'] = pd.to_datetime(impressions_evt['OCCURRED_AT'])
events.append(impressions_evt[['USER_ID', 'AUCTION_ID', 'event_type', 'event_time', 'PRODUCT_ID', 'VENDOR_ID']])

# Add clicks
clicks_evt = clicks.copy()
clicks_evt['event_type'] = 'click'
clicks_evt['event_time'] = pd.to_datetime(clicks_evt['OCCURRED_AT'])
events.append(clicks_evt[['USER_ID', 'AUCTION_ID', 'event_type', 'event_time', 'PRODUCT_ID', 'VENDOR_ID']])

# Add purchases
purchases_evt = purchases.copy()
purchases_evt['event_type'] = 'purchase'
purchases_evt['event_time'] = pd.to_datetime(purchases_evt['PURCHASED_AT'])
purchases_evt['AUCTION_ID'] = None
purchases_evt['VENDOR_ID'] = None
events.append(purchases_evt[['USER_ID', 'AUCTION_ID', 'event_type', 'event_time', 'PRODUCT_ID', 'VENDOR_ID']])

# Combine and sort
all_events = pd.concat(events, ignore_index=True)
all_events = all_events.sort_values(['USER_ID', 'event_time'])
log(f"Total events: {len(all_events):,}")

[2025-09-23 05:02:57] 
[2025-09-23 05:02:57] SECTION 2: JOURNEY PROCESSING
[2025-09-23 05:02:57] 
Sessionizing events...
[2025-09-23 05:02:57] Total events: 449,505


In [6]:
# Create enhanced event stream with metadata
log("\nEnhancing event data with metadata...")

# Add ranking information from auctions_results
auction_ranks = auctions_results[['AUCTION_ID', 'PRODUCT_ID', 'RANKING']].copy()
auction_ranks = auction_ranks.dropna()
auction_ranks = auction_ranks.groupby(['AUCTION_ID', 'PRODUCT_ID'])['RANKING'].min().reset_index()
log(f"  Created ranking lookup with {len(auction_ranks):,} auction-product pairs")

# Merge rankings into events
all_events_enhanced = all_events.merge(
    auction_ranks, 
    on=['AUCTION_ID', 'PRODUCT_ID'], 
    how='left'
)

# Add catalog information
all_events_enhanced = all_events_enhanced.merge(
    catalog[['PRODUCT_ID', 'BRAND', 'DEPARTMENT_ID', 'PRICE']].drop_duplicates(),
    on='PRODUCT_ID',
    how='left'
)

all_events_enhanced = all_events_enhanced.sort_values(['USER_ID', 'event_time'])
log(f"Enhanced events created: {len(all_events_enhanced):,} total events with metadata")

[2025-09-23 05:03:01] 
Enhancing event data with metadata...
[2025-09-23 05:03:03]   Created ranking lookup with 3,410,466 auction-product pairs
[2025-09-23 05:03:08] Enhanced events created: 449,505 total events with metadata


In [7]:
# Identify session breaks (2+ hour gaps)
SESSION_GAP_HOURS = metadata.get('session_gap_hours', 2)

log(f"\nIdentifying session breaks (>{SESSION_GAP_HOURS} hour gaps)...")
all_events['prev_time'] = all_events.groupby('USER_ID')['event_time'].shift()
all_events['time_diff'] = (all_events['event_time'] - all_events['prev_time']).dt.total_seconds() / 3600
all_events['session_break'] = (all_events['time_diff'] >= SESSION_GAP_HOURS) | all_events['time_diff'].isna()

# Assign journey IDs
all_events['journey_id'] = all_events.groupby('USER_ID')['session_break'].cumsum()
all_events['journey_id'] = all_events['USER_ID'] + '_' + all_events['journey_id'].astype(str)

unique_journeys = all_events['journey_id'].nunique()
log(f"Created {unique_journeys:,} unique journeys")

# Also apply to enhanced events
all_events_enhanced['prev_time'] = all_events_enhanced.groupby('USER_ID')['event_time'].shift()
all_events_enhanced['time_diff'] = (all_events_enhanced['event_time'] - all_events_enhanced['prev_time']).dt.total_seconds() / 3600
all_events_enhanced['session_break'] = (all_events_enhanced['time_diff'] >= SESSION_GAP_HOURS) | all_events_enhanced['time_diff'].isna()
all_events_enhanced['journey_id'] = all_events_enhanced.groupby('USER_ID')['session_break'].cumsum()
all_events_enhanced['journey_id'] = all_events_enhanced['USER_ID'] + '_' + all_events_enhanced['journey_id'].astype(str)

[2025-09-23 05:03:09] 
Identifying session breaks (>2 hour gaps)...
[2025-09-23 05:03:10] Created 10,360 unique journeys


## Section 3: Feature Engineering

In [8]:
log("\n" + "="*80)
log("SECTION 3: FEATURE ENGINEERING")
log("="*80)

# Aggregate metrics at (journey_id, product_id) level
log("\nAggregating journey metrics...")

# Get products that received impressions
impression_events = all_events[all_events['event_type'] == 'impression']
journey_products = impression_events[['journey_id', 'PRODUCT_ID']].drop_duplicates()
log(f"Unique (journey, product) pairs: {len(journey_products):,}")

# Initialize metrics DataFrame
metrics = journey_products.copy()

# Add user_id
journey_users = all_events[['journey_id', 'USER_ID']].drop_duplicates()
metrics = metrics.merge(journey_users, on='journey_id', how='left')

# Calculate impression counts
imp_counts = impression_events.groupby(['journey_id', 'PRODUCT_ID']).size().reset_index(name='impressions_on_product')
metrics = metrics.merge(imp_counts, on=['journey_id', 'PRODUCT_ID'], how='left')

# Calculate click counts
click_events = all_events[all_events['event_type'] == 'click']
click_counts = click_events.groupby(['journey_id', 'PRODUCT_ID']).size().reset_index(name='clicks_on_product')
metrics = metrics.merge(click_counts, on=['journey_id', 'PRODUCT_ID'], how='left')
metrics['clicks_on_product'] = metrics['clicks_on_product'].fillna(0).astype(int)

# Calculate purchase counts
purchase_events = all_events[all_events['event_type'] == 'purchase']
purchase_counts = purchase_events.groupby(['journey_id', 'PRODUCT_ID']).size().reset_index(name='purchases_on_product')
metrics = metrics.merge(purchase_counts, on=['journey_id', 'PRODUCT_ID'], how='left')
metrics['purchases_on_product'] = metrics['purchases_on_product'].fillna(0).astype(int)
metrics['did_purchase_product'] = (metrics['purchases_on_product'] > 0).astype(int)

log(f"Base metrics shape: {metrics.shape}")

[2025-09-23 05:03:13] 
[2025-09-23 05:03:13] SECTION 3: FEATURE ENGINEERING
[2025-09-23 05:03:13] 
Aggregating journey metrics...
[2025-09-23 05:03:13] Unique (journey, product) pairs: 269,276
[2025-09-23 05:03:14] Base metrics shape: (269276, 7)


In [9]:
# HALO EFFECTS
log("\n" + "="*60)
log("FEATURE: HALO EFFECTS")
log("="*60)

# Get purchased brands/departments per journey
purchase_events_enhanced = all_events_enhanced[all_events_enhanced['event_type'] == 'purchase'].copy()

if len(purchase_events_enhanced) > 0:
    # Brands purchased in each journey
    purchased_brands = purchase_events_enhanced.dropna(subset=['BRAND']).groupby('journey_id')['BRAND'].apply(lambda x: x.unique().tolist()).reset_index()
    purchased_brands.columns = ['journey_id', 'purchased_brands']
    
    # Departments purchased in each journey
    purchased_depts = purchase_events_enhanced.dropna(subset=['DEPARTMENT_ID']).groupby('journey_id')['DEPARTMENT_ID'].apply(lambda x: x.unique().tolist()).reset_index()
    purchased_depts.columns = ['journey_id', 'purchased_departments']
    
    # Merge into metrics
    metrics = metrics.merge(purchased_brands, on='journey_id', how='left')
    metrics = metrics.merge(purchased_depts, on='journey_id', how='left')
    
    # Fill NaNs with empty lists
    metrics['purchased_brands'] = metrics['purchased_brands'].apply(lambda x: x if isinstance(x, list) else [])
    metrics['purchased_departments'] = metrics['purchased_departments'].apply(lambda x: x if isinstance(x, list) else [])
    
    # Add brand/department info to metrics
    metrics = metrics.merge(
        catalog[['PRODUCT_ID', 'BRAND', 'DEPARTMENT_ID']].drop_duplicates(),
        on='PRODUCT_ID',
        how='left'
    )
    
    # Create brand halo outcome
    metrics['did_purchase_brand_in_journey'] = metrics.apply(
        lambda row: 1 if pd.notna(row['BRAND']) and row['BRAND'] in row['purchased_brands'] else 0,
        axis=1
    )
    
    # Create department halo outcome
    metrics['did_purchase_department_in_journey'] = metrics.apply(
        lambda row: 1 if pd.notna(row['DEPARTMENT_ID']) and row['DEPARTMENT_ID'] in row['purchased_departments'] else 0,
        axis=1
    )
    
    log(f"  Products where same brand purchased: {metrics['did_purchase_brand_in_journey'].sum():,}")
    log(f"  Products where same department purchased: {metrics['did_purchase_department_in_journey'].sum():,}")
    log(f"  Brand purchase rate: {metrics['did_purchase_brand_in_journey'].mean():.4%}")
    log(f"  Department purchase rate: {metrics['did_purchase_department_in_journey'].mean():.4%}")
else:
    metrics['did_purchase_brand_in_journey'] = 0
    metrics['did_purchase_department_in_journey'] = 0
    log("  No purchases found - halo outcomes set to 0")

[2025-09-23 05:03:25] 
[2025-09-23 05:03:25] FEATURE: HALO EFFECTS
[2025-09-23 05:03:30]   Products where same brand purchased: 2,474
[2025-09-23 05:03:30]   Products where same department purchased: 11,153
[2025-09-23 05:03:30]   Brand purchase rate: 0.9188%
[2025-09-23 05:03:30]   Department purchase rate: 4.1418%


In [10]:
# COMPETITIVE CONTEXT
log("\n" + "="*60)
log("FEATURE: COMPETITIVE CONTEXT")
log("="*60)

# Auction competitiveness metrics
log("Calculating auction competitiveness...")
auction_metrics = auctions_results.groupby('AUCTION_ID').agg({
    'VENDOR_ID': 'nunique',
    'PRODUCT_ID': 'nunique',
    'RANKING': ['min', 'max', 'mean']
}).reset_index()

auction_metrics.columns = ['AUCTION_ID', 'num_bidders', 'num_products', 'min_rank', 'max_rank', 'avg_rank']
auction_metrics['auction_competitiveness'] = auction_metrics['num_bidders'] * np.log1p(auction_metrics['max_rank'])

# Map average winning rank to impressed products
impression_events_enhanced = all_events_enhanced[all_events_enhanced['event_type'] == 'impression']
avg_winning_rank = impression_events_enhanced.merge(
    auction_metrics[['AUCTION_ID', 'avg_rank']], 
    on='AUCTION_ID', 
    how='left'
)
avg_winning_rank = avg_winning_rank.groupby(['journey_id', 'PRODUCT_ID'])['avg_rank'].mean().reset_index()
avg_winning_rank.columns = ['journey_id', 'PRODUCT_ID', 'avg_winning_rank']

metrics = metrics.merge(avg_winning_rank, on=['journey_id', 'PRODUCT_ID'], how='left')

# Product win rates
product_wins = auctions_results[auctions_results['IS_WINNER'] == True].groupby('PRODUCT_ID').size().reset_index(name='wins')
product_bids = auctions_results.groupby('PRODUCT_ID').size().reset_index(name='bids')
product_win_rate = product_wins.merge(product_bids, on='PRODUCT_ID')
product_win_rate['product_win_rate'] = product_win_rate['wins'] / product_win_rate['bids']

metrics = metrics.merge(product_win_rate[['PRODUCT_ID', 'product_win_rate']], on='PRODUCT_ID', how='left')

log(f"  Avg Winning Rank - Mean: {metrics['avg_winning_rank'].mean():.2f}")
log(f"  Product Win Rate - Mean: {metrics['product_win_rate'].mean():.4f}")

[2025-09-23 05:03:30] 
[2025-09-23 05:03:30] FEATURE: COMPETITIVE CONTEXT
[2025-09-23 05:03:30] Calculating auction competitiveness...
[2025-09-23 05:03:35]   Avg Winning Rank - Mean: 24.99
[2025-09-23 05:03:35]   Product Win Rate - Mean: 0.8947


In [11]:
# TEMPORAL FEATURES
log("\n" + "="*60)
log("FEATURE: TEMPORAL DYNAMICS")
log("="*60)

# Click Order Features
log("Calculating click order features...")
click_events_enhanced = all_events_enhanced[all_events_enhanced['event_type'] == 'click'].copy()

if len(click_events_enhanced) > 0:
    # Sort clicks by time within each journey
    click_events_enhanced = click_events_enhanced.sort_values(['journey_id', 'event_time'])
    
    # Add click order within journey
    click_events_enhanced['click_order_in_journey'] = click_events_enhanced.groupby('journey_id').cumcount() + 1
    
    # Get first and last click positions for each product
    first_clicks = click_events_enhanced.groupby(['journey_id', 'PRODUCT_ID'])['click_order_in_journey'].min().reset_index()
    first_clicks.columns = ['journey_id', 'PRODUCT_ID', 'first_click_position']
    
    last_clicks = click_events_enhanced.groupby(['journey_id', 'PRODUCT_ID'])['click_order_in_journey'].max().reset_index()
    last_clicks.columns = ['journey_id', 'PRODUCT_ID', 'last_click_position']
    
    # Merge into metrics
    metrics = metrics.merge(first_clicks, on=['journey_id', 'PRODUCT_ID'], how='left')
    metrics = metrics.merge(last_clicks, on=['journey_id', 'PRODUCT_ID'], how='left')
    
    # Create binary flags
    metrics['is_first_click_in_journey'] = (metrics['first_click_position'] == 1).astype(int).fillna(0)
    
    # Find the last clicked product in each journey
    max_clicks_per_journey = metrics.groupby('journey_id')['last_click_position'].max().reset_index()
    max_clicks_per_journey.columns = ['journey_id', 'max_click_position']
    metrics = metrics.merge(max_clicks_per_journey, on='journey_id', how='left')
    metrics['is_last_click_product'] = (metrics['last_click_position'] == metrics['max_click_position']).astype(int).fillna(0)
    
    log(f"  Products that were first click: {metrics['is_first_click_in_journey'].sum():,}")
    log(f"  Products that were last click: {metrics['is_last_click_product'].sum():,}")
else:
    metrics['is_first_click_in_journey'] = 0
    metrics['is_last_click_product'] = 0

[2025-09-23 05:03:35] 
[2025-09-23 05:03:35] FEATURE: TEMPORAL DYNAMICS
[2025-09-23 05:03:35] Calculating click order features...
[2025-09-23 05:03:35]   Products that were first click: 3,029
[2025-09-23 05:03:35]   Products that were last click: 3,047


In [12]:
# Journey-level metrics
log("\nCalculating journey-level metrics...")

total_clicks = click_events.groupby('journey_id').size().reset_index(name='total_clicks')
metrics = metrics.merge(total_clicks, on='journey_id', how='left')
metrics['total_clicks'] = metrics['total_clicks'].fillna(0).astype(int)

# Journey duration
journey_times = all_events.groupby('journey_id')['event_time'].agg(['min', 'max']).reset_index()
journey_times['journey_duration_hours'] = (journey_times['max'] - journey_times['min']).dt.total_seconds() / 3600
metrics = metrics.merge(journey_times[['journey_id', 'journey_duration_hours']], on='journey_id', how='left')

# Distinct products viewed
distinct_products = impression_events.groupby('journey_id')['PRODUCT_ID'].nunique().reset_index(name='distinct_products')
metrics = metrics.merge(distinct_products, on='journey_id', how='left')

# Add catalog features
if 'PRICE' not in metrics.columns:
    catalog_features = catalog[['PRODUCT_ID', 'PRICE']].copy()
    metrics = metrics.merge(catalog_features, on='PRODUCT_ID', how='left')

log(f"Final dataset shape: {metrics.shape}")
log(f"Purchase rate: {metrics['did_purchase_product'].mean():.4%}")

[2025-09-23 05:03:35] 
Calculating journey-level metrics...
[2025-09-23 05:03:37] Final dataset shape: (269276, 24)
[2025-09-23 05:03:37] Purchase rate: 0.0171%


In [13]:
# Historical Features (if available)
log("\nEngineering historical features...")

if len(hist_purchases) > 0:
    # User-level historical features
    user_purchase_history = hist_purchases.groupby('USER_ID').agg({
        'PURCHASE_ID': 'count',
        'UNIT_PRICE': ['mean', 'sum']
    }).reset_index()
    user_purchase_history.columns = ['USER_ID', 'hist_purchase_count', 'hist_avg_price', 'hist_total_spend']
    metrics = metrics.merge(user_purchase_history, on='USER_ID', how='left')
    log(f"  Added user purchase history features")

if len(hist_impressions) > 0 and len(hist_clicks) > 0:
    # User CTR history
    user_imp_counts = hist_impressions.groupby('USER_ID').size().reset_index(name='hist_impressions')
    user_click_counts = hist_clicks.groupby('USER_ID').size().reset_index(name='hist_clicks')
    
    user_ctr = user_imp_counts.merge(user_click_counts, on='USER_ID', how='left')
    user_ctr['hist_clicks'] = user_ctr['hist_clicks'].fillna(0)
    user_ctr['hist_user_ctr'] = user_ctr['hist_clicks'] / user_ctr['hist_impressions']
    user_ctr['hist_user_ctr'] = user_ctr['hist_user_ctr'].fillna(0)
    
    metrics = metrics.merge(user_ctr[['USER_ID', 'hist_user_ctr', 'hist_impressions', 'hist_clicks']], 
                            on='USER_ID', how='left')
    log(f"  Added user CTR history features")
    
    # Vendor CTR history
    if 'VENDOR_ID' in hist_impressions.columns:
        vendor_imp_counts = hist_impressions.groupby('VENDOR_ID').size().reset_index(name='vendor_hist_imps')
        vendor_click_counts = hist_clicks.groupby('VENDOR_ID').size().reset_index(name='vendor_hist_clicks')
        
        vendor_ctr = vendor_imp_counts.merge(vendor_click_counts, on='VENDOR_ID', how='left')
        vendor_ctr['vendor_hist_clicks'] = vendor_ctr['vendor_hist_clicks'].fillna(0)
        vendor_ctr['vendor_hist_ctr'] = vendor_ctr['vendor_hist_clicks'] / vendor_ctr['vendor_hist_imps']
        vendor_ctr['vendor_hist_ctr'] = vendor_ctr['vendor_hist_ctr'].fillna(0)
        
        # Get vendor_id for each product from impressions
        product_vendor = impressions[['PRODUCT_ID', 'VENDOR_ID']].drop_duplicates()
        metrics = metrics.merge(product_vendor, on='PRODUCT_ID', how='left', suffixes=('', '_new'))
        if 'VENDOR_ID_new' in metrics.columns:
            metrics['VENDOR_ID'] = metrics['VENDOR_ID'].fillna(metrics['VENDOR_ID_new'])
            metrics = metrics.drop(columns=['VENDOR_ID_new'])
        metrics = metrics.merge(vendor_ctr[['VENDOR_ID', 'vendor_hist_ctr']], on='VENDOR_ID', how='left')
        log(f"  Added vendor CTR history features")

# Fill missing historical features with 0
hist_cols = [c for c in metrics.columns if 'hist_' in c or 'vendor_hist' in c]
for col in hist_cols:
    if col in metrics.columns:
        metrics[col] = metrics[col].fillna(0)

log(f"\nFinal dataset shape with all features: {metrics.shape}")
log(f"Purchase rate: {metrics['did_purchase_product'].mean():.4%}")

[2025-09-23 05:03:37] 
Engineering historical features...
[2025-09-23 05:03:37]   Added user purchase history features
[2025-09-23 05:03:37]   Added user CTR history features
[2025-09-23 05:03:37]   Added vendor CTR history features
[2025-09-23 05:03:37] 
Final dataset shape with all features: (269276, 32)
[2025-09-23 05:03:37] Purchase rate: 0.0171%


In [14]:
# Save processed dataset
dataset_path = Path("./data/user_journey_causal_dataset.parquet")
dataset_path.parent.mkdir(parents=True, exist_ok=True)
metrics.to_parquet(dataset_path, index=False)
log(f"\n✓ Processed dataset saved to: {dataset_path}")
log(f"  Shape: {metrics.shape}")
log(f"  Memory usage: {metrics.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

[2025-09-23 05:03:38] 
✓ Processed dataset saved to: data/user_journey_causal_dataset.parquet
[2025-09-23 05:03:38]   Shape: (269276, 32)
[2025-09-23 05:03:38]   Memory usage: 202.48 MB


## Section 4: Exploratory Data Analysis

In [15]:
log("\n" + "="*80)
log("SECTION 4: EXPLORATORY DATA ANALYSIS")
log("="*80)

# 1. JOURNEY OVERVIEW
log("\n1. JOURNEY OVERVIEW")
log("-" * 60)
log(f"Total journeys: {metrics['journey_id'].nunique():,}")
log(f"Total users: {metrics['USER_ID'].nunique():,}")
log(f"Total products: {metrics['PRODUCT_ID'].nunique():,}")
log(f"Total observations: {len(metrics):,}")

# Journey composition
journey_stats = all_events.groupby('journey_id')['event_type'].value_counts().unstack(fill_value=0)
log("\nAverage events per journey:")
for event_type in journey_stats.columns:
    log(f"  {event_type}: {journey_stats[event_type].mean():.2f}")

# Journey duration distribution
log("\nJourney duration (hours):")
log(f"  Mean: {metrics['journey_duration_hours'].mean():.2f}")
log(f"  Median: {metrics['journey_duration_hours'].median():.2f}")
log(f"  75th percentile: {metrics['journey_duration_hours'].quantile(0.75):.2f}")
log(f"  95th percentile: {metrics['journey_duration_hours'].quantile(0.95):.2f}")

[2025-09-23 05:03:38] 
[2025-09-23 05:03:38] SECTION 4: EXPLORATORY DATA ANALYSIS
[2025-09-23 05:03:38] 
1. JOURNEY OVERVIEW
[2025-09-23 05:03:38] ------------------------------------------------------------
[2025-09-23 05:03:38] Total journeys: 7,820
[2025-09-23 05:03:38] Total users: 1,124
[2025-09-23 05:03:38] Total products: 215,589
[2025-09-23 05:03:38] Total observations: 269,276
[2025-09-23 05:03:38] 
Average events per journey:
[2025-09-23 05:03:38]   auction: 8.56
[2025-09-23 05:03:38]   click: 1.08
[2025-09-23 05:03:38]   impression: 33.57
[2025-09-23 05:03:38]   purchase: 0.18
[2025-09-23 05:03:38] 
Journey duration (hours):
[2025-09-23 05:03:38]   Mean: 2.00
[2025-09-23 05:03:38]   Median: 0.89
[2025-09-23 05:03:38]   75th percentile: 2.36
[2025-09-23 05:03:38]   95th percentile: 8.15


In [16]:
# 2. CONVERSION FUNNEL
log("\n2. CONVERSION FUNNEL")
log("-" * 60)

# Journey-level funnel
journey_level = metrics.groupby('journey_id').agg({
    'impressions_on_product': 'sum',
    'clicks_on_product': 'sum',
    'purchases_on_product': 'sum'
}).reset_index()

journeys_with_impressions = (journey_level['impressions_on_product'] > 0).sum()
journeys_with_clicks = (journey_level['clicks_on_product'] > 0).sum()
journeys_with_purchases = (journey_level['purchases_on_product'] > 0).sum()

log("Journey-level funnel:")
log(f"  Journeys with impressions: {journeys_with_impressions:,} (100.00%)")
log(f"  Journeys with clicks: {journeys_with_clicks:,} ({journeys_with_clicks/journeys_with_impressions:.2%})")
log(f"  Journeys with purchases: {journeys_with_purchases:,} ({journeys_with_purchases/journeys_with_impressions:.2%})")

# Product-level funnel
log("\nProduct-level funnel (products that received impressions):")
products_with_clicks = (metrics['clicks_on_product'] > 0).sum()
products_purchased = (metrics['did_purchase_product'] > 0).sum()

log(f"  Products impressed: {len(metrics):,} (100.00%)")
log(f"  Products clicked: {products_with_clicks:,} ({products_with_clicks/len(metrics):.2%})")
log(f"  Products purchased: {products_purchased:,} ({products_purchased/len(metrics):.2%})")

[2025-09-23 05:03:38] 
2. CONVERSION FUNNEL
[2025-09-23 05:03:38] ------------------------------------------------------------
[2025-09-23 05:03:38] Journey-level funnel:
[2025-09-23 05:03:38]   Journeys with impressions: 7,820 (100.00%)
[2025-09-23 05:03:38]   Journeys with clicks: 3,047 (38.96%)
[2025-09-23 05:03:38]   Journeys with purchases: 43 (0.55%)
[2025-09-23 05:03:38] 
Product-level funnel (products that received impressions):
[2025-09-23 05:03:38]   Products impressed: 269,276 (100.00%)
[2025-09-23 05:03:38]   Products clicked: 9,768 (3.63%)
[2025-09-23 05:03:38]   Products purchased: 46 (0.02%)


In [17]:
# 3. ENGAGEMENT METRICS
log("\n3. ENGAGEMENT METRICS")
log("-" * 60)

# Overall CTR
total_impressions = metrics['impressions_on_product'].sum()
total_clicks = metrics['clicks_on_product'].sum()
ctr = total_clicks / total_impressions if total_impressions > 0 else 0
log(f"Overall CTR: {ctr:.2%} ({total_clicks:,} clicks / {total_impressions:,} impressions)")

# Conversion rate by click status
clicked_products = metrics[metrics['clicks_on_product'] > 0]
not_clicked_products = metrics[metrics['clicks_on_product'] == 0]

log("\nConversion rates:")
log(f"  Products with clicks: {clicked_products['did_purchase_product'].mean():.4%}")
log(f"  Products without clicks: {not_clicked_products['did_purchase_product'].mean():.4%}")
if not_clicked_products['did_purchase_product'].mean() > 0:
    lift = (clicked_products['did_purchase_product'].mean() / not_clicked_products['did_purchase_product'].mean() - 1)
    log(f"  Lift from clicking: {lift:.1f}x")

# Click distribution
log("\nClick distribution on impressed products:")
click_dist = metrics['clicks_on_product'].value_counts().sort_index().head(10)
for clicks, count in click_dist.items():
    log(f"  {clicks} clicks: {count:,} products ({count/len(metrics):.2%})")

[2025-09-23 05:03:38] 
3. ENGAGEMENT METRICS
[2025-09-23 05:03:38] ------------------------------------------------------------
[2025-09-23 05:03:38] Overall CTR: 3.19% (11,087 clicks / 347,741 impressions)
[2025-09-23 05:03:38] 
Conversion rates:
[2025-09-23 05:03:38]   Products with clicks: 0.4197%
[2025-09-23 05:03:38]   Products without clicks: 0.0019%
[2025-09-23 05:03:38]   Lift from clicking: 216.9x
[2025-09-23 05:03:38] 
Click distribution on impressed products:
[2025-09-23 05:03:38]   0 clicks: 259,508 products (96.37%)
[2025-09-23 05:03:38]   1 clicks: 8,637 products (3.21%)
[2025-09-23 05:03:38]   2 clicks: 977 products (0.36%)
[2025-09-23 05:03:38]   3 clicks: 130 products (0.05%)
[2025-09-23 05:03:38]   4 clicks: 17 products (0.01%)
[2025-09-23 05:03:38]   5 clicks: 4 products (0.00%)
[2025-09-23 05:03:38]   6 clicks: 3 products (0.00%)


In [18]:
# 4. PRODUCT CHARACTERISTICS
log("\n4. PRODUCT CHARACTERISTICS")
log("-" * 60)

# Price distribution
log("Price distribution (for products with price data):")
price_data = metrics[metrics['PRICE'].notna()]
if len(price_data) > 0:
    log(f"  Mean: ${price_data['PRICE'].mean():.2f}")
    log(f"  Median: ${price_data['PRICE'].median():.2f}")
    log(f"  25th percentile: ${price_data['PRICE'].quantile(0.25):.2f}")
    log(f"  75th percentile: ${price_data['PRICE'].quantile(0.75):.2f}")

    # Conversion by price quartile
    try:
        price_data['price_quartile'] = pd.qcut(price_data['PRICE'], q=4, labels=['Q1', 'Q2', 'Q3', 'Q4'], duplicates='drop')
        log("\nConversion by price quartile:")
        for q in ['Q1', 'Q2', 'Q3', 'Q4']:
            q_data = price_data[price_data['price_quartile'] == q]
            if len(q_data) > 0:
                conv_rate = q_data['did_purchase_product'].mean()
                price_range = f"${q_data['PRICE'].min():.0f}-${q_data['PRICE'].max():.0f}"
                log(f"  {q} {price_range}: {conv_rate:.4%}")
    except:
        log("  Could not create price quartiles - insufficient price variation")

[2025-09-23 05:03:38] 
4. PRODUCT CHARACTERISTICS
[2025-09-23 05:03:38] ------------------------------------------------------------
[2025-09-23 05:03:38] Price distribution (for products with price data):
[2025-09-23 05:03:38]   Mean: $166.34
[2025-09-23 05:03:38]   Median: $40.00
[2025-09-23 05:03:38]   25th percentile: $24.00
[2025-09-23 05:03:38]   75th percentile: $75.00
[2025-09-23 05:03:38] 
Conversion by price quartile:
[2025-09-23 05:03:38]   Q1 $3-$24: 0.0270%
[2025-09-23 05:03:38]   Q2 $25-$40: 0.0135%
[2025-09-23 05:03:38]   Q3 $41-$75: 0.0224%
[2025-09-23 05:03:38]   Q4 $76-$8008135: 0.0060%


## Section 5: Causal Analysis

In [19]:
log("\n" + "="*80)
log("SECTION 5: CAUSAL ANALYSIS")
log("="*80)

# Prepare features for regression
df = metrics.copy()

# Handle missing values
if 'PRICE' in df.columns:
    median_price = df['PRICE'].median()
    df['PRICE'].fillna(median_price, inplace=True)
    log(f"INFO: Filled missing PRICE with median value of ${median_price:.2f}")

# For other numerics, 0 is reasonable
numeric_cols = df.select_dtypes(include=[np.number]).columns
for col in numeric_cols:
    if col != 'PRICE':  # Already handled
        df[col] = df[col].fillna(0)

# Create log-transformed features
df['log_price'] = np.log1p(df['PRICE'])
df['log_journey_duration'] = np.log1p(df['journey_duration_hours'])
df['log_impressions'] = np.log1p(df['impressions_on_product'])

# Create interaction terms
df['clicks_x_price'] = df['clicks_on_product'] * df['log_price']
df['clicks_x_duration'] = df['total_clicks'] * df['log_journey_duration']

# Create revenue outcome
df['log_revenue'] = np.log1p(df['PRICE'] * df['did_purchase_product'])

log(f"Prepared {len(df):,} observations for analysis")
log(f"Features created including interactions and transformations")

[2025-09-23 05:03:38] 
[2025-09-23 05:03:38] SECTION 5: CAUSAL ANALYSIS
[2025-09-23 05:03:38] INFO: Filled missing PRICE with median value of $40.00
[2025-09-23 05:03:38] Prepared 269,276 observations for analysis
[2025-09-23 05:03:38] Features created including interactions and transformations


In [20]:
# DEFINE MODEL FORMULAS
log("\n" + "="*60)
log("MODEL SPECIFICATION")
log("="*60)

# Base controls
base_controls = ['log_price', 'log_journey_duration', 'distinct_products', 'log_impressions']

# Add historical controls if they exist
historical_controls = []
for col in ['hist_purchase_count', 'hist_user_ctr', 'vendor_hist_ctr']:
    if col in df.columns:
        historical_controls.append(col)

# Add competitive context if exists
competitive_controls = []
for col in ['avg_winning_rank', 'product_win_rate']:
    if col in df.columns:
        competitive_controls.append(col)

# Combine all controls
all_controls = base_controls + historical_controls + competitive_controls
control_str = " + ".join(all_controls)

# Treatment variables
treatment_var = "total_clicks"
product_clicks_var = "clicks_on_product"

# Model formulas
logit_formula = f"did_purchase_product ~ {treatment_var} + {product_clicks_var} + {control_str}"
ols_formula = f"log_revenue ~ {treatment_var} + {product_clicks_var} + {control_str}"

log("MODEL FORMULATIONS:")
log(f"  Treatment: {treatment_var} (journey-level clicks)")
log(f"  Product treatment: {product_clicks_var} (product-specific clicks)")
log(f"  Base controls: {', '.join(base_controls)}")
if historical_controls:
    log(f"  Historical controls: {', '.join(historical_controls)}")
if competitive_controls:
    log(f"  Competitive controls: {', '.join(competitive_controls)}")
log(f"\n  Logit: {logit_formula}")
log(f"  OLS:   {ols_formula}")

[2025-09-23 05:03:38] 
[2025-09-23 05:03:38] MODEL SPECIFICATION
[2025-09-23 05:03:38] MODEL FORMULATIONS:
[2025-09-23 05:03:38]   Treatment: total_clicks (journey-level clicks)
[2025-09-23 05:03:38]   Product treatment: clicks_on_product (product-specific clicks)
[2025-09-23 05:03:38]   Base controls: log_price, log_journey_duration, distinct_products, log_impressions
[2025-09-23 05:03:38]   Historical controls: hist_purchase_count, hist_user_ctr, vendor_hist_ctr
[2025-09-23 05:03:38]   Competitive controls: avg_winning_rank, product_win_rate
[2025-09-23 05:03:38] 
  Logit: did_purchase_product ~ total_clicks + clicks_on_product + log_price + log_journey_duration + distinct_products + log_impressions + hist_purchase_count + hist_user_ctr + vendor_hist_ctr + avg_winning_rank + product_win_rate
[2025-09-23 05:03:38]   OLS:   log_revenue ~ total_clicks + clicks_on_product + log_price + log_journey_duration + distinct_products + log_impressions + hist_purchase_count + hist_user_ctr + vend

In [21]:
# LOGISTIC REGRESSION
log("\n" + "="*60)
log("MODEL 1: LOGISTIC REGRESSION")
log("="*60)

try:
    # Fit model using formula API
    logit_model = smf.logit(formula=logit_formula, data=df)
    logit_results = logit_model.fit(disp=0, maxiter=100)
    
    log("\nLogistic Regression Results:")
    log("-" * 40)
    
    # Print summary
    log(str(logit_results.summary()))
    
    # Calculate Odds Ratios
    log("\n" + "="*40)
    log("ODDS RATIOS AND 95% CONFIDENCE INTERVALS")
    log("="*40)
    
    odds_ratios = pd.DataFrame({
        "Coefficient": logit_results.params,
        "Odds Ratio": np.exp(logit_results.params),
        "Std. Error": logit_results.bse,
        "z-statistic": logit_results.tvalues,
        "P>|z|": logit_results.pvalues,
        "OR CI Lower": np.exp(logit_results.conf_int()[0]),
        "OR CI Upper": np.exp(logit_results.conf_int()[1]),
    })
    
    log("\n" + odds_ratios.to_string())
    
    # Key interpretations
    log("\n" + "="*40)
    log("KEY INTERPRETATIONS")
    log("="*40)
    
    if treatment_var in odds_ratios.index:
        or_total = odds_ratios.loc[treatment_var, "Odds Ratio"]
        p_total = odds_ratios.loc[treatment_var, "P>|z|"]
        log(f"\n'{treatment_var}' (Journey-level clicks):")
        log(f"  Odds Ratio = {or_total:.4f}, p-value = {p_total:.4f}")
        log(f"  Interpretation: Each additional journey click multiplies the odds of purchase by {or_total:.4f}")
        if p_total < 0.05:
            log(f"  ✓ Statistically significant at 5% level")
        else:
            log(f"  ✗ Not statistically significant at 5% level")
    
    if product_clicks_var in odds_ratios.index:
        or_product = odds_ratios.loc[product_clicks_var, "Odds Ratio"]
        p_product = odds_ratios.loc[product_clicks_var, "P>|z|"]
        log(f"\n'{product_clicks_var}' (Product-specific clicks):")
        log(f"  Odds Ratio = {or_product:.4f}, p-value = {p_product:.4f}")
        log(f"  Interpretation: Each click on the specific product multiplies the odds of purchasing it by {or_product:.4f}")
        if p_product < 0.05:
            log(f"  ✓ Statistically significant at 5% level")
        else:
            log(f"  ✗ Not statistically significant at 5% level")
    
    # Model fit statistics
    log("\n" + "="*40)
    log("MODEL FIT STATISTICS")
    log("="*40)
    log(f"  Pseudo R-squared: {logit_results.prsquared:.4f}")
    log(f"  Log-Likelihood: {logit_results.llf:.4f}")
    log(f"  AIC: {logit_results.aic:.4f}")
    log(f"  BIC: {logit_results.bic:.4f}")
    
except Exception as e:
    log(f"ERROR in logistic regression: {e}")
    import traceback
    log(traceback.format_exc())

[2025-09-23 05:03:38] 
[2025-09-23 05:03:38] MODEL 1: LOGISTIC REGRESSION
[2025-09-23 05:03:39] 
Logistic Regression Results:
[2025-09-23 05:03:39] ----------------------------------------
[2025-09-23 05:03:39]                             Logit Regression Results                            
Dep. Variable:     did_purchase_product   No. Observations:               269276
Model:                            Logit   Df Residuals:                   269264
Method:                             MLE   Df Model:                           11
Date:                  Tue, 23 Sep 2025   Pseudo R-squ.:                  0.2462
Time:                          05:03:39   Log-Likelihood:                -335.47
converged:                         True   LL-Null:                       -445.04
Covariance Type:              nonrobust   LLR p-value:                 7.832e-41
                           coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------

In [22]:
# OLS REGRESSION WITH ROBUST STANDARD ERRORS
log("\n" + "="*60)
log("MODEL 2: OLS REGRESSION (WITH ROBUST SE)")
log("="*60)

try:
    # Fit model using formula API with robust standard errors
    ols_model = smf.ols(formula=ols_formula, data=df)
    # Use HC3 robust standard errors (most conservative)
    ols_results = ols_model.fit(cov_type='HC3')
    
    log("\nOLS Regression Results (with Heteroskedasticity-Robust Standard Errors):")
    log("-" * 40)
    
    # Print summary
    log(str(ols_results.summary()))
    
    # Extract key coefficients
    log("\n" + "="*40)
    log("KEY COEFFICIENTS WITH ROBUST STANDARD ERRORS")
    log("="*40)
    
    coef_summary = pd.DataFrame({
        "Coefficient": ols_results.params,
        "Robust SE": ols_results.bse,
        "t-statistic": ols_results.tvalues,
        "P>|t|": ols_results.pvalues,
        "CI Lower": ols_results.conf_int()[0],
        "CI Upper": ols_results.conf_int()[1],
    })
    
    log("\n" + coef_summary.to_string())
    
    # Key interpretations
    log("\n" + "="*40)
    log("KEY INTERPRETATIONS")
    log("="*40)
    
    if treatment_var in coef_summary.index:
        coef_total = coef_summary.loc[treatment_var, "Coefficient"]
        p_total = coef_summary.loc[treatment_var, "P>|t|"]
        pct_change_total = (np.exp(coef_total) - 1) * 100
        log(f"\n'{treatment_var}' (Journey-level clicks):")
        log(f"  Coefficient = {coef_total:.6f}, p-value = {p_total:.4f}")
        log(f"  Interpretation: Each additional journey click is associated with {pct_change_total:.3f}% change in revenue")
        if p_total < 0.05:
            log(f"  ✓ Statistically significant at 5% level (with robust SE)")
        else:
            log(f"  ✗ Not statistically significant at 5% level")
    
    if product_clicks_var in coef_summary.index:
        coef_product = coef_summary.loc[product_clicks_var, "Coefficient"]
        p_product = coef_summary.loc[product_clicks_var, "P>|t|"]
        pct_change_product = (np.exp(coef_product) - 1) * 100
        log(f"\n'{product_clicks_var}' (Product-specific clicks):")
        log(f"  Coefficient = {coef_product:.6f}, p-value = {p_product:.4f}")
        log(f"  Interpretation: Each click on the specific product is associated with {pct_change_product:.3f}% change in revenue")
        if p_product < 0.05:
            log(f"  ✓ Statistically significant at 5% level (with robust SE)")
        else:
            log(f"  ✗ Not statistically significant at 5% level")
    
    # Model fit statistics
    log("\n" + "="*40)
    log("MODEL FIT STATISTICS")
    log("="*40)
    log(f"  R-squared: {ols_results.rsquared:.4f}")
    log(f"  Adjusted R-squared: {ols_results.rsquared_adj:.4f}")
    log(f"  F-statistic: {ols_results.fvalue:.4f} (p={ols_results.f_pvalue:.6f})")
    log(f"  AIC: {ols_results.aic:.4f}")
    log(f"  BIC: {ols_results.bic:.4f}")
    
    # Heteroskedasticity test
    residuals = ols_results.resid
    exog = ols_model.exog
    bp_test = het_breuschpagan(residuals, exog)
    
    log("\n" + "="*40)
    log("DIAGNOSTIC TESTS")
    log("="*40)
    log(f"  Breusch-Pagan test for heteroskedasticity:")
    log(f"    LM statistic = {bp_test[0]:.4f}, p-value = {bp_test[1]:.4f}")
    if bp_test[1] < 0.05:
        log(f"    ✓ Evidence of heteroskedasticity - robust SEs are appropriate")
    else:
        log(f"    ✗ No strong evidence of heteroskedasticity")
    
except Exception as e:
    log(f"ERROR in OLS regression: {e}")
    import traceback
    log(traceback.format_exc())

[2025-09-23 05:03:39] 
[2025-09-23 05:03:39] MODEL 2: OLS REGRESSION (WITH ROBUST SE)
[2025-09-23 05:03:39] 
OLS Regression Results (with Heteroskedasticity-Robust Standard Errors):
[2025-09-23 05:03:39] ----------------------------------------
[2025-09-23 05:03:39]                             OLS Regression Results                            
Dep. Variable:            log_revenue   R-squared:                       0.006
Model:                            OLS   Adj. R-squared:                  0.006
Method:                 Least Squares   F-statistic:                     3.749
Date:                Tue, 23 Sep 2025   Prob (F-statistic):           2.19e-05
Time:                        05:03:39   Log-Likelihood:             4.4947e+05
No. Observations:              269276   AIC:                        -8.989e+05
Df Residuals:                  269264   BIC:                        -8.988e+05
Df Model:                          11                                         
Covariance Type:      

In [23]:
# ROBUSTNESS CHECKS
log("\n" + "="*60)
log("ROBUSTNESS CHECKS")
log("="*60)

# Simple mean comparison
log("\n1. Simple Mean Comparison:")
treated = df[df['total_clicks'] > 0]
control = df[df['total_clicks'] == 0]

treated_conv = treated['did_purchase_product'].mean()
control_conv = control['did_purchase_product'].mean()

log(f"  Treated (clicks>0) conversion: {treated_conv:.4%}")
log(f"  Control (clicks=0) conversion: {control_conv:.4%}")
log(f"  Raw difference: {(treated_conv - control_conv):.4%}")

# T-test
t_stat, p_value = stats.ttest_ind(treated['did_purchase_product'], control['did_purchase_product'])
log(f"  T-test: t={t_stat:.4f}, p={p_value:.4f}")

# Click intensity analysis
log("\n2. Click Intensity Analysis:")
click_bins = [0, 1, 2, 5, 10, float('inf')]
df['click_bin'] = pd.cut(df['total_clicks'], bins=click_bins, right=False)

bin_conv = df.groupby('click_bin', observed=True)['did_purchase_product'].agg(['mean', 'count'])
log("  Conversion by click bins:")
for idx, row in bin_conv.iterrows():
    log(f"    {idx}: {row['mean']:.4%} (n={row['count']:,})")

[2025-09-23 05:03:39] 
[2025-09-23 05:03:39] ROBUSTNESS CHECKS
[2025-09-23 05:03:39] 
1. Simple Mean Comparison:
[2025-09-23 05:03:39]   Treated (clicks>0) conversion: 0.0228%
[2025-09-23 05:03:39]   Control (clicks=0) conversion: 0.0037%
[2025-09-23 05:03:39]   Raw difference: 0.0191%
[2025-09-23 05:03:39]   T-test: t=3.4722, p=0.0005
[2025-09-23 05:03:39] 
2. Click Intensity Analysis:
[2025-09-23 05:03:39]   Conversion by click bins:
[2025-09-23 05:03:39]     [0.0, 1.0): 0.0037% (n=80,712.0)
[2025-09-23 05:03:39]     [1.0, 2.0): 0.0172% (n=40,697.0)
[2025-09-23 05:03:39]     [2.0, 5.0): 0.0214% (n=65,442.0)
[2025-09-23 05:03:39]     [5.0, 10.0): 0.0231% (n=43,296.0)
[2025-09-23 05:03:39]     [10.0, inf): 0.0307% (n=39,129.0)


In [24]:
# Save comprehensive results
log("\n" + "="*80)
log("ANALYSIS COMPLETE")
log("="*80)

output_path = Path("./data") / f"causal_analysis_results_{timestamp}.txt"
with open(output_path, 'w') as f:
    f.write('\n'.join(output_log))

log(f"\nResults saved to: {output_path}")
log(f"Total log entries: {len(output_log)}")

# Summary statistics
log("\n" + "="*60)
log("FINAL SUMMARY")
log("="*60)
log(f"  Total users analyzed: {df['USER_ID'].nunique():,}")
log(f"  Total journeys: {df['journey_id'].nunique():,}")
log(f"  Total products: {df['PRODUCT_ID'].nunique():,}")
log(f"  Overall purchase rate: {df['did_purchase_product'].mean():.4%}")
log(f"  CTR: {(df['clicks_on_product'].sum() / df['impressions_on_product'].sum()):.2%}")

[2025-09-23 05:03:39] 
[2025-09-23 05:03:39] ANALYSIS COMPLETE
[2025-09-23 05:03:39] 
Results saved to: data/causal_analysis_results_20250923_050249.txt
[2025-09-23 05:03:39] Total log entries: 216
[2025-09-23 05:03:39] 
[2025-09-23 05:03:39] FINAL SUMMARY
[2025-09-23 05:03:39]   Total users analyzed: 1,124
[2025-09-23 05:03:39]   Total journeys: 7,820
[2025-09-23 05:03:39]   Total products: 215,589
[2025-09-23 05:03:39]   Overall purchase rate: 0.0171%
[2025-09-23 05:03:39]   CTR: 3.19%
