# Data Pull with Updated Schema

Pull a small sample of recent data to explore the updated schema with new columns:
- AUCTIONS_RESULTS: QUALITY, FINAL_BID, PRICE, CONVERSION_RATE, PACING
- AUCTIONS_USERS: PLACEMENT

Configuration:
- Sample: 0.001% of users (~100-200 users)
- Window: 7-14 days (recent data)
- Purpose: Schema exploration and validation

In [1]:
# --- IMPORTS ---
import os
import textwrap
from datetime import date, timedelta
from pathlib import Path
import warnings

import pandas as pd
import numpy as np
from dotenv import load_dotenv
import snowflake.connector
from tqdm import tqdm

# Suppress warnings
warnings.filterwarnings(
    'ignore',
    category=UserWarning,
    message='pandas only supports SQLAlchemy connectable.*'
)

print("[INFO] Imports complete")

[INFO] Imports complete


In [2]:
# --- CONFIGURATION ---
load_dotenv('../.env')

# Date range: Last 14 days for quick testing
ANALYSIS_END_DATE = date.today()
DAYS_WINDOW = 14

# Sampling: 0.1% for quick exploration (10x more than shopping-sessions)
SAMPLING_FRACTION = 0.001  # 0.1% of users

print(f"[CONFIG] End date: {ANALYSIS_END_DATE}")
print(f"[CONFIG] Window: {DAYS_WINDOW} days")
print(f"[CONFIG] Sampling: {SAMPLING_FRACTION:.4%}")

# --- SNOWFLAKE CONNECTION ---
try:
    conn = snowflake.connector.connect(
        user=os.getenv('SNOWFLAKE_USER'),
        password=os.getenv('SNOWFLAKE_PASSWORD'),
        account=os.getenv('SNOWFLAKE_ACCOUNT'),
        warehouse=os.getenv('SNOWFLAKE_WAREHOUSE', 'COMPUTE_WH'),
        database='INCREMENTALITY',
        schema='INCREMENTALITY_RESEARCH'
    )
    print("\n[SUCCESS] Snowflake connection established.")
except Exception as e:
    print(f"\n[FAILURE] Could not connect to Snowflake: {e}")

[CONFIG] End date: 2025-10-11
[CONFIG] Window: 14 days
[CONFIG] Sampling: 0.1000%

[SUCCESS] Snowflake connection established.


In [3]:
# --- DATA FETCHING FUNCTION WITH UPDATED SCHEMA ---

def fetch_data_with_new_schema(conn, start_date: str, end_date: str, sampling_fraction: float) -> tuple:
    """
    Fetches data from Snowflake with UPDATED SCHEMA including new columns.
    
    New columns:
    - AUCTIONS_RESULTS: QUALITY, FINAL_BID, PRICE, CONVERSION_RATE, PACING
    - AUCTIONS_USERS: PLACEMENT
    
    Args:
        conn: Snowflake connection
        start_date (str): Start date 'YYYY-MM-DD'
        end_date (str): End date 'YYYY-MM-DD'
        sampling_fraction (float): Fraction of users to sample
    
    Returns:
        tuple: (journey_data dict, catalog_data DataFrame)
    """
    print(f"\n{'='*80}")
    print("FETCHING DATA WITH UPDATED SCHEMA")
    print(f"{'='*80}")
    print(f"  Date range: {start_date} to {end_date}")
    print(f"  Sampling: {sampling_fraction:.4%}")
    
    journey_data = {}
    
    # Build shared CTE for deterministic user sampling
    total_buckets = 10000
    selection_threshold = max(1, int(total_buckets * sampling_fraction))
    
    cte_sql = textwrap.dedent(f"""
    WITH SAMPLED_USER_IDS AS (
        SELECT OPAQUE_USER_ID FROM (
            SELECT OPAQUE_USER_ID, MOD(ABS(HASH(OPAQUE_USER_ID)), {total_buckets}) AS bucket
            FROM (SELECT DISTINCT OPAQUE_USER_ID FROM AUCTIONS_USERS 
                  WHERE CREATED_AT BETWEEN '{start_date}' AND '{end_date}')
        ) WHERE bucket < {selection_threshold}
    )
    """)
    
    print(f"\n  Sampling strategy: Hash-based, {selection_threshold} buckets out of {total_buckets}")
    
    # 1. AUCTIONS_USERS - WITH NEW PLACEMENT COLUMN
    query_users = cte_sql + textwrap.dedent(f"""
    SELECT 
        LOWER(TO_VARCHAR(au.AUCTION_ID, 'HEX')) AS AUCTION_ID,
        au.OPAQUE_USER_ID,
        au.CREATED_AT,
        au.PLACEMENT
    FROM AUCTIONS_USERS AS au
    JOIN SAMPLED_USER_IDS AS s ON au.OPAQUE_USER_ID = s.OPAQUE_USER_ID
    WHERE au.CREATED_AT BETWEEN '{start_date}' AND '{end_date}'
    """)
    
    print("\n  [1/6] Fetching AUCTIONS_USERS (with PLACEMENT)...")
    journey_data['AUCTIONS_USERS'] = pd.read_sql(query_users, conn)
    print(f"        Found {len(journey_data['AUCTIONS_USERS']):,} auctions from {journey_data['AUCTIONS_USERS']['OPAQUE_USER_ID'].nunique():,} users")
    
    # 2. AUCTIONS_RESULTS - WITH NEW COLUMNS
    query_bids = cte_sql + textwrap.dedent(f"""
    SELECT
        LOWER(TO_VARCHAR(ar.AUCTION_ID, 'HEX')) AS AUCTION_ID,
        LOWER(TO_VARCHAR(ar.VENDOR_ID, 'HEX')) AS VENDOR_ID,
        LOWER(TO_VARCHAR(ar.CAMPAIGN_ID, 'HEX')) AS CAMPAIGN_ID,
        LOWER(TRIM(ar.PRODUCT_ID)) AS PRODUCT_ID,
        ar.RANKING,
        ar.IS_WINNER,
        ar.CREATED_AT,
        ar.QUALITY,
        ar.FINAL_BID,
        ar.PRICE,
        ar.CONVERSION_RATE,
        ar.PACING
    FROM AUCTIONS_RESULTS ar
    JOIN AUCTIONS_USERS au ON ar.AUCTION_ID = au.AUCTION_ID
    WHERE au.OPAQUE_USER_ID IN (SELECT OPAQUE_USER_ID FROM SAMPLED_USER_IDS)
      AND ar.CREATED_AT BETWEEN '{start_date}' AND '{end_date}'
    """)
    
    print("  [2/6] Fetching AUCTIONS_RESULTS (with QUALITY, FINAL_BID, PRICE, CONVERSION_RATE, PACING)...")
    journey_data['AUCTIONS_RESULTS'] = pd.read_sql(query_bids, conn)
    print(f"        Found {len(journey_data['AUCTIONS_RESULTS']):,} bids")
    
    # 3. IMPRESSIONS
    query_impressions = cte_sql + textwrap.dedent(f"""
    SELECT
        i.INTERACTION_ID,
        LOWER(REPLACE(i.AUCTION_ID, '-', '')) AS AUCTION_ID,
        LOWER(TRIM(i.PRODUCT_ID)) AS PRODUCT_ID,
        i.USER_ID,
        LOWER(REPLACE(i.CAMPAIGN_ID, '-', '')) AS CAMPAIGN_ID,
        LOWER(REPLACE(i.VENDOR_ID, '-', '')) AS VENDOR_ID,
        i.OCCURRED_AT
    FROM IMPRESSIONS i
    JOIN SAMPLED_USER_IDS s ON i.USER_ID = s.OPAQUE_USER_ID
    WHERE i.OCCURRED_AT BETWEEN '{start_date}' AND '{end_date}'
    """)
    
    print("  [3/6] Fetching IMPRESSIONS...")
    journey_data['IMPRESSIONS'] = pd.read_sql(query_impressions, conn)
    print(f"        Found {len(journey_data['IMPRESSIONS']):,} impressions")
    
    # 4. CLICKS
    query_clicks = cte_sql + textwrap.dedent(f"""
    SELECT
        c.INTERACTION_ID,
        LOWER(REPLACE(c.AUCTION_ID, '-', '')) AS AUCTION_ID,
        LOWER(TRIM(c.PRODUCT_ID)) AS PRODUCT_ID,
        c.USER_ID,
        LOWER(REPLACE(c.CAMPAIGN_ID, '-', '')) AS CAMPAIGN_ID,
        LOWER(REPLACE(c.VENDOR_ID, '-', '')) AS VENDOR_ID,
        c.OCCURRED_AT
    FROM CLICKS c
    JOIN SAMPLED_USER_IDS s ON c.USER_ID = s.OPAQUE_USER_ID
    WHERE c.OCCURRED_AT BETWEEN '{start_date}' AND '{end_date}'
    """)
    
    print("  [4/6] Fetching CLICKS...")
    journey_data['CLICKS'] = pd.read_sql(query_clicks, conn)
    print(f"        Found {len(journey_data['CLICKS']):,} clicks")
    
    # 5. PURCHASES
    query_purchases = cte_sql + textwrap.dedent(f"""
    SELECT
        p.PURCHASE_ID,
        p.PURCHASED_AT,
        LOWER(TRIM(p.PRODUCT_ID)) AS PRODUCT_ID,
        p.QUANTITY,
        p.UNIT_PRICE,
        p.USER_ID,
        p.PURCHASE_LINE
    FROM PURCHASES p
    JOIN SAMPLED_USER_IDS s ON p.USER_ID = s.OPAQUE_USER_ID
    WHERE p.PURCHASED_AT BETWEEN '{start_date}' AND '{end_date}'
    """)
    
    print("  [5/6] Fetching PURCHASES...")
    journey_data['PURCHASES'] = pd.read_sql(query_purchases, conn)
    print(f"        Found {len(journey_data['PURCHASES']):,} purchases")
    
    # 6. CATALOG (simplified - no category parsing for now)
    catalog_query = cte_sql + textwrap.dedent(f""",
    RELEVANT_PRODUCT_IDS AS (
        SELECT DISTINCT LOWER(TRIM(ar.PRODUCT_ID)) AS product_id 
        FROM AUCTIONS_RESULTS ar 
        JOIN AUCTIONS_USERS au ON ar.AUCTION_ID = au.AUCTION_ID 
        JOIN SAMPLED_USER_IDS s ON au.OPAQUE_USER_ID = s.OPAQUE_USER_ID 
        WHERE ar.CREATED_AT BETWEEN '{start_date}' AND '{end_date}' 
          AND ar.PRODUCT_ID IS NOT NULL
        UNION
        SELECT DISTINCT LOWER(TRIM(i.PRODUCT_ID)) AS product_id 
        FROM IMPRESSIONS i 
        JOIN SAMPLED_USER_IDS s ON i.USER_ID = s.OPAQUE_USER_ID 
        WHERE i.OCCURRED_AT BETWEEN '{start_date}' AND '{end_date}' 
          AND i.PRODUCT_ID IS NOT NULL
        UNION
        SELECT DISTINCT LOWER(TRIM(c.PRODUCT_ID)) AS product_id 
        FROM CLICKS c 
        JOIN SAMPLED_USER_IDS s ON c.USER_ID = s.OPAQUE_USER_ID 
        WHERE c.OCCURRED_AT BETWEEN '{start_date}' AND '{end_date}' 
          AND c.PRODUCT_ID IS NOT NULL
        UNION
        SELECT DISTINCT LOWER(TRIM(p.PRODUCT_ID)) AS product_id 
        FROM PURCHASES p 
        JOIN SAMPLED_USER_IDS s ON p.USER_ID = s.OPAQUE_USER_ID 
        WHERE p.PURCHASED_AT BETWEEN '{start_date}' AND '{end_date}' 
          AND p.PRODUCT_ID IS NOT NULL
    )
    SELECT
        LOWER(TRIM(c.PRODUCT_ID)) AS PRODUCT_ID,
        c.NAME,
        c.PRICE,
        c.ACTIVE,
        c.IS_DELETED
    FROM CATALOG c
    JOIN RELEVANT_PRODUCT_IDS rpi ON rpi.product_id = LOWER(TRIM(c.PRODUCT_ID))
    """)
    
    print("  [6/6] Fetching CATALOG...")
    catalog_data = pd.read_sql(catalog_query, conn)
    print(f"        Found {len(catalog_data):,} products")
    
    print(f"\n  [SUCCESS] Data fetch complete!")
    print(f"  Total users: {journey_data['AUCTIONS_USERS']['OPAQUE_USER_ID'].nunique():,}")
    print(f"  Total events: {sum(len(df) for df in journey_data.values()):,}")
    
    return journey_data, catalog_data

In [4]:
# --- EXECUTE DATA PULL ---

if 'conn' in locals() and conn and not conn.is_closed():
    # Calculate date range
    start_date = ANALYSIS_END_DATE - timedelta(days=DAYS_WINDOW)
    start_date_str = start_date.strftime('%Y-%m-%d')
    end_date_str = ANALYSIS_END_DATE.strftime('%Y-%m-%d')
    
    # Fetch data
    journey_data, catalog_data = fetch_data_with_new_schema(
        conn, start_date_str, end_date_str, SAMPLING_FRACTION
    )
    
    print("\n[INFO] Data successfully loaded into memory.")
else:
    print("[ERROR] No active Snowflake connection. Please run the connection cell first.")


FETCHING DATA WITH UPDATED SCHEMA
  Date range: 2025-09-27 to 2025-10-11
  Sampling: 0.1000%

  Sampling strategy: Hash-based, 10 buckets out of 10000

  [1/6] Fetching AUCTIONS_USERS (with PLACEMENT)...
        Found 413,457 auctions from 4,671 users
  [2/6] Fetching AUCTIONS_RESULTS (with QUALITY, FINAL_BID, PRICE, CONVERSION_RATE, PACING)...
        Found 18,838,670 bids
  [3/6] Fetching IMPRESSIONS...
        Found 533,146 impressions
  [4/6] Fetching CLICKS...
        Found 16,706 clicks
  [5/6] Fetching PURCHASES...
        Found 2,188 purchases
  [6/6] Fetching CATALOG...
        Found 2,007,695 products

  [SUCCESS] Data fetch complete!
  Total users: 4,671
  Total events: 19,804,167

[INFO] Data successfully loaded into memory.


In [5]:
# --- SAVE RAW DATA ---

if 'journey_data' in locals() and 'catalog_data' in locals():
    output_dir = Path("./data")
    output_dir.mkdir(exist_ok=True)
    
    timestamp = ANALYSIS_END_DATE.strftime('%Y%m%d')
    
    print(f"\n{'='*80}")
    print("SAVING DATA ARTIFACTS")
    print(f"{'='*80}")
    
    # Save journey data
    for name, df in journey_data.items():
        path = output_dir / f"raw_{name.lower()}_{timestamp}.parquet"
        df.to_parquet(path, index=False)
        size_mb = path.stat().st_size / 1024 / 1024
        print(f"  Saved {path.name} ({size_mb:.2f} MB)")
    
    # Save catalog
    catalog_path = output_dir / f"catalog_{timestamp}.parquet"
    catalog_data.to_parquet(catalog_path, index=False)
    size_mb = catalog_path.stat().st_size / 1024 / 1024
    print(f"  Saved {catalog_path.name} ({size_mb:.2f} MB)")
    
    print(f"\n[SUCCESS] All data saved to '{output_dir}'")
else:
    print("[ERROR] Data not found. Please run the data pull cell first.")


SAVING DATA ARTIFACTS
  Saved raw_auctions_users_20251011.parquet (11.96 MB)
  Saved raw_auctions_results_20251011.parquet (981.55 MB)
  Saved raw_impressions_20251011.parquet (46.33 MB)
  Saved raw_clicks_20251011.parquet (1.92 MB)
  Saved raw_purchases_20251011.parquet (0.14 MB)
  Saved catalog_20251011.parquet (115.65 MB)

[SUCCESS] All data saved to 'data'


In [26]:
# --- LOAD SAVED DATA (if data pull already done) ---

import pickle

data_file = 'data/journey_data.pkl'

if os.path.exists(data_file):
    print(f"Loading data from {data_file}...")
    with open(data_file, 'rb') as f:
        journey_data = pickle.load(f)
    
    print(f"\nLoaded tables:")
    for table_name, df in journey_data.items():
        print(f"  {table_name}: {len(df):,} rows")
    
    print(f"\n✓ Data loaded successfully")
else:
    print(f"[WARNING] Data file not found: {data_file}")
    print(f"Run the data pull cells first to create the data")

Run the data pull cells first to create the data


---

# Exploratory Data Analysis

## Part 1: Schema Validation

In [None]:
# --- SCHEMA VALIDATION ---

if 'journey_data' in locals():
    print(f"\n{'='*80}")
    print("SCHEMA VALIDATION")
    print(f"{'='*80}")
    
    for table_name, df in journey_data.items():
        print(f"\n{table_name}:")
        print(f"  Shape: {df.shape[0]:,} rows × {df.shape[1]} columns")
        print(f"  Columns: {list(df.columns)}")
        print(f"  Dtypes:")
        for col in df.columns:
            missing_pct = (df[col].isna().sum() / len(df)) * 100
            print(f"    {col:20s} {str(df[col].dtype):15s} {missing_pct:5.1f}% missing")
    
    print(f"\nCATALOG:")
    print(f"  Shape: {catalog_data.shape[0]:,} rows × {catalog_data.shape[1]} columns")
    print(f"  Columns: {list(catalog_data.columns)}")
    
    print("\n[INFO] Schema validation complete.")
else:
    print("[ERROR] Data not loaded.")

In [None]:
# --- DISPLAY SAMPLE DATA ---

if 'journey_data' in locals():
    print(f"\n{'='*80}")
    print("SAMPLE DATA (First 5 rows)")
    print(f"{'='*80}")
    
    for table_name, df in journey_data.items():
        print(f"\n{table_name}:")
        print(df.head())
    
    print(f"\nCATALOG:")
    print(catalog_data.head())
else:
    print("[ERROR] Data not loaded.")


SAMPLE DATA (First 5 rows)

AUCTIONS_USERS:
                         AUCTION_ID  \
0  068d734899d07de0a1045a2118dc23ba   
1  068d73517944715093043563a3816287   
2  068d7344cb5b718780045f4e93c3ea28   
3  068d73595d8b716dae04f5cf93c3ea28   
4  068d735cd8067521bb0473d1bdf792a1   

                              OPAQUE_USER_ID              CREATED_AT PLACEMENT  
0  ext1:961fd468-21e3-424f-b0f8-0ece9f23ab6a 2025-09-27 00:49:13.650         3  
1  ext1:d2d65979-c9b6-45e7-a90d-0638d06aaefd 2025-09-27 00:51:35.619         2  
2  ext1:d2d65979-c9b6-45e7-a90d-0638d06aaefd 2025-09-27 00:48:12.760         2  
3  ext1:abbf41f4-1bed-44ba-8ac6-a73d0861f1c0 2025-09-27 00:53:41.900         3  
4  ext1:d2d65979-c9b6-45e7-a90d-0638d06aaefd 2025-09-27 00:54:37.536         2  

AUCTIONS_RESULTS:
                         AUCTION_ID                         VENDOR_ID  \
0  068d732029cf7560ba049dd6ee967f23  0196f3b92cd87403b96a612bd0bf28c4   
1  068d732029cf7560ba049dd6ee967f23  064b2efd93cf7cf9a624cd526419b66a

## Part 2: New Column Analysis

In [None]:
# --- ANALYZE NEW COLUMN: PLACEMENT (AUCTIONS_USERS) ---

if 'journey_data' in locals():
    print(f"\n{'='*80}")
    print("NEW COLUMN ANALYSIS: PLACEMENT (AUCTIONS_USERS)")
    print(f"{'='*80}")
    
    df = journey_data['AUCTIONS_USERS']
    
    print(f"\nColumn: PLACEMENT")
    print(f"  Data type: {df['PLACEMENT'].dtype}")
    print(f"  Missing: {df['PLACEMENT'].isna().sum():,} ({df['PLACEMENT'].isna().mean()*100:.1f}%)")
    print(f"  Unique values: {df['PLACEMENT'].nunique():,}")
    
    if df['PLACEMENT'].notna().any():
        print(f"\n  Value distribution:")
        value_counts = df['PLACEMENT'].value_counts(dropna=False)
        for val, count in value_counts.head(20).items():
            pct = (count / len(df)) * 100
            print(f"    {str(val):30s} {count:8,} ({pct:5.1f}%)")
        
        if len(value_counts) > 20:
            print(f"    ... and {len(value_counts) - 20} more values")
    else:
        print("  [WARNING] All values are NULL")
    
    # PLACEMENT-LEVEL BIDDING ANALYSIS
    print(f"\n{'='*80}")
    print("PLACEMENT-LEVEL BIDDING ANALYSIS")
    print(f"{'='*80}")
    
    auctions = journey_data['AUCTIONS_USERS']
    bids = journey_data['AUCTIONS_RESULTS']
    
    # Join auctions with bids to get placement info
    auction_bids = bids.merge(auctions[['AUCTION_ID', 'PLACEMENT']], on='AUCTION_ID', how='left')
    
    print(f"\nBidding statistics by PLACEMENT:")
    placement_stats = auction_bids.groupby('PLACEMENT').agg({
        'AUCTION_ID': 'count',  # total bids
        'IS_WINNER': ['sum', 'mean']  # wins and win rate
    }).round(4)
    
    placement_stats.columns = ['Total_Bids', 'Wins', 'Win_Rate']
    
    # Add auctions per placement
    auctions_per_placement = auctions['PLACEMENT'].value_counts().sort_index()
    placement_stats['Num_Auctions'] = auctions_per_placement
    placement_stats['Bids_Per_Auction'] = placement_stats['Total_Bids'] / placement_stats['Num_Auctions']
    
    # Reorder columns
    placement_stats = placement_stats[['Num_Auctions', 'Total_Bids', 'Bids_Per_Auction', 'Wins', 'Win_Rate']]
    
    print(placement_stats.to_string())
    
    print(f"\nKey insights:")
    print(f"  Most competitive placement (highest bids/auction): {placement_stats['Bids_Per_Auction'].idxmax()}")
    print(f"  Least competitive placement (lowest bids/auction): {placement_stats['Bids_Per_Auction'].idxmin()}")
    print(f"  Highest win rate placement: {placement_stats['Win_Rate'].idxmax()}")
    print(f"  Lowest win rate placement: {placement_stats['Win_Rate'].idxmin()}")
    
    print("\n[INFO] PLACEMENT analysis complete.")
else:
    print("[ERROR] Data not loaded.")


NEW COLUMN ANALYSIS: PLACEMENT (AUCTIONS_USERS)

Column: PLACEMENT
  Data type: object
  Missing: 0 (0.0%)
  Unique values: 5

  Value distribution:
    5                               160,276 ( 38.8%)
    2                               145,826 ( 35.3%)
    3                                61,217 ( 14.8%)
    1                                38,858 (  9.4%)
    4                                 7,280 (  1.8%)

PLACEMENT-LEVEL BIDDING ANALYSIS

Bidding statistics by PLACEMENT:
           Num_Auctions  Total_Bids  Bids_Per_Auction     Wins  Win_Rate
PLACEMENT                                                               
1                 38858     1502474         38.665757  1135277    0.7556
2                145826     5003519         34.311570  4285134    0.8564
3                 61217     3040245         49.663410  2428435    0.7988
4                  7280      178757         24.554533   142934    0.7996
5                160276     9115603         56.874410  7518892    0.8248

Key i

In [None]:
# --- ANALYZE NEW COLUMNS: AUCTIONS_RESULTS ---

if 'journey_data' in locals():
    print(f"\n{'='*80}")
    print("NEW COLUMNS ANALYSIS: AUCTIONS_RESULTS")
    print(f"{'='*80}")
    
    df = journey_data['AUCTIONS_RESULTS']
    new_cols = ['QUALITY', 'FINAL_BID', 'PRICE', 'CONVERSION_RATE', 'PACING']
    
    for col in new_cols:
        if col in df.columns:
            print(f"\n{col}:")
            print(f"  Data type: {df[col].dtype}")
            print(f"  Missing: {df[col].isna().sum():,} ({df[col].isna().mean()*100:.1f}%)")
            
            if df[col].notna().any() and pd.api.types.is_numeric_dtype(df[col]):
                print(f"  Statistics:")
                print(f"    Count:  {df[col].count():,}")
                print(f"    Mean:   {df[col].mean():.6f}")
                print(f"    Std:    {df[col].std():.6f}")
                print(f"    Min:    {df[col].min():.6f}")
                print(f"    25%:    {df[col].quantile(0.25):.6f}")
                print(f"    50%:    {df[col].quantile(0.50):.6f}")
                print(f"    75%:    {df[col].quantile(0.75):.6f}")
                print(f"    Max:    {df[col].max():.6f}")
                
                # Unique value counts for top 20
                print(f"  Unique values: {df[col].nunique():,}")
                if df[col].nunique() <= 100:
                    print(f"  Top 20 value counts:")
                    value_counts = df[col].value_counts().head(20)
                    for val, count in value_counts.items():
                        pct = (count / len(df)) * 100
                        print(f"    {val:.6f}: {count:,} ({pct:.2f}%)")
                else:
                    # For continuous variables, show deciles
                    print(f"  Decile distribution:")
                    deciles = df[col].quantile([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])
                    for q, val in deciles.items():
                        print(f"    {int(q*100)}th percentile: {val:.6f}")
                        
            elif df[col].notna().any():
                print(f"  Unique values: {df[col].nunique():,}")
                value_counts = df[col].value_counts().head(10)
                for val, count in value_counts.items():
                    print(f"    {val}: {count:,}")
            else:
                print(f"  [WARNING] All values are NULL")
        else:
            print(f"\n{col}: [ERROR] Column not found in data")
    
    print("\n[INFO] New columns analysis complete.")
else:
    print("[ERROR] Data not loaded.")


NEW COLUMNS ANALYSIS: AUCTIONS_RESULTS

QUALITY:
  Data type: float64
  Missing: 0 (0.0%)
  Statistics:
    Count:  18,838,670
    Mean:   0.036669
    Std:    0.028097
    Min:    0.000001
    25%:    0.013764
    50%:    0.032807
    75%:    0.053153
    Max:    0.847945
  Unique values: 6,159,823
  Decile distribution:
    10th percentile: 0.005403
    20th percentile: 0.011266
    30th percentile: 0.016306
    40th percentile: 0.023195
    50th percentile: 0.032807
    60th percentile: 0.040758
    70th percentile: 0.048722
    80th percentile: 0.058133
    90th percentile: 0.072841

FINAL_BID:
  Data type: int64
  Missing: 0 (0.0%)
  Statistics:
    Count:  18,838,670
    Mean:   11.801821
    Std:    14.444879
    Min:    0.000000
    25%:    3.000000
    50%:    6.000000
    75%:    16.000000
    Max:    100.000000
  Unique values: 101
  Decile distribution:
    10th percentile: 1.000000
    20th percentile: 2.000000
    30th percentile: 4.000000
    40th percentile: 5.000000
 

## Part 3: Auction Dynamics with New Fields

In [None]:
# --- QUALITY SCORE ANALYSIS ---

if 'journey_data' in locals():
    print(f"\n{'='*80}")
    print("QUALITY SCORE ANALYSIS")
    print(f"{'='*80}")

    df = journey_data['AUCTIONS_RESULTS'].copy()

    if 'QUALITY' in df.columns and df['QUALITY'].notna().any():
        # Win rate by quality quartile
        df['quality_quartile'] = pd.qcut(df['QUALITY'], q=4, labels=['Q1 (Low)', 'Q2', 'Q3', 'Q4 (High)'], duplicates='drop')

        win_by_quality = df.groupby('quality_quartile', observed=True)['IS_WINNER'].agg(['sum', 'count', 'mean'])
        win_by_quality.columns = ['Wins', 'Total Bids', 'Win Rate']

        print("\nWin Rate by Quality Quartile:")
        print(win_by_quality)

        # Correlation with ranking
        if df['QUALITY'].notna().any() and df['RANKING'].notna().any():
            corr_quality_rank = df[['QUALITY', 'RANKING']].corr().loc['QUALITY', 'RANKING']
            print(f"\nCorrelation (QUALITY vs RANKING): {corr_quality_rank:.4f}")
            print("  (Negative correlation expected: higher quality → lower rank number → better position)")

            # Add BID correlation with ranking
            if 'FINAL_BID' in df.columns and df['FINAL_BID'].notna().any():
                corr_bid_rank = df[['FINAL_BID', 'RANKING']].corr().loc['FINAL_BID', 'RANKING']
                print(f"\nCorrelation (FINAL_BID vs RANKING): {corr_bid_rank:.4f}")
                print("  (Negative correlation expected: higher bid → lower rank number → better position)")

                # Compare magnitudes
                print(f"\nRelative importance (correlation magnitude):")
                print(f"  QUALITY:   {abs(corr_quality_rank):.4f}")
                print(f"  FINAL_BID: {abs(corr_bid_rank):.4f}")
                print(f"  Bid is {abs(corr_bid_rank)/abs(corr_quality_rank):.1f}x more correlated with rank than quality")

        # LOG-LOG REGRESSION: DECOMPOSE RANK DRIVERS
        print(f"\n{'='*80}")
        print("RANK DECOMPOSITION: LOG-LOG REGRESSION")
        print(f"{'='*80}")

        print("\nUnit of Analysis: Individual bid (18.8M observations)")
        print("Model: log(RANKING) ~ log(QUALITY) + log(FINAL_BID) + log(PACING) + PLACEMENT FE")
        print("Purpose: Estimate elasticities - % change in rank for % change in inputs")
        print("Interpretation: Coefficients = elasticities in log-log specification")
        print("Error captures: Vendor effects, product effects, time effects, stochastic variation")

        # Prepare regression data
        auctions = journey_data['AUCTIONS_USERS']
        reg_df = df[['AUCTION_ID', 'RANKING', 'QUALITY', 'FINAL_BID', 'PACING']].merge(
            auctions[['AUCTION_ID', 'PLACEMENT']],
            on='AUCTION_ID',
            how='left'
        )
        reg_df = reg_df[(reg_df[['RANKING', 'QUALITY', 'FINAL_BID', 'PACING']] > 0).all(axis=1)]

        print(f"\nRegression sample: {len(reg_df):,} bids (after dropping zeros/nulls)")

        # Take log transforms
        reg_df['log_rank'] = np.log(reg_df['RANKING'])
        reg_df['log_quality'] = np.log(reg_df['QUALITY'])
        reg_df['log_bid'] = np.log(reg_df['FINAL_BID'])
        reg_df['log_pacing'] = np.log(reg_df['PACING'])

        # Create placement dummies
        placement_dummies = pd.get_dummies(reg_df['PLACEMENT'], prefix='placement', drop_first=True)
        reg_df = pd.concat([reg_df, placement_dummies], axis=1)

        # Run OLS regression
        import statsmodels.api as sm

        # Model 1: Just quality and bid
        X1 = reg_df[['log_quality', 'log_bid']]
        X1 = sm.add_constant(X1)
        y = reg_df['log_rank']

        model1 = sm.OLS(y, X1).fit()
        print(f"\n{'='*60}")
        print("MODEL 1: log(RANK) ~ log(QUALITY) + log(BID)")
        print(f"{'='*60}")
        print(f"N = {len(reg_df):,}, R² = {model1.rsquared:.4f}")
        print("\nCoefficients (elasticities):")
        print(f"  Intercept:    {model1.params['const']:8.4f}  (se: {model1.bse['const']:.4f}, t: {model1.tvalues['const']:6.2f})")
        print(f"  log(QUALITY): {model1.params['log_quality']:8.4f}  (se: {model1.bse['log_quality']:.4f}, t: {model1.tvalues['log_quality']:6.2f})")
        print(f"  log(BID):     {model1.params['log_bid']:8.4f}  (se: {model1.bse['log_bid']:.4f}, t: {model1.tvalues['log_bid']:6.2f})")
        print("\nInterpretation:")
        print(f"  1% increase in QUALITY → {model1.params['log_quality']:.2f}% change in RANK")
        print(f"  1% increase in BID     → {model1.params['log_bid']:.2f}% change in RANK")
        print(f"  BID elasticity is {abs(model1.params['log_bid'])/abs(model1.params['log_quality']):.1f}x larger than QUALITY elasticity")

        # Model 2: Add pacing
        X2 = reg_df[['log_quality', 'log_bid', 'log_pacing']]
        X2 = sm.add_constant(X2)

        model2 = sm.OLS(y, X2).fit()
        print(f"\n{'='*60}")
        print("MODEL 2: log(RANK) ~ log(QUALITY) + log(BID) + log(PACING)")
        print(f"{'='*60}")
        print(f"N = {len(reg_df):,}, R² = {model2.rsquared:.4f}, ΔR² = {model2.rsquared - model1.rsquared:.4f}")
        print("\nCoefficients (elasticities):")
        print(f"  Intercept:    {model2.params['const']:8.4f}  (se: {model2.bse['const']:.4f}, t: {model2.tvalues['const']:6.2f})")
        print(f"  log(QUALITY): {model2.params['log_quality']:8.4f}  (se: {model2.bse['log_quality']:.4f}, t: {model2.tvalues['log_quality']:6.2f})")
        print(f"  log(BID):     {model2.params['log_bid']:8.4f}  (se: {model2.bse['log_bid']:.4f}, t: {model2.tvalues['log_bid']:6.2f})")
        print(f"  log(PACING):  {model2.params['log_pacing']:8.4f}  (se: {model2.bse['log_pacing']:.4f}, t: {model2.tvalues['log_pacing']:6.2f})")

        # Model 3: Add placement fixed effects
        placement_cols = [col for col in reg_df.columns if col.startswith('placement_')]
        X3 = reg_df[['log_quality', 'log_bid', 'log_pacing'] + placement_cols]
        X3 = sm.add_constant(X3)

        model3 = sm.OLS(y, X3).fit()
        print(f"\n{'='*60}")
        print("MODEL 3: log(RANK) ~ log(QUALITY) + log(BID) + log(PACING) + PLACEMENT FE")
        print(f"{'='*60}")
        print(f"N = {len(reg_df):,}, R² = {model3.rsquared:.4f}, ΔR² = {model3.rsquared - model2.rsquared:.4f}")
        print("\nCoefficients (elasticities):")
        print(f"  Intercept:    {model3.params['const']:8.4f}  (se: {model3.bse['const']:.4f}, t: {model3.tvalues['const']:6.2f})")
        print(f"  log(QUALITY): {model3.params['log_quality']:8.4f}  (se: {model3.bse['log_quality']:.4f}, t: {model3.tvalues['log_quality']:6.2f})")
        print(f"  log(BID):     {model3.params['log_bid']:8.4f}  (se: {model3.bse['log_bid']:.4f}, t: {model3.tvalues['log_bid']:6.2f})")
        print(f"  log(PACING):  {model3.params['log_pacing']:8.4f}  (se: {model3.bse['log_pacing']:.4f}, t: {model3.tvalues['log_pacing']:6.2f})")
        print(f"\nPlacement fixed effects:")
        for col in placement_cols:
            print(f"  {col:15s}: {model3.params[col]:8.4f}  (se: {model3.bse[col]:.4f}, t: {model3.tvalues[col]:6.2f})")

        print(f"\n{'='*60}")
        print("SUMMARY: WHAT MOVES RANK?")
        print(f"{'='*60}")
        print(f"From Model 3 (full specification):")
        print(f"  1% ↑ BID     → {model3.params['log_bid']:.3f}% change in RANK")
        print(f"  1% ↑ QUALITY → {model3.params['log_quality']:.3f}% change in RANK")
        print(f"  1% ↑ PACING  → {model3.params['log_pacing']:.3f}% change in RANK")
        print(f"\nBID is {abs(model3.params['log_bid'])/abs(model3.params['log_quality']):.1f}x more important than QUALITY")
        print(f"R² = {model3.rsquared:.4f} → {(1-model3.rsquared)*100:.1f}% of rank variation unexplained")
        print(f"  (Unexplained variation due to: vendor effects, product effects, time effects, tie-breaking)")

        # QUALITY vs ACTUAL CTR ANALYSIS
        print(f"\n{'='*80}")
        print("QUALITY vs ACTUAL CTR ANALYSIS")
        print(f"{'='*80}")

        impressions = journey_data['IMPRESSIONS']
        clicks = journey_data['CLICKS']

        # Join bids with impressions
        bid_imp = df[df['IS_WINNER']==True].merge(
            impressions[['AUCTION_ID', 'PRODUCT_ID', 'INTERACTION_ID']], 
            on=['AUCTION_ID', 'PRODUCT_ID'], 
            how='left',
            suffixes=('', '_imp')
        )

        # Mark if impression led to click
        bid_imp['had_impression'] = bid_imp['INTERACTION_ID'].notna()
        bid_imp_with_imp = bid_imp[bid_imp['had_impression']].copy()

        # Join with clicks to see which impressions got clicked
        bid_imp_with_imp = bid_imp_with_imp.merge(
            clicks[['AUCTION_ID', 'PRODUCT_ID']],
            on=['AUCTION_ID', 'PRODUCT_ID'],
            how='left',
            indicator='clicked'
        )
        bid_imp_with_imp['was_clicked'] = (bid_imp_with_imp['clicked'] == 'both').astype(int)

        print(f"\nBids with impressions tracked: {len(bid_imp_with_imp):,}")
        print(f"Bids that led to clicks: {bid_imp_with_imp['was_clicked'].sum():,}")
        print(f"Overall observed CTR: {bid_imp_with_imp['was_clicked'].mean():.4f}")

        # Group by quality buckets and calculate actual CTR
        bid_imp_with_imp['quality_bucket'] = pd.qcut(bid_imp_with_imp['QUALITY'], q=10, labels=False, duplicates='drop') + 1

        quality_ctr = bid_imp_with_imp.groupby('quality_bucket').agg({
            'QUALITY': 'mean',
            'was_clicked': ['sum', 'count', 'mean']
        })
        quality_ctr.columns = ['Avg_Quality', 'Clicks', 'Impressions', 'Actual_CTR']

        print(f"\nActual CTR by Quality Decile:")
        print(quality_ctr.to_string())

        # Correlation between QUALITY and actual CTR
        corr_quality_ctr = bid_imp_with_imp[['QUALITY', 'was_clicked']].corr().loc['QUALITY', 'was_clicked']
        print(f"\nCorrelation (QUALITY score vs Actual CTR): {corr_quality_ctr:.4f}")

        # CONVERSION_RATE vs ACTUAL CONVERSION ANALYSIS
        print(f"\n{'='*80}")
        print("CONVERSION_RATE vs ACTUAL CONVERSION")
        print(f"{'='*80}")

        purchases = journey_data['PURCHASES']

        # For clicks, check if they led to purchase
        clicks_with_bids = clicks.merge(
            df[df['IS_WINNER']==True][['AUCTION_ID', 'PRODUCT_ID', 'CONVERSION_RATE']],
            on=['AUCTION_ID', 'PRODUCT_ID'],
            how='left'
        )

        # Join with purchases
        clicks_with_bids = clicks_with_bids.merge(
            purchases[['USER_ID', 'PRODUCT_ID']],
            on=['USER_ID', 'PRODUCT_ID'],
            how='left',
            indicator='converted'
        )
        clicks_with_bids['did_convert'] = (clicks_with_bids['converted'] == 'both').astype(int)

        print(f"\nClicks tracked: {len(clicks_with_bids):,}")
        print(f"Clicks that converted: {clicks_with_bids['did_convert'].sum():,}")
        print(f"Overall observed conversion rate: {clicks_with_bids['did_convert'].mean():.4f}")

        # Group by predicted conversion rate and see actual conversion
        clicks_with_bids['cr_bucket'] = pd.qcut(clicks_with_bids['CONVERSION_RATE'], q=10, labels=False, duplicates='drop') + 1

        cr_actual = clicks_with_bids.groupby('cr_bucket').agg({
            'CONVERSION_RATE': 'mean',
            'did_convert': ['sum', 'count', 'mean']
        })
        cr_actual.columns = ['Predicted_CR', 'Conversions', 'Clicks', 'Actual_CR']

        print(f"\nActual Conversion Rate by Predicted CR Decile:")
        print(cr_actual.to_string())

        # Correlation
        corr_cr = clicks_with_bids[['CONVERSION_RATE', 'did_convert']].corr().loc['CONVERSION_RATE', 'did_convert']
        print(f"\nCorrelation (Predicted CONVERSION_RATE vs Actual Conversion): {corr_cr:.4f}")

    else:
        print("\n[WARNING] QUALITY column has no data")

    print("\n[INFO] Quality score analysis complete.")
else:
    print("[ERROR] Data not loaded.")

In [None]:
# --- BIDDING DYNAMICS: FINAL_BID vs PRICE ---

if 'journey_data' in locals():
    print(f"\n{'='*80}")
    print("BIDDING DYNAMICS: FINAL_BID vs PRICE")
    print(f"{'='*80}")
    
    df = journey_data['AUCTIONS_RESULTS'].copy()
    
    if 'FINAL_BID' in df.columns and 'PRICE' in df.columns:
        # Compare winners vs losers
        print("\nWinning Bids:")
        winning_bids = df[df['IS_WINNER'] == True]
        if len(winning_bids) > 0 and winning_bids['FINAL_BID'].notna().any():
            print(f"  Count: {len(winning_bids):,}")
            print(f"  Mean FINAL_BID: {winning_bids['FINAL_BID'].mean():.2f}")
            print(f"  Mean PRICE: {winning_bids['PRICE'].mean():.2f}")
            if winning_bids['PRICE'].notna().any() and winning_bids['FINAL_BID'].notna().any():
                print(f"  Mean FINAL_BID/PRICE ratio: {(winning_bids['FINAL_BID'] / winning_bids['PRICE']).mean():.4f}")
        else:
            print("  [WARNING] No data for winning bids")
        
        print("\nLosing Bids:")
        losing_bids = df[df['IS_WINNER'] == False]
        if len(losing_bids) > 0 and losing_bids['FINAL_BID'].notna().any():
            print(f"  Count: {len(losing_bids):,}")
            print(f"  Mean FINAL_BID: {losing_bids['FINAL_BID'].mean():.2f}")
            print(f"  Mean PRICE: {losing_bids['PRICE'].mean():.2f}")
            if losing_bids['PRICE'].notna().any() and losing_bids['FINAL_BID'].notna().any():
                print(f"  Mean FINAL_BID/PRICE ratio: {(losing_bids['FINAL_BID'] / losing_bids['PRICE']).mean():.4f}")
        else:
            print("  [WARNING] No data for losing bids")
        
        # Correlation
        if df['FINAL_BID'].notna().any() and df['PRICE'].notna().any():
            corr = df[['FINAL_BID', 'PRICE']].corr().loc['FINAL_BID', 'PRICE']
            print(f"\nCorrelation (FINAL_BID vs PRICE): {corr:.4f}")
        
        # BID × QUALITY SCORE ANALYSIS (4×4 MATRIX)
        print(f"\n{'='*80}")
        print("BID × QUALITY SCORE ANALYSIS")
        print(f"{'='*80}")
        
        # Create quartiles
        df['bid_quartile'] = pd.qcut(df['FINAL_BID'], q=4, labels=['Bid_Q1_Low', 'Bid_Q2', 'Bid_Q3', 'Bid_Q4_High'], duplicates='drop')
        df['quality_quartile'] = pd.qcut(df['QUALITY'], q=4, labels=['Qual_Q1_Low', 'Qual_Q2', 'Qual_Q3', 'Qual_Q4_High'], duplicates='drop')
        
        # Create combined score
        df['score'] = df['FINAL_BID'] * df['QUALITY']
        
        # Build 4×4 matrix of win rates
        matrix = df.groupby(['bid_quartile', 'quality_quartile'], observed=True)['IS_WINNER'].agg(['sum', 'count', 'mean'])
        matrix.columns = ['Wins', 'Total', 'Win_Rate']
        
        print("\nWin Rate by Bid Quartile × Quality Quartile:")
        print(matrix.to_string())
        
        # Pivot for easier reading
        win_rate_pivot = df.pivot_table(
            values='IS_WINNER',
            index='bid_quartile',
            columns='quality_quartile',
            aggfunc='mean',
            observed=True
        )
        
        print("\nWin Rate Matrix (rows=bid, cols=quality):")
        print(win_rate_pivot.to_string())
        
        # Test if score predicts wins better than bid alone
        from scipy.stats import spearmanr
        corr_bid_win = df[['FINAL_BID', 'IS_WINNER']].corr(method='spearman').loc['FINAL_BID', 'IS_WINNER']
        corr_score_win = df[['score', 'IS_WINNER']].corr(method='spearman').loc['score', 'IS_WINNER']
        print(f"\nSpearman correlation (BID vs WIN): {corr_bid_win:.4f}")
        print(f"Spearman correlation (BID×QUALITY vs WIN): {corr_score_win:.4f}")
        
        # FIRST-PRICE vs SECOND-PRICE AUCTION DETECTION (EXPANDED)
        print(f"\n{'='*80}")
        print("AUCTION MECHANISM DETECTION (EXPANDED)")
        print(f"{'='*80}")
        
        winners_with_price = winning_bids[winning_bids['PRICE'].notna()].copy()
        
        # Detect auction types with finer granularity
        winners_with_price['price_diff'] = winners_with_price['FINAL_BID'] - winners_with_price['PRICE']
        winners_with_price['auction_type'] = 'Unknown'
        winners_with_price.loc[winners_with_price['price_diff'].abs() < 0.01, 'auction_type'] = 'First-Price'
        winners_with_price.loc[winners_with_price['price_diff'] > 0.01, 'auction_type'] = 'Second-Price'
        winners_with_price.loc[winners_with_price['price_diff'] < -0.01, 'auction_type'] = 'Anomaly (PRICE > BID)'
        
        auction_type_dist = winners_with_price['auction_type'].value_counts()
        print(f"\nAuction type distribution (winners with pricing, N={len(winners_with_price):,}):")
        for atype, count in auction_type_dist.items():
            pct = (count / len(winners_with_price)) * 100
            print(f"  {atype:25s}: {count:9,} ({pct:5.2f}%)")
        
        # Check for 3rd auction type: Hybrid by placement
        auctions = journey_data['AUCTIONS_USERS']
        winners_with_placement = winners_with_price.merge(
            auctions[['AUCTION_ID', 'PLACEMENT']],
            on='AUCTION_ID',
            how='left'
        )
        
        auction_by_placement = winners_with_placement.groupby(['PLACEMENT', 'auction_type']).size().unstack(fill_value=0)
        auction_by_placement_pct = auction_by_placement.div(auction_by_placement.sum(axis=1), axis=0) * 100
        
        print(f"\nAuction type distribution by PLACEMENT (%):")
        print(auction_by_placement_pct.to_string())
        
        # Check temporal patterns (hour of day)
        winners_with_placement['hour'] = pd.to_datetime(winners_with_placement['CREATED_AT']).dt.hour
        auction_by_hour = winners_with_placement.groupby(['hour', 'auction_type']).size().unstack(fill_value=0)
        auction_by_hour_pct = auction_by_hour.div(auction_by_hour.sum(axis=1), axis=0) * 100
        
        print(f"\nAuction type stability over time (sample hours):")
        sample_hours = [0, 6, 12, 18]
        for h in sample_hours:
            if h in auction_by_hour_pct.index:
                first_pct = auction_by_hour_pct.loc[h, 'First-Price'] if 'First-Price' in auction_by_hour_pct.columns else 0
                second_pct = auction_by_hour_pct.loc[h, 'Second-Price'] if 'Second-Price' in auction_by_hour_pct.columns else 0
                print(f"  Hour {h:02d}: First={first_pct:5.1f}%, Second={second_pct:5.1f}%")
        
        # Second-price analysis
        second_price = winners_with_price[winners_with_price['auction_type'] == 'Second-Price']
        if len(second_price) > 0:
            print(f"\nSecond-price auction statistics:")
            print(f"  Count: {len(second_price):,}")
            print(f"  Mean (FINAL_BID - PRICE): ${second_price['price_diff'].mean():.2f}")
            print(f"  Median discount: ${second_price['price_diff'].median():.2f}")
            print(f"  Max discount: ${second_price['price_diff'].max():.2f}")
            print(f"  Mean savings rate: {(second_price['price_diff'] / second_price['FINAL_BID']).mean():.2%}")
        
        # Anomaly investigation
        anomaly = winners_with_price[winners_with_price['auction_type'] == 'Anomaly (PRICE > BID)']
        if len(anomaly) > 0:
            print(f"\nAnomaly investigation (PRICE > BID):")
            print(f"  Count: {len(anomaly):,} ({len(anomaly)/len(winners_with_price)*100:.2f}%)")
            print(f"  Mean PRICE excess: ${abs(anomaly['price_diff'].mean()):.2f}")
            print(f"  Max PRICE excess: ${abs(anomaly['price_diff'].min()):.2f}")
            print(f"  Interpretation: May indicate reserve prices or minimum bid floors")
        
        print(f"\nConclusion:")
        if auction_type_dist.get('First-Price', 0) > auction_type_dist.get('Second-Price', 0):
            print(f"  Platform runs predominantly FIRST-PRICE auctions ({auction_type_dist.get('First-Price', 0)/len(winners_with_price)*100:.1f}%)")
        else:
            print(f"  Platform runs predominantly SECOND-PRICE auctions ({auction_type_dist.get('Second-Price', 0)/len(winners_with_price)*100:.1f}%)")
        
        if auction_by_placement_pct.std(axis=0).max() > 5:
            print(f"  ⚠ Auction type varies significantly by PLACEMENT (hybrid system)")
        else:
            print(f"  ✓ Auction type is consistent across placements")
        
        # CPC PRICING MODEL VALIDATION
        print(f"\n{'='*80}")
        print("CPC PRICING MODEL VALIDATION")
        print(f"{'='*80}")
        
        clicks = journey_data['CLICKS']
        
        # Check if PRICE is null for winners without clicks
        winners_no_click = winning_bids.merge(
            clicks[['AUCTION_ID', 'PRODUCT_ID']],
            on=['AUCTION_ID', 'PRODUCT_ID'],
            how='left',
            indicator='has_click'
        )
        winners_no_click['clicked'] = (winners_no_click['has_click'] == 'both')
        
        print(f"\nWinning bids analysis:")
        print(f"  Total winners: {len(winners_no_click):,}")
        print(f"  Winners with clicks: {winners_no_click['clicked'].sum():,}")
        print(f"  Winners without clicks: {(~winners_no_click['clicked']).sum():,}")
        
        # Check PRICE population by click status
        clicked_winners = winners_no_click[winners_no_click['clicked']]
        no_click_winners = winners_no_click[~winners_no_click['clicked']]
        
        print(f"\nPRICE populated for clicked winners: {clicked_winners['PRICE'].notna().sum():,} / {len(clicked_winners):,} ({clicked_winners['PRICE'].notna().mean():.2%})")
        print(f"PRICE populated for non-clicked winners: {no_click_winners['PRICE'].notna().sum():,} / {len(no_click_winners):,} ({no_click_winners['PRICE'].notna().mean():.2%})")
        
        print(f"\nConclusion:")
        if no_click_winners['PRICE'].notna().mean() < 0.1:
            print(f"  ✓ CPC model confirmed: PRICE is mostly null when there's no click")
        else:
            print(f"  ⚠ Unexpected: PRICE populated even without clicks")
            print(f"     → PRICE likely represents clearing price (not actual charge)")
            print(f"     → Actual billing happens separately in payment system")
        
        # PRICE DISAMBIGUATION: CLEARING PRICE vs CATALOG PRICE
        print(f"\n{'='*80}")
        print("PRICE FIELD DISAMBIGUATION")
        print(f"{'='*80}")
        
        print("\nQuestion: Does PRICE = ad clearing price OR commodity catalog price?")
        
        catalog = journey_data['CATALOG']
        
        # Join winners with catalog
        winners_with_catalog = winning_bids.merge(
            catalog[['PRODUCT_ID', 'PRICE']],
            on='PRODUCT_ID',
            how='left',
            suffixes=('_bid', '_catalog')
        )
        
        winners_with_both = winners_with_catalog[
            winners_with_catalog['PRICE_bid'].notna() & 
            winners_with_catalog['PRICE_catalog'].notna()
        ]
        
        print(f"\nSample size: {len(winners_with_both):,} winners with both PRICE fields populated")
        
        if len(winners_with_both) > 0:
            # Test 1: Are they identical?
            winners_with_both['price_match'] = (winners_with_both['PRICE_bid'] - winners_with_both['PRICE_catalog']).abs() < 0.01
            match_rate = winners_with_both['price_match'].mean()
            
            print(f"\nTest 1: Exact match rate (bid PRICE = catalog PRICE):")
            print(f"  Match rate: {match_rate:.2%}")
            
            if match_rate > 0.9:
                print(f"  → PRICE appears to be CATALOG PRICE (commodity price)")
            else:
                print(f"  → PRICE appears to be CLEARING PRICE (auction outcome)")
            
            # Test 2: Correlation
            corr_prices = winners_with_both[['PRICE_bid', 'PRICE_catalog']].corr().iloc[0,1]
            print(f"\nTest 2: Correlation between bid PRICE and catalog PRICE:")
            print(f"  Correlation: {corr_prices:.4f}")
            
            # Test 3: Variation within product
            # For same product across multiple auctions, does PRICE vary?
            product_price_var = winners_with_catalog.groupby('PRODUCT_ID')['PRICE_bid'].agg(['count', 'std', 'mean'])
            product_price_var = product_price_var[product_price_var['count'] >= 5]  # Products with 5+ wins
            
            if len(product_price_var) > 0:
                avg_cv = (product_price_var['std'] / product_price_var['mean']).mean()
                print(f"\nTest 3: Within-product PRICE variation (N={len(product_price_var):,} products with 5+ wins):")
                print(f"  Average coefficient of variation: {avg_cv:.4f}")
                
                if avg_cv < 0.05:
                    print(f"  → PRICE is nearly constant for same product → CATALOG PRICE")
                else:
                    print(f"  → PRICE varies significantly for same product → CLEARING PRICE")
            
            # Test 4: PRICE vs FINAL_BID relationship
            winners_with_both['bid_over_catalog'] = winners_with_both['FINAL_BID'] / winners_with_both['PRICE_catalog']
            winners_with_both['price_over_catalog'] = winners_with_both['PRICE_bid'] / winners_with_both['PRICE_catalog']
            
            print(f"\nTest 4: Relationship to catalog price:")
            print(f"  Mean FINAL_BID / catalog_PRICE: {winners_with_both['bid_over_catalog'].mean():.4f}")
            print(f"  Mean auction_PRICE / catalog_PRICE: {winners_with_both['price_over_catalog'].mean():.4f}")
            
            # Test 5: Extreme examples
            print(f"\nTest 5: Extreme mismatches (bid PRICE >> catalog PRICE):")
            big_diff = winners_with_both[winners_with_both['PRICE_bid'] > winners_with_both['PRICE_catalog'] * 2]
            if len(big_diff) > 0:
                print(f"  Cases where bid PRICE > 2x catalog PRICE: {len(big_diff):,} ({len(big_diff)/len(winners_with_both)*100:.2f}%)")
                print(f"  Mean ratio: {(big_diff['PRICE_bid'] / big_diff['PRICE_catalog']).mean():.2f}x")
                print(f"  → Suggests PRICE fields measure different things")
            else:
                print(f"  No extreme mismatches found")
                print(f"  → Suggests PRICE fields may be related")
        
        print(f"\nFINAL DETERMINATION:")
        if len(winners_with_both) > 0:
            if match_rate > 0.9:
                print(f"  PRICE in AUCTIONS_RESULTS = CATALOG PRICE (commodity price)")
                print(f"  Evidence: {match_rate:.1%} exact match rate")
            elif corr_prices > 0.95:
                print(f"  PRICE in AUCTIONS_RESULTS = CATALOG PRICE (with small markup/fees)")
                print(f"  Evidence: {corr_prices:.4f} correlation, match_rate={match_rate:.1%}")
            else:
                print(f"  PRICE in AUCTIONS_RESULTS = CLEARING PRICE (auction outcome)")
                print(f"  Evidence: Low match rate ({match_rate:.1%}), correlation={corr_prices:.4f}")
                print(f"  Note: PRICE = what advertiser pays for THIS auction")
        else:
            print(f"  INSUFFICIENT DATA: Need products in both AUCTIONS_RESULTS and CATALOG")
            
    else:
        print("\n[WARNING] FINAL_BID or PRICE columns not available")
    
    print("\n[INFO] Bidding dynamics analysis complete.")
else:
    print("[ERROR] Data not loaded.")

In [None]:
# --- PACING IMPACT ANALYSIS ---

if 'journey_data' in locals():
    print(f"\n{'='*80}")
    print("PACING IMPACT ON BIDS")
    print(f"{'='*80}")
    
    df = journey_data['AUCTIONS_RESULTS'].copy()
    
    if 'PACING' in df.columns and df['PACING'].notna().any():
        print(f"\nPacing distribution:")
        print(f"  Count: {df['PACING'].count():,}")
        print(f"  Mean: {df['PACING'].mean():.4f}")
        print(f"  Std: {df['PACING'].std():.4f}")
        print(f"  Min: {df['PACING'].min():.4f}")
        print(f"  Max: {df['PACING'].max():.4f}")
        
        # Pacing bins
        df['pacing_bin'] = pd.cut(df['PACING'], bins=[0, 0.5, 0.9, 1.0, np.inf], 
                                   labels=['Low (<0.5)', 'Medium (0.5-0.9)', 'High (0.9-1.0)', 'Over (>1.0)'])
        
        print("\nBids by pacing level:")
        pacing_dist = df['pacing_bin'].value_counts(sort=False)
        for bin_name, count in pacing_dist.items():
            pct = (count / len(df)) * 100
            print(f"  {bin_name:20s} {count:8,} ({pct:5.1f}%)")
        
        # Win rate by pacing
        win_by_pacing = df.groupby('pacing_bin', observed=True)['IS_WINNER'].agg(['sum', 'count', 'mean'])
        win_by_pacing.columns = ['Wins', 'Total', 'Win Rate']
        print("\nWin rate by pacing level:")
        print(win_by_pacing)
        
        # Correlation with final bid
        if 'FINAL_BID' in df.columns and df['FINAL_BID'].notna().any():
            corr = df[['PACING', 'FINAL_BID']].corr().loc['PACING', 'FINAL_BID']
            print(f"\nCorrelation (PACING vs FINAL_BID): {corr:.4f}")
    else:
        print("\n[WARNING] PACING column has no data")
    
    print("\n[INFO] Pacing analysis complete.")
else:
    print("[ERROR] Data not loaded.")


PACING IMPACT ON BIDS

Pacing distribution:
  Count: 18,838,670
  Mean: 0.8910
  Std: 0.2634
  Min: 0.0067
  Max: 1.0000

Bids by pacing level:
  Low (<0.5)           2,096,661 ( 11.1%)
  Medium (0.5-0.9)     1,169,643 (  6.2%)
  High (0.9-1.0)       15,572,366 ( 82.7%)
  Over (>1.0)                 0 (  0.0%)

Win rate by pacing level:
                      Wins     Total  Win Rate
pacing_bin                                    
Low (<0.5)         1224937   2096661  0.584232
Medium (0.5-0.9)    953453   1169643  0.815166
High (0.9-1.0)    13330714  15572366  0.856049

Correlation (PACING vs FINAL_BID): -0.0798

[INFO] Pacing analysis complete.


In [None]:
# --- CONVERSION_RATE ANALYSIS ---

if 'journey_data' in locals():
    print(f"\n{'='*80}")
    print("CONVERSION_RATE ANALYSIS")
    print(f"{'='*80}")
    
    df = journey_data['AUCTIONS_RESULTS'].copy()
    
    if 'CONVERSION_RATE' in df.columns and df['CONVERSION_RATE'].notna().any():
        print(f"\nConversion rate distribution:")
        print(f"  Count: {df['CONVERSION_RATE'].count():,}")
        print(f"  Mean: {df['CONVERSION_RATE'].mean():.6f}")
        print(f"  Std: {df['CONVERSION_RATE'].std():.6f}")
        print(f"  Min: {df['CONVERSION_RATE'].min():.6f}")
        print(f"  Median: {df['CONVERSION_RATE'].median():.6f}")
        print(f"  Max: {df['CONVERSION_RATE'].max():.6f}")
        
        # Relationship with quality
        if 'QUALITY' in df.columns and df['QUALITY'].notna().any():
            corr = df[['CONVERSION_RATE', 'QUALITY']].corr().loc['CONVERSION_RATE', 'QUALITY']
            print(f"\nCorrelation (CONVERSION_RATE vs QUALITY): {corr:.4f}")
        
        # Winners vs losers
        print("\nConversion rate by bid outcome:")
        print(f"  Winners:  Mean CR = {df[df['IS_WINNER']==True]['CONVERSION_RATE'].mean():.6f}")
        print(f"  Losers:   Mean CR = {df[df['IS_WINNER']==False]['CONVERSION_RATE'].mean():.6f}")
    else:
        print("\n[WARNING] CONVERSION_RATE column has no data")
    
    print("\n[INFO] Conversion rate analysis complete.")
else:
    print("[ERROR] Data not loaded.")


CONVERSION_RATE ANALYSIS

Conversion rate distribution:
  Count: 18,838,670
  Mean: 0.010004
  Std: 0.007716
  Min: 0.000001
  Median: 0.009010
  Max: 0.056500

Correlation (CONVERSION_RATE vs QUALITY): 0.1578

Conversion rate by bid outcome:
  Winners:  Mean CR = 0.010032
  Losers:   Mean CR = 0.009875

[INFO] Conversion rate analysis complete.


## Part 4: Funnel Analysis with Updated Schema

In [None]:
# --- FUNNEL ANALYSIS ---

if 'journey_data' in locals():
    print(f"\n{'='*80}")
    print("FUNNEL ANALYSIS")
    print(f"{'='*80}")
    
    auctions = journey_data['AUCTIONS_USERS']
    bids = journey_data['AUCTIONS_RESULTS']
    impressions = journey_data['IMPRESSIONS']
    clicks = journey_data['CLICKS']
    purchases = journey_data['PURCHASES']
    
    # Basic funnel metrics
    print("\nFunnel stages:")
    print(f"  Auctions:     {len(auctions):8,}")
    print(f"  Bids:         {len(bids):8,}")
    print(f"  Winners:      {bids['IS_WINNER'].sum():8,}")
    print(f"  Impressions:  {len(impressions):8,}")
    print(f"  Clicks:       {len(clicks):8,}")
    print(f"  Purchases:    {len(purchases):8,}")
    
    # Conversion rates
    if len(impressions) > 0:
        ctr = len(clicks) / len(impressions)
        print(f"\n  Click-through rate: {ctr:.4f} ({ctr*100:.2f}%)")
    
    if len(clicks) > 0:
        cvr = len(purchases) / len(clicks)
        print(f"  Click-to-purchase: {cvr:.4f} ({cvr*100:.2f}%)")
    
    # User-level metrics
    n_users = auctions['OPAQUE_USER_ID'].nunique()
    n_buyers = purchases['USER_ID'].nunique()
    print(f"\nUser-level:")
    print(f"  Total users: {n_users:,}")
    print(f"  Buyers: {n_buyers:,}")
    print(f"  Conversion rate: {n_buyers/n_users:.4f} ({n_buyers/n_users*100:.2f}%)")
    
    # Revenue
    purchases['revenue'] = (purchases['UNIT_PRICE'] * purchases['QUANTITY']) / 100
    total_revenue = purchases['revenue'].sum()
    print(f"\nRevenue:")
    print(f"  Total: ${total_revenue:,.2f}")
    print(f"  Per buyer: ${total_revenue/n_buyers:,.2f}")
    print(f"  Per user: ${total_revenue/n_users:,.2f}")
    
    # ENHANCED FUNNEL METRICS
    print(f"\n{'='*80}")
    print("ENHANCED FUNNEL METRICS WITH PACING & WIN RATES")
    print(f"{'='*80}")
    
    # Ad win rate overall
    total_bids = len(bids)
    total_winners = bids['IS_WINNER'].sum()
    print(f"\nAd Win Rate:")
    print(f"  Total bids: {total_bids:,}")
    print(f"  Winning bids: {total_winners:,}")
    print(f"  Overall win rate: {total_winners/total_bids:.4f} ({total_winners/total_bids*100:.2f}%)")
    
    # Win rate by placement
    auction_bids = bids.merge(auctions[['AUCTION_ID', 'PLACEMENT']], on='AUCTION_ID', how='left')
    win_by_placement = auction_bids.groupby('PLACEMENT')['IS_WINNER'].agg(['sum', 'count', 'mean'])
    win_by_placement.columns = ['Wins', 'Bids', 'Win_Rate']
    print(f"\nWin Rate by Placement:")
    print(win_by_placement.to_string())
    
    # Pacing distribution in funnel context
    print(f"\nPacing Distribution:")
    pacing_dist = bids['PACING'].describe()
    print(pacing_dist.to_string())
    
    # Average pacing for winners vs losers
    avg_pacing_winners = bids[bids['IS_WINNER']==True]['PACING'].mean()
    avg_pacing_losers = bids[bids['IS_WINNER']==False]['PACING'].mean()
    print(f"\nAverage Pacing:")
    print(f"  Winners: {avg_pacing_winners:.4f}")
    print(f"  Losers: {avg_pacing_losers:.4f}")
    print(f"  Difference: {avg_pacing_winners - avg_pacing_losers:.4f}")
    
    # Pacing level breakdown
    bids['pacing_level'] = pd.cut(bids['PACING'], bins=[0, 0.5, 0.9, 1.0, np.inf], 
                                   labels=['Low (<0.5)', 'Medium (0.5-0.9)', 'High (0.9-1.0)', 'Over (>1.0)'])
    pacing_funnel = bids.groupby('pacing_level', observed=True).agg({
        'AUCTION_ID': 'count',
        'IS_WINNER': ['sum', 'mean']
    })
    pacing_funnel.columns = ['Bids', 'Wins', 'Win_Rate']
    print(f"\nFunnel by Pacing Level:")
    print(pacing_funnel.to_string())
    
    # Impression delivery rate by placement
    print(f"\nImpression Delivery Rate by Placement:")
    winners_by_placement = auction_bids[auction_bids['IS_WINNER']==True].groupby('PLACEMENT').size()
    
    impressions_with_placement = impressions.merge(
        auctions[['AUCTION_ID', 'PLACEMENT']], 
        on='AUCTION_ID', 
        how='left'
    )
    impressions_by_placement = impressions_with_placement.groupby('PLACEMENT').size()
    
    delivery_rate = pd.DataFrame({
        'Winners': winners_by_placement,
        'Impressions': impressions_by_placement
    })
    delivery_rate['Delivery_Rate'] = delivery_rate['Impressions'] / delivery_rate['Winners']
    print(delivery_rate.to_string())
    
    # Full funnel breakdown
    print(f"\n{'='*80}")
    print("COMPLETE FUNNEL BREAKDOWN")
    print(f"{'='*80}")
    
    print(f"\nStage-by-stage conversion:")
    print(f"  Auctions → Bids: {total_bids/len(auctions):.2f}x (avg bids per auction)")
    print(f"  Bids → Winners: {total_winners/total_bids:.4f} ({total_winners/total_bids*100:.2f}%)")
    print(f"  Winners → Impressions: {len(impressions)/total_winners:.4f} ({len(impressions)/total_winners*100:.2f}%)")
    print(f"  Impressions → Clicks: {len(clicks)/len(impressions):.4f} ({len(clicks)/len(impressions)*100:.2f}%)")
    print(f"  Clicks → Purchases: {len(purchases)/len(clicks):.4f} ({len(purchases)/len(clicks)*100:.2f}%)")
    print(f"  Auctions → Purchases (end-to-end): {len(purchases)/len(auctions):.4f} ({len(purchases)/len(auctions)*100:.2f}%)")
    
    print("\n[INFO] Funnel analysis complete.")
else:
    print("[ERROR] Data not loaded.")


FUNNEL ANALYSIS

Funnel stages:
  Auctions:      413,457
  Bids:         18,838,670
  Winners:      15,509,104
  Impressions:   533,146
  Clicks:         16,706
  Purchases:       2,188

  Click-through rate: 0.0313 (3.13%)
  Click-to-purchase: 0.1310 (13.10%)

User-level:
  Total users: 4,671
  Buyers: 835
  Conversion rate: 0.1788 (17.88%)

Revenue:
  Total: $74,659.00
  Per buyer: $89.41
  Per user: $15.98

ENHANCED FUNNEL METRICS WITH PACING & WIN RATES

Ad Win Rate:
  Total bids: 18,838,670
  Winning bids: 15,509,104
  Overall win rate: 0.8233 (82.33%)

Win Rate by Placement:
              Wins     Bids  Win_Rate
PLACEMENT                            
1          1135277  1502474  0.755605
2          4285134  5003519  0.856424
3          2428435  3040245  0.798763
4           142934   178757  0.799599
5          7518892  9115603  0.824838

Pacing Distribution:
count    1.883867e+07
mean     8.909944e-01
std      2.633870e-01
min      6.737947e-03
25%      1.000000e+00
50%      1.00

## Part 5: Summary & Data Quality Assessment

In [None]:
# --- SUMMARY STATISTICS ---

if 'journey_data' in locals():
    print(f"\n{'='*80}")
    print("SUMMARY & DATA QUALITY ASSESSMENT")
    print(f"{'='*80}")
    
    print(f"\nData pull configuration:")
    print(f"  Date range: {start_date_str} to {end_date_str} ({DAYS_WINDOW} days)")
    print(f"  Sampling fraction: {SAMPLING_FRACTION:.4%}")
    print(f"  Users sampled: {journey_data['AUCTIONS_USERS']['OPAQUE_USER_ID'].nunique():,}")
    
    print(f"\nNew schema fields coverage:")
    
    # AUCTIONS_USERS: PLACEMENT
    placement_coverage = (1 - journey_data['AUCTIONS_USERS']['PLACEMENT'].isna().mean()) * 100
    print(f"  AUCTIONS_USERS.PLACEMENT: {placement_coverage:.1f}% populated")
    
    # AUCTIONS_RESULTS: new columns
    df_bids = journey_data['AUCTIONS_RESULTS']
    for col in ['QUALITY', 'FINAL_BID', 'PRICE', 'CONVERSION_RATE', 'PACING']:
        if col in df_bids.columns:
            coverage = (1 - df_bids[col].isna().mean()) * 100
            print(f"  AUCTIONS_RESULTS.{col}: {coverage:.1f}% populated")
        else:
            print(f"  AUCTIONS_RESULTS.{col}: NOT FOUND")
    
    print(f"\nData quality flags:")
    
    # Check for obvious issues
    issues = []
    
    # Check if new columns are mostly empty
    if 'PLACEMENT' in journey_data['AUCTIONS_USERS'].columns:
        if journey_data['AUCTIONS_USERS']['PLACEMENT'].isna().mean() > 0.9:
            issues.append("PLACEMENT column >90% null")
    
    for col in ['QUALITY', 'FINAL_BID', 'PRICE', 'CONVERSION_RATE', 'PACING']:
        if col in df_bids.columns:
            if df_bids[col].isna().mean() > 0.9:
                issues.append(f"{col} column >90% null")
    
    # Check funnel integrity
    winning_bids = df_bids['IS_WINNER'].sum()
    impressions = len(journey_data['IMPRESSIONS'])
    if impressions > 0:
        match_rate = impressions / winning_bids if winning_bids > 0 else 0
        if match_rate < 0.1:
            issues.append(f"Low impression-to-bid match rate: {match_rate:.2%}")
    
    if len(issues) == 0:
        print("  ✓ No major issues detected")
    else:
        print("  ⚠ Issues detected:")
        for issue in issues:
            print(f"    - {issue}")
    
    print("\n[SUCCESS] Analysis complete!")
else:
    print("[ERROR] Data not loaded.")


SUMMARY & DATA QUALITY ASSESSMENT

Data pull configuration:
  Date range: 2025-09-27 to 2025-10-11 (14 days)
  Sampling fraction: 0.1000%
  Users sampled: 4,671

New schema fields coverage:
  AUCTIONS_USERS.PLACEMENT: 100.0% populated
  AUCTIONS_RESULTS.QUALITY: 100.0% populated
  AUCTIONS_RESULTS.FINAL_BID: 100.0% populated
  AUCTIONS_RESULTS.PRICE: 82.3% populated
  AUCTIONS_RESULTS.CONVERSION_RATE: 100.0% populated
  AUCTIONS_RESULTS.PACING: 100.0% populated

Data quality flags:
  ⚠ Issues detected:
    - Low impression-to-bid match rate: 3.44%

[SUCCESS] Analysis complete!
