# Snowflake-Native EDA: "Aggregating the Truth"

## Strategy
1. **Unit of Analysis:** Vendor Ledger (not User Journey)
2. **Computation:** Push-down to Snowflake (not ELT to pandas)
3. **Data Structure:** Scaffolded panel with explicit zeros

## Two-Part Data Pull
- **Part 1:** Vendor-Week Panel (~3.7M rows) - Q1,Q2,Q3,Q4,Q5,Q8,Q9,Q10
- **Part 2:** Mechanism Sample (~50K rows) - Q6,Q7

In [None]:
# Cell 1: Configuration & Connection
import os
import pandas as pd
import numpy as np
import textwrap
import warnings
from datetime import datetime
from pathlib import Path
from dotenv import load_dotenv
from tqdm import tqdm
import snowflake.connector

warnings.filterwarnings('ignore', message='pandas only supports SQLAlchemy.*')
load_dotenv()

# Configuration
ANALYSIS_START = '2025-03-24'
ANALYSIS_END = '2025-09-15'
N_WEEKS = 26

# Paths
DATA_DIR = Path('./data')
DATA_DIR.mkdir(exist_ok=True)

# Connection (stays open throughout notebook)
conn = snowflake.connector.connect(
    user=os.getenv('SNOWFLAKE_USER'),
    password=os.getenv('SNOWFLAKE_PASSWORD'),
    account=os.getenv('SNOWFLAKE_ACCOUNT'),
    warehouse=os.getenv('SNOWFLAKE_WAREHOUSE', 'COMPUTE_WH'),
    database='INCREMENTALITY',
    schema='INCREMENTALITY_RESEARCH'
)
print("[OK] Snowflake connected")
print(f"Analysis window: {ANALYSIS_START} to {ANALYSIS_END} ({N_WEEKS} weeks)")

In [None]:
# Cell 2: Q1 - Bridge Integrity Check (Orphan Rate)
# CRITICAL PATH: If orphan GMV > 50%, project stops.

orphan_query = f"""
SELECT
    COUNT(DISTINCT p.PURCHASE_ID) as total_purchases,
    COUNT(DISTINCT CASE WHEN c.PRODUCT_ID IS NULL THEN p.PURCHASE_ID END) as orphan_purchases,
    SUM(p.UNIT_PRICE * p.QUANTITY) as total_gmv,
    SUM(CASE WHEN c.PRODUCT_ID IS NULL THEN p.UNIT_PRICE * p.QUANTITY ELSE 0 END) as orphan_gmv
FROM PURCHASES p
LEFT JOIN CATALOG c ON LOWER(TRIM(p.PRODUCT_ID)) = LOWER(TRIM(c.PRODUCT_ID))
WHERE p.PURCHASED_AT BETWEEN '{ANALYSIS_START}' AND '{ANALYSIS_END}'
"""

print("Executing orphan rate query...")
orphan_stats = pd.read_sql(orphan_query, conn)
orphan_rate = orphan_stats['ORPHAN_GMV'].iloc[0] / orphan_stats['TOTAL_GMV'].iloc[0] if orphan_stats['TOTAL_GMV'].iloc[0] > 0 else 0

print("="*60)
print("Q1: BRIDGE INTEGRITY CHECK")
print("="*60)
print(f"Total Purchases: {orphan_stats['TOTAL_PURCHASES'].iloc[0]:,}")
print(f"Orphan Purchases: {orphan_stats['ORPHAN_PURCHASES'].iloc[0]:,}")
print(f"Total GMV: ${orphan_stats['TOTAL_GMV'].iloc[0]:,.0f}")
print(f"Orphan GMV: ${orphan_stats['ORPHAN_GMV'].iloc[0]:,.0f}")
print(f"Orphan Rate: {orphan_rate:.1%}")
print(f"Status: {'CRITICAL - STOP' if orphan_rate > 0.5 else 'OK' if orphan_rate < 0.1 else 'WARNING'}")

In [None]:
# Cell 3: Q2 - Panel Balance Check

balance_query = f"""
SELECT
    (SELECT COUNT(DISTINCT LOWER(TO_VARCHAR(VENDOR_ID, 'HEX')))
     FROM AUCTIONS_RESULTS
     WHERE CREATED_AT BETWEEN '{ANALYSIS_START}' AND '{ANALYSIS_END}'
       AND IS_WINNER = TRUE) as n_bidders,
    (SELECT COUNT(DISTINCT v.value::STRING)
     FROM CATALOG, LATERAL FLATTEN(input => VENDORS) v) as n_catalog_vendors
"""

print("Executing panel balance query...")
balance = pd.read_sql(balance_query, conn)

print("="*60)
print("Q2: PANEL BALANCE")
print("="*60)
print(f"Bidding Vendors (winners): {balance['N_BIDDERS'].iloc[0]:,}")
print(f"Catalog Vendors: {balance['N_CATALOG_VENDORS'].iloc[0]:,}")

In [None]:
# Cell 4: Build Vendor-Week Panel (Server-Side Aggregation)
# THE CORE QUERY - builds entire panel inside Snowflake

panel_query = f"""
WITH
-- Step 1: Vendor Universe from Catalog
VENDOR_UNIVERSE AS (
    SELECT DISTINCT v.value::STRING AS VENDOR_ID
    FROM CATALOG, LATERAL FLATTEN(input => VENDORS) v
),

-- Step 2: Week Scaffold
WEEK_SCAFFOLD AS (
    SELECT DATEADD('week', seq4(), DATE '{ANALYSIS_START}') AS week
    FROM TABLE(GENERATOR(ROWCOUNT => {N_WEEKS}))
),

-- Step 3: Vendor x Week Scaffold (densification)
VENDOR_WEEK_SCAFFOLD AS (
    SELECT v.VENDOR_ID, w.week
    FROM VENDOR_UNIVERSE v
    CROSS JOIN WEEK_SCAFFOLD w
),

-- Step 4: Treatment (Spend from winning bids)
SPEND_AGG AS (
    SELECT
        LOWER(TO_VARCHAR(VENDOR_ID, 'HEX')) AS VENDOR_ID,
        DATE_TRUNC('week', CREATED_AT) AS week,
        SUM(FINAL_BID) AS total_spend,
        COUNT(*) AS wins
    FROM AUCTIONS_RESULTS
    WHERE IS_WINNER = TRUE
      AND CREATED_AT BETWEEN '{ANALYSIS_START}' AND '{ANALYSIS_END}'
    GROUP BY 1, 2
),

-- Step 5: Total GMV (via Catalog bridge - THE KEY JOIN)
TOTAL_GMV_AGG AS (
    SELECT
        v.value::STRING AS VENDOR_ID,
        DATE_TRUNC('week', p.PURCHASED_AT) AS week,
        SUM(p.UNIT_PRICE * p.QUANTITY) AS total_gmv,
        COUNT(DISTINCT p.PURCHASE_ID) AS n_purchases
    FROM PURCHASES p
    JOIN CATALOG c ON LOWER(TRIM(p.PRODUCT_ID)) = LOWER(TRIM(c.PRODUCT_ID))
    CROSS JOIN LATERAL FLATTEN(input => c.VENDORS) v
    WHERE p.PURCHASED_AT BETWEEN '{ANALYSIS_START}' AND '{ANALYSIS_END}'
    GROUP BY 1, 2
),

-- Step 6: Promoted GMV (click-attributed, 7-day window)
PROMOTED_GMV_AGG AS (
    SELECT
        LOWER(REPLACE(cl.VENDOR_ID, '-', '')) AS VENDOR_ID,
        DATE_TRUNC('week', p.PURCHASED_AT) AS week,
        SUM(p.UNIT_PRICE * p.QUANTITY) AS promoted_gmv,
        COUNT(DISTINCT p.PURCHASE_ID) AS n_attributed
    FROM CLICKS cl
    JOIN PURCHASES p
        ON cl.USER_ID = p.USER_ID
        AND LOWER(TRIM(cl.PRODUCT_ID)) = LOWER(TRIM(p.PRODUCT_ID))
        AND p.PURCHASED_AT BETWEEN cl.OCCURRED_AT AND DATEADD('day', 7, cl.OCCURRED_AT)
    WHERE cl.OCCURRED_AT BETWEEN '{ANALYSIS_START}' AND '{ANALYSIS_END}'
    GROUP BY 1, 2
),

-- Step 7: Impressions aggregation
IMPRESSIONS_AGG AS (
    SELECT
        LOWER(REPLACE(VENDOR_ID, '-', '')) AS VENDOR_ID,
        DATE_TRUNC('week', OCCURRED_AT) AS week,
        COUNT(*) AS impressions
    FROM IMPRESSIONS
    WHERE OCCURRED_AT BETWEEN '{ANALYSIS_START}' AND '{ANALYSIS_END}'
    GROUP BY 1, 2
),

-- Step 8: Clicks aggregation
CLICKS_AGG AS (
    SELECT
        LOWER(REPLACE(VENDOR_ID, '-', '')) AS VENDOR_ID,
        DATE_TRUNC('week', OCCURRED_AT) AS week,
        COUNT(*) AS clicks
    FROM CLICKS
    WHERE OCCURRED_AT BETWEEN '{ANALYSIS_START}' AND '{ANALYSIS_END}'
    GROUP BY 1, 2
)

-- Final Join with COALESCE for explicit zeros
SELECT
    vws.VENDOR_ID,
    vws.week,
    COALESCE(s.total_spend, 0) AS total_spend,
    COALESCE(s.wins, 0) AS wins,
    COALESCE(t.total_gmv, 0) AS total_gmv,
    COALESCE(t.n_purchases, 0) AS n_purchases,
    COALESCE(p.promoted_gmv, 0) AS promoted_gmv,
    COALESCE(p.n_attributed, 0) AS n_attributed,
    COALESCE(i.impressions, 0) AS impressions,
    COALESCE(c.clicks, 0) AS clicks,
    COALESCE(t.total_gmv, 0) - COALESCE(p.promoted_gmv, 0) AS organic_gmv,
    CASE WHEN COALESCE(s.total_spend, 0) > 0 THEN 1 ELSE 0 END AS has_spend
FROM VENDOR_WEEK_SCAFFOLD vws
LEFT JOIN SPEND_AGG s ON vws.VENDOR_ID = s.VENDOR_ID AND vws.week = s.week
LEFT JOIN TOTAL_GMV_AGG t ON vws.VENDOR_ID = t.VENDOR_ID AND vws.week = t.week
LEFT JOIN PROMOTED_GMV_AGG p ON vws.VENDOR_ID = p.VENDOR_ID AND vws.week = p.week
LEFT JOIN IMPRESSIONS_AGG i ON vws.VENDOR_ID = i.VENDOR_ID AND vws.week = i.week
LEFT JOIN CLICKS_AGG c ON vws.VENDOR_ID = c.VENDOR_ID AND vws.week = c.week
ORDER BY vws.VENDOR_ID, vws.week
"""

print("Executing panel query (this may take several minutes)...")
print(f"Building {N_WEEKS}-week panel for all vendors...")
with tqdm(desc="Building panel") as pbar:
    panel = pd.read_sql(panel_query, conn)
    pbar.update(1)

print(f"\nPanel: {len(panel):,} rows, {panel['VENDOR_ID'].nunique():,} vendors, {panel['WEEK'].nunique()} weeks")
panel.to_parquet(DATA_DIR / 'vendor_week_panel.parquet', index=False)
print(f"Saved to {DATA_DIR / 'vendor_week_panel.parquet'}")
panel.head()

In [None]:
# Cell 5: Q3 - Treatment Absorbing (Flicker Rate)

# Sort and compute transitions
panel_sorted = panel.sort_values(['VENDOR_ID', 'WEEK'])
panel_sorted['prev_has_spend'] = panel_sorted.groupby('VENDOR_ID')['HAS_SPEND'].shift(1)

transitions = panel_sorted.dropna(subset=['prev_has_spend'])
on_to_off = ((transitions['prev_has_spend'] == 1) & (transitions['HAS_SPEND'] == 0)).sum()
on_to_on = ((transitions['prev_has_spend'] == 1) & (transitions['HAS_SPEND'] == 1)).sum()
flicker_rate = on_to_off / (on_to_off + on_to_on) if (on_to_off + on_to_on) > 0 else 0

print("="*60)
print("Q3: TREATMENT ABSORBING (FLICKER RATE)")
print("="*60)
print(f"ON->OFF transitions: {on_to_off:,}")
print(f"ON->ON transitions: {on_to_on:,}")
print(f"Flicker Rate: {flicker_rate:.2%}")
print(f"Status: {'OK (<20%)' if flicker_rate < 0.2 else 'WARNING'}")

In [None]:
# Cell 6: Q4 - Adoption Velocity

# First week with spend > 0 per vendor
first_spend = panel[panel['HAS_SPEND'] == 1].groupby('VENDOR_ID')['WEEK'].min().reset_index()
first_spend.columns = ['VENDOR_ID', 'cohort_week']

adoption_by_week = first_spend.groupby('cohort_week').size().reset_index(name='n_new')
adoption_by_week['cumulative'] = adoption_by_week['n_new'].cumsum()
adoption_by_week['pct_new'] = adoption_by_week['n_new'] / adoption_by_week['n_new'].sum() * 100

print("="*60)
print("Q4: ADOPTION VELOCITY")
print("="*60)
print(f"Total cohorts: {len(adoption_by_week)}")
if len(adoption_by_week) > 0:
    print(f"Week 1 adoption: {adoption_by_week['pct_new'].iloc[0]:.1f}%")
    print(f"Largest cohort: {adoption_by_week['pct_new'].max():.1f}%")
print("\nAdoption by Week:")
print(adoption_by_week.to_string(index=False))

In [None]:
# Cell 7: Q5 - Ashenfelter's Dip

# Merge cohort week to panel
panel_with_cohort = panel.merge(first_spend, on='VENDOR_ID', how='left')
panel_with_cohort['relative_week'] = (
    (pd.to_datetime(panel_with_cohort['WEEK']) -
     pd.to_datetime(panel_with_cohort['cohort_week'])).dt.days // 7
)

# Average total_gmv by relative week for treated vendors
treated = panel_with_cohort[panel_with_cohort['cohort_week'].notna()]
event_study = treated.groupby('relative_week')['TOTAL_GMV'].agg(['mean', 'std', 'count']).reset_index()
event_study.columns = ['relative_week', 'mean_gmv', 'std_gmv', 'n_obs']

print("="*60)
print("Q5: ASHENFELTER'S DIP (Pre-treatment GMV trajectory)")
print("="*60)
display_range = event_study[(event_study['relative_week'] >= -5) & (event_study['relative_week'] <= 5)]
print(display_range.to_string(index=False))

# Test for pre-trend
pre_treatment = event_study[event_study['relative_week'] < 0]['mean_gmv']
if len(pre_treatment) >= 2:
    trend = pre_treatment.iloc[-1] - pre_treatment.iloc[0]
    print(f"\nPre-treatment trend (e=-5 to e=-1): ${trend:.2f}")
    print(f"Interpretation: {'Declining (Ashenfelter Dip?)' if trend < 0 else 'Stable/Increasing'}")

In [None]:
# Cell 8: Q8 - Zero-Inflation (True Population)

print("="*60)
print("Q8: ZERO-INFLATION")
print("="*60)
print(f"{'Variable':<20} {'Zero %':>10} {'Non-Zero Mean':>15} {'Non-Zero Median':>15}")
print("-"*60)

for col in ['TOTAL_GMV', 'PROMOTED_GMV', 'IMPRESSIONS', 'CLICKS', 'TOTAL_SPEND']:
    zero_pct = (panel[col] == 0).sum() / len(panel) * 100
    nonzero = panel[panel[col] > 0][col]
    mean_val = nonzero.mean() if len(nonzero) > 0 else 0
    median_val = nonzero.median() if len(nonzero) > 0 else 0
    print(f"{col:<20} {zero_pct:>9.1f}% {mean_val:>15,.2f} {median_val:>15,.2f}")

In [None]:
# Cell 9: Q9 - Whale Concentration

vendor_totals = panel.groupby('VENDOR_ID')['TOTAL_GMV'].sum().sort_values(ascending=False)
total_gmv = vendor_totals.sum()
n_vendors = len(vendor_totals)

# Handle edge cases
top_1_n = max(1, int(n_vendors * 0.01))
top_10_n = max(1, int(n_vendors * 0.10))

top_1_pct = vendor_totals.head(top_1_n).sum() / total_gmv * 100 if total_gmv > 0 else 0
top_10_pct = vendor_totals.head(top_10_n).sum() / total_gmv * 100 if total_gmv > 0 else 0

# Gini coefficient
x = np.sort(vendor_totals.values)
n = len(x)
if np.sum(x) > 0 and n > 0:
    gini = (2 * np.sum((np.arange(1, n+1) * x)) - (n + 1) * np.sum(x)) / (n * np.sum(x))
else:
    gini = 0

print("="*60)
print("Q9: WHALE CONCENTRATION")
print("="*60)
print(f"Total Vendors: {n_vendors:,}")
print(f"Total GMV: ${total_gmv:,.0f}")
print(f"Top 1% ({top_1_n} vendors) share of GMV: {top_1_pct:.1f}%")
print(f"Top 10% ({top_10_n} vendors) share of GMV: {top_10_pct:.1f}%")
print(f"Gini coefficient: {gini:.3f}")
print(f"\nTop 10 Vendors by GMV:")
print(vendor_totals.head(10).to_string())

In [None]:
# Cell 10: Q10 - Cannibalization

# Filter to observations with some activity
active = panel[(panel['PROMOTED_GMV'] > 0) | (panel['ORGANIC_GMV'] > 0)]

print("="*60)
print("Q10: CANNIBALIZATION")
print("="*60)
print(f"Active observations (promoted or organic GMV > 0): {len(active):,}")

if len(active) > 1:
    corr = active['PROMOTED_GMV'].corr(active['ORGANIC_GMV'])
    print(f"Correlation(promoted_gmv, organic_gmv): {corr:.4f}")
    print(f"Interpretation: {'Substitution' if corr < -0.1 else 'Complement' if corr > 0.1 else 'Independent'}")
    
    # Additional analysis
    print(f"\nMean Promoted GMV: ${active['PROMOTED_GMV'].mean():,.2f}")
    print(f"Mean Organic GMV: ${active['ORGANIC_GMV'].mean():,.2f}")
    print(f"Mean Total GMV: ${active['TOTAL_GMV'].mean():,.2f}")
else:
    print("Insufficient data for correlation analysis.")

In [None]:
# Cell 11: Part 2 - Mechanism Sample (Q6 & Q7)

mechanism_query = """
SELECT
    RANKING, FINAL_BID, QUALITY, PACING, CONVERSION_RATE, IS_WINNER
FROM AUCTIONS_RESULTS
SAMPLE(0.01)
WHERE CREATED_AT BETWEEN '2025-09-01' AND '2025-09-07'
  AND FINAL_BID IS NOT NULL
  AND QUALITY IS NOT NULL
  AND PACING IS NOT NULL
LIMIT 50000
"""

print("Fetching mechanism sample...")
auction_sample = pd.read_sql(mechanism_query, conn)
auction_sample.to_parquet(DATA_DIR / 'auction_sample.parquet', index=False)
print(f"Mechanism sample: {len(auction_sample):,} rows")
auction_sample.head()

In [None]:
# Cell 12: Q6 - CPC Verification

print("="*60)
print("Q6: CPC VERIFICATION (FINAL_BID Distribution)")
print("="*60)

winners = auction_sample[auction_sample['IS_WINNER'] == True]
print(f"Total bids in sample: {len(auction_sample):,}")
print(f"Winners: {len(winners):,}")
print(f"\nFINAL_BID statistics (all bids):")
print(auction_sample['FINAL_BID'].describe())
print(f"\nFINAL_BID statistics (winners only):")
print(winners['FINAL_BID'].describe())

In [None]:
# Cell 13: Q7 - Rank Determinism

from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

# Prepare data
X = auction_sample[['FINAL_BID', 'QUALITY', 'PACING']].dropna()
y = auction_sample.loc[X.index, 'RANKING']

print("="*60)
print("Q7: RANK DETERMINISM")
print("="*60)

if len(X) > 10:
    model = LinearRegression().fit(X, y)
    r2 = r2_score(y, model.predict(X))
    
    print(f"Sample size: {len(X):,}")
    print(f"R-squared (RANKING ~ FINAL_BID + QUALITY + PACING): {r2:.4f}")
    print(f"\nCoefficients:")
    print(f"  FINAL_BID: {model.coef_[0]:.6f}")
    print(f"  QUALITY: {model.coef_[1]:.6f}")
    print(f"  PACING: {model.coef_[2]:.6f}")
    print(f"  Intercept: {model.intercept_:.6f}")
    print(f"\nInterpretation: {'Deterministic' if r2 > 0.9 else 'Some noise' if r2 > 0.5 else 'Noisy'}")
else:
    print("Insufficient data for regression analysis.")
    r2 = 0

In [None]:
# Cell 14: Summary Table

results = {
    'Q1_orphan_rate': f"{orphan_rate:.1%}",
    'Q2_bidders': f"{balance['N_BIDDERS'].iloc[0]:,}",
    'Q2_catalog_vendors': f"{balance['N_CATALOG_VENDORS'].iloc[0]:,}",
    'Q3_flicker_rate': f"{flicker_rate:.2%}",
    'Q4_n_cohorts': len(adoption_by_week),
    'Q4_week1_pct': f"{adoption_by_week['pct_new'].iloc[0]:.1f}%" if len(adoption_by_week) > 0 else 'N/A',
    'Q8_total_gmv_zero_pct': f"{(panel['TOTAL_GMV'] == 0).mean()*100:.1f}%",
    'Q9_top1_pct': f"{top_1_pct:.1f}%",
    'Q9_gini': f"{gini:.3f}",
    'Q10_cannibalization_corr': f"{corr:.4f}" if 'corr' in dir() and len(active) > 1 else 'N/A',
    'Q6_n_winners': f"{len(winners):,}",
    'Q7_rank_r2': f"{r2:.4f}"
}

print("="*60)
print("EDA SUMMARY")
print("="*60)
print(f"{'Metric':<30} {'Value':<20}")
print("-"*50)
for k, v in results.items():
    print(f"{k:<30} {v:<20}")

In [None]:
# Cell 15: Close Connection

conn.close()
print("[OK] Snowflake connection closed")
print(f"\nData saved to:")
print(f"  - {DATA_DIR / 'vendor_week_panel.parquet'}")
print(f"  - {DATA_DIR / 'auction_sample.parquet'}")