# 01_simple_holdout_analysis.ipynb: Direct 97-3 RCT Analysis

## Overview
Leverage the platform's 97-3 treatment/control split where 3% of users are randomly blocked from ads.
This provides clean causal identification.

## Simple Approach
- Get ALL users who ever appeared in AUCTIONS_USERS (treatment group)
- Get ALL users who purchased in analysis period
- Apply 97-3 split to reconstruct control group size
- Calculate treatment effects

In [29]:
# Complete analysis in one cell
import os
import sys
import pandas as pd
import numpy as np
from datetime import datetime
from dotenv import load_dotenv
import snowflake.connector
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

load_dotenv()

print("="*80)
print("DIRECT 97-3 RCT ANALYSIS")
print("="*80)

# Configuration
ANALYSIS_START = '2025-03-01'
ANALYSIS_END = '2025-09-26'
TREATMENT_PROPORTION = 0.97
CONTROL_PROPORTION = 0.03

print(f"\nAnalysis Period: {ANALYSIS_START} to {ANALYSIS_END}")
print(f"Population Split: {TREATMENT_PROPORTION:.0%} treatment, {CONTROL_PROPORTION:.0%} control")

# Connect to Snowflake
try:
    conn = snowflake.connector.connect(
        user=os.getenv('SNOWFLAKE_USER'),
        password=os.getenv('SNOWFLAKE_PASSWORD'),
        account=os.getenv('SNOWFLAKE_ACCOUNT'),
        warehouse=os.getenv('SNOWFLAKE_WAREHOUSE', 'COMPUTE_WH'),
        database='INCREMENTALITY',
        schema='INCREMENTALITY_RESEARCH'
    )
    print("✅ Connected to Snowflake\n")
except Exception as e:
    print(f"❌ Connection failed: {e}")
    sys.exit(1)

# Main query - simple and direct
query = f"""
WITH
    -- ALL users who EVER entered the ad funnel (treatment group)
    treatment_users AS (
        SELECT DISTINCT OPAQUE_USER_ID AS user_id
        FROM AUCTIONS_USERS
    ),
    
    -- Users who purchased in analysis period
    purchasers AS (
        SELECT DISTINCT USER_ID AS user_id
        FROM PURCHASES
        WHERE PURCHASED_AT >= '{ANALYSIS_START}'
          AND PURCHASED_AT < '{ANALYSIS_END}'
    ),
    
    -- Combine all active users
    all_users AS (
        SELECT user_id FROM treatment_users
        UNION
        SELECT user_id FROM purchasers
    ),
    
    -- Classify each user
    classified AS (
        SELECT
            u.user_id,
            (t.user_id IS NOT NULL) AS is_treatment,
            (p.user_id IS NOT NULL) AS did_purchase
        FROM all_users u
        LEFT JOIN treatment_users t ON u.user_id = t.user_id
        LEFT JOIN purchasers p ON u.user_id = p.user_id
    )

-- Get counts
SELECT
    COUNT_IF(is_treatment AND did_purchase) AS T_p,     -- Treatment purchasers
    COUNT_IF(is_treatment AND NOT did_purchase) AS T_np, -- Treatment non-purchasers
    COUNT_IF(NOT is_treatment AND did_purchase) AS C_p   -- Control purchasers
FROM classified
"""

print("Executing query...")
df = pd.read_sql(query, conn)

# Extract counts
T_p = int(df['T_P'].iloc[0])
T_np = int(df['T_NP'].iloc[0])
C_p = int(df['C_P'].iloc[0])

# Calculate metrics
Observed_T = T_p + T_np
Observed_C = int(Observed_T * (CONTROL_PROPORTION / TREATMENT_PROPORTION))
C_np = max(0, Observed_C - C_p)

# Purchase rates
Rate_T = T_p / Observed_T if Observed_T > 0 else 0
Rate_C = C_p / Observed_C if Observed_C > 0 else 0

# Treatment effect
ATE = Rate_T - Rate_C
Lift = (ATE / Rate_C * 100) if Rate_C > 0 else 0

# Statistical test
from statsmodels.stats.proportion import proportions_ztest
counts = np.array([T_p, C_p])
nobs = np.array([Observed_T, Observed_C])
z_stat, p_value = proportions_ztest(counts, nobs)

# Revenue analysis
revenue_query = f"""
WITH
    treatment_users AS (
        SELECT DISTINCT OPAQUE_USER_ID AS user_id
        FROM AUCTIONS_USERS
    ),
    
    user_revenue AS (
        SELECT 
            USER_ID as user_id,
            SUM(QUANTITY * UNIT_PRICE) as revenue
        FROM PURCHASES
        WHERE PURCHASED_AT >= '{ANALYSIS_START}'
          AND PURCHASED_AT < '{ANALYSIS_END}'
        GROUP BY USER_ID
    )

SELECT
    CASE WHEN t.user_id IS NOT NULL THEN 'Treatment' ELSE 'Control' END as group_type,
    COUNT(DISTINCT COALESCE(r.user_id, t.user_id)) as n_users,
    COALESCE(SUM(r.revenue), 0) as total_revenue,
    COALESCE(AVG(r.revenue), 0) as avg_revenue_purchasers,
    PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY COALESCE(r.revenue, 0)) as median_revenue
FROM user_revenue r
FULL OUTER JOIN treatment_users t ON r.user_id = t.user_id
WHERE r.user_id IS NOT NULL OR t.user_id IS NOT NULL
GROUP BY group_type
"""

df_revenue = pd.read_sql(revenue_query, conn)
conn.close()

# Display results
print("\n" + "="*80)
print("RESULTS")
print("="*80)

print("\n1. CONTINGENCY TABLE:")
print("-"*40)
print(f"                   Purchased    Did Not Purchase        Total")
print(f"Treatment Group  {T_p:>11,}  {T_np:>18,}  {Observed_T:>11,}")
print(f"Control Group    {C_p:>11,}  {C_np:>18,}  {Observed_C:>11,}")

print("\n2. PURCHASE RATES:")
print("-"*40)
print(f"Treatment: {Rate_T:.4f} ({Rate_T*100:.2f}%)")
print(f"Control:   {Rate_C:.4f} ({Rate_C*100:.2f}%)")

print("\n3. CAUSAL EFFECT:")
print("-"*40)
print(f"ATE:           {ATE:.4f} ({ATE*100:.2f} percentage points)")
print(f"Relative Lift: {Lift:.1f}%")
print(f"P-value:       {p_value:.6f}")
print(f"Significant:   {'Yes' if p_value < 0.05 else 'No'}")

print("\n4. REVENUE ANALYSIS:")
print("-"*40)
for _, row in df_revenue.iterrows():
    group = row['GROUP_TYPE']
    n = int(row['N_USERS'])
    total_rev = row['TOTAL_REVENUE']
    avg_rev = row['AVG_REVENUE_PURCHASERS']
    
    # Adjust control counts using 97-3 split
    if group == 'Control':
        estimated_n = Observed_C
        arpu = total_rev / estimated_n  # Include zeros
    else:
        arpu = total_rev / n if n > 0 else 0
    
    print(f"{group:10} - Users: {n:,}, ARPU: ${arpu:.2f}, Avg (purchasers): ${avg_rev:.2f}")

# Calculate revenue lift
treatment_row = df_revenue[df_revenue['GROUP_TYPE'] == 'Treatment'].iloc[0]
control_row = df_revenue[df_revenue['GROUP_TYPE'] == 'Control'].iloc[0]

treatment_arpu = treatment_row['TOTAL_REVENUE'] / Observed_T
control_arpu = control_row['TOTAL_REVENUE'] / Observed_C
revenue_lift = ((treatment_arpu - control_arpu) / control_arpu * 100) if control_arpu > 0 else 0

print(f"\nRevenue Lift: {revenue_lift:.1f}%")
print(f"Incremental Revenue per User: ${treatment_arpu - control_arpu:.2f}")

print("\n" + "="*80)
print("INTERPRETATION")
print("="*80)

if ATE > 0:
    print(f"✅ Advertising INCREASES purchase probability by {ATE*100:.2f} pp")
    print(f"   This is a {Lift:.1f}% relative increase")
elif ATE < 0:
    print(f"⚠️  Advertising DECREASES purchase probability by {abs(ATE)*100:.2f} pp")
    print(f"   This is a {abs(Lift):.1f}% relative decrease")
    print(f"\n   Note: This unexpected result suggests potential data issues:")
    print(f"   - Control users (no ads) have {Rate_C*100:.1f}% purchase rate")
    print(f"   - Treatment users (saw ads) have {Rate_T*100:.1f}% purchase rate")
    print(f"   - Possible explanations: selection bias, data quality, or reverse causality")
else:
    print(f"No significant effect detected")

print(f"\nStatistical confidence: p-value = {p_value:.6f}")
print(f"Based on {Observed_T:,} treatment users and {C_p:,} observed control purchasers")

DIRECT 97-3 RCT ANALYSIS

Analysis Period: 2025-03-01 to 2025-09-26
Population Split: 97% treatment, 3% control
✅ Connected to Snowflake

Executing query...

RESULTS

1. CONTINGENCY TABLE:
----------------------------------------
                   Purchased    Did Not Purchase        Total
Treatment Group    4,592,276          13,142,165   17,734,441
Control Group        666,882                   0      548,487

2. PURCHASE RATES:
----------------------------------------
Treatment: 0.2589 (25.89%)
Control:   1.2159 (121.59%)

3. CAUSAL EFFECT:
----------------------------------------
ATE:           -0.9569 (-95.69 percentage points)
Relative Lift: -78.7%
P-value:       0.000000
Significant:   Yes

4. REVENUE ANALYSIS:
----------------------------------------
Treatment  - Users: 17,734,640, ARPU: $5704.01, Avg (purchasers): $22027.98
Control    - Users: 666,880, ARPU: $6503.39, Avg (purchasers): $5348.82

Revenue Lift: -12.3%
Incremental Revenue per User: $-799.31

INTERPRETATION
⚠️  A