# 01_simple_holdout_analysis_fixed.ipynb: Corrected 97-3 RCT Analysis

## Key Fix
The previous approach had a logical error - we cannot have more control purchasers than total control users.
This version correctly handles the population reconstruction.

## Approach
- Treatment: Users in AUCTIONS_USERS
- Control: Users NOT in AUCTIONS_USERS who purchased
- Use observed data to infer true proportions

In [None]:
# Complete analysis with corrected logic
import os
import sys
import pandas as pd
import numpy as np
from datetime import datetime
from dotenv import load_dotenv
import snowflake.connector
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

load_dotenv()

print("="*80)
print("CORRECTED 97-3 RCT ANALYSIS")
print("="*80)

# Configuration
ANALYSIS_START = '2025-03-01'
ANALYSIS_END = '2025-09-26'

print(f"\nAnalysis Period: {ANALYSIS_START} to {ANALYSIS_END}")

# Connect to Snowflake
try:
    conn = snowflake.connector.connect(
        user=os.getenv('SNOWFLAKE_USER'),
        password=os.getenv('SNOWFLAKE_PASSWORD'),
        account=os.getenv('SNOWFLAKE_ACCOUNT'),
        warehouse=os.getenv('SNOWFLAKE_WAREHOUSE', 'COMPUTE_WH'),
        database='INCREMENTALITY',
        schema='INCREMENTALITY_RESEARCH'
    )
    print("✅ Connected to Snowflake\n")
except Exception as e:
    print(f"❌ Connection failed: {e}")
    sys.exit(1)

# First, let's understand the data
diagnostic_query = f"""
WITH
    treatment_users AS (
        SELECT DISTINCT OPAQUE_USER_ID AS user_id
        FROM AUCTIONS_USERS
    ),
    
    all_purchasers AS (
        SELECT DISTINCT USER_ID AS user_id
        FROM PURCHASES
        WHERE PURCHASED_AT >= '{ANALYSIS_START}'
          AND PURCHASED_AT < '{ANALYSIS_END}'
    )

SELECT
    COUNT(DISTINCT t.user_id) as n_treatment_users,
    COUNT(DISTINCT p.user_id) as n_all_purchasers,
    COUNT(DISTINCT CASE WHEN t.user_id IS NOT NULL AND p.user_id IS NOT NULL THEN p.user_id END) as n_treatment_purchasers,
    COUNT(DISTINCT CASE WHEN t.user_id IS NULL AND p.user_id IS NOT NULL THEN p.user_id END) as n_control_purchasers
FROM all_purchasers p
FULL OUTER JOIN treatment_users t ON p.user_id = t.user_id
"""

print("Running diagnostic query...")
df_diag = pd.read_sql(diagnostic_query, conn)

n_treatment_users = int(df_diag['N_TREATMENT_USERS'].iloc[0])
n_all_purchasers = int(df_diag['N_ALL_PURCHASERS'].iloc[0])
n_treatment_purchasers = int(df_diag['N_TREATMENT_PURCHASERS'].iloc[0])
n_control_purchasers = int(df_diag['N_CONTROL_PURCHASERS'].iloc[0])

print("\n" + "="*80)
print("DATA DIAGNOSTICS")
print("="*80)
print(f"Treatment users (in AUCTIONS_USERS):      {n_treatment_users:,}")
print(f"Total purchasers in period:               {n_all_purchasers:,}")
print(f"  - Treatment purchasers:                 {n_treatment_purchasers:,}")
print(f"  - Control purchasers (never saw ads):   {n_control_purchasers:,}")

# Calculate implied split
implied_treatment_pct = n_treatment_purchasers / n_all_purchasers if n_all_purchasers > 0 else 0
implied_control_pct = n_control_purchasers / n_all_purchasers if n_all_purchasers > 0 else 0

print(f"\nImplied split among purchasers:")
print(f"  - Treatment: {implied_treatment_pct:.1%}")
print(f"  - Control:   {implied_control_pct:.1%}")

# Method 1: Conservative approach - use observed data directly
print("\n" + "="*80)
print("METHOD 1: OBSERVED DATA APPROACH")
print("="*80)

# We know treatment group size exactly
T_total = n_treatment_users
T_p = n_treatment_purchasers
T_np = T_total - T_p

# For control, we only observe purchasers
C_p = n_control_purchasers

# Estimate total control size using 97-3 split
# But ensure it's at least as large as observed purchasers
C_total_estimated = int(T_total * (0.03 / 0.97))
C_total = max(C_total_estimated, C_p)  # Can't be less than observed purchasers
C_np = C_total - C_p

# Calculate rates
Rate_T = T_p / T_total if T_total > 0 else 0
Rate_C = C_p / C_total if C_total > 0 else 0

ATE = Rate_T - Rate_C
Lift = (ATE / Rate_C * 100) if Rate_C > 0 else 0

print(f"Treatment Group:")
print(f"  - Total users:     {T_total:,}")
print(f"  - Purchasers:      {T_p:,}")
print(f"  - Purchase rate:   {Rate_T:.4f} ({Rate_T*100:.2f}%)")

print(f"\nControl Group:")
print(f"  - Observed purchasers:   {C_p:,}")
print(f"  - Estimated total:       {C_total:,}")
print(f"  - Purchase rate:         {Rate_C:.4f} ({Rate_C*100:.2f}%)")

print(f"\nCausal Effect:")
print(f"  - ATE:           {ATE:.4f} ({ATE*100:.2f} pp)")
print(f"  - Relative Lift: {Lift:.1f}%")

# Statistical test
from statsmodels.stats.proportion import proportions_ztest
counts = np.array([T_p, C_p])
nobs = np.array([T_total, C_total])
z_stat, p_value = proportions_ztest(counts, nobs)
print(f"  - P-value:       {p_value:.6f}")

# Method 2: Bounds approach
print("\n" + "="*80)
print("METHOD 2: BOUNDS ON TREATMENT EFFECT")
print("="*80)

# Lower bound: Assume control non-purchasers = 0 (all control users purchased)
Rate_C_upper = 1.0  # 100% purchase rate
ATE_lower = Rate_T - Rate_C_upper

# Upper bound: Use 97-3 split estimate
Rate_C_lower = C_p / C_total_estimated if C_total_estimated > 0 else 0
ATE_upper = Rate_T - Rate_C_lower

# Most likely: Assume control has similar activity level as treatment
activity_ratio = T_total / (T_total + C_total)  # What % of active users are in treatment
expected_control_activity = 0.03 / 0.97  # Expected ratio if equal activity
C_total_adjusted = int(T_total * expected_control_activity)
C_total_adjusted = max(C_total_adjusted, C_p)
Rate_C_adjusted = C_p / C_total_adjusted if C_total_adjusted > 0 else 0
ATE_adjusted = Rate_T - Rate_C_adjusted

print(f"Treatment purchase rate: {Rate_T:.4f}")
print(f"\nControl purchase rate bounds:")
print(f"  - Upper bound (all purchased): 1.0000")
print(f"  - Lower bound (97-3 split):    {Rate_C_lower:.4f}")
print(f"  - Adjusted (equal activity):   {Rate_C_adjusted:.4f}")

print(f"\nATE bounds:")
print(f"  - Lower bound: {ATE_lower:.4f} ({ATE_lower*100:.2f} pp)")
print(f"  - Upper bound: {ATE_upper:.4f} ({ATE_upper*100:.2f} pp)")
print(f"  - Adjusted:    {ATE_adjusted:.4f} ({ATE_adjusted*100:.2f} pp)")

# Revenue analysis
revenue_query = f"""
WITH
    treatment_users AS (
        SELECT DISTINCT OPAQUE_USER_ID AS user_id
        FROM AUCTIONS_USERS
    ),
    
    revenue_data AS (
        SELECT 
            p.USER_ID as user_id,
            SUM(p.QUANTITY * p.UNIT_PRICE) as revenue,
            CASE WHEN t.user_id IS NOT NULL THEN 'Treatment' ELSE 'Control' END as group_type
        FROM PURCHASES p
        LEFT JOIN treatment_users t ON p.USER_ID = t.user_id
        WHERE p.PURCHASED_AT >= '{ANALYSIS_START}'
          AND p.PURCHASED_AT < '{ANALYSIS_END}'
        GROUP BY p.USER_ID, group_type
    )

SELECT
    group_type,
    COUNT(*) as n_purchasers,
    SUM(revenue) as total_revenue,
    AVG(revenue) as avg_revenue,
    PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY revenue) as median_revenue,
    PERCENTILE_CONT(0.99) WITHIN GROUP (ORDER BY revenue) as p99_revenue
FROM revenue_data
GROUP BY group_type
"""

df_revenue = pd.read_sql(revenue_query, conn)
conn.close()

print("\n" + "="*80)
print("REVENUE ANALYSIS")
print("="*80)

for _, row in df_revenue.iterrows():
    group = row['GROUP_TYPE']
    n = int(row['N_PURCHASERS'])
    total_rev = row['TOTAL_REVENUE']
    avg_rev = row['AVG_REVENUE']
    median_rev = row['MEDIAN_REVENUE']
    
    print(f"\n{group}:")
    print(f"  - Purchasers:        {n:,}")
    print(f"  - Total revenue:     ${total_rev:,.2f}")
    print(f"  - Avg per purchaser: ${avg_rev:,.2f}")
    print(f"  - Median:            ${median_rev:,.2f}")
    
    # Calculate ARPU including non-purchasers
    if group == 'Treatment':
        arpu = total_rev / T_total
        print(f"  - ARPU (all users):  ${arpu:,.2f}")
    else:
        # Use adjusted control size
        arpu = total_rev / C_total_adjusted
        print(f"  - ARPU (estimated):  ${arpu:,.2f}")

print("\n" + "="*80)
print("FINAL INTERPRETATION")
print("="*80)

if C_p > C_total_estimated:
    print("⚠️  WARNING: Data anomaly detected!")
    print(f"   - We observe {C_p:,} control purchasers")
    print(f"   - But 97-3 split suggests only {C_total_estimated:,} total control users")
    print(f"   - This implies >100% purchase rate, which is impossible")
    print(f"\n   Possible explanations:")
    print(f"   1. The 97-3 split is not accurate")
    print(f"   2. Control users are much more active than treatment users")
    print(f"   3. There's selection bias in who enters the ad funnel")
    print(f"\n   Using bounds approach for conservative estimates.")

print(f"\nBest estimate of treatment effect:")
print(f"  - ATE: {ATE_adjusted:.4f} ({ATE_adjusted*100:.2f} pp)")
if ATE_adjusted > 0:
    print(f"  - Ads INCREASE purchase probability")
else:
    print(f"  - Ads DECREASE purchase probability")
    print(f"  - This negative effect is concerning and warrants investigation")