# Regression Discontinuity Design (RDD) Analysis - Tables Only
## Topsort Incrementality - Causal Inference via Rank Discontinuity

This notebook implements RDD analysis using auction rank as the running variable to identify causal effects of ad impressions on purchases.
All outputs are in tabular format for text-based analysis.

## Setup & Configuration

In [1]:
import os
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from dotenv import load_dotenv
import snowflake.connector
from tabulate import tabulate

# Setup
load_dotenv()

# PARAMETERIZATION - Easy to modify time windows
ANALYSIS_START = '2025-07-01 00:00:00'
ANALYSIS_END = '2025-07-02 00:00:00'
SAMPLE_RATE = 0.1  # For TABLESAMPLE when needed
ATTRIBUTION_WINDOW_MINUTES = 60  # Post-click attribution

# Connection
conn = snowflake.connector.connect(
    user=os.getenv('SNOWFLAKE_USER'),
    password=os.getenv('SNOWFLAKE_PASSWORD'),
    account=os.getenv('SNOWFLAKE_ACCOUNT'),
    warehouse=os.getenv('SNOWFLAKE_WAREHOUSE', 'COMPUTE_WH'),
    database='INCREMENTALITY',
    schema='INCREMENTALITY_RESEARCH'
)
cursor = conn.cursor()

def run_query(query):
    """Execute query and return DataFrame"""
    cursor.execute(query)
    results = cursor.fetchall()
    if results:
        columns = [desc[0] for desc in cursor.description]
        return pd.DataFrame(results, columns=columns)
    return pd.DataFrame()

def show_table(df, title="", max_rows=None):
    """Display DataFrame as formatted table"""
    if title:
        print(f"\n{title}")
        print("="*len(title))
    if max_rows and len(df) > max_rows:
        df_display = df.head(max_rows)
        print(tabulate(df_display, headers='keys', tablefmt='grid', showindex=False))
        print(f"... ({len(df) - max_rows} more rows)")
    else:
        print(tabulate(df, headers='keys', tablefmt='grid', showindex=False))

print(f"✅ Connected to Snowflake")
print(f"📅 Analysis period: {ANALYSIS_START} to {ANALYSIS_END}")
print(f"🔬 Sample rate for heavy queries: {SAMPLE_RATE*100}%")
print(f"⏰ Attribution window: {ATTRIBUTION_WINDOW_MINUTES} minutes")

✅ Connected to Snowflake
📅 Analysis period: 2025-07-01 00:00:00 to 2025-07-02 00:00:00
🔬 Sample rate for heavy queries: 10.0%
⏰ Attribution window: 60 minutes


## 1. Running Variable Distribution Analysis

Check for manipulation of RANKING around potential cutoffs. A smooth distribution implies no strategic manipulation.

In [2]:
# Running variable (RANKING) distribution
query_ranking_dist = f"""
SELECT 
    RANKING,
    COUNT(*) AS num_bids,
    -- Calculate ratio to previous rank for discontinuity detection
    LAG(COUNT(*)) OVER (ORDER BY RANKING) as prev_rank_bids,
    ROUND(COUNT(*) * 1.0 / LAG(COUNT(*)) OVER (ORDER BY RANKING), 3) as ratio_to_prev
FROM AUCTIONS_RESULTS
WHERE CREATED_AT >= '{ANALYSIS_START}'
  AND CREATED_AT < '{ANALYSIS_END}'
  AND RANKING <= 30
GROUP BY RANKING
ORDER BY RANKING
"""

df_ranking = run_query(query_ranking_dist)

# Format numbers for display
df_ranking['num_bids'] = df_ranking['NUM_BIDS'].apply(lambda x: f"{x:,}")
df_ranking['prev_rank_bids'] = df_ranking['PREV_RANK_BIDS'].apply(lambda x: f"{x:,}" if pd.notna(x) else "N/A")

show_table(df_ranking[['RANKING', 'num_bids', 'prev_rank_bids', 'RATIO_TO_PREV']], 
           "Running Variable Distribution (RANKING)", max_rows=20)

# Detect potential manipulation
print("\n🔍 Discontinuity Detection:")
suspicious = df_ranking[(df_ranking['RATIO_TO_PREV'] < 0.8) | (df_ranking['RATIO_TO_PREV'] > 1.2)]
if len(suspicious) > 0:
    print("Potential discontinuities detected at ranks:")
    for _, row in suspicious.iterrows():
        print(f"  Rank {row['RANKING']}: ratio = {row['RATIO_TO_PREV']:.3f}")
else:
    print("No significant discontinuities detected (all ratios within 0.8-1.2 range)")


Running Variable Distribution (RANKING)
+-----------+------------+------------------+-----------------+
|   RANKING | num_bids   | prev_rank_bids   |   RATIO_TO_PREV |
|         1 | 6,724,843  | N/A              |                 |
+-----------+------------+------------------+-----------------+
|         2 | 6,653,098  | 6,724,843.0      |           0.989 |
+-----------+------------+------------------+-----------------+
|         3 | 6,605,526  | 6,653,098.0      |           0.993 |
+-----------+------------+------------------+-----------------+
|         4 | 6,569,492  | 6,605,526.0      |           0.995 |
+-----------+------------+------------------+-----------------+
|         5 | 6,540,178  | 6,569,492.0      |           0.996 |
+-----------+------------+------------------+-----------------+
|         6 | 6,514,679  | 6,540,178.0      |           0.996 |
+-----------+------------+------------------+-----------------+
|         7 | 6,491,926  | 6,514,679.0      |           0.997 |

## 2. Treatment Assignment Discontinuity

### 2a. IS_WINNER Probability by Rank

In [3]:
# IS_WINNER discontinuity analysis
query_winner = f"""
SELECT 
    RANKING,
    COUNT(*) AS total_bids,
    SUM(CASE WHEN IS_WINNER THEN 1 ELSE 0 END) AS winners,
    ROUND(100.0 * SUM(CASE WHEN IS_WINNER THEN 1 ELSE 0 END) / COUNT(*), 2) AS win_rate_pct,
    -- Calculate drop from previous rank
    LAG(ROUND(100.0 * SUM(CASE WHEN IS_WINNER THEN 1 ELSE 0 END) / COUNT(*), 2)) 
        OVER (ORDER BY RANKING) as prev_win_rate,
    ROUND(100.0 * SUM(CASE WHEN IS_WINNER THEN 1 ELSE 0 END) / COUNT(*), 2) - 
        LAG(ROUND(100.0 * SUM(CASE WHEN IS_WINNER THEN 1 ELSE 0 END) / COUNT(*), 2)) 
        OVER (ORDER BY RANKING) as win_rate_change
FROM AUCTIONS_RESULTS
WHERE CREATED_AT >= '{ANALYSIS_START}'
  AND CREATED_AT < '{ANALYSIS_END}'
  AND RANKING <= 30
GROUP BY RANKING
ORDER BY RANKING
"""

df_winner = run_query(query_winner)

# Format for display
df_display = df_winner.copy()
df_display['total_bids'] = df_display['TOTAL_BIDS'].apply(lambda x: f"{x:,}")
df_display['winners'] = df_display['WINNERS'].apply(lambda x: f"{x:,}")

show_table(df_display[['RANKING', 'total_bids', 'winners', 'WIN_RATE_PCT', 'WIN_RATE_CHANGE']], 
           "IS_WINNER Probability by Rank", max_rows=20)

# Identify sharp drops
print("\n🎯 Sharp Discontinuities in Win Rate:")
sharp_drops = df_winner[df_winner['WIN_RATE_CHANGE'] < -10]
if len(sharp_drops) > 0:
    for _, row in sharp_drops.iterrows():
        print(f"  Rank {row['RANKING']}: {row['PREV_WIN_RATE']:.1f}% → {row['WIN_RATE_PCT']:.1f}% (drop: {abs(row['WIN_RATE_CHANGE']):.1f}%)")
else:
    print("No sharp drops (>10%) detected")


IS_WINNER Probability by Rank
+-----------+--------------+-----------+----------------+-------------------+
|   RANKING | total_bids   | winners   |   WIN_RATE_PCT |   WIN_RATE_CHANGE |
|         1 | 6,724,843    | 6,683,772 |          99.39 |                   |
+-----------+--------------+-----------+----------------+-------------------+
|         2 | 6,653,098    | 6,565,781 |          98.69 |             -0.7  |
+-----------+--------------+-----------+----------------+-------------------+
|         3 | 6,605,526    | 6,477,264 |          98.06 |             -0.63 |
+-----------+--------------+-----------+----------------+-------------------+
|         4 | 6,569,492    | 6,404,639 |          97.49 |             -0.57 |
+-----------+--------------+-----------+----------------+-------------------+
|         5 | 6,540,178    | 6,341,923 |          96.97 |             -0.52 |
+-----------+--------------+-----------+----------------+-------------------+
|         6 | 6,514,679    | 6,28

### 2b. Impression Rate by Rank (Fold Detection)

In [4]:
# Impression rate analysis with proper type conversion
query_impression = f"""
WITH winner_impressions AS (
    SELECT
        ar.RANKING,
        COUNT(*) as total_winners,
        COUNT(i.INTERACTION_ID) as got_impression
    FROM AUCTIONS_RESULTS ar
    LEFT JOIN IMPRESSIONS i 
        ON TO_VARCHAR(ar.AUCTION_ID) = i.AUCTION_ID
        AND ar.PRODUCT_ID = i.PRODUCT_ID
    WHERE ar.CREATED_AT >= '{ANALYSIS_START}'
      AND ar.CREATED_AT < '{ANALYSIS_END}'
      AND ar.IS_WINNER = TRUE
      AND ar.RANKING <= 30
    GROUP BY ar.RANKING
)
SELECT 
    RANKING,
    total_winners,
    got_impression,
    ROUND(100.0 * got_impression / total_winners, 2) as impression_rate_pct,
    -- Mark if below fold threshold
    CASE WHEN ROUND(100.0 * got_impression / total_winners, 2) < 50 
         THEN 'BELOW_FOLD' 
         ELSE 'ABOVE_FOLD' END as fold_position
FROM winner_impressions
ORDER BY RANKING
"""

df_impression = run_query(query_impression)

# Format for display
df_imp_display = df_impression.copy()
df_imp_display['total_winners'] = df_imp_display['TOTAL_WINNERS'].apply(lambda x: f"{x:,}")
df_imp_display['got_impression'] = df_imp_display['GOT_IMPRESSION'].apply(lambda x: f"{x:,}")

show_table(df_imp_display[['RANKING', 'total_winners', 'got_impression', 'IMPRESSION_RATE_PCT', 'FOLD_POSITION']], 
           "Impression Rate by Rank - Fold Detection", max_rows=20)

# Identify the fold
fold_rank = None
for idx, row in df_impression.iterrows():
    if row['IMPRESSION_RATE_PCT'] < 50:
        fold_rank = row['RANKING']
        break

print("\n📍 FOLD ANALYSIS:")
if fold_rank:
    above_fold = df_impression[df_impression['RANKING'] < fold_rank]
    below_fold = df_impression[df_impression['RANKING'] >= fold_rank]
    
    print(f"Fold detected at rank {fold_rank}")
    print(f"\nAbove fold (rank < {fold_rank}):")
    print(f"  Average impression rate: {above_fold['IMPRESSION_RATE_PCT'].mean():.1f}%")
    print(f"  Min impression rate: {above_fold['IMPRESSION_RATE_PCT'].min():.1f}%")
    print(f"  Max impression rate: {above_fold['IMPRESSION_RATE_PCT'].max():.1f}%")
    
    print(f"\nBelow fold (rank ≥ {fold_rank}):")
    print(f"  Average impression rate: {below_fold['IMPRESSION_RATE_PCT'].mean():.1f}%")
    print(f"  Min impression rate: {below_fold['IMPRESSION_RATE_PCT'].min():.1f}%")
    print(f"  Max impression rate: {below_fold['IMPRESSION_RATE_PCT'].max():.1f}%")
    
    print(f"\nDiscontinuity size: {above_fold['IMPRESSION_RATE_PCT'].mean() - below_fold['IMPRESSION_RATE_PCT'].mean():.1f} percentage points")
else:
    print("No fold detected (all ranks have >50% impression rate)")

KeyboardInterrupt: 

## 3. Covariate Balance Tests

Check if pre-treatment characteristics vary smoothly around the cutoff.

In [5]:
# Using APPROX_COUNT_DISTINCT for efficiency with heavy data
query_covariates = f"""
SELECT 
    RANKING,
    COUNT(*) AS total_bids,
    APPROX_COUNT_DISTINCT(PRODUCT_ID) AS unique_products,
    APPROX_COUNT_DISTINCT(VENDOR_ID) AS unique_vendors,
    APPROX_COUNT_DISTINCT(CAMPAIGN_ID) AS unique_campaigns,
    -- Normalized metrics
    ROUND(APPROX_COUNT_DISTINCT(PRODUCT_ID) * 1.0 / COUNT(*), 4) AS products_per_bid,
    ROUND(APPROX_COUNT_DISTINCT(VENDOR_ID) * 1.0 / COUNT(*), 4) AS vendors_per_bid,
    ROUND(APPROX_COUNT_DISTINCT(CAMPAIGN_ID) * 1.0 / COUNT(*), 4) AS campaigns_per_bid
FROM AUCTIONS_RESULTS
WHERE CREATED_AT >= '{ANALYSIS_START}'
  AND CREATED_AT < '{ANALYSIS_END}'
  AND RANKING <= 20
GROUP BY RANKING
ORDER BY RANKING
"""

df_covariates = run_query(query_covariates)

# Format for display
df_cov_display = df_covariates.copy()
df_cov_display['total_bids'] = df_cov_display['TOTAL_BIDS'].apply(lambda x: f"{x:,}")
df_cov_display['unique_products'] = df_cov_display['UNIQUE_PRODUCTS'].apply(lambda x: f"{x:,}")
df_cov_display['unique_vendors'] = df_cov_display['UNIQUE_VENDORS'].apply(lambda x: f"{x:,}")

show_table(df_cov_display[['RANKING', 'total_bids', 'unique_products', 'unique_vendors', 
                           'PRODUCTS_PER_BID', 'VENDORS_PER_BID']], 
           "Covariate Balance by Rank", max_rows=20)

# Test for smoothness
print("\n🔬 Covariate Balance Tests:")
for col in ['PRODUCTS_PER_BID', 'VENDORS_PER_BID', 'CAMPAIGNS_PER_BID']:
    values = df_covariates[col].values[:10]  # Focus on top 10 ranks
    mean_val = np.mean(values)
    std_val = np.std(values)
    cv = std_val / mean_val if mean_val > 0 else 0  # Coefficient of variation
    print(f"\n{col}:")
    print(f"  Mean: {mean_val:.4f}")
    print(f"  Std Dev: {std_val:.4f}")
    print(f"  CV: {cv:.3f} (lower is smoother)")
    
    # Check for jumps
    if fold_rank and fold_rank <= 10:
        before_fold = np.mean(values[:fold_rank-1])
        after_fold = np.mean(values[fold_rank-1:])
        jump = abs(after_fold - before_fold) / before_fold * 100
        print(f"  Jump at fold: {jump:.1f}%")


Covariate Balance by Rank
+-----------+--------------+-------------------+------------------+--------------------+-------------------+
|   RANKING | total_bids   | unique_products   | unique_vendors   |   PRODUCTS_PER_BID |   VENDORS_PER_BID |
|         1 | 6,724,843    | 1,464,894         | 33,186           |             0.2178 |            0.0049 |
+-----------+--------------+-------------------+------------------+--------------------+-------------------+
|         2 | 6,653,098    | 1,684,527         | 33,175           |             0.2532 |            0.005  |
+-----------+--------------+-------------------+------------------+--------------------+-------------------+
|         3 | 6,605,526    | 1,758,527         | 33,154           |             0.2662 |            0.005  |
+-----------+--------------+-------------------+------------------+--------------------+-------------------+
|         4 | 6,569,492    | 1,831,487         | 33,176           |             0.2788 |            0

NameError: name 'fold_rank' is not defined

## 4. Outcome Analysis Around Cutoff

Examine purchase rates and revenue around the rank discontinuity.

In [None]:
# Complex join for outcome analysis with attribution window
# Using TABLESAMPLE for efficiency with heavy data
query_outcomes = f"""
WITH auction_outcomes AS (
    SELECT
        ar.RANKING,
        ar.IS_WINNER,
        CASE WHEN i.INTERACTION_ID IS NOT NULL THEN 1 ELSE 0 END as got_impression,
        CASE WHEN c.INTERACTION_ID IS NOT NULL THEN 1 ELSE 0 END as got_click,
        CASE WHEN p.PURCHASE_ID IS NOT NULL THEN 1 ELSE 0 END as got_purchase,
        COALESCE(p.QUANTITY * p.UNIT_PRICE, 0) as revenue
    FROM AUCTIONS_RESULTS ar TABLESAMPLE BERNOULLI ({SAMPLE_RATE})
    JOIN AUCTIONS_USERS au 
        ON ar.AUCTION_ID = au.AUCTION_ID
    LEFT JOIN IMPRESSIONS i 
        ON TO_VARCHAR(ar.AUCTION_ID) = i.AUCTION_ID
        AND ar.PRODUCT_ID = i.PRODUCT_ID
        AND i.USER_ID = au.OPAQUE_USER_ID
    LEFT JOIN CLICKS c 
        ON i.INTERACTION_ID = c.INTERACTION_ID
    LEFT JOIN PURCHASES p
        ON c.USER_ID = p.USER_ID
        AND c.PRODUCT_ID = p.PRODUCT_ID
        AND p.PURCHASED_AT > c.OCCURRED_AT
        AND DATEDIFF('minute', c.OCCURRED_AT, p.PURCHASED_AT) <= {ATTRIBUTION_WINDOW_MINUTES}
    WHERE ar.CREATED_AT >= '{ANALYSIS_START}'
      AND ar.CREATED_AT < '{ANALYSIS_END}'
      AND ar.RANKING <= 20
)
SELECT
    RANKING,
    COUNT(*) as sample_size,
    ROUND(AVG(got_impression) * 100, 2) as impression_rate,
    ROUND(AVG(got_click) * 100, 2) as click_rate,
    ROUND(AVG(got_purchase) * 100, 2) as purchase_rate,
    ROUND(AVG(revenue), 2) as avg_revenue,
    ROUND(SUM(revenue), 2) as total_revenue,
    -- CTR and CVR calculations
    CASE WHEN SUM(got_impression) > 0 
         THEN ROUND(100.0 * SUM(got_click) / SUM(got_impression), 2) 
         ELSE 0 END as ctr,
    CASE WHEN SUM(got_click) > 0 
         THEN ROUND(100.0 * SUM(got_purchase) / SUM(got_click), 2) 
         ELSE 0 END as cvr
FROM auction_outcomes
GROUP BY RANKING
ORDER BY RANKING
"""

df_outcomes = run_query(query_outcomes)

# Format for display
df_out_display = df_outcomes.copy()
df_out_display['sample_size'] = df_out_display['SAMPLE_SIZE'].apply(lambda x: f"{x:,}")
df_out_display['avg_revenue'] = df_out_display['AVG_REVENUE'].apply(lambda x: f"${x:,.2f}")
df_out_display['total_revenue'] = df_out_display['TOTAL_REVENUE'].apply(lambda x: f"${x:,.2f}")

show_table(df_out_display[['RANKING', 'sample_size', 'IMPRESSION_RATE', 'CLICK_RATE', 
                           'PURCHASE_RATE', 'CTR', 'CVR', 'avg_revenue']], 
           f"Outcome Analysis by Rank (Sample: {SAMPLE_RATE*100}%)", max_rows=20)

# Summary statistics around fold
print(f"\n📊 Outcome Summary Around Fold:")
if fold_rank and fold_rank <= 20:
    above = df_outcomes[df_outcomes['RANKING'] < fold_rank]
    below = df_outcomes[df_outcomes['RANKING'] >= fold_rank]
    
    metrics = ['IMPRESSION_RATE', 'CLICK_RATE', 'PURCHASE_RATE', 'CTR', 'CVR']
    
    comparison_data = []
    for metric in metrics:
        above_mean = above[metric].mean()
        below_mean = below[metric].mean()
        diff = above_mean - below_mean
        comparison_data.append([metric, f"{above_mean:.2f}%", f"{below_mean:.2f}%", f"{diff:+.2f}pp"])
    
    comparison_df = pd.DataFrame(comparison_data, 
                                 columns=['Metric', f'Above Fold (<{fold_rank})', 
                                         f'Below Fold (≥{fold_rank})', 'Difference'])
    show_table(comparison_df, "Metrics Comparison Around Fold")
    
    print(f"\n💰 Estimated Treatment Effect:")
    print(f"  Purchase rate difference: {above['PURCHASE_RATE'].mean() - below['PURCHASE_RATE'].mean():.2f} percentage points")
    print(f"  Average revenue difference: ${above['AVG_REVENUE'].mean() - below['AVG_REVENUE'].mean():.2f}")
else:
    print("No fold detected for comparison")

## 5. RDD-Specific Analysis: Local Linear Regression

Estimate treatment effects using local linear regression around the cutoff.

In [None]:
# Focus on narrow bandwidth around fold for RDD
if fold_rank:
    bandwidth = 3  # Ranks within ±3 of fold
    
    query_rdd = f"""
    WITH rdd_sample AS (
        SELECT
            ar.RANKING,
            CASE WHEN ar.RANKING < {fold_rank} THEN 1 ELSE 0 END as above_fold,
            ar.RANKING - {fold_rank} as centered_rank,
            CASE WHEN p.PURCHASE_ID IS NOT NULL THEN 1 ELSE 0 END as purchased
        FROM AUCTIONS_RESULTS ar
        JOIN AUCTIONS_USERS au ON ar.AUCTION_ID = au.AUCTION_ID
        LEFT JOIN IMPRESSIONS i 
            ON TO_VARCHAR(ar.AUCTION_ID) = i.AUCTION_ID
            AND ar.PRODUCT_ID = i.PRODUCT_ID
        LEFT JOIN CLICKS c ON i.INTERACTION_ID = c.INTERACTION_ID
        LEFT JOIN PURCHASES p
            ON c.USER_ID = p.USER_ID
            AND c.PRODUCT_ID = p.PRODUCT_ID
            AND p.PURCHASED_AT > c.OCCURRED_AT
            AND DATEDIFF('minute', c.OCCURRED_AT, p.PURCHASED_AT) <= {ATTRIBUTION_WINDOW_MINUTES}
        WHERE ar.CREATED_AT >= '{ANALYSIS_START}'
          AND ar.CREATED_AT < '{ANALYSIS_END}'
          AND ar.RANKING BETWEEN {fold_rank - bandwidth} AND {fold_rank + bandwidth}
          AND ar.IS_WINNER = TRUE
    )
    SELECT
        RANKING,
        above_fold,
        centered_rank,
        COUNT(*) as n,
        AVG(purchased) * 100 as purchase_rate,
        STDDEV(purchased) * 100 as purchase_std
    FROM rdd_sample
    GROUP BY RANKING, above_fold, centered_rank
    ORDER BY RANKING
    """
    
    df_rdd = run_query(query_rdd)
    
    # Format for display
    df_rdd_display = df_rdd.copy()
    df_rdd_display['n'] = df_rdd_display['N'].apply(lambda x: f"{x:,}")
    df_rdd_display['purchase_rate'] = df_rdd_display['PURCHASE_RATE'].apply(lambda x: f"{x:.2f}%")
    df_rdd_display['purchase_std'] = df_rdd_display['PURCHASE_STD'].apply(lambda x: f"{x:.2f}%" if pd.notna(x) else "N/A")
    df_rdd_display['treatment'] = df_rdd_display['ABOVE_FOLD'].apply(lambda x: 'Above Fold' if x == 1 else 'Below Fold')
    
    show_table(df_rdd_display[['RANKING', 'treatment', 'CENTERED_RANK', 'n', 'purchase_rate', 'purchase_std']], 
               f"RDD Analysis: Narrow Bandwidth (±{bandwidth} ranks around fold)")
    
    # Calculate RDD estimate using simple difference in means
    above = df_rdd[df_rdd['ABOVE_FOLD'] == 1]
    below = df_rdd[df_rdd['ABOVE_FOLD'] == 0]
    
    if len(above) > 0 and len(below) > 0:
        # Weight by sample size
        above_rate = np.average(above['PURCHASE_RATE'], weights=above['N'])
        below_rate = np.average(below['PURCHASE_RATE'], weights=below['N'])
        treatment_effect = above_rate - below_rate
        
        print(f"\n🎯 RDD Treatment Effect Estimate:")
        print(f"  Above fold purchase rate: {above_rate:.2f}%")
        print(f"  Below fold purchase rate: {below_rate:.2f}%")
        print(f"  Discontinuity at fold: {treatment_effect:.2f} percentage points")
        print(f"\n  Interpretation: Being above the fold (rank < {fold_rank}) increases")
        print(f"  purchase probability by {abs(treatment_effect):.2f} percentage points")
        
        # Calculate standard error (simplified)
        n_above = above['N'].sum()
        n_below = below['N'].sum()
        se_above = np.sqrt(above_rate * (100 - above_rate) / n_above)
        se_below = np.sqrt(below_rate * (100 - below_rate) / n_below)
        se_total = np.sqrt(se_above**2 + se_below**2)
        
        print(f"\n  Standard error: {se_total:.3f}")
        print(f"  95% CI: [{treatment_effect - 1.96*se_total:.2f}, {treatment_effect + 1.96*se_total:.2f}]")
else:
    print("⚠️ No fold detected for RDD analysis")

## 6. Bandwidth Sensitivity Analysis

Test robustness of results to different bandwidth choices.

In [None]:
# Test multiple bandwidths
bandwidths = [2, 3, 4, 5, 6]
sensitivity_results = []

if fold_rank:
    for bw in bandwidths:
        query_bw = f"""
        WITH bw_analysis AS (
            SELECT
                CASE WHEN ar.RANKING < {fold_rank} THEN 1 ELSE 0 END as treatment,
                COUNT(*) as n,
                AVG(CASE WHEN p.PURCHASE_ID IS NOT NULL THEN 1 ELSE 0 END) as purchase_rate
            FROM AUCTIONS_RESULTS ar
            JOIN AUCTIONS_USERS au ON ar.AUCTION_ID = au.AUCTION_ID
            LEFT JOIN IMPRESSIONS i 
                ON TO_VARCHAR(ar.AUCTION_ID) = i.AUCTION_ID
                AND ar.PRODUCT_ID = i.PRODUCT_ID
            LEFT JOIN CLICKS c ON i.INTERACTION_ID = c.INTERACTION_ID
            LEFT JOIN PURCHASES p
                ON c.USER_ID = p.USER_ID
                AND c.PRODUCT_ID = p.PRODUCT_ID
                AND p.PURCHASED_AT > c.OCCURRED_AT
                AND DATEDIFF('minute', c.OCCURRED_AT, p.PURCHASED_AT) <= {ATTRIBUTION_WINDOW_MINUTES}
            WHERE ar.CREATED_AT >= '{ANALYSIS_START}'
              AND ar.CREATED_AT < '{ANALYSIS_END}'
              AND ar.RANKING BETWEEN {fold_rank - bw} AND {fold_rank + bw}
              AND ar.IS_WINNER = TRUE
            GROUP BY treatment
        )
        SELECT
            MAX(CASE WHEN treatment = 1 THEN purchase_rate END) * 100 as above_rate,
            MAX(CASE WHEN treatment = 0 THEN purchase_rate END) * 100 as below_rate,
            (MAX(CASE WHEN treatment = 1 THEN purchase_rate END) - 
             MAX(CASE WHEN treatment = 0 THEN purchase_rate END)) * 100 as effect,
            SUM(n) as total_n,
            MAX(CASE WHEN treatment = 1 THEN n END) as n_above,
            MAX(CASE WHEN treatment = 0 THEN n END) as n_below
        FROM bw_analysis
        """
        
        df_bw = run_query(query_bw)
        if not df_bw.empty and 'EFFECT' in df_bw.columns:
            sensitivity_results.append({
                'Bandwidth': f"±{bw}",
                'Rank_Range': f"{fold_rank - bw} to {fold_rank + bw}",
                'N_Above': df_bw['N_ABOVE'].iloc[0],
                'N_Below': df_bw['N_BELOW'].iloc[0],
                'Above_Rate': f"{df_bw['ABOVE_RATE'].iloc[0]:.2f}%" if pd.notna(df_bw['ABOVE_RATE'].iloc[0]) else "N/A",
                'Below_Rate': f"{df_bw['BELOW_RATE'].iloc[0]:.2f}%" if pd.notna(df_bw['BELOW_RATE'].iloc[0]) else "N/A",
                'Treatment_Effect': df_bw['EFFECT'].iloc[0] if pd.notna(df_bw['EFFECT'].iloc[0]) else None
            })
    
    # Display results
    if sensitivity_results:
        sens_df = pd.DataFrame(sensitivity_results)
        
        # Format for display
        sens_df['N_Above'] = sens_df['N_Above'].apply(lambda x: f"{x:,}" if pd.notna(x) else "N/A")
        sens_df['N_Below'] = sens_df['N_Below'].apply(lambda x: f"{x:,}" if pd.notna(x) else "N/A")
        sens_df['Treatment_Effect_Display'] = sens_df['Treatment_Effect'].apply(
            lambda x: f"{x:.2f}pp" if pd.notna(x) else "N/A"
        )
        
        show_table(sens_df[['Bandwidth', 'Rank_Range', 'N_Above', 'N_Below', 
                           'Above_Rate', 'Below_Rate', 'Treatment_Effect_Display']], 
                  "Bandwidth Sensitivity Analysis")
        
        # Calculate summary statistics
        valid_effects = [x['Treatment_Effect'] for x in sensitivity_results if x['Treatment_Effect'] is not None]
        if valid_effects:
            mean_effect = np.mean(valid_effects)
            std_effect = np.std(valid_effects)
            min_effect = np.min(valid_effects)
            max_effect = np.max(valid_effects)
            
            print("\n📈 Bandwidth Sensitivity Summary:")
            print(f"  Mean effect across bandwidths: {mean_effect:.2f}pp")
            print(f"  Standard deviation: {std_effect:.2f}pp")
            print(f"  Range: {min_effect:.2f}pp to {max_effect:.2f}pp")
            print(f"  Coefficient of variation: {std_effect/abs(mean_effect):.3f}")
else:
    print("⚠️ No fold detected for bandwidth sensitivity analysis")

## 7. Summary & Conclusions

In [None]:
print("="*60)
print("RDD ANALYSIS SUMMARY")
print("="*60)
print(f"\n📅 Analysis Period: {ANALYSIS_START} to {ANALYSIS_END}")
print(f"🔍 Sample Rate Used: {SAMPLE_RATE*100}% (for heavy queries)")
print(f"⏰ Attribution Window: {ATTRIBUTION_WINDOW_MINUTES} minutes post-click")

if fold_rank:
    print(f"\n🎯 KEY FINDINGS:")
    print(f"  1. Fold detected at rank {fold_rank}")
    print(f"  2. Sharp discontinuity in impression probability at this threshold")
    
    if 'mean_effect' in locals():
        print(f"  3. Estimated causal effect of being above fold: {mean_effect:.1f}pp increase in purchases")
        print(f"  4. Effect is {'robust' if std_effect < abs(mean_effect) * 0.3 else 'sensitive'} across bandwidth specifications")
    
    print(f"\n✅ RDD VALIDITY CHECKS:")
    
    # Check for manipulation
    if 'suspicious' in locals() and len(suspicious) == 0:
        print(f"  ✓ No evidence of manipulation around cutoff")
    else:
        print(f"  ⚠️ Some discontinuities in running variable distribution")
    
    # Check covariate balance
    if 'df_covariates' in locals():
        cv_values = [df_covariates['PRODUCTS_PER_BID'].std() / df_covariates['PRODUCTS_PER_BID'].mean(),
                     df_covariates['VENDORS_PER_BID'].std() / df_covariates['VENDORS_PER_BID'].mean()]
        if all(cv < 0.2 for cv in cv_values):
            print(f"  ✓ Covariates vary smoothly across threshold")
        else:
            print(f"  ⚠️ Some covariate imbalance detected")
    
    print(f"  ✓ Clear discontinuity in treatment assignment")
    
    print(f"\n📋 FINAL ESTIMATES:")
    summary_data = [
        ['Fold Rank', str(fold_rank)],
        ['Impression Rate Jump', f"{above_fold['IMPRESSION_RATE_PCT'].mean() - below_fold['IMPRESSION_RATE_PCT'].mean():.1f}pp" if 'above_fold' in locals() else "N/A"],
        ['Purchase Rate Effect', f"{mean_effect:.2f}pp" if 'mean_effect' in locals() else "N/A"],
        ['Revenue Effect', f"${above['AVG_REVENUE'].mean() - below['AVG_REVENUE'].mean():.2f}" if 'above' in locals() and 'AVG_REVENUE' in above.columns else "N/A"]
    ]
    summary_df = pd.DataFrame(summary_data, columns=['Metric', 'Value'])
    show_table(summary_df, "")
    
else:
    print("\n⚠️ No clear discontinuity detected for RDD analysis")
    print("Consider:")
    print("  - Expanding the analysis period")
    print("  - Examining different rank thresholds")
    print("  - Checking data quality and join conditions")

print("\n" + "="*60)

In [None]:
# Close connection
cursor.close()
conn.close()
print("✅ Connection closed")