In [9]:
import os
import sys
import pandas as pd
from dotenv import load_dotenv
import snowflake.connector

# Load environment variables from the .env file
# This command looks for the .env file in the same directory as your notebook
load_dotenv()

# --- Snowflake Connection Block ---
conn = None  # Initialize conn to None
try:
    # Establish the connection using credentials from the .env file
    conn = snowflake.connector.connect(
        user=os.getenv('SNOWFLAKE_USER'),
        password=os.getenv('SNOWFLAKE_PASSWORD'),
        account=os.getenv('SNOWFLAKE_ACCOUNT'),
        warehouse=os.getenv('SNOWFLAKE_WAREHOUSE'),
        database='INCREMENTALITY',
        schema='INCREMENTALITY_RESEARCH'
    )
    print("✅ Connection to Snowflake successful!")

    # Optional: Verify the connection with a simple query
    cursor = conn.cursor()
    cursor.execute("SELECT CURRENT_VERSION()")
    one_row = cursor.fetchone()
    print(f"   Snowflake version: {one_row[0]}")

except Exception as e:
    print(f"❌ ERROR: Could not connect to Snowflake.", file=sys.stderr)
    print(f"   Please check your credentials in the .env file and network connection.", file=sys.stderr)
    print(f"   Details: {e}", file=sys.stderr)


✅ Connection to Snowflake successful!
   Snowflake version: 9.29.0


In [15]:
import pandas as pd
from tabulate import tabulate
import warnings
import numpy as np

# Suppress the UserWarning for cleaner output
warnings.filterwarnings(
    "ignore",
    message="pandas only supports SQLAlchemy connectable.*",
    category=UserWarning,
)

# --- 1. Configuration ---
ANALYSIS_START_DATE = '2025-08-20'
ANALYSIS_END_DATE = '2025-08-21'

# --- 2. SQL Query to Get Raw Counts (Unchanged) ---
# This query is designed for maximum efficiency.
counts_query = f"""
WITH
-- CTE 1: Identify the groups of users within the period
impressed_users AS (
    SELECT DISTINCT USER_ID FROM IMPRESSIONS
    WHERE OCCURRED_AT BETWEEN '{ANALYSIS_START_DATE}' AND '{ANALYSIS_END_DATE}'
),
clicked_users AS (
    SELECT DISTINCT USER_ID FROM CLICKS
    WHERE OCCURRED_AT BETWEEN '{ANALYSIS_START_DATE}' AND '{ANALYSIS_END_DATE}'
),
purchasers AS (
    SELECT DISTINCT USER_ID FROM PURCHASES
    WHERE PURCHASED_AT BETWEEN '{ANALYSIS_START_DATE}' AND '{ANALYSIS_END_DATE}'
)
-- Final Aggregation: We only need counts related to Z=1 and the single n01 count
SELECT
    (SELECT COUNT(DISTINCT USER_ID) FROM impressed_users) AS N1,
    (SELECT COUNT(DISTINCT USER_ID) FROM impressed_users WHERE USER_ID IN (SELECT USER_ID FROM purchasers)) AS y11,
    (SELECT COUNT(DISTINCT USER_ID) FROM impressed_users WHERE USER_ID IN (SELECT USER_ID FROM clicked_users)) AS d11,
    (SELECT COUNT(DISTINCT USER_ID) FROM purchasers WHERE USER_ID NOT IN (SELECT USER_ID FROM impressed_users)) AS n01
"""

# --- 3. Execute Query and Run Sensitivity Analysis ---
if conn:
    print("--- Running LATE Sensitivity Analysis for Different Holdout Probabilities ---")
    try:
        df_counts = pd.read_sql(counts_query, conn)
        counts = df_counts.iloc[0]
        
        N1 = counts['N1']
        y11 = counts['Y11']
        d11 = counts['D11']
        n01 = counts['N01']
        
        print("\n✅ Query successful. Observable counts retrieved:")
        print(f"   - N1 (Exposed Count): {N1:,.0f}")
        print(f"   - y11 (Exposed Purchases): {y11:,.0f}")
        print(f"   - d11 (Exposed Clicks): {d11:,.0f}")
        print(f"   - n01 (Holdout Purchases): {n01:,.0f}")
        
        if N1 == 0 or d11 == 0:
            raise ValueError("No treated users or no clicks in the treatment group. Cannot calculate LATE.")

        # Pre-calculate constant rates for the loop
        a = y11 / N1
        c = d11 / N1

        # --- 4. Loop Through Different Values of π₀ ---
        sensitivity_results = []
        
        # Define the range of holdout probabilities to test (1% to 10%)
        holdout_probabilities = np.arange(0.01, 0.11, 0.01)

        for pi_0 in holdout_probabilities:
            pi_1 = 1 - pi_0
            
            # Calculate N₀ and b̂ for the current pi_0
            N0 = (pi_0 / pi_1) * N1
            b_hat = n01 / N0 if N0 > 0 else 0
            
            # Calculate the LATE estimate
            LATE_estimate = (a - b_hat) / c
            
            sensitivity_results.append({
                "Assumed π₀ (Holdout %)": f"{pi_0:.0%}",
                "Est. N₀ (Control Size)": f"{N0:,.0f}",
                "Est. b̂ (Control Purchase Rate)": f"{b_hat:.2%}",
                "LATE Estimate": f"{LATE_estimate:+.2%}"
            })

        # --- 5. Generate Final Report ---
        print("\n" + "="*80)
        print("    LATE Sensitivity Analysis with Reconstructed Control Group")
        print("="*80)
        print("\nThis table shows how the LATE estimate changes based on the assumed")
        print("randomization probability for the holdout group (π₀).\n")

        print("Constant Rates (Unaffected by π₀):")
        print(f"  - a (Purchase Rate | Z=1): {a:.4%}")
        print(f"  - c (Click Rate | Z=1):    {c:.4%}\n")

        # Display the results in a clean table
        results_df = pd.DataFrame(sensitivity_results)
        print(tabulate(results_df, headers='keys', tablefmt='grid', showindex=False))

        print("\nInterpretation:")
        print("The LATE estimate remains strongly negative across all plausible holdout probabilities.")
        print("This confirms the result is not an artifact of assuming π₀=5%, but is due to")
        print("a fundamental selection bias: the holdout purchasers (n01) are a fundamentally")
        print("different, higher-converting group than the general population (N1).")
        print("="*80)

    except Exception as e:
        print(f"❌ ERROR executing analysis: {e}")
else:
    print("❌ Snowflake connection ('conn') not found. Please run the connection cell first.")

--- Running LATE Sensitivity Analysis for Different Holdout Probabilities ---

✅ Query successful. Observable counts retrieved:
   - N1 (Exposed Count): 738,417
   - y11 (Exposed Purchases): 69,314
   - d11 (Exposed Clicks): 270,925
   - n01 (Holdout Purchases): 22,550

    LATE Sensitivity Analysis with Reconstructed Control Group

This table shows how the LATE estimate changes based on the assumed
randomization probability for the holdout group (π₀).

Constant Rates (Unaffected by π₀):
  - a (Purchase Rate | Z=1): 9.3868%
  - c (Click Rate | Z=1):    36.6900%

+--------------------------+--------------------------+----------------------------------+-----------------+
| Assumed π₀ (Holdout %)   | Est. N₀ (Control Size)   | Est. b̂ (Control Purchase Rate)   | LATE Estimate   |
| 1%                       | 7,459                    | 302.33%                          | -798.43%        |
+--------------------------+--------------------------+----------------------------------+-------------

In [19]:
import pandas as pd
from tabulate import tabulate
import numpy as np

# --- 1. Observed Data (The Ground Truth) ---
OBSERVED_COUNTS = {
    'N1': 738417,
    'N0_est': 38864, # Estimated from N1 * (0.05 / 0.95)
    'y11': 69314,   # count(Z=1, Y=1)
    'd11': 270925,  # count(Z=1, D=1)
    'n01': 22550   # count(Z=0, Y=1)
}

# --- 2. Calculate Empirical Moments (m_D1, m_Y0, m_Y1) ---
m_D1 = OBSERVED_COUNTS['d11'] / OBSERVED_COUNTS['N1']
m_Y0 = OBSERVED_COUNTS['n01'] / OBSERVED_COUNTS['N0_est']
m_Y1 = OBSERVED_COUNTS['y11'] / OBSERVED_COUNTS['N1']

# --- 3. Solve for Best-Fit Parameters (b_hat, c_hat, d_hat) ---
# This is the analytical solution from the Method of Moments objective function.
b_hat = m_D1
c_hat = m_Y0
d_hat = (m_Y1 - c_hat) / b_hat if b_hat > 0 else 0

# --- 4. Diagnosis and Report ---
print("="*80)
print("             Method of Moments Diagnosis of Causal Model")
print("="*80)

print("\n1. Empirical Moments (Observed Data):")
moments_data = [
    ["Compliance Rate in Treatment (m_D1)", f"{m_D1:.4%}"],
    ["Purchase Rate in Control (m_Y0)", f"{m_Y0:.4%}"],
    ["Purchase Rate in Treatment (m_Y1)", f"{m_Y1:.4%}"]
]
print(tabulate(moments_data, headers=["Moment", "Observed Value"], tablefmt="grid"))

print("\n2. Best-Fit Causal Parameters (Method of Moments Solution):")
params_data = [
    ["b̂ (Best-fit Compliance Rate)", f"{b_hat:.4%}", "Matches m_D1 exactly."],
    ["ĉ (Best-fit Baseline Purchase Rate)", f"{c_hat:.4%}", "Matches m_Y0 exactly."],
    ["d̂ (Best-fit Incrementality / LATE)", f"{d_hat:.4%}", "Calculated to reconcile the other moments."]
]
print(tabulate(params_data, headers=["Parameter", "Value", "Derivation"], tablefmt="grid"))

print("\n3. Diagnosis: The Contradiction")
print("The Method of Moments reveals a fundamental contradiction in the data,")
print("proving that a single causal model cannot explain the behavior of both groups:\n")

# The contradiction is that a baseline purchase rate this high is nonsensical for the general population.
contradiction_table = [
    ["The Problem", "To match the observed holdout purchasers, the model's baseline purchase rate (`ĉ`) must be an incredibly high **58.02%**."],
    ["The Consequence", "However, the observed purchase rate for the entire treatment group (`m_Y1`) is only **9.40%**."],
    ["The Impossibility", "It is mathematically impossible for a click to have a negative enough effect (`d̂` = -132.56%) to drag a 58.02% baseline down to 9.40%. A purchase rate cannot be negative."],
    ["The Conclusion", "The model fails because the assumption of a single baseline rate (`c`) for both groups is false. The control group (`Z=0`) and treatment group (`Z=1`) are two different populations with two different baseline purchase rates, proving severe selection bias."]
]
print(tabulate(contradiction_table, tablefmt="fancy_grid", maxcolwidths=[17, 90]))
print("="*80)

             Method of Moments Diagnosis of Causal Model

1. Empirical Moments (Observed Data):
+-------------------------------------+------------------+
| Moment                              | Observed Value   |
| Compliance Rate in Treatment (m_D1) | 36.6900%         |
+-------------------------------------+------------------+
| Purchase Rate in Control (m_Y0)     | 58.0228%         |
+-------------------------------------+------------------+
| Purchase Rate in Treatment (m_Y1)   | 9.3868%          |
+-------------------------------------+------------------+

2. Best-Fit Causal Parameters (Method of Moments Solution):
+-------------------------------------+------------+--------------------------------------------+
| Parameter                           | Value      | Derivation                                 |
| b̂ (Best-fit Compliance Rate)        | 36.6900%   | Matches m_D1 exactly.                      |
+-------------------------------------+------------+------------------------