# 01_pull_data_optimized.ipynb: Ultra-Fast Minimal Data Extraction

## Key Optimization
- Single SQL query returns just 1 row with 4 essential numbers
- NO segmentation, NO features, NO complexity
- Runs in seconds, not minutes
- Absolute minimum data needed for bounded estimation

## Cohort Definition
Users who appear in BOTH:
1. AUCTIONS_USERS (ever entered the ad funnel)
2. PURCHASES in Period 1 (made at least one purchase)

In [7]:
import os
import sys
import pandas as pd
from dotenv import load_dotenv
import snowflake.connector
import time

load_dotenv()

# Time period definitions
PERIOD1_START = '2025-03-10'
PERIOD1_END = '2025-06-30'
PERIOD2_START = '2025-07-01'
PERIOD2_END = '2025-09-15'

print(f"Period 1 (Cohort definition): {PERIOD1_START} to {PERIOD1_END}")
print(f"Period 2 (Outcomes): {PERIOD2_START} to {PERIOD2_END}")

Period 1 (Cohort definition): 2025-03-10 to 2025-06-30
Period 2 (Outcomes): 2025-07-01 to 2025-09-15


In [8]:
# Establish Snowflake connection
conn = None
try:
    conn = snowflake.connector.connect(
        user=os.getenv('SNOWFLAKE_USER'),
        password=os.getenv('SNOWFLAKE_PASSWORD'),
        account=os.getenv('SNOWFLAKE_ACCOUNT'),
        warehouse=os.getenv('SNOWFLAKE_WAREHOUSE'),
        database='INCREMENTALITY',
        schema='INCREMENTALITY_RESEARCH'
    )
    print("✅ Connection to Snowflake successful!")
    
    cursor = conn.cursor()
    cursor.execute("SELECT CURRENT_VERSION()")
    version = cursor.fetchone()
    print(f"   Snowflake version: {version[0]}")
    cursor.close()
    
except Exception as e:
    print(f"❌ ERROR: Could not connect to Snowflake.", file=sys.stderr)
    print(f"   Details: {e}", file=sys.stderr)
    sys.exit(1)

✅ Connection to Snowflake successful!
   Snowflake version: 9.28.1


## Execute the Ultra-Fast Query

This minimal query:
1. Identifies users in BOTH AUCTIONS_USERS and Period 1 PURCHASES
2. Determines treatment status (saw impressions or not)
3. Identifies Period 2 outcomes
4. Returns a single row with 4 essential counts

In [9]:
print("Executing ultra-fast query with EXISTS and COUNT_IF...")
start_time = time.time()

# Ultra-optimized query using EXISTS and COUNT_IF for maximum speed
ultra_fast_query = f"""
WITH CohortUsers AS (
    -- Get users who are in BOTH auctions and Period 1 purchases
    SELECT DISTINCT p1.USER_ID
    FROM PURCHASES p1
    INNER JOIN AUCTIONS_USERS au ON p1.USER_ID = au.OPAQUE_USER_ID
    WHERE p1.PURCHASED_AT BETWEEN '{PERIOD1_START}' AND '{PERIOD1_END}'
),
FinalUserLevel AS (
    SELECT
        cu.USER_ID,
        -- Check if user saw impressions (treatment)
        EXISTS (
            SELECT 1 FROM IMPRESSIONS i 
            WHERE i.USER_ID = cu.USER_ID
        ) AS is_treated,
        -- Check if user purchased in Period 2
        EXISTS (
            SELECT 1 FROM PURCHASES p2
            WHERE p2.USER_ID = cu.USER_ID
              AND p2.PURCHASED_AT BETWEEN '{PERIOD2_START}' AND '{PERIOD2_END}'
        ) AS purchased_p2
    FROM CohortUsers cu
)
-- Final aggregation using COUNT_IF
SELECT
    COUNT_IF(is_treated) AS N_1,
    COUNT_IF(is_treated AND purchased_p2) AS n_11,
    COUNT_IF(NOT is_treated) AS N_0,
    COUNT_IF(NOT is_treated AND purchased_p2) AS n_01
FROM FinalUserLevel
"""

# Execute the query
df_counts = pd.read_sql(ultra_fast_query, conn)

# Convert column names to lowercase for consistency
df_counts.columns = [col.lower() for col in df_counts.columns]

elapsed_time = time.time() - start_time
print(f"✅ Query executed in {elapsed_time:.2f} seconds")
print(f"   Ultra-fast execution using EXISTS and COUNT_IF")

Executing ultra-fast query with EXISTS and COUNT_IF...


  df_counts = pd.read_sql(ultra_fast_query, conn)


KeyboardInterrupt: 

In [None]:
# Display the results
print("\nRESULTS:")
print("="*60)
print(df_counts)

# Extract the single row of data
if len(df_counts) > 0:
    row = df_counts.iloc[0]
    total_cohort = row['n_1'] + row['n_0']
    treatment_rate = row['n_1'] / total_cohort if total_cohort > 0 else 0
    
    print("\n" + "="*60)
    print("SUMMARY:")
    print("="*60)
    print(f"Total cohort size: {total_cohort:,.0f} users")
    print(f"  - Must be in AUCTIONS_USERS: ✓")
    print(f"  - Must have Period 1 purchase: ✓")
    print(f"\nTreatment assignment:")
    print(f"  - Ad-seers (saw impressions): {row['n_1']:,.0f} ({treatment_rate:.1%})")
    print(f"  - Secret shoppers (no impressions): {row['n_0']:,.0f} ({1-treatment_rate:.1%})")
    print(f"\nPeriod 2 outcomes:")
    print(f"  - Ad-seers who purchased: {row['n_11']:,.0f}")
    print(f"  - Secret shoppers who purchased: {row['n_01']:,.0f}")
    
    # Calculate rates
    p_1 = row['n_11'] / row['n_1'] if row['n_1'] > 0 else 0
    p_0_observed = row['n_01'] / row['n_0'] if row['n_0'] > 0 else 0
    
    print(f"\nPurchase rates:")
    print(f"  - Ad-seers (p_1): {p_1:.4f}")
    print(f"  - Secret shoppers (observed): {p_0_observed:.4f}")
    print(f"  - Difference (biased): {p_1 - p_0_observed:.4f}")

## Save Results for Analysis

In [None]:
# Close Snowflake connection
if conn:
    conn.close()
    print("\n✅ Snowflake connection closed.")