# 01_pull_data.ipynb: Hyper-Efficient Data Extraction for Bounded Causal Analysis

## Objective
Extract aggregated counts from PURCHASES and AUCTIONS_USERS tables only, avoiding complex joins.

## Key Design Decisions
- Period 1 (Segmentation): 2025-03-10 to 2025-06-30
- Period 2 (Outcomes): 2025-07-01 onwards
- Analysis Cohort: Users with ≥1 purchase in Period 1
- Treatment: Presence in AUCTIONS_USERS table

In [1]:
import os
import sys
import pandas as pd
import numpy as np
from dotenv import load_dotenv
import snowflake.connector
import time
from tqdm import tqdm

load_dotenv()

# Time period definitions
PERIOD1_START = '2025-03-10'
PERIOD1_END = '2025-06-30'
PERIOD2_START = '2025-07-01'
PERIOD2_END = '2025-09-15'

print(f"Period 1 (Segmentation): {PERIOD1_START} to {PERIOD1_END}")
print(f"Period 2 (Outcomes): {PERIOD2_START} to {PERIOD2_END}")

Period 1 (Segmentation): 2025-03-10 to 2025-06-30
Period 2 (Outcomes): 2025-07-01 to 2025-09-15


In [2]:
# Establish Snowflake connection
conn = None
try:
    conn = snowflake.connector.connect(
        user=os.getenv('SNOWFLAKE_USER'),
        password=os.getenv('SNOWFLAKE_PASSWORD'),
        account=os.getenv('SNOWFLAKE_ACCOUNT'),
        warehouse=os.getenv('SNOWFLAKE_WAREHOUSE'),
        database='INCREMENTALITY',
        schema='INCREMENTALITY_RESEARCH'
    )
    print("✅ Connection to Snowflake successful!")
    
    cursor = conn.cursor()
    cursor.execute("SELECT CURRENT_VERSION()")
    version = cursor.fetchone()
    print(f"   Snowflake version: {version[0]}")
    cursor.close()
    
except Exception as e:
    print(f"❌ ERROR: Could not connect to Snowflake.", file=sys.stderr)
    print(f"   Details: {e}", file=sys.stderr)
    sys.exit(1)

✅ Connection to Snowflake successful!
   Snowflake version: 9.28.1


## Step 1: Build Analysis Cohort and Segmentation

Extract all users with purchases in Period 1 and calculate segmentation variables.

In [3]:
# Query to extract Period 1 user features for segmentation
print("Extracting Period 1 user features...")
start_time = time.time()

period1_features_query = f"""
WITH period1_purchases AS (
    SELECT 
        USER_ID,
        PURCHASE_ID,
        PURCHASED_AT,
        QUANTITY,
        UNIT_PRICE,
        (QUANTITY * UNIT_PRICE / 100.0) AS purchase_value_dollars
    FROM PURCHASES
    WHERE PURCHASED_AT >= '{PERIOD1_START}' 
      AND PURCHASED_AT <= '{PERIOD1_END}'
),
user_features AS (
    SELECT
        USER_ID,
        COUNT(DISTINCT PURCHASE_ID) AS purchase_frequency,
        SUM(purchase_value_dollars) AS total_spend,
        AVG(purchase_value_dollars) AS avg_order_value,
        MIN(PURCHASED_AT) AS first_purchase_date,
        DATEDIFF('day', MIN(PURCHASED_AT), MAX(PURCHASED_AT)) AS purchase_span_days
    FROM period1_purchases
    GROUP BY USER_ID
)
SELECT * FROM user_features
"""

df_period1_features = pd.read_sql(period1_features_query, conn)
print(f"✅ Extracted {len(df_period1_features):,} users with Period 1 purchases")
print(f"   Time taken: {time.time() - start_time:.2f} seconds")
print(f"\nSample of Period 1 features:")
print(df_period1_features.head())
print(f"\nSummary statistics:")
print(df_period1_features.describe())

Extracting Period 1 user features...


  df_period1_features = pd.read_sql(period1_features_query, conn)


✅ Extracted 3,430,304 users with Period 1 purchases
   Time taken: 34.53 seconds

Sample of Period 1 features:
                                     USER_ID  PURCHASE_FREQUENCY  TOTAL_SPEND  \
0  ext1:46e99e8d-b96a-4eab-ba68-33c1197e9f68                   1         45.0   
1  ext1:93530b45-e549-43ff-aeea-9ade41b2fc7d                   1         60.0   
2  ext1:3ded7e05-a29d-4e7a-8ffd-50258e596160                   1        132.0   
3  ext1:f100c7ab-337b-47be-b425-e1ca028549d7                   1        250.0   
4  ext1:d0ca3816-e99f-44a5-968b-3566638b1029                   1        115.0   

   AVG_ORDER_VALUE FIRST_PURCHASE_DATE  PURCHASE_SPAN_DAYS  
0             45.0 2025-03-31 16:43:32                   0  
1             60.0 2025-05-27 13:42:38                   0  
2            132.0 2025-03-15 19:12:00                   0  
3            250.0 2025-06-18 05:20:24                   0  
4            115.0 2025-03-23 04:02:34                   0  

Summary statistics:
       PURCHASE

## Step 2: Create User Segments

Define segments based on Period 1 purchase behavior.

In [5]:
# Create segmentation variables
print("Creating user segments...")
print(f"\nAvailable columns: {df_period1_features.columns.tolist()}")

# Snowflake returns uppercase column names - convert to lowercase
df_period1_features.columns = [col.lower() for col in df_period1_features.columns]
print(f"Columns after lowercase conversion: {df_period1_features.columns.tolist()}")

# Monetary value quintiles
df_period1_features['spend_quintile'] = pd.qcut(
    df_period1_features['total_spend'], 
    q=5, 
    labels=['Q1_Low', 'Q2', 'Q3', 'Q4', 'Q5_High'],
    duplicates='drop'
)

# Purchase frequency groups
freq_bins = [0, 1, 3, 10, float('inf')]
freq_labels = ['1_purchase', '2-3_purchases', '4-10_purchases', '11+_purchases']
df_period1_features['frequency_group'] = pd.cut(
    df_period1_features['purchase_frequency'],
    bins=freq_bins,
    labels=freq_labels,
    include_lowest=True
)

# Tenure segments (days from period start to first purchase)
period1_start_date = pd.to_datetime(PERIOD1_START)
df_period1_features['first_purchase_date'] = pd.to_datetime(df_period1_features['first_purchase_date'])
df_period1_features['days_since_period_start'] = (
    df_period1_features['first_purchase_date'] - period1_start_date
).dt.days

tenure_bins = [-1, 30, 60, 90, float('inf')]  # Changed to start from -1 to include day 0
tenure_labels = ['0-30_days', '31-60_days', '61-90_days', '91+_days']
df_period1_features['tenure_group'] = pd.cut(
    df_period1_features['days_since_period_start'],
    bins=tenure_bins,
    labels=tenure_labels,
    include_lowest=True
)

# Average order value quintiles
df_period1_features['aov_quintile'] = pd.qcut(
    df_period1_features['avg_order_value'],
    q=5,
    labels=['AOV_Q1_Low', 'AOV_Q2', 'AOV_Q3', 'AOV_Q4', 'AOV_Q5_High'],
    duplicates='drop'
)

print("\nSegment distributions:")
print("\nSpend Quintiles:")
print(df_period1_features['spend_quintile'].value_counts().sort_index())
print("\nFrequency Groups:")
print(df_period1_features['frequency_group'].value_counts().sort_index())
print("\nTenure Groups:")
print(df_period1_features['tenure_group'].value_counts().sort_index())
print("\nAOV Quintiles:")
print(df_period1_features['aov_quintile'].value_counts().sort_index())

Creating user segments...

Available columns: ['USER_ID', 'PURCHASE_FREQUENCY', 'TOTAL_SPEND', 'AVG_ORDER_VALUE', 'FIRST_PURCHASE_DATE', 'PURCHASE_SPAN_DAYS']
Columns after lowercase conversion: ['user_id', 'purchase_frequency', 'total_spend', 'avg_order_value', 'first_purchase_date', 'purchase_span_days']

Segment distributions:

Spend Quintiles:
spend_quintile
Q1_Low     690501
Q2         683756
Q3         687322
Q4         683264
Q5_High    685461
Name: count, dtype: int64

Frequency Groups:
frequency_group
1_purchase        1823163
2-3_purchases      822747
4-10_purchases     564330
11+_purchases      220064
Name: count, dtype: int64

Tenure Groups:
tenure_group
0-30_days     1310754
31-60_days     915513
61-90_days     748294
91+_days       455743
Name: count, dtype: int64

AOV Quintiles:
aov_quintile
AOV_Q1_Low     710103
AOV_Q2         679787
AOV_Q3         668430
AOV_Q4         689551
AOV_Q5_High    682433
Name: count, dtype: int64


## Step 3: Identify Treatment and Control Groups

Check which users appear in AUCTIONS_USERS table.

In [None]:
# Get all users who have been in auctions (treatment group)
print("Identifying treatment group (users in AUCTIONS_USERS)...")
start_time = time.time()

# The challenge: AUCTIONS_USERS has OPAQUE_USER_ID, PURCHASES has USER_ID
# We need to identify which purchase users were exposed to auctions

print("   Checking for users exposed to ads (auctions)...")

# Since we know from the previous holdout detection that impressions can be linked to users,
# let's use a more direct approach through impressions/clicks which have USER_ID
treatment_query = """
WITH auction_exposed_users AS (
    -- Get users who received impressions (these are the treated users)
    SELECT DISTINCT USER_ID
    FROM IMPRESSIONS
    WHERE USER_ID IS NOT NULL
    
    UNION
    
    -- Also include users who clicked (subset of impressions but for completeness)
    SELECT DISTINCT USER_ID  
    FROM CLICKS
    WHERE USER_ID IS NOT NULL
)
SELECT DISTINCT USER_ID
FROM auction_exposed_users
"""

print("   Querying for users with ad exposure (impressions or clicks)...")
df_treatment_users = pd.read_sql(treatment_query, conn)
df_treatment_users.columns = [col.lower() for col in df_treatment_users.columns]
treatment_users_set = set(df_treatment_users['user_id'])

print(f"✅ Found {len(treatment_users_set):,} unique users with ad exposure")
print(f"   Time taken: {time.time() - start_time:.2f} seconds")

# Add treatment indicator to features dataframe
# Match treatment users with our analysis cohort
cohort_user_ids = set(df_period1_features['user_id'])
treatment_in_cohort = treatment_users_set.intersection(cohort_user_ids)

df_period1_features['is_treated'] = df_period1_features['user_id'].isin(treatment_in_cohort).astype(int)

print(f"\nTreatment assignment in analysis cohort:")
print(f"   Treated (D=1): {df_period1_features['is_treated'].sum():,} users")
print(f"   Control (D=0): {(1 - df_period1_features['is_treated']).sum():,} users")
print(f"   Treatment rate: {df_period1_features['is_treated'].mean():.4f}")

# Sanity check
if df_period1_features['is_treated'].sum() == 0:
    print("\n⚠️ WARNING: No treated users found in the analysis cohort!")
    print("   This could mean all purchasers are in the holdout group.")
elif df_period1_features['is_treated'].mean() > 0.99:
    print("\n⚠️ WARNING: Almost all users are treated!")
    print("   Control group is very small - estimates may be unstable.")

Identifying treatment group (users in AUCTIONS_USERS)...
   Checking for users exposed to ads (auctions)...
   Querying for users with ad exposure (impressions or clicks)...


  df_treatment_users = pd.read_sql(treatment_query, conn)


## Step 4: Get Period 2 Outcomes

Check which users made purchases in Period 2.

In [None]:
# Get Period 2 purchase outcomes
print("Extracting Period 2 outcomes...")
start_time = time.time()

period2_outcomes_query = f"""
SELECT DISTINCT USER_ID, 1 AS purchased_p2
FROM PURCHASES
WHERE PURCHASED_AT >= '{PERIOD2_START}'
  AND PURCHASED_AT <= '{PERIOD2_END}'
"""

df_period2_outcomes = pd.read_sql(period2_outcomes_query, conn)
# Convert columns to lowercase
df_period2_outcomes.columns = [col.lower() for col in df_period2_outcomes.columns]
period2_purchasers = set(df_period2_outcomes['user_id'])

print(f"✅ Found {len(period2_purchasers):,} users who purchased in Period 2")
print(f"   Time taken: {time.time() - start_time:.2f} seconds")

# Add outcome to features dataframe
df_period1_features['purchased_p2'] = df_period1_features['user_id'].isin(period2_purchasers).astype(int)

print(f"\nPeriod 2 purchase rates:")
print(f"   Overall: {df_period1_features['purchased_p2'].mean():.4f}")
print(f"   Treated: {df_period1_features[df_period1_features['is_treated']==1]['purchased_p2'].mean():.4f}")
print(f"   Control: {df_period1_features[df_period1_features['is_treated']==0]['purchased_p2'].mean():.4f}")

## Step 5: Aggregate Counts by Segment

Create the aggregated counts table needed for bounded estimation.

In [None]:
def aggregate_by_segment(df, segment_col, segment_type):
    """
    Aggregate counts for a specific segmentation variable.
    
    Returns DataFrame with columns:
    - segment_type: Type of segmentation
    - segment_name: Name of the segment
    - N_1: Count of treated users
    - n_11: Count of treated users who purchased in P2
    - n_01: Count of control users who purchased in P2
    - N_0: Count of control users
    """
    results = []
    
    for segment_value in df[segment_col].dropna().unique():
        segment_df = df[df[segment_col] == segment_value]
        
        treated = segment_df[segment_df['is_treated'] == 1]
        control = segment_df[segment_df['is_treated'] == 0]
        
        N_1 = len(treated)
        n_11 = treated['purchased_p2'].sum()
        N_0 = len(control)
        n_01 = control['purchased_p2'].sum()
        
        results.append({
            'segment_type': segment_type,
            'segment_name': str(segment_value),
            'N_1': N_1,
            'n_11': n_11,
            'N_0': N_0,
            'n_01': n_01
        })
    
    return pd.DataFrame(results)

# Aggregate for each segment type
print("Aggregating counts by segment...")

segment_configs = [
    ('spend_quintile', 'Spend_Quintile'),
    ('frequency_group', 'Purchase_Frequency'),
    ('tenure_group', 'Tenure'),
    ('aov_quintile', 'AOV_Quintile')
]

all_aggregates = []

for segment_col, segment_type in tqdm(segment_configs, desc="Processing segments"):
    segment_agg = aggregate_by_segment(df_period1_features, segment_col, segment_type)
    all_aggregates.append(segment_agg)
    print(f"\n{segment_type}:")
    print(segment_agg.to_string(index=False))

# Add overall cohort
overall_treated = df_period1_features[df_period1_features['is_treated'] == 1]
overall_control = df_period1_features[df_period1_features['is_treated'] == 0]

overall_agg = pd.DataFrame([{
    'segment_type': 'Overall_Cohort',
    'segment_name': 'All_Users',
    'N_1': len(overall_treated),
    'n_11': overall_treated['purchased_p2'].sum(),
    'N_0': len(overall_control),
    'n_01': overall_control['purchased_p2'].sum()
}])

all_aggregates.append(overall_agg)

# Combine all aggregates
df_final_aggregates = pd.concat(all_aggregates, ignore_index=True)

print("\n" + "="*80)
print("FINAL AGGREGATED COUNTS TABLE:")
print("="*80)
print(df_final_aggregates.to_string(index=False))

## Step 6: Save Results

In [None]:
# Save aggregated counts
output_file = 'causal_estimation_counts.csv'
df_final_aggregates.to_csv(output_file, index=False)
print(f"✅ Saved aggregated counts to {output_file}")

# Save detailed user-level data for validation
user_level_file = 'user_level_features.parquet'
df_period1_features.to_parquet(user_level_file, index=False)
print(f"✅ Saved user-level features to {user_level_file}")

# Generate summary report
with open('data_extraction_summary.txt', 'w') as f:
    f.write("DATA EXTRACTION SUMMARY\n")
    f.write("=" * 80 + "\n\n")
    
    f.write(f"Period 1 (Segmentation): {PERIOD1_START} to {PERIOD1_END}\n")
    f.write(f"Period 2 (Outcomes): {PERIOD2_START} to {PERIOD2_END}\n\n")
    
    f.write("COHORT STATISTICS\n")
    f.write("-" * 40 + "\n")
    f.write(f"Total users in analysis cohort: {len(df_period1_features):,}\n")
    f.write(f"Treated users (D=1): {df_period1_features['is_treated'].sum():,}\n")
    f.write(f"Control users (D=0): {(1 - df_period1_features['is_treated']).sum():,}\n")
    f.write(f"Treatment rate: {df_period1_features['is_treated'].mean():.4f}\n\n")
    
    f.write("OUTCOME STATISTICS\n")
    f.write("-" * 40 + "\n")
    f.write(f"Overall P2 purchase rate: {df_period1_features['purchased_p2'].mean():.4f}\n")
    f.write(f"Treated P2 purchase rate: {df_period1_features[df_period1_features['is_treated']==1]['purchased_p2'].mean():.4f}\n")
    f.write(f"Control P2 purchase rate: {df_period1_features[df_period1_features['is_treated']==0]['purchased_p2'].mean():.4f}\n\n")
    
    f.write("AGGREGATED COUNTS BY SEGMENT\n")
    f.write("-" * 40 + "\n")
    f.write(df_final_aggregates.to_string(index=False))

print("\n✅ Data extraction complete!")
print(f"   - Aggregated counts: {output_file}")
print(f"   - User-level features: {user_level_file}")
print(f"   - Summary report: data_extraction_summary.txt")