In [None]:
# CELL 1: IMPORTS + CONNECTION (run once, handles duo)
import os
import textwrap
from pathlib import Path
import warnings
import pandas as pd
from dotenv import load_dotenv
import snowflake.connector
from tqdm import tqdm

warnings.filterwarnings('ignore')
load_dotenv()

OUTPUT_DIR = Path("./data_r3")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

conn = snowflake.connector.connect(
    user=os.getenv('SNOWFLAKE_USER'),
    password=os.getenv('SNOWFLAKE_PASSWORD'),
    account=os.getenv('SNOWFLAKE_ACCOUNT'),
    warehouse=os.getenv('SNOWFLAKE_WAREHOUSE', 'COMPUTE_WH'),
    database='INCREMENTALITY',
    schema='INCREMENTALITY_RESEARCH'
)
print("[SUCCESS] Connected to Snowflake")

In [None]:
# CELL 2: DATA PULL (Round 3 - expanded parameters)
# CONFIG
MINUTES_WINDOW = 60  # 60 minutes (was 15 in R1)
SAMPLE_FRACTION = 0.10  # 10% of users (expanded from 1%)
TOTAL_BUCKETS = 10000
SELECTION_THRESHOLD = int(TOTAL_BUCKETS * SAMPLE_FRACTION)

print(f"[Round 3] Pulling {MINUTES_WINDOW}min, ALL placements, {SAMPLE_FRACTION:.0%} users...")
print(f"Expected yield: ~800K auctions, ~70K users, ~240K clicks, ~7.7M impressions")

# CTE for deterministic user sampling (all placements)
CTE_SQL = f"""
WITH SAMPLED_USERS AS (
    SELECT OPAQUE_USER_ID FROM (
        SELECT OPAQUE_USER_ID, MOD(ABS(HASH(OPAQUE_USER_ID)), {TOTAL_BUCKETS}) AS bucket
        FROM (SELECT DISTINCT OPAQUE_USER_ID FROM AUCTIONS_USERS 
              WHERE CREATED_AT >= DATEADD(minute, -{MINUTES_WINDOW}, CURRENT_TIMESTAMP()))
    ) WHERE bucket < {SELECTION_THRESHOLD}
)
"""

# 1. AUCTIONS_USERS
print("\n1/6 AUCTIONS_USERS...")
auctions_users = pd.read_sql(CTE_SQL + f"""
SELECT LOWER(TO_VARCHAR(au.AUCTION_ID, 'HEX')) AS AUCTION_ID,
       au.OPAQUE_USER_ID AS USER_ID, au.PLACEMENT, au.CREATED_AT
FROM AUCTIONS_USERS au
JOIN SAMPLED_USERS s ON au.OPAQUE_USER_ID = s.OPAQUE_USER_ID
WHERE au.CREATED_AT >= DATEADD(minute, -{MINUTES_WINDOW}, CURRENT_TIMESTAMP())
""", conn)
print(f"  {len(auctions_users):,} rows, {auctions_users['USER_ID'].nunique():,} users")
print(f"  Placements: {auctions_users['PLACEMENT'].value_counts().to_dict()}")

# 2. AUCTIONS_RESULTS
print("\n2/6 AUCTIONS_RESULTS...")
auctions_results = pd.read_sql(CTE_SQL + f"""
SELECT LOWER(TO_VARCHAR(ar.AUCTION_ID, 'HEX')) AS AUCTION_ID,
       LOWER(TO_VARCHAR(ar.VENDOR_ID, 'HEX')) AS VENDOR_ID,
       LOWER(TO_VARCHAR(ar.CAMPAIGN_ID, 'HEX')) AS CAMPAIGN_ID,
       LOWER(TRIM(ar.PRODUCT_ID)) AS PRODUCT_ID,
       ar.RANKING, ar.IS_WINNER, ar.FINAL_BID, ar.QUALITY,
       ar.CONVERSION_RATE, ar.PACING, ar.PRICE, ar.CREATED_AT
FROM AUCTIONS_RESULTS ar
JOIN AUCTIONS_USERS au ON ar.AUCTION_ID = au.AUCTION_ID
JOIN SAMPLED_USERS s ON au.OPAQUE_USER_ID = s.OPAQUE_USER_ID
WHERE ar.CREATED_AT >= DATEADD(minute, -{MINUTES_WINDOW}, CURRENT_TIMESTAMP())
""", conn)
print(f"  {len(auctions_results):,} rows")

# 3. IMPRESSIONS
print("\n3/6 IMPRESSIONS...")
impressions = pd.read_sql(CTE_SQL + f"""
SELECT i.INTERACTION_ID, LOWER(REPLACE(i.AUCTION_ID, '-', '')) AS AUCTION_ID,
       LOWER(TRIM(i.PRODUCT_ID)) AS PRODUCT_ID, i.USER_ID,
       LOWER(REPLACE(i.CAMPAIGN_ID, '-', '')) AS CAMPAIGN_ID,
       LOWER(REPLACE(i.VENDOR_ID, '-', '')) AS VENDOR_ID, i.OCCURRED_AT
FROM IMPRESSIONS i
JOIN SAMPLED_USERS s ON i.USER_ID = s.OPAQUE_USER_ID
WHERE i.OCCURRED_AT >= DATEADD(minute, -{MINUTES_WINDOW}, CURRENT_TIMESTAMP())
""", conn)
print(f"  {len(impressions):,} rows")

# 4. CLICKS
print("\n4/6 CLICKS...")
clicks = pd.read_sql(CTE_SQL + f"""
SELECT c.INTERACTION_ID, LOWER(REPLACE(c.AUCTION_ID, '-', '')) AS AUCTION_ID,
       LOWER(TRIM(c.PRODUCT_ID)) AS PRODUCT_ID, c.USER_ID,
       LOWER(REPLACE(c.CAMPAIGN_ID, '-', '')) AS CAMPAIGN_ID,
       LOWER(REPLACE(c.VENDOR_ID, '-', '')) AS VENDOR_ID, c.OCCURRED_AT
FROM CLICKS c
JOIN SAMPLED_USERS s ON c.USER_ID = s.OPAQUE_USER_ID
WHERE c.OCCURRED_AT >= DATEADD(minute, -{MINUTES_WINDOW}, CURRENT_TIMESTAMP())
""", conn)
print(f"  {len(clicks):,} rows")

# 5. PURCHASES (new in R2)
print("\n5/6 PURCHASES...")
purchases = pd.read_sql(CTE_SQL + f"""
SELECT p.PURCHASE_ID,
       LOWER(TRIM(p.PRODUCT_ID)) AS PRODUCT_ID,
       p.USER_ID,
       p.PURCHASED_AT,
       p.QUANTITY,
       p.UNIT_PRICE,
       p.PURCHASE_LINE
FROM PURCHASES p
JOIN SAMPLED_USERS s ON p.USER_ID = s.OPAQUE_USER_ID
WHERE p.PURCHASED_AT >= DATEADD(minute, -{MINUTES_WINDOW}, CURRENT_TIMESTAMP())
""", conn)
print(f"  {len(purchases):,} rows, {purchases['USER_ID'].nunique():,} users")
print(f"  Total revenue: ${(purchases['QUANTITY'] * purchases['UNIT_PRICE']).sum():,.0f}")

# 6. CATALOG (only products that received impressions)
print("\n6/6 CATALOG...")
product_ids = impressions['PRODUCT_ID'].dropna().unique().tolist()
print(f"  Products to fetch: {len(product_ids):,}")
if len(product_ids) > 0:
    batch_size = 10000
    catalog_dfs = []
    for i in tqdm(range(0, len(product_ids), batch_size), desc="Catalog"):
        batch = product_ids[i:i+batch_size]
        placeholders = ', '.join(['%s'] * len(batch))
        batch_df = pd.read_sql(f"""
        SELECT LOWER(TRIM(PRODUCT_ID)) AS PRODUCT_ID, NAME, PRICE AS CATALOG_PRICE,
               ACTIVE, IS_DELETED, CATEGORIES, DESCRIPTION
        FROM CATALOG WHERE LOWER(TRIM(PRODUCT_ID)) IN ({placeholders})
        """, conn, params=batch)
        catalog_dfs.append(batch_df)
    catalog = pd.concat(catalog_dfs, ignore_index=True) if catalog_dfs else pd.DataFrame()
else:
    catalog = pd.DataFrame()
print(f"  {len(catalog):,} rows")

# SAVE
print("\nSaving parquet files...")
auctions_results.to_parquet(OUTPUT_DIR / "auctions_results_r3.parquet", index=False)
auctions_users.to_parquet(OUTPUT_DIR / "auctions_users_r3.parquet", index=False)
impressions.to_parquet(OUTPUT_DIR / "impressions_r3.parquet", index=False)
clicks.to_parquet(OUTPUT_DIR / "clicks_r3.parquet", index=False)
purchases.to_parquet(OUTPUT_DIR / "purchases_r3.parquet", index=False)
catalog.to_parquet(OUTPUT_DIR / "catalog_r3.parquet", index=False)

print("\n" + "="*50)
print("DONE - Round 3")
print("="*50)
print(f"auctions_users:   {len(auctions_users):,} ({auctions_users['USER_ID'].nunique():,} users)")
print(f"auctions_results: {len(auctions_results):,}")
print(f"impressions:      {len(impressions):,}")
print(f"clicks:           {len(clicks):,}")
print(f"purchases:        {len(purchases):,}")
print(f"catalog:          {len(catalog):,}")

In [None]:
# CELL 3: DATA QUALITY CHECKS (from notes.md Q1-Q6)
print("="*60)
print("DATA QUALITY CHECKS")
print("="*60)

# Q1: Are impression timestamps unique within auction, or batched?
print("\n--- Q1: Impression timestamp uniqueness within auction ---")
imp_ts = impressions.groupby('AUCTION_ID')['OCCURRED_AT'].agg(['nunique', 'count'])
print(f"Auctions with >1 impression: {(imp_ts['count'] > 1).sum():,}")
print(f"Among those, unique timestamps: {imp_ts[imp_ts['count'] > 1]['nunique'].describe().to_dict()}")
print(f"Batched (all same timestamp): {(imp_ts['nunique'] == 1).sum():,} / {len(imp_ts):,}")

# Q2: Distribution of positions per auction
print("\n--- Q2: Positions per auction ---")
positions_per_auction = impressions.groupby('AUCTION_ID')['PRODUCT_ID'].nunique()
print(positions_per_auction.describe())

# Q3: Maximum rank that receives impression
print("\n--- Q3: Max rank receiving impression ---")
# Join impressions to auctions_results to get ranking
imp_with_rank = impressions.merge(
    auctions_results[['AUCTION_ID', 'PRODUCT_ID', 'RANKING']],
    on=['AUCTION_ID', 'PRODUCT_ID'],
    how='left'
)
print(f"Impressions with ranking: {imp_with_rank['RANKING'].notna().sum():,} / {len(imp_with_rank):,}")
print(f"Ranking distribution for impressions:")
print(imp_with_rank['RANKING'].describe())
print(f"\nMax rank shown: {imp_with_rank['RANKING'].max()}")

# Q4: Products appearing at multiple positions
print("\n--- Q4: Product position variation across auctions ---")
prod_pos = auctions_results.groupby('PRODUCT_ID')['RANKING'].agg(['mean', 'std', 'count', 'nunique'])
prod_pos = prod_pos[prod_pos['count'] >= 5]  # Products with 5+ appearances
print(f"Products with 5+ auctions: {len(prod_pos):,}")
print(f"Avg unique positions per product: {prod_pos['nunique'].mean():.1f}")
print(f"Products with position variation (nunique > 1): {(prod_pos['nunique'] > 1).sum():,}")

# Q5: User auction frequency and position variation
print("\n--- Q5: User-level auction frequency ---")
user_auctions = auctions_users.groupby('USER_ID').size()
print(f"Auctions per user:")
print(user_auctions.describe())
print(f"\nUsers with 10+ auctions: {(user_auctions >= 10).sum():,}")

# Q6: Time between auctions for same user (session definition)
print("\n--- Q6: Time between auctions (session gaps) ---")
auctions_users_sorted = auctions_users.sort_values(['USER_ID', 'CREATED_AT'])
auctions_users_sorted['time_gap'] = auctions_users_sorted.groupby('USER_ID')['CREATED_AT'].diff()
gaps = auctions_users_sorted['time_gap'].dropna().dt.total_seconds()
print(f"Gap between auctions (seconds):")
print(gaps.describe())
print(f"\nMedian gap: {gaps.median():.1f}s = {gaps.median()/60:.1f}min")
print(f"75th percentile: {gaps.quantile(0.75):.1f}s = {gaps.quantile(0.75)/60:.1f}min")

# Additional: Purchase linkage check
print("\n--- Purchase-to-impression linkage ---")
purchase_products = set(purchases['PRODUCT_ID'].unique())
impression_products = set(impressions['PRODUCT_ID'].unique())
overlap = purchase_products & impression_products
print(f"Unique products in purchases: {len(purchase_products):,}")
print(f"Unique products in impressions: {len(impression_products):,}")
print(f"Overlap (promoted purchases): {len(overlap):,}")
print(f"Organic-only purchases: {len(purchase_products - impression_products):,}")

print("\n" + "="*60)