In [12]:
# CELL 1: IMPORTS + CONNECTION (run once, handles duo)
import os
import textwrap
from pathlib import Path
import warnings
import pandas as pd
from dotenv import load_dotenv
import snowflake.connector
from tqdm import tqdm

warnings.filterwarnings('ignore')
load_dotenv()

OUTPUT_DIR = Path("./data")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

conn = snowflake.connector.connect(
    user=os.getenv('SNOWFLAKE_USER'),
    password=os.getenv('SNOWFLAKE_PASSWORD'),
    account=os.getenv('SNOWFLAKE_ACCOUNT'),
    warehouse=os.getenv('SNOWFLAKE_WAREHOUSE', 'COMPUTE_WH'),
    database='INCREMENTALITY',
    schema='INCREMENTALITY_RESEARCH'
)
print("[SUCCESS] Connected to Snowflake")

[SUCCESS] Connected to Snowflake


In [13]:
# CELL 2: DATA PULL (rerun this to adjust time window)
# CONFIG
MINUTES_WINDOW = 15  # 15 minutes
SAMPLE_FRACTION = 0.01  # 1% of users
TOTAL_BUCKETS = 10000
SELECTION_THRESHOLD = int(TOTAL_BUCKETS * SAMPLE_FRACTION)

print(f"Pulling {MINUTES_WINDOW}min, ALL placements, {SAMPLE_FRACTION:.0%} users...")

# CTE for deterministic user sampling (all placements)
CTE_SQL = f"""
WITH SAMPLED_USERS AS (
    SELECT OPAQUE_USER_ID FROM (
        SELECT OPAQUE_USER_ID, MOD(ABS(HASH(OPAQUE_USER_ID)), {TOTAL_BUCKETS}) AS bucket
        FROM (SELECT DISTINCT OPAQUE_USER_ID FROM AUCTIONS_USERS 
              WHERE CREATED_AT >= DATEADD(minute, -{MINUTES_WINDOW}, CURRENT_TIMESTAMP()))
    ) WHERE bucket < {SELECTION_THRESHOLD}
)
"""

# 1. AUCTIONS_USERS
print("\n1/5 AUCTIONS_USERS...")
auctions_users = pd.read_sql(CTE_SQL + f"""
SELECT LOWER(TO_VARCHAR(au.AUCTION_ID, 'HEX')) AS AUCTION_ID,
       au.OPAQUE_USER_ID AS USER_ID, au.PLACEMENT, au.CREATED_AT
FROM AUCTIONS_USERS au
JOIN SAMPLED_USERS s ON au.OPAQUE_USER_ID = s.OPAQUE_USER_ID
WHERE au.CREATED_AT >= DATEADD(minute, -{MINUTES_WINDOW}, CURRENT_TIMESTAMP())
""", conn)
print(f"  {len(auctions_users):,} rows, {auctions_users['USER_ID'].nunique():,} users")
print(f"  Placements: {auctions_users['PLACEMENT'].value_counts().to_dict()}")

# 2. AUCTIONS_RESULTS
print("\n2/5 AUCTIONS_RESULTS...")
auctions_results = pd.read_sql(CTE_SQL + f"""
SELECT LOWER(TO_VARCHAR(ar.AUCTION_ID, 'HEX')) AS AUCTION_ID,
       LOWER(TO_VARCHAR(ar.VENDOR_ID, 'HEX')) AS VENDOR_ID,
       LOWER(TO_VARCHAR(ar.CAMPAIGN_ID, 'HEX')) AS CAMPAIGN_ID,
       LOWER(TRIM(ar.PRODUCT_ID)) AS PRODUCT_ID,
       ar.RANKING, ar.IS_WINNER, ar.FINAL_BID, ar.QUALITY,
       ar.CONVERSION_RATE, ar.PACING, ar.PRICE, ar.CREATED_AT
FROM AUCTIONS_RESULTS ar
JOIN AUCTIONS_USERS au ON ar.AUCTION_ID = au.AUCTION_ID
JOIN SAMPLED_USERS s ON au.OPAQUE_USER_ID = s.OPAQUE_USER_ID
WHERE ar.CREATED_AT >= DATEADD(minute, -{MINUTES_WINDOW}, CURRENT_TIMESTAMP())
""", conn)
print(f"  {len(auctions_results):,} rows")

# 3. IMPRESSIONS
print("\n3/5 IMPRESSIONS...")
impressions = pd.read_sql(CTE_SQL + f"""
SELECT i.INTERACTION_ID, LOWER(REPLACE(i.AUCTION_ID, '-', '')) AS AUCTION_ID,
       LOWER(TRIM(i.PRODUCT_ID)) AS PRODUCT_ID, i.USER_ID,
       LOWER(REPLACE(i.CAMPAIGN_ID, '-', '')) AS CAMPAIGN_ID,
       LOWER(REPLACE(i.VENDOR_ID, '-', '')) AS VENDOR_ID, i.OCCURRED_AT
FROM IMPRESSIONS i
JOIN SAMPLED_USERS s ON i.USER_ID = s.OPAQUE_USER_ID
WHERE i.OCCURRED_AT >= DATEADD(minute, -{MINUTES_WINDOW}, CURRENT_TIMESTAMP())
""", conn)
print(f"  {len(impressions):,} rows")

# 4. CLICKS
print("\n4/5 CLICKS...")
clicks = pd.read_sql(CTE_SQL + f"""
SELECT c.INTERACTION_ID, LOWER(REPLACE(c.AUCTION_ID, '-', '')) AS AUCTION_ID,
       LOWER(TRIM(c.PRODUCT_ID)) AS PRODUCT_ID, c.USER_ID,
       LOWER(REPLACE(c.CAMPAIGN_ID, '-', '')) AS CAMPAIGN_ID,
       LOWER(REPLACE(c.VENDOR_ID, '-', '')) AS VENDOR_ID, c.OCCURRED_AT
FROM CLICKS c
JOIN SAMPLED_USERS s ON c.USER_ID = s.OPAQUE_USER_ID
WHERE c.OCCURRED_AT >= DATEADD(minute, -{MINUTES_WINDOW}, CURRENT_TIMESTAMP())
""", conn)
print(f"  {len(clicks):,} rows")

# 5. CATALOG
print("\n5/5 CATALOG...")
product_ids = auctions_results['PRODUCT_ID'].dropna().unique().tolist()
print(f"  Products to fetch: {len(product_ids):,}")
if len(product_ids) > 0:
    batch_size = 10000
    catalog_dfs = []
    for i in tqdm(range(0, len(product_ids), batch_size), desc="Catalog"):
        batch = product_ids[i:i+batch_size]
        placeholders = ', '.join(['%s'] * len(batch))
        batch_df = pd.read_sql(f"""
        SELECT LOWER(TRIM(PRODUCT_ID)) AS PRODUCT_ID, NAME, PRICE AS CATALOG_PRICE,
               ACTIVE, IS_DELETED, CATEGORIES, DESCRIPTION
        FROM CATALOG WHERE LOWER(TRIM(PRODUCT_ID)) IN ({placeholders})
        """, conn, params=batch)
        catalog_dfs.append(batch_df)
    catalog = pd.concat(catalog_dfs, ignore_index=True) if catalog_dfs else pd.DataFrame()
else:
    catalog = pd.DataFrame()
print(f"  {len(catalog):,} rows")

# SAVE
print("\nSaving parquet files...")
auctions_results.to_parquet(OUTPUT_DIR / "auctions_results_all.parquet", index=False)
auctions_users.to_parquet(OUTPUT_DIR / "auctions_users_all.parquet", index=False)
impressions.to_parquet(OUTPUT_DIR / "impressions_all.parquet", index=False)
clicks.to_parquet(OUTPUT_DIR / "clicks_all.parquet", index=False)
catalog.to_parquet(OUTPUT_DIR / "catalog_all.parquet", index=False)

print("\n" + "="*50)
print("DONE")
print("="*50)
print(f"auctions_users:   {len(auctions_users):,} ({auctions_users['USER_ID'].nunique():,} users)")
print(f"auctions_results: {len(auctions_results):,}")
print(f"impressions:      {len(impressions):,}")
print(f"clicks:           {len(clicks):,}")
print(f"catalog:          {len(catalog):,}")

Pulling 15min, ALL placements, 1% users...

1/5 AUCTIONS_USERS...
  78,318 rows, 7,239 users
  Placements: {'3': 46844, '1': 14543, '5': 8606, '2': 8325}

2/5 AUCTIONS_RESULTS...
  3,748,381 rows

3/5 IMPRESSIONS...
  192,307 rows

4/5 CLICKS...
  6,090 rows

5/5 CATALOG...
  Products to fetch: 1,043,111


Catalog: 100%|██████████| 105/105 [1:51:19<00:00, 63.62s/it]


  1,039,783 rows

Saving parquet files...

DONE
auctions_users:   78,318 (7,239 users)
auctions_results: 3,748,381
impressions:      192,307
clicks:           6,090
catalog:          1,039,783


In [None]:
print("="*60)                                                                                                        
print("DIAGNOSTIC QUERIES")                                                                                          
print("="*60)                                                                                                        
                                                                                                                    
# 1. What predicts RANKING?                                                                                          
print("\n--- Q1: What predicts RANKING? ---")                                                                        
q1 = pd.read_sql("""                                                                                                 
SELECT                                                                                                               
    CORR(RANKING, FINAL_BID) as corr_bid,                                                                            
    CORR(RANKING, QUALITY) as corr_quality,                                                                          
    CORR(RANKING, QUALITY * FINAL_BID) as corr_score,                                                                
    CORR(RANKING, CONVERSION_RATE * FINAL_BID) as corr_cvr_bid,                                                      
    CORR(RANKING, CONVERSION_RATE) as corr_cvr                                                                       
FROM AUCTIONS_RESULTS                                                                                                
WHERE CREATED_AT >= DATEADD(hour, -24, CURRENT_TIMESTAMP())                                                          
""", conn)                                                                                                           
print(q1.T.to_string())                                                                                              
                                                                                                                    
# 2. Placement types                                                                                                 
print("\n--- Q2: What are the placement types? ---")                                                                 
q2 = pd.read_sql("""                                                                                                 
SELECT PLACEMENT, COUNT(*) as n_auctions                                                                             
FROM AUCTIONS_USERS                                                                                                  
WHERE CREATED_AT >= DATEADD(hour, -24, CURRENT_TIMESTAMP())                                                          
GROUP BY PLACEMENT                                                                                                   
ORDER BY n_auctions DESC                                                                                             
""", conn)                                                                                                           
print(q2.to_string())                                                                                                
                                                                                                                    
# 3. Impressions per auction                                                                                         
print("\n--- Q3: Impressions per auction (how many slots shown?) ---")                                               
q3 = pd.read_sql("""                                                                                                 
SELECT                                                                                                               
    AVG(products_shown) as avg_shown,                                                                                
    MEDIAN(products_shown) as median_shown,                                                                          
    MIN(products_shown) as min_shown,                                                                                
    MAX(products_shown) as max_shown,                                                                                
    COUNT(*) as n_auctions                                                                                           
FROM (                                                                                                               
    SELECT AUCTION_ID, COUNT(DISTINCT PRODUCT_ID) as products_shown                                                  
    FROM IMPRESSIONS                                                                                                 
    WHERE OCCURRED_AT >= DATEADD(hour, -24, CURRENT_TIMESTAMP())                                                     
    GROUP BY AUCTION_ID                                                                                              
)                                                                                                                    
""", conn)                                                                                                           
print(q3.to_string())                                                                                                
                                                                                                                    
# 4. Winners vs impressions                                                                                          
print("\n--- Q4: Why do winners not get impressions? ---")                                                           
q4 = pd.read_sql("""                                                                                                 
SELECT                                                                                                               
    ar.IS_WINNER,                                                                                                    
    COUNT(*) as n_bids,                                                                                              
    COUNT(DISTINCT CASE WHEN i.AUCTION_ID IS NOT NULL                                                                
        THEN LOWER(TO_VARCHAR(ar.AUCTION_ID, 'HEX')) || LOWER(TRIM(ar.PRODUCT_ID)) END) as got_impression          
FROM AUCTIONS_RESULTS ar                                                                                             
LEFT JOIN IMPRESSIONS i                                                                                              
    ON LOWER(TO_VARCHAR(ar.AUCTION_ID, 'HEX')) = LOWER(REPLACE(i.AUCTION_ID, '-', ''))                               
    AND LOWER(TRIM(ar.PRODUCT_ID)) = LOWER(TRIM(i.PRODUCT_ID))                                                       
WHERE ar.CREATED_AT >= DATEADD(hour, -24, CURRENT_TIMESTAMP())                                                       
GROUP BY ar.IS_WINNER                                                                                                
""", conn)                                                                                                           
print(q4.to_string())                                                                                                
                                                                                                                    
# 5. Max RANKING among winners                                                                                       
print("\n--- Q5: Max RANKING among winners (how many slots?) ---")                                                   
q5 = pd.read_sql("""                                                                                                 
SELECT                                                                                                               
    MAX(RANKING) as max_winner_rank,                                                                                 
    MIN(RANKING) as min_winner_rank,                                                                                 
    AVG(RANKING) as avg_winner_rank,                                                                                 
    COUNT(*) as n_winners                                                                                            
FROM AUCTIONS_RESULTS                                                                                                
WHERE IS_WINNER = TRUE                                                                                               
AND CREATED_AT >= DATEADD(hour, -24, CURRENT_TIMESTAMP())                                                          
""", conn)                                                                                                           
print(q5.to_string())                                                                                                
                                                                                                                    
# 6. Check ranking formula candidates                                                                                
print("\n--- Q6: Rank prediction accuracy by formula ---")                                                           
sample = pd.read_sql("""                                                                                             
SELECT LOWER(TO_VARCHAR(AUCTION_ID, 'HEX')) as AUCTION_ID, PRODUCT_ID, RANKING, FINAL_BID, QUALITY, CONVERSION_RATE, 
PACING                                                                                                               
FROM AUCTIONS_RESULTS                                                                                                
WHERE CREATED_AT >= DATEADD(hour, -1, CURRENT_TIMESTAMP())                                                           
LIMIT 100000                                                                                                         
""", conn)                                                                                                           
                                                                                                                    
def check_rank_match(grp, score_col):                                                                                
    grp = grp.copy()                                                                                                 
    grp['predicted_rank'] = grp[score_col].rank(ascending=False, method='first')                                     
    return (grp['RANKING'] == grp['predicted_rank']).mean()                                                          
                                                                                                                    
sample['score_bid'] = sample['FINAL_BID']                                                                            
sample['score_quality_bid'] = sample['QUALITY'] * sample['FINAL_BID']                                                
sample['score_cvr_bid'] = sample['CONVERSION_RATE'] * sample['FINAL_BID']                                            
sample['score_quality'] = sample['QUALITY']                                                                          
sample['score_pacing_bid'] = sample['PACING'] * sample['FINAL_BID']                                                  
                                                                                                                    
for score in ['score_bid', 'score_quality_bid', 'score_cvr_bid', 'score_quality', 'score_pacing_bid']:               
    match = sample.groupby('AUCTION_ID').apply(lambda g: check_rank_match(g, score)).mean()                          
    print(f"  {score}: {match:.1%} match")                                                                           
                                                                                                                    
print("\n" + "="*60)  

DIAGNOSTIC QUERIES

--- Q1: What predicts RANKING? ---
                     0
CORR_BID     -0.138181
CORR_QUALITY -0.069196
CORR_SCORE   -0.116432
CORR_CVR_BID -0.101949
CORR_CVR      0.015812

--- Q2: What are the placement types? ---
  PLACEMENT  N_AUCTIONS
0         3    15407865
1         1     5205703
2         5     3501237
3         2     2744936
4         4       95057

--- Q3: Impressions per auction (how many slots shown?) ---
   AVG_SHOWN  MEDIAN_SHOWN  MIN_SHOWN  MAX_SHOWN  N_AUCTIONS
0   7.043868           4.0          1         64     9026665

--- Q4: Why do winners not get impressions? ---


KeyboardInterrupt: 

In [18]:
                                                                                            
# Q1: Timestamp batching check                                                                
print("Q1: Timestamp uniqueness within auctions")                                             
ts_check = imp.groupby('AUCTION_ID').agg({                                                    
    'OCCURRED_AT': ['count', 'nunique']                                                       
}).reset_index()                                                                              
ts_check.columns = ['AUCTION_ID', 'n_impressions', 'n_unique_ts']                             
ts_check['uniqueness_ratio'] = ts_check['n_unique_ts'] / ts_check['n_impressions']            
print(f"  Mean uniqueness ratio: {ts_check['uniqueness_ratio'].mean():.3f}")                  
print(f"  Auctions with all same timestamp: {(ts_check['uniqueness_ratio'] ==                 
1/ts_check['n_impressions']).sum()}")                                                         
                                                                                            
# Q3: Max rank with impression                                                                
imp_with_rank = imp.merge(winners[['AUCTION_ID', 'PRODUCT_ID', 'RANKING']],                   
                        on=['AUCTION_ID', 'PRODUCT_ID'], how='left')                        
print(f"\nQ3: Max rank ever getting impression: {imp_with_rank['RANKING'].max()}")            
print(f"    P99 rank: {imp_with_rank['RANKING'].quantile(0.99)}")                             
                                                                                            
# Q5: Users with more auctions -> more rank variation?                                        
user_auction_counts = ar.groupby('USER_ID')['AUCTION_ID'].nunique().reset_index()             
user_auction_counts.columns = ['USER_ID', 'n_auctions']                                       
user_rank_var = ar.groupby(['USER_ID', 'PRODUCT_ID'])['RANKING'].std().reset_index()          
user_rank_var.columns = ['USER_ID', 'PRODUCT_ID', 'rank_std']                                 
user_rank_var = user_rank_var.groupby('USER_ID')['rank_std'].mean().reset_index()             
merged = user_auction_counts.merge(user_rank_var, on='USER_ID')                               
print(f"\nQ5: Correlation(n_auctions, mean_rank_std):                                         
{merged['n_auctions'].corr(merged['rank_std']):.3f}")                                         
                                                                                            
# Q6: Time between auctions for same user                                                     
au_sorted = au.sort_values(['USER_ID', 'CREATED_AT'])                                         
au_sorted['prev_time'] = au_sorted.groupby('USER_ID')['CREATED_AT'].shift(1)                  
au_sorted['gap_minutes'] = (pd.to_datetime(au_sorted['CREATED_AT']) -                         
                            pd.to_datetime(au_sorted['prev_time'])).dt.total_seconds() / 60   
gaps = au_sorted['gap_minutes'].dropna()                                                      
print(f"\nQ6: Time between auctions (same user):")                                            
print(f"    Median: {gaps.median():.1f} min")                                                 
print(f"    Mean: {gaps.mean():.1f} min")                                                     
print(f"    P25: {gaps.quantile(0.25):.1f} min")                                              
print(f"    P75: {gaps.quantile(0.75):.1f} min")                                              
print(f"    % gaps < 30 min (same session): {(gaps < 30).mean()*100:.1f}%")                   
                                                                

SyntaxError: unterminated string literal (detected at line 9) (3626487021.py, line 9)