# 02 Canonical Tables

**Purpose:** Build auditable canonical tables for panel construction.

**Outputs:**
1. `PROMOTED_EVENTS` - One row per promoted click with full auction metadata
2. `PURCHASES_MAPPED` - Purchases with vendor attribution (promoted-linked only)
3. Session IDs with multiple gap thresholds (1/2/3/5/7 days)

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

DATA_DIR = Path('../eda/data')
OUTPUT_DIR = Path('data')
OUTPUT_DIR.mkdir(exist_ok=True)

print(f"Input: {DATA_DIR.resolve()}")
print(f"Output: {OUTPUT_DIR.resolve()}")

## 1. Load Source Tables

In [None]:
print("Loading source tables...")

clicks = pd.read_parquet(DATA_DIR / 'clicks_365d.parquet')
impressions = pd.read_parquet(DATA_DIR / 'impressions_365d.parquet')
bids = pd.read_parquet(DATA_DIR / 'auctions_results_365d.parquet')
auctions = pd.read_parquet(DATA_DIR / 'auctions_users_365d.parquet')
purchases = pd.read_parquet(DATA_DIR / 'purchases_365d.parquet')

print(f"Clicks: {len(clicks):,}")
print(f"Impressions: {len(impressions):,}")
print(f"Bids: {len(bids):,}")
print(f"Auctions: {len(auctions):,}")
print(f"Purchases: {len(purchases):,}")

In [None]:
# Parse timestamps
print("Parsing timestamps...")
clicks['click_time'] = pd.to_datetime(clicks['OCCURRED_AT'])
impressions['impression_time'] = pd.to_datetime(impressions['OCCURRED_AT'])
bids['bid_time'] = pd.to_datetime(bids['CREATED_AT'])
auctions['auction_time'] = pd.to_datetime(auctions['CREATED_AT'])
purchases['purchase_time'] = pd.to_datetime(purchases['PURCHASED_AT'])

print("Done.")

## 2. Build PROMOTED_EVENTS Table

Join chain: CLICKS → IMPRESSIONS → BIDS → AUCTIONS

In [None]:
print("=" * 80)
print("BUILDING PROMOTED_EVENTS")
print("=" * 80)

# Define composite join keys
COMPOSITE_KEYS = ['AUCTION_ID', 'PRODUCT_ID', 'VENDOR_ID', 'CAMPAIGN_ID']

# Start with clicks
promoted_events = clicks[['INTERACTION_ID', 'AUCTION_ID', 'PRODUCT_ID', 'USER_ID', 
                          'VENDOR_ID', 'CAMPAIGN_ID', 'click_time']].copy()
promoted_events.columns = ['click_id', 'auction_id', 'product_id', 'user_id', 
                           'vendor_id', 'campaign_id', 'click_time']

print(f"Starting clicks: {len(promoted_events):,}")

In [None]:
# Join to impressions to get impression_time
print("\nJoining to impressions...")

impressions_slim = impressions[['AUCTION_ID', 'PRODUCT_ID', 'VENDOR_ID', 'CAMPAIGN_ID', 'impression_time']].copy()
impressions_slim.columns = ['auction_id', 'product_id', 'vendor_id', 'campaign_id', 'impression_time']

# Dedupe impressions (take earliest per composite key)
impressions_slim = impressions_slim.sort_values('impression_time').drop_duplicates(
    subset=['auction_id', 'product_id', 'vendor_id', 'campaign_id'], keep='first'
)

promoted_events = promoted_events.merge(
    impressions_slim,
    on=['auction_id', 'product_id', 'vendor_id', 'campaign_id'],
    how='left'
)

imp_match_rate = promoted_events['impression_time'].notna().mean() * 100
print(f"Clicks with impression match: {imp_match_rate:.1f}%")

In [None]:
# Join to bids to get auction metadata
print("\nJoining to bids...")

bid_cols = ['AUCTION_ID', 'PRODUCT_ID', 'VENDOR_ID', 'CAMPAIGN_ID', 
            'RANKING', 'IS_WINNER', 'FINAL_BID', 'QUALITY', 'PACING', 
            'CONVERSION_RATE', 'PRICE']
bid_cols_available = [c for c in bid_cols if c in bids.columns]

bids_slim = bids[bid_cols_available].copy()
bids_slim.columns = [c.lower() for c in bids_slim.columns]

# Dedupe bids (take winner or highest rank)
if 'is_winner' in bids_slim.columns:
    bids_slim = bids_slim.sort_values(['is_winner', 'ranking'], ascending=[False, True])
bids_slim = bids_slim.drop_duplicates(
    subset=['auction_id', 'product_id', 'vendor_id', 'campaign_id'], keep='first'
)

promoted_events = promoted_events.merge(
    bids_slim,
    on=['auction_id', 'product_id', 'vendor_id', 'campaign_id'],
    how='left'
)

bid_match_rate = promoted_events['ranking'].notna().mean() * 100
print(f"Clicks with bid match: {bid_match_rate:.1f}%")

In [None]:
# Join to auctions to get placement
print("\nJoining to auctions...")

auctions_slim = auctions[['AUCTION_ID', 'OPAQUE_USER_ID', 'PLACEMENT']].copy() if 'PLACEMENT' in auctions.columns else auctions[['AUCTION_ID', 'OPAQUE_USER_ID']].copy()
auctions_slim.columns = ['auction_id', 'opaque_user_id'] + (['placement'] if 'PLACEMENT' in auctions.columns else [])
auctions_slim = auctions_slim.drop_duplicates(subset=['auction_id'], keep='first')

promoted_events = promoted_events.merge(
    auctions_slim,
    on=['auction_id'],
    how='left'
)

auction_match_rate = promoted_events['opaque_user_id'].notna().mean() * 100
print(f"Clicks with auction match: {auction_match_rate:.1f}%")

In [None]:
# Final PROMOTED_EVENTS summary
print("\n" + "=" * 80)
print("PROMOTED_EVENTS SUMMARY")
print("=" * 80)
print(f"Rows: {len(promoted_events):,}")
print(f"Columns: {list(promoted_events.columns)}")
print(f"\nUnique users: {promoted_events['user_id'].nunique():,}")
print(f"Unique vendors: {promoted_events['vendor_id'].nunique():,}")
print(f"Unique products: {promoted_events['product_id'].nunique():,}")
print(f"Date range: {promoted_events['click_time'].min()} to {promoted_events['click_time'].max()}")

# Save
promoted_events.to_parquet(OUTPUT_DIR / 'promoted_events.parquet', index=False)
print(f"\nSaved to {OUTPUT_DIR / 'promoted_events.parquet'}")

## 3. Build PURCHASES_MAPPED Table

Map purchases to vendors via promoted journey linkage only.

In [None]:
print("=" * 80)
print("BUILDING PURCHASES_MAPPED")
print("=" * 80)

# Start with purchases
purchases_mapped = purchases[['PURCHASE_ID', 'USER_ID', 'PRODUCT_ID', 'purchase_time', 
                               'QUANTITY', 'UNIT_PRICE', 'PURCHASE_LINE']].copy()
purchases_mapped.columns = ['purchase_id', 'user_id', 'product_id', 'purchase_time',
                            'quantity', 'unit_price', 'purchase_line']

# Calculate spend (cents to dollars)
purchases_mapped['spend'] = purchases_mapped['quantity'] * purchases_mapped['unit_price'] / 100

print(f"Total purchases: {len(purchases_mapped):,}")
print(f"Total spend: ${purchases_mapped['spend'].sum():,.2f}")

In [None]:
# Get promoted click info for each (user, product)
# Take earliest click time for attribution
click_info = promoted_events.groupby(['user_id', 'product_id']).agg({
    'click_time': 'min',
    'vendor_id': 'first',
    'campaign_id': 'first',
    'ranking': 'first',
    'is_winner': 'first'
}).reset_index()
click_info.columns = ['user_id', 'product_id', 'first_click_time', 
                      'click_vendor_id', 'click_campaign_id', 
                      'click_ranking', 'click_is_winner']

print(f"\nUnique (user, product) pairs with clicks: {len(click_info):,}")

In [None]:
# Join purchases to click info
purchases_mapped = purchases_mapped.merge(
    click_info,
    on=['user_id', 'product_id'],
    how='left'
)

# Flag promoted-linked purchases
purchases_mapped['is_promoted_linked'] = purchases_mapped['first_click_time'].notna()

# Calculate click-to-purchase lag
purchases_mapped['click_to_purchase_hours'] = np.where(
    purchases_mapped['is_promoted_linked'],
    (purchases_mapped['purchase_time'] - purchases_mapped['first_click_time']).dt.total_seconds() / 3600,
    np.nan
)

# Flag purchases after click (valid attribution)
purchases_mapped['is_post_click'] = (
    purchases_mapped['is_promoted_linked'] & 
    (purchases_mapped['click_to_purchase_hours'] >= 0)
)

In [None]:
# Summary
print("\n--- Mapping Summary ---")
print(f"Total purchases: {len(purchases_mapped):,}")
print(f"Promoted-linked: {purchases_mapped['is_promoted_linked'].sum():,} ({purchases_mapped['is_promoted_linked'].mean()*100:.1f}%)")
print(f"Post-click (valid): {purchases_mapped['is_post_click'].sum():,} ({purchases_mapped['is_post_click'].mean()*100:.1f}%)")

total_spend = purchases_mapped['spend'].sum()
linked_spend = purchases_mapped.loc[purchases_mapped['is_promoted_linked'], 'spend'].sum()
valid_spend = purchases_mapped.loc[purchases_mapped['is_post_click'], 'spend'].sum()

print(f"\n--- Spend Coverage ---")
print(f"Total spend: ${total_spend:,.2f}")
print(f"Promoted-linked spend: ${linked_spend:,.2f} ({linked_spend/total_spend*100:.1f}%)")
print(f"Valid post-click spend: ${valid_spend:,.2f} ({valid_spend/total_spend*100:.1f}%)")

In [None]:
# Lag distribution for valid purchases
valid_purchases = purchases_mapped[purchases_mapped['is_post_click']].copy()

print("\n--- Click-to-Purchase Lag (valid only) ---")
print(valid_purchases['click_to_purchase_hours'].describe())

print("\nPercentiles (hours):")
for p in [10, 25, 50, 75, 90, 95, 99]:
    val = valid_purchases['click_to_purchase_hours'].quantile(p/100)
    print(f"  P{p}: {val:.1f}h ({val/24:.1f} days)")

In [None]:
# Save
purchases_mapped.to_parquet(OUTPUT_DIR / 'purchases_mapped.parquet', index=False)
print(f"\nSaved to {OUTPUT_DIR / 'purchases_mapped.parquet'}")

## 4. Build Session IDs with Multiple Gap Thresholds

In [None]:
print("=" * 80)
print("BUILDING SESSION IDS")
print("=" * 80)

# Combine all user events for sessionization
# Events: clicks, impressions, purchases

events_list = []

# Clicks
click_events = promoted_events[['user_id', 'click_time', 'vendor_id', 'product_id']].copy()
click_events['event_type'] = 'click'
click_events.columns = ['user_id', 'timestamp', 'vendor_id', 'product_id', 'event_type']
events_list.append(click_events)

# Purchases (only post-click valid ones)
purchase_events = purchases_mapped[purchases_mapped['is_post_click']][['user_id', 'purchase_time', 'click_vendor_id', 'product_id', 'spend']].copy()
purchase_events['event_type'] = 'purchase'
purchase_events.columns = ['user_id', 'timestamp', 'vendor_id', 'product_id', 'spend', 'event_type']
events_list.append(purchase_events)

# Combine
all_events = pd.concat(events_list, ignore_index=True)
all_events = all_events.sort_values(['user_id', 'timestamp']).reset_index(drop=True)

print(f"Total events: {len(all_events):,}")
print(f"Unique users: {all_events['user_id'].nunique():,}")

In [None]:
# Create session IDs for multiple gap thresholds
SESSION_GAPS = [1, 2, 3, 5, 7]  # days

print("\nCreating session IDs for multiple gap thresholds...")

# Calculate time since last event per user
all_events['time_since_last'] = all_events.groupby('user_id')['timestamp'].diff()

for gap_days in tqdm(SESSION_GAPS, desc="Session gaps"):
    gap_threshold = pd.Timedelta(days=gap_days)
    col_name = f'session_id_{gap_days}d'
    
    # New session starts when gap exceeds threshold or first event
    all_events['new_session'] = (
        (all_events['time_since_last'] > gap_threshold) | 
        (all_events['time_since_last'].isnull())
    )
    
    # Assign session numbers within user
    all_events['session_num'] = all_events.groupby('user_id')['new_session'].cumsum()
    
    # Create session ID
    all_events[col_name] = all_events['user_id'].astype(str) + '_S' + all_events['session_num'].astype(str)

# Clean up temp columns
all_events = all_events.drop(columns=['time_since_last', 'new_session', 'session_num'])

In [None]:
# Session counts by threshold
print("\n--- Session Counts by Gap Threshold ---")
for gap_days in SESSION_GAPS:
    col = f'session_id_{gap_days}d'
    n_sessions = all_events[col].nunique()
    avg_per_user = n_sessions / all_events['user_id'].nunique()
    print(f"  {gap_days}-day gap: {n_sessions:,} sessions ({avg_per_user:.1f} per user)")

In [None]:
# Add week index
all_events['week'] = all_events['timestamp'].dt.isocalendar().week
all_events['year'] = all_events['timestamp'].dt.year
all_events['year_week'] = all_events['year'].astype(str) + '_W' + all_events['week'].astype(str).str.zfill(2)

print(f"\nWeeks in data: {all_events['year_week'].nunique()}")
print(f"Range: {all_events['year_week'].min()} to {all_events['year_week'].max()}")

In [None]:
# Save events with session IDs
all_events.to_parquet(OUTPUT_DIR / 'events_with_sessions.parquet', index=False)
print(f"\nSaved to {OUTPUT_DIR / 'events_with_sessions.parquet'}")

print(f"\nColumns: {list(all_events.columns)}")

## 5. Summary Statistics

In [None]:
print("=" * 80)
print("CANONICAL TABLES SUMMARY")
print("=" * 80)

print("\n1. PROMOTED_EVENTS")
print(f"   Rows: {len(promoted_events):,}")
print(f"   Users: {promoted_events['user_id'].nunique():,}")
print(f"   Vendors: {promoted_events['vendor_id'].nunique():,}")

print("\n2. PURCHASES_MAPPED")
print(f"   Total purchases: {len(purchases_mapped):,}")
print(f"   Valid (post-click): {purchases_mapped['is_post_click'].sum():,}")
print(f"   Valid spend: ${purchases_mapped.loc[purchases_mapped['is_post_click'], 'spend'].sum():,.2f}")

print("\n3. EVENTS_WITH_SESSIONS")
print(f"   Events: {len(all_events):,}")
print(f"   Users: {all_events['user_id'].nunique():,}")
print(f"   Weeks: {all_events['year_week'].nunique()}")
for gap in SESSION_GAPS:
    print(f"   Sessions ({gap}d gap): {all_events[f'session_id_{gap}d'].nunique():,}")

In [None]:
print("\n" + "=" * 80)
print("CANONICAL TABLES COMPLETE")
print("=" * 80)
print("\nOutput files in data/:")
for f in OUTPUT_DIR.glob('*.parquet'):
    size_mb = f.stat().st_size / 1e6
    print(f"  {f.name}: {size_mb:.1f} MB")

print("\nReady for 03_panel_construction.ipynb")