# 01 Data Audit

**Purpose:** Validate data integrity, join rates, and purchase mappability before panel construction.

**Data Source:** `eda/data/` 365-day files

**Outputs:**
- Row counts, date ranges, unique entities
- Join rate diagnostics (composite key validation)
- Purchase mappability to promoted journey

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

DATA_DIR = Path('../eda/data')
print(f"Data directory: {DATA_DIR.resolve()}")
print(f"Exists: {DATA_DIR.exists()}")

## 1. Load All Source Tables

In [None]:
print("Loading source tables...")

tables = {}
files = [
    ('auctions_users', 'auctions_users_365d.parquet'),
    ('auctions_results', 'auctions_results_365d.parquet'),
    ('impressions', 'impressions_365d.parquet'),
    ('clicks', 'clicks_365d.parquet'),
    ('purchases', 'purchases_365d.parquet'),
    ('catalog', 'catalog_365d.parquet'),
]

for name, filename in tqdm(files, desc="Loading tables"):
    path = DATA_DIR / filename
    if path.exists():
        tables[name] = pd.read_parquet(path)
        print(f"  {name}: {len(tables[name]):,} rows, {tables[name].shape[1]} cols")
    else:
        print(f"  {name}: FILE NOT FOUND at {path}")

print(f"\nLoaded {len(tables)} tables")

## 2. Basic Statistics

In [None]:
print("=" * 80)
print("TABLE SUMMARY")
print("=" * 80)

for name, df in tables.items():
    print(f"\n--- {name.upper()} ---")
    print(f"Rows: {len(df):,}")
    print(f"Columns: {list(df.columns)}")
    print(f"Memory: {df.memory_usage(deep=True).sum() / 1e6:.1f} MB")
    
    # Date range for timestamp columns
    ts_cols = [c for c in df.columns if 'AT' in c.upper() or 'TIME' in c.upper() or 'DATE' in c.upper()]
    for col in ts_cols:
        try:
            df[col] = pd.to_datetime(df[col])
            print(f"{col}: {df[col].min()} to {df[col].max()}")
        except:
            pass

## 3. Unique Entity Counts

In [None]:
print("=" * 80)
print("UNIQUE ENTITY COUNTS")
print("=" * 80)

# Users
user_cols = {'auctions_users': 'OPAQUE_USER_ID', 'impressions': 'USER_ID', 
             'clicks': 'USER_ID', 'purchases': 'USER_ID'}
print("\nUnique Users by Table:")
for name, col in user_cols.items():
    if name in tables and col in tables[name].columns:
        n = tables[name][col].nunique()
        print(f"  {name}: {n:,} unique users")

# Vendors
print("\nUnique Vendors by Table:")
for name in ['auctions_results', 'impressions', 'clicks']:
    if name in tables and 'VENDOR_ID' in tables[name].columns:
        n = tables[name]['VENDOR_ID'].nunique()
        print(f"  {name}: {n:,} unique vendors")

# Products
print("\nUnique Products by Table:")
for name in ['auctions_results', 'impressions', 'clicks', 'purchases', 'catalog']:
    if name in tables and 'PRODUCT_ID' in tables[name].columns:
        n = tables[name]['PRODUCT_ID'].nunique()
        print(f"  {name}: {n:,} unique products")

# Auctions
print("\nUnique Auctions by Table:")
for name in ['auctions_users', 'auctions_results', 'impressions', 'clicks']:
    if name in tables and 'AUCTION_ID' in tables[name].columns:
        n = tables[name]['AUCTION_ID'].nunique()
        print(f"  {name}: {n:,} unique auctions")

In [None]:
# CRITICAL BLOCKER CHECK: User ID Coherence
print("\n" + "=" * 80)
print("BLOCKER A: USER ID COHERENCE CHECK")
print("=" * 80)
print("\nAUCTIONS_USERS uses OPAQUE_USER_ID; CLICKS/IMPRESSIONS/PURCHASES use USER_ID")
print("Must verify: Are these the same identifier or is mapping needed?\n")

if 'auctions_users' in tables and 'clicks' in tables:
    auctions = tables['auctions_users']
    clicks = tables['clicks']
    
    opaque_users = set(auctions['OPAQUE_USER_ID'].unique())
    click_users = set(clicks['USER_ID'].unique())
    
    # Check overlap
    overlap = opaque_users & click_users
    only_opaque = opaque_users - click_users
    only_click = click_users - opaque_users
    
    print(f"OPAQUE_USER_ID unique values: {len(opaque_users):,}")
    print(f"USER_ID unique values: {len(click_users):,}")
    print(f"Overlap (same IDs in both): {len(overlap):,}")
    print(f"Only in OPAQUE_USER_ID: {len(only_opaque):,}")
    print(f"Only in USER_ID: {len(only_click):,}")
    
    overlap_rate = len(overlap) / min(len(opaque_users), len(click_users)) * 100
    print(f"\nOverlap rate: {overlap_rate:.1f}%")
    
    if overlap_rate > 90:
        print("STATUS: IDs appear to be THE SAME identifier")
    elif overlap_rate > 50:
        print("STATUS: PARTIAL overlap - may need investigation")
    else:
        print("STATUS: LOW overlap - IDs may be DIFFERENT identifiers, need mapping")
    
    # Check format similarity
    print("\n--- Sample ID Formats ---")
    print(f"OPAQUE_USER_ID samples: {list(auctions['OPAQUE_USER_ID'].head(3))}")
    print(f"USER_ID samples: {list(clicks['USER_ID'].head(3))}")

In [None]:
# CRITICAL BLOCKER CHECK: Auction Scope
print("\n" + "=" * 80)
print("BLOCKER B: AUCTION SCOPE CHECK")
print("=" * 80)
print("\nDoes AUCTIONS_USERS represent ALL searches/queries, or only when ads are served?")
print("If only promoted, sessions built from AUCTIONS_USERS miss organic browsing.\n")

if 'auctions_users' in tables and 'impressions' in tables:
    auctions = tables['auctions_users']
    impressions = tables['impressions']
    
    auction_ids_from_auctions = set(auctions['AUCTION_ID'].unique())
    auction_ids_from_impressions = set(impressions['AUCTION_ID'].unique())
    
    # Every impression should have a parent auction
    impressions_with_auction = len(auction_ids_from_impressions & auction_ids_from_auctions)
    impressions_without_auction = len(auction_ids_from_impressions - auction_ids_from_auctions)
    
    print(f"Auctions in AUCTIONS_USERS: {len(auction_ids_from_auctions):,}")
    print(f"Auctions in IMPRESSIONS: {len(auction_ids_from_impressions):,}")
    print(f"Impression auctions found in AUCTIONS_USERS: {impressions_with_auction:,}")
    print(f"Impression auctions NOT in AUCTIONS_USERS: {impressions_without_auction:,}")
    
    # Check if auctions >> impressions (suggests auctions without ads)
    ratio = len(auction_ids_from_auctions) / len(auction_ids_from_impressions)
    print(f"\nAuctions / Impressions ratio: {ratio:.2f}")
    
    if ratio > 2:
        print("STATUS: AUCTIONS_USERS likely includes searches WITHOUT promoted results")
        print("        (Good for session construction - captures organic browsing)")
    else:
        print("STATUS: AUCTIONS_USERS may only include auctions WITH promoted results")
        print("        (Sessions will miss organic-only browsing activity)")

if 'auctions_results' in tables:
    bids = tables['auctions_results']
    auction_ids_from_bids = set(bids['AUCTION_ID'].unique())
    
    # How many auctions have bids but no impressions?
    bids_without_impressions = len(auction_ids_from_bids - auction_ids_from_impressions)
    print(f"\nAuctions with bids but no impressions: {bids_without_impressions:,}")
    print("(These are auctions where ads were bid but not shown)")

## 4. Join Rate Diagnostics

Test composite key joins: `AUCTION_ID + PRODUCT_ID + VENDOR_ID + CAMPAIGN_ID`

In [None]:
print("=" * 80)
print("JOIN RATE DIAGNOSTICS")
print("=" * 80)

# Define join keys
COMPOSITE_KEYS = ['AUCTION_ID', 'PRODUCT_ID', 'VENDOR_ID', 'CAMPAIGN_ID']
MINIMAL_KEYS = ['AUCTION_ID', 'PRODUCT_ID']

# Check which tables have which keys
print("\nKey availability by table:")
for name, df in tables.items():
    available = [k for k in COMPOSITE_KEYS if k in df.columns]
    print(f"  {name}: {available}")

In [None]:
# Join clicks to auctions_results (bids)
print("\n--- CLICKS -> AUCTIONS_RESULTS (bids) ---")
if 'clicks' in tables and 'auctions_results' in tables:
    clicks = tables['clicks']
    bids = tables['auctions_results']
    
    # Find common keys
    common_keys = [k for k in COMPOSITE_KEYS if k in clicks.columns and k in bids.columns]
    print(f"Common keys: {common_keys}")
    
    # Create composite keys
    clicks_keys = clicks[common_keys].drop_duplicates()
    bids_keys = bids[common_keys].drop_duplicates()
    
    # Forward join (clicks that have matching bids)
    merged = clicks_keys.merge(bids_keys, on=common_keys, how='inner')
    forward_rate = len(merged) / len(clicks_keys) * 100
    print(f"Clicks with matching bids: {len(merged):,} / {len(clicks_keys):,} ({forward_rate:.1f}%)")
    
    # Backward join (bids that have matching clicks)
    backward_rate = len(merged) / len(bids_keys) * 100
    print(f"Bids with matching clicks: {len(merged):,} / {len(bids_keys):,} ({backward_rate:.1f}%)")

In [None]:
# Join impressions to auctions_results
print("\n--- IMPRESSIONS -> AUCTIONS_RESULTS (bids) ---")
if 'impressions' in tables and 'auctions_results' in tables:
    impressions = tables['impressions']
    bids = tables['auctions_results']
    
    common_keys = [k for k in COMPOSITE_KEYS if k in impressions.columns and k in bids.columns]
    print(f"Common keys: {common_keys}")
    
    imp_keys = impressions[common_keys].drop_duplicates()
    bids_keys = bids[common_keys].drop_duplicates()
    
    merged = imp_keys.merge(bids_keys, on=common_keys, how='inner')
    forward_rate = len(merged) / len(imp_keys) * 100
    print(f"Impressions with matching bids: {len(merged):,} / {len(imp_keys):,} ({forward_rate:.1f}%)")

In [None]:
# Join clicks to impressions
print("\n--- CLICKS -> IMPRESSIONS ---")
if 'clicks' in tables and 'impressions' in tables:
    clicks = tables['clicks']
    impressions = tables['impressions']
    
    common_keys = [k for k in COMPOSITE_KEYS if k in clicks.columns and k in impressions.columns]
    print(f"Common keys: {common_keys}")
    
    clicks_keys = clicks[common_keys].drop_duplicates()
    imp_keys = impressions[common_keys].drop_duplicates()
    
    merged = clicks_keys.merge(imp_keys, on=common_keys, how='inner')
    forward_rate = len(merged) / len(clicks_keys) * 100
    print(f"Clicks with matching impressions: {len(merged):,} / {len(clicks_keys):,} ({forward_rate:.1f}%)")

In [None]:
# Join auctions_results to auctions_users
print("\n--- AUCTIONS_RESULTS -> AUCTIONS_USERS ---")
if 'auctions_results' in tables and 'auctions_users' in tables:
    bids = tables['auctions_results']
    auctions = tables['auctions_users']
    
    bids_auctions = bids['AUCTION_ID'].unique()
    auctions_ids = set(auctions['AUCTION_ID'].unique())
    
    matched = sum(1 for a in bids_auctions if a in auctions_ids)
    rate = matched / len(bids_auctions) * 100
    print(f"Bid auctions with user info: {matched:,} / {len(bids_auctions):,} ({rate:.1f}%)")

## 5. Purchase Mappability to Promoted Journey

How many purchases can we reliably link to a promoted click?

In [None]:
print("=" * 80)
print("PURCHASE MAPPABILITY")
print("=" * 80)

if 'purchases' in tables and 'clicks' in tables:
    purchases = tables['purchases'].copy()
    clicks = tables['clicks'].copy()
    
    print(f"\nTotal purchases: {len(purchases):,}")
    print(f"Total clicks: {len(clicks):,}")
    
    # Parse timestamps
    purchases['PURCHASED_AT'] = pd.to_datetime(purchases['PURCHASED_AT'])
    clicks['OCCURRED_AT'] = pd.to_datetime(clicks['OCCURRED_AT'])
    
    # Calculate spend
    purchases['spend'] = purchases['QUANTITY'] * purchases['UNIT_PRICE'] / 100  # cents to dollars
    total_spend = purchases['spend'].sum()
    print(f"Total spend: ${total_spend:,.2f}")
    
    # Get unique (user, product) pairs from clicks with vendor info
    click_user_product = clicks[['USER_ID', 'PRODUCT_ID', 'VENDOR_ID', 'CAMPAIGN_ID']].drop_duplicates()
    print(f"\nUnique (user, product) pairs in clicks: {len(click_user_product):,}")
    
    # Join purchases to clicks on (user, product)
    purchases_with_click = purchases.merge(
        click_user_product,
        left_on=['USER_ID', 'PRODUCT_ID'],
        right_on=['USER_ID', 'PRODUCT_ID'],
        how='inner'
    )
    
    n_mapped = len(purchases_with_click)
    spend_mapped = purchases_with_click['spend'].sum()
    
    print(f"\n--- Mappability (user + product match) ---")
    print(f"Purchases mappable: {n_mapped:,} / {len(purchases):,} ({n_mapped/len(purchases)*100:.1f}%)")
    print(f"Spend mappable: ${spend_mapped:,.2f} / ${total_spend:,.2f} ({spend_mapped/total_spend*100:.1f}%)")

In [None]:
# More strict: same-day or within-window purchase after click
print("\n--- Mappability with time constraint ---")

if 'purchases' in tables and 'clicks' in tables:
    # For each user-product pair, get earliest click time
    first_clicks = clicks.groupby(['USER_ID', 'PRODUCT_ID'])['OCCURRED_AT'].min().reset_index()
    first_clicks.columns = ['USER_ID', 'PRODUCT_ID', 'first_click_time']
    
    # Join to purchases
    purchases_timed = purchases.merge(first_clicks, on=['USER_ID', 'PRODUCT_ID'], how='left')
    
    # Check different windows
    for window_days in [0, 1, 7, 14, 30]:
        if window_days == 0:
            mask = purchases_timed['first_click_time'].notna() & \
                   (purchases_timed['PURCHASED_AT'].dt.date == purchases_timed['first_click_time'].dt.date)
            label = "Same day"
        else:
            mask = purchases_timed['first_click_time'].notna() & \
                   (purchases_timed['PURCHASED_AT'] >= purchases_timed['first_click_time']) & \
                   (purchases_timed['PURCHASED_AT'] <= purchases_timed['first_click_time'] + pd.Timedelta(days=window_days))
            label = f"Within {window_days}d"
        
        n = mask.sum()
        spend = purchases_timed.loc[mask, 'spend'].sum()
        print(f"  {label}: {n:,} purchases ({n/len(purchases)*100:.1f}%), ${spend:,.2f} ({spend/total_spend*100:.1f}%)")

In [None]:
# Click-to-purchase lag distribution
print("\n--- Click-to-Purchase Lag Distribution ---")

if 'purchases_timed' in dir():
    # Only for mapped purchases
    mapped = purchases_timed[purchases_timed['first_click_time'].notna()].copy()
    mapped['lag_hours'] = (mapped['PURCHASED_AT'] - mapped['first_click_time']).dt.total_seconds() / 3600
    
    # Only positive lags (purchase after click)
    positive_lag = mapped[mapped['lag_hours'] >= 0]
    
    print(f"Purchases with positive lag: {len(positive_lag):,}")
    print(f"\nLag (hours) statistics:")
    print(positive_lag['lag_hours'].describe())
    
    # Percentiles
    print(f"\nLag percentiles:")
    for p in [10, 25, 50, 75, 90, 95, 99]:
        val = positive_lag['lag_hours'].quantile(p/100)
        print(f"  P{p}: {val:.1f} hours ({val/24:.1f} days)")

## 6. Vendor Coverage in Purchases

In [None]:
print("=" * 80)
print("VENDOR COVERAGE")
print("=" * 80)

if 'purchases_with_click' in dir():
    # Vendors in mapped purchases
    vendors_in_purchases = purchases_with_click['VENDOR_ID'].nunique()
    vendors_in_clicks = clicks['VENDOR_ID'].nunique()
    
    print(f"\nVendors with mapped purchases: {vendors_in_purchases:,}")
    print(f"Vendors in clicks: {vendors_in_clicks:,}")
    print(f"Coverage: {vendors_in_purchases/vendors_in_clicks*100:.1f}%")
    
    # Top vendors by mapped spend
    print(f"\nTop 10 vendors by mapped spend:")
    top_vendors = purchases_with_click.groupby('VENDOR_ID')['spend'].sum().sort_values(ascending=False).head(10)
    for i, (vendor, spend) in enumerate(top_vendors.items(), 1):
        print(f"  {i}. {vendor[:20]}...: ${spend:,.2f}")

## 7. Summary Statistics for Panel Construction

In [None]:
print("=" * 80)
print("SUMMARY FOR PANEL CONSTRUCTION")
print("=" * 80)

summary = {
    'n_users_clicks': clicks['USER_ID'].nunique() if 'clicks' in tables else 0,
    'n_users_purchases': purchases['USER_ID'].nunique() if 'purchases' in tables else 0,
    'n_vendors': bids['VENDOR_ID'].nunique() if 'auctions_results' in tables else 0,
    'n_products_clicks': clicks['PRODUCT_ID'].nunique() if 'clicks' in tables else 0,
    'n_clicks': len(clicks) if 'clicks' in tables else 0,
    'n_impressions': len(impressions) if 'impressions' in tables else 0,
    'n_purchases': len(purchases) if 'purchases' in tables else 0,
    'total_spend': total_spend if 'total_spend' in dir() else 0,
    'mappable_spend': spend_mapped if 'spend_mapped' in dir() else 0,
    'mappable_rate': spend_mapped/total_spend*100 if 'spend_mapped' in dir() else 0,
}

print("\nKey Metrics:")
for k, v in summary.items():
    if 'rate' in k:
        print(f"  {k}: {v:.1f}%")
    elif 'spend' in k:
        print(f"  {k}: ${v:,.2f}")
    else:
        print(f"  {k}: {v:,}")

In [None]:
# Date range for analysis
print("\n--- Date Range ---")
if 'clicks' in tables:
    print(f"Clicks: {clicks['OCCURRED_AT'].min()} to {clicks['OCCURRED_AT'].max()}")
if 'purchases' in tables:
    print(f"Purchases: {purchases['PURCHASED_AT'].min()} to {purchases['PURCHASED_AT'].max()}")

# Weeks available
if 'clicks' in tables:
    clicks['week'] = clicks['OCCURRED_AT'].dt.isocalendar().week
    clicks['year'] = clicks['OCCURRED_AT'].dt.year
    n_weeks = clicks.groupby(['year', 'week']).size().shape[0]
    print(f"\nWeeks with click data: {n_weeks}")

In [None]:
print("\n" + "=" * 80)
print("AUDIT COMPLETE")
print("=" * 80)
print("\nReady for 02_canonical_tables.ipynb")