# Attribution Panel Data Construction with Ad-Stock

This notebook builds panel data for attribution analysis with ad-stock decay functions.
We control for both impressions and clicks as exposure variables with temporal decay.

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import pyarrow.parquet as pq
from tqdm import tqdm
import os
import warnings
warnings.filterwarnings('ignore')

print('Libraries loaded successfully')

Libraries loaded successfully


## 1. Load Shopping Sessions Data

In [2]:
# Load the shopping sessions data
df_sessions = pd.read_parquet('data/shopping_sessions.parquet')
print(f'Loaded {len(df_sessions):,} shopping sessions')
print(f'Date range: {df_sessions.shopping_start.min()} to {df_sessions.shopping_end.max()}')
print(f'\nColumns: {df_sessions.columns.tolist()}')

Loaded 9,214 shopping sessions
Date range: 2025-03-25 00:00:27.672000 to 2025-09-20 23:59:56

Columns: ['shopping_session_id', 'user_id', 'shopping_start', 'shopping_end', 'total_events', 'total_auctions', 'total_impressions', 'total_clicks', 'total_purchases', 'total_revenue', 'num_browsing_sessions', 'total_unique_products', 'total_unique_auctions', 'total_browse_minutes', 'shopping_duration_days', 'did_purchase', 'avg_browsing_session_minutes', 'total_revenue_usd']


## 2. Load Ad Exposure Data (Impressions & Clicks)

In [3]:
# Load impressions
impressions = pd.read_parquet('data/raw_sample_impressions.parquet')
impressions['date'] = pd.to_datetime(impressions['OCCURRED_AT']).dt.date
print(f'Loaded {len(impressions):,} impressions')

# Load clicks
clicks = pd.read_parquet('data/raw_sample_clicks.parquet')
clicks['date'] = pd.to_datetime(clicks['OCCURRED_AT']).dt.date
print(f'Loaded {len(clicks):,} clicks')

Loaded 1,118,310 impressions
Loaded 34,260 clicks


## 3. Create Daily User-Vendor Panel

Unit of analysis: user-vendor-day

In [4]:
# Aggregate daily impressions by user-vendor
daily_impressions = impressions.groupby(['USER_ID', 'VENDOR_ID', 'date']).size().reset_index(name='impressions')
print(f'Daily user-vendor impressions: {len(daily_impressions):,}')

# Aggregate daily clicks by user-vendor
daily_clicks = clicks.groupby(['USER_ID', 'VENDOR_ID', 'date']).size().reset_index(name='clicks')
print(f'Daily user-vendor clicks: {len(daily_clicks):,}')

# Merge impressions and clicks
daily_exposure = pd.merge(
    daily_impressions,
    daily_clicks,
    on=['USER_ID', 'VENDOR_ID', 'date'],
    how='outer'
).fillna(0)

daily_exposure['impressions'] = daily_exposure['impressions'].astype(int)
daily_exposure['clicks'] = daily_exposure['clicks'].astype(int)

print(f'\nCombined daily exposure records: {len(daily_exposure):,}')

Daily user-vendor impressions: 753,304
Daily user-vendor clicks: 29,173

Combined daily exposure records: 754,095


## 4. Create Balanced Panel

Fill in missing days with zero exposure

In [5]:
# Get date range
min_date = daily_exposure['date'].min()
max_date = daily_exposure['date'].max()
all_dates = pd.date_range(min_date, max_date, freq='D').date

print(f'Creating balanced panel from {min_date} to {max_date} ({len(all_dates)} days)')

# Get unique user-vendor pairs
user_vendor_pairs = daily_exposure[['USER_ID', 'VENDOR_ID']].drop_duplicates()
print(f'Unique user-vendor pairs: {len(user_vendor_pairs):,}')

# For efficiency with real data, limit panel size
max_pairs = 10000  # Increased from 5000 since we have real conversions now
if len(user_vendor_pairs) > max_pairs:
    # Prioritize pairs with activity
    pair_activity = daily_exposure.groupby(['USER_ID', 'VENDOR_ID']).agg({
        'impressions': 'sum',
        'clicks': 'sum'
    }).reset_index()
    pair_activity['total_activity'] = pair_activity['impressions'] + pair_activity['clicks'] * 10  # Weight clicks more
    
    # Take top active pairs
    user_vendor_pairs = pair_activity.nlargest(max_pairs, 'total_activity')[['USER_ID', 'VENDOR_ID']]
    print(f'Selected top {max_pairs:,} most active user-vendor pairs')

# Create balanced panel efficiently
balanced_panel = user_vendor_pairs.copy()
balanced_panel = balanced_panel.assign(key=1)
dates_df = pd.DataFrame({'date': all_dates, 'key': 1})
balanced_panel = pd.merge(balanced_panel, dates_df, on='key').drop('key', axis=1)

# Merge with actual exposure data
balanced_panel = pd.merge(
    balanced_panel,
    daily_exposure,
    on=['USER_ID', 'VENDOR_ID', 'date'],
    how='left'
).fillna(0)

balanced_panel['impressions'] = balanced_panel['impressions'].astype(int)
balanced_panel['clicks'] = balanced_panel['clicks'].astype(int)

print(f'\nBalanced panel size: {len(balanced_panel):,} observations')
print(f'Panel covers {balanced_panel.USER_ID.nunique()} users and {balanced_panel.VENDOR_ID.nunique()} vendors')

Creating balanced panel from 2025-03-25 to 2025-09-20 (180 days)
Unique user-vendor pairs: 599,032
Selected top 10,000 most active user-vendor pairs

Balanced panel size: 1,800,000 observations
Panel covers 818 users and 7597 vendors


## 5. Calculate Ad-Stock with Exponential Decay

Ad-stock formula: $S_t = x_t + \lambda \cdot S_{t-1}$

Where $\lambda$ is the decay parameter (0 < λ < 1)

In [6]:
def calculate_adstock(exposures, decay_rate=0.5):
    """
    Calculate ad-stock with exponential decay
    exposures: array of daily exposures
    decay_rate: decay parameter (0 < decay_rate < 1)
    """
    adstock = np.zeros_like(exposures, dtype=float)
    adstock[0] = exposures[0]
    
    for t in range(1, len(exposures)):
        adstock[t] = exposures[t] + decay_rate * adstock[t-1]
    
    return adstock

# Sort by user-vendor-date for proper time series
balanced_panel = balanced_panel.sort_values(['USER_ID', 'VENDOR_ID', 'date'])

# Calculate ad-stock for impressions and clicks
print('Calculating ad-stock with different decay rates...')

decay_rates = [0.3, 0.5, 0.7, 0.9]

for decay in tqdm(decay_rates):
    # Impressions ad-stock
    balanced_panel[f'adstock_imp_{decay}'] = balanced_panel.groupby(['USER_ID', 'VENDOR_ID'])['impressions'].transform(
        lambda x: calculate_adstock(x.values, decay)
    )
    
    # Clicks ad-stock
    balanced_panel[f'adstock_click_{decay}'] = balanced_panel.groupby(['USER_ID', 'VENDOR_ID'])['clicks'].transform(
        lambda x: calculate_adstock(x.values, decay)
    )

print('Ad-stock calculation complete')

Calculating ad-stock with different decay rates...


100%|██████████| 4/4 [00:05<00:00,  1.42s/it]

Ad-stock calculation complete





## 6. Add Outcome Variables (Purchases)

## 6. Add Outcome Variables (Purchases)

We'll match purchases to impressions/clicks using **product IDs** which are consistent across all datasets.

In [7]:
# Load purchases
purchases = pd.read_parquet('data/raw_sample_purchases.parquet')
purchases['date'] = pd.to_datetime(purchases['PURCHASED_AT']).dt.date
print(f'Loaded {len(purchases):,} purchases')

# IMPORTANT: We match through PRODUCT_ID, not vendor
# The vendor IDs in catalog don't match ad data, but product IDs do!

# First, let's check product overlap
imp_products = set(impressions['PRODUCT_ID'].unique())
click_products = set(clicks['PRODUCT_ID'].unique()) 
purchase_products = set(purchases['PRODUCT_ID'].unique())

print(f'\nProduct overlap:')
print(f'  Products with impressions AND purchases: {len(imp_products & purchase_products)}')
print(f'  Products with clicks AND purchases: {len(click_products & purchase_products)}')

# Create user-product-date level purchase data
purchase_daily = purchases.groupby(['USER_ID', 'PRODUCT_ID', 'date']).agg({
    'QUANTITY': 'sum',
    'UNIT_PRICE': 'mean'
}).reset_index()
purchase_daily['gmv'] = purchase_daily['QUANTITY'] * purchase_daily['UNIT_PRICE']

print(f'\nDaily user-product purchases: {len(purchase_daily):,}')

# Now we need to link these to our vendor-level panel
# We'll match impressions/clicks to purchases via USER_ID + PRODUCT_ID + date
# Then attribute the purchase to the vendor who showed the ad

# Get impression/click data with vendor info at product level
imp_with_product = impressions.groupby(['USER_ID', 'VENDOR_ID', 'PRODUCT_ID', 'date']).size().reset_index(name='product_impressions')
click_with_product = clicks.groupby(['USER_ID', 'VENDOR_ID', 'PRODUCT_ID', 'date']).size().reset_index(name='product_clicks')

# Match impressions to purchases
imp_conversions = pd.merge(
    imp_with_product,
    purchase_daily[['USER_ID', 'PRODUCT_ID', 'date', 'gmv']],
    on=['USER_ID', 'PRODUCT_ID', 'date'],
    how='inner'
)

# Match clicks to purchases  
click_conversions = pd.merge(
    click_with_product,
    purchase_daily[['USER_ID', 'PRODUCT_ID', 'date', 'gmv']],
    on=['USER_ID', 'PRODUCT_ID', 'date'],
    how='inner'
)

print(f'\nREAL conversion matches:')
print(f'  Impression->purchase matches: {len(imp_conversions)}')
print(f'  Click->purchase matches: {len(click_conversions)}')

# Combine and deduplicate (prefer click attribution over impression)
all_conversions = pd.concat([
    click_conversions[['USER_ID', 'VENDOR_ID', 'date', 'gmv']],
    imp_conversions[['USER_ID', 'VENDOR_ID', 'date', 'gmv']]
])

# Remove duplicates, keeping first (which prioritizes clicks)
all_conversions = all_conversions.drop_duplicates(subset=['USER_ID', 'VENDOR_ID', 'date'], keep='first')

# Aggregate to user-vendor-date level
daily_gmv = all_conversions.groupby(['USER_ID', 'VENDOR_ID', 'date'])['gmv'].sum().reset_index()

print(f'\nUnique user-vendor-date conversions: {len(daily_gmv)}')
print(f'Total GMV from real conversions: ${daily_gmv.gmv.sum():,.2f}')

# Merge with panel
balanced_panel = pd.merge(
    balanced_panel,
    daily_gmv,
    on=['USER_ID', 'VENDOR_ID', 'date'],
    how='left'
)

balanced_panel['gmv'] = balanced_panel['gmv'].fillna(0)
balanced_panel['conversion'] = (balanced_panel['gmv'] > 0).astype(int)

print(f'\nPanel with REAL outcomes:')
print(f'  Total observations: {len(balanced_panel):,}')
print(f'  Observations with conversions: {balanced_panel.conversion.sum():,}')
print(f'  Conversion rate: {balanced_panel.conversion.mean():.4%}')
print(f'  Total GMV: ${balanced_panel.gmv.sum():,.2f}')

Loaded 4,904 purchases

Product overlap:
  Products with impressions AND purchases: 275
  Products with clicks AND purchases: 238

Daily user-product purchases: 4,760

REAL conversion matches:
  Impression->purchase matches: 157
  Click->purchase matches: 145

Unique user-vendor-date conversions: 155
Total GMV from real conversions: $501,800.00

Panel with REAL outcomes:
  Total observations: 1,800,000
  Observations with conversions: 78
  Conversion rate: 0.0043%
  Total GMV: $236,900.00


## 7. Add Time and Fixed Effects Variables

In [8]:
# Convert date to datetime for easier manipulation
balanced_panel['date'] = pd.to_datetime(balanced_panel['date'])

# Add time variables
balanced_panel['year'] = balanced_panel['date'].dt.year
balanced_panel['month'] = balanced_panel['date'].dt.month
balanced_panel['week'] = balanced_panel['date'].dt.isocalendar().week
balanced_panel['weekday'] = balanced_panel['date'].dt.dayofweek
balanced_panel['is_weekend'] = (balanced_panel['weekday'] >= 5).astype(int)

# Create time period for fixed effects
balanced_panel['year_month'] = balanced_panel['date'].dt.to_period('M').astype(str)
balanced_panel['year_week'] = balanced_panel['date'].dt.to_period('W').astype(str)

print('Time variables added')
print(f'\nPanel dimensions:')
print(f'  Users: {balanced_panel.USER_ID.nunique():,}')
print(f'  Vendors: {balanced_panel.VENDOR_ID.nunique():,}')
print(f'  Days: {balanced_panel.date.nunique():,}')
print(f'  Total observations: {len(balanced_panel):,}')

Time variables added

Panel dimensions:
  Users: 818
  Vendors: 7,597
  Days: 180
  Total observations: 1,800,000


## 8. Create Lagged Variables

In [9]:
# Sort for proper lagging
balanced_panel = balanced_panel.sort_values(['USER_ID', 'VENDOR_ID', 'date'])

# Create lagged outcomes
lag_vars = ['gmv', 'conversion']
lag_periods = [1, 7, 14]

print('Creating lagged variables...')
for var in lag_vars:
    for lag in lag_periods:
        balanced_panel[f'{var}_lag{lag}'] = balanced_panel.groupby(['USER_ID', 'VENDOR_ID'])[var].shift(lag)

# Create cumulative exposures (last 7 days, 14 days, 30 days)
exposure_vars = ['impressions', 'clicks']
windows = [7, 14, 30]

print('Creating rolling window exposures...')
for var in exposure_vars:
    for window in windows:
        balanced_panel[f'{var}_sum{window}d'] = balanced_panel.groupby(['USER_ID', 'VENDOR_ID'])[var].transform(
            lambda x: x.rolling(window, min_periods=1).sum()
        )

print('Lagged variables created')

Creating lagged variables...
Creating rolling window exposures...
Lagged variables created


## 9. Summary Statistics

In [10]:
# Basic summary
print('Panel Summary Statistics\n' + '='*50)
print(f'\nPanel Structure:')
print(f'  Total observations: {len(balanced_panel):,}')
print(f'  Unique users: {balanced_panel.USER_ID.nunique():,}')
print(f'  Unique vendors: {balanced_panel.VENDOR_ID.nunique():,}')
print(f'  Date range: {balanced_panel.date.min()} to {balanced_panel.date.max()}')
print(f'  Days covered: {balanced_panel.date.nunique():,}')

print(f'\nExposure Statistics:')
print(f'  Observations with impressions: {(balanced_panel.impressions > 0).sum():,} ({(balanced_panel.impressions > 0).mean():.2%})')
print(f'  Observations with clicks: {(balanced_panel.clicks > 0).sum():,} ({(balanced_panel.clicks > 0).mean():.2%})')
print(f'  Mean daily impressions (when > 0): {balanced_panel[balanced_panel.impressions > 0].impressions.mean():.2f}')
print(f'  Mean daily clicks (when > 0): {balanced_panel[balanced_panel.clicks > 0].clicks.mean():.2f}')

print(f'\nOutcome Statistics:')
print(f'  Observations with purchases: {(balanced_panel.conversion == 1).sum():,} ({balanced_panel.conversion.mean():.2%})')
print(f'  Mean GMV (when > 0): ${balanced_panel[balanced_panel.gmv > 0].gmv.mean():.2f}')
print(f'  Total GMV: ${balanced_panel.gmv.sum():,.2f}')

# Ad-stock statistics
print(f'\nAd-Stock Statistics (decay=0.5):')
print(f'  Mean impression ad-stock: {balanced_panel["adstock_imp_0.5"].mean():.2f}')
print(f'  Mean click ad-stock: {balanced_panel["adstock_click_0.5"].mean():.4f}')
print(f'  Max impression ad-stock: {balanced_panel["adstock_imp_0.5"].max():.2f}')
print(f'  Max click ad-stock: {balanced_panel["adstock_click_0.5"].max():.2f}')

Panel Summary Statistics

Panel Structure:
  Total observations: 1,800,000
  Unique users: 818
  Unique vendors: 7,597
  Date range: 2025-03-25 00:00:00 to 2025-09-20 00:00:00
  Days covered: 180

Exposure Statistics:
  Observations with impressions: 51,342 (2.85%)
  Observations with clicks: 10,095 (0.56%)
  Mean daily impressions (when > 0): 2.96
  Mean daily clicks (when > 0): 1.50

Outcome Statistics:
  Observations with purchases: 78 (0.00%)
  Mean GMV (when > 0): $3037.18
  Total GMV: $236,900.00

Ad-Stock Statistics (decay=0.5):
  Mean impression ad-stock: 0.17
  Mean click ad-stock: 0.0168
  Max impression ad-stock: 173.47
  Max click ad-stock: 18.00


## 10. Save Panel Data

In [11]:
# Save to parquet
output_path = 'data/attribution_panel.parquet'
balanced_panel.to_parquet(output_path, index=False)
print(f'Panel saved to {output_path}')
print(f'File size: {os.path.getsize(output_path) / 1024**2:.2f} MB')

# Display sample
print('\nSample of panel data:')
display_cols = ['USER_ID', 'VENDOR_ID', 'date', 'impressions', 'clicks', 
                'adstock_imp_0.5', 'adstock_click_0.5', 'gmv', 'conversion']
balanced_panel[display_cols].head(10)

Panel saved to data/attribution_panel.parquet
File size: 26.25 MB

Sample of panel data:


Unnamed: 0,USER_ID,VENDOR_ID,date,impressions,clicks,adstock_imp_0.5,adstock_click_0.5,gmv,conversion
0,ext1:024ff6f7-6340-4225-a29a-a404643efd74,0197e14e14dd7a039876fef882411846,2025-03-25,0,0,0.0,0.0,0.0,0
1,ext1:024ff6f7-6340-4225-a29a-a404643efd74,0197e14e14dd7a039876fef882411846,2025-03-26,0,0,0.0,0.0,0.0,0
2,ext1:024ff6f7-6340-4225-a29a-a404643efd74,0197e14e14dd7a039876fef882411846,2025-03-27,0,0,0.0,0.0,0.0,0
3,ext1:024ff6f7-6340-4225-a29a-a404643efd74,0197e14e14dd7a039876fef882411846,2025-03-28,0,0,0.0,0.0,0.0,0
4,ext1:024ff6f7-6340-4225-a29a-a404643efd74,0197e14e14dd7a039876fef882411846,2025-03-29,0,0,0.0,0.0,0.0,0
5,ext1:024ff6f7-6340-4225-a29a-a404643efd74,0197e14e14dd7a039876fef882411846,2025-03-30,0,0,0.0,0.0,0.0,0
6,ext1:024ff6f7-6340-4225-a29a-a404643efd74,0197e14e14dd7a039876fef882411846,2025-03-31,0,0,0.0,0.0,0.0,0
7,ext1:024ff6f7-6340-4225-a29a-a404643efd74,0197e14e14dd7a039876fef882411846,2025-04-01,0,0,0.0,0.0,0.0,0
8,ext1:024ff6f7-6340-4225-a29a-a404643efd74,0197e14e14dd7a039876fef882411846,2025-04-02,0,0,0.0,0.0,0.0,0
9,ext1:024ff6f7-6340-4225-a29a-a404643efd74,0197e14e14dd7a039876fef882411846,2025-04-03,0,0,0.0,0.0,0.0,0
