# 02_data_processing.ipynb
## Session-Based Data Processing for Causal Transformer

This notebook transforms the raw unified event stream into model-ready sessions.

### Workflow:
1. Load the unified event stream from checkpoint
2. Sessionize events based on inactivity threshold (24 hours)
3. Create vocabulary mappings for events and products
4. Transform sessions into (X, T, Y) format for causal modeling
5. Split data into train/validation/test sets at user level
6. Save processed datasets for model training

In [1]:
# --- IMPORTS ---
import os
import json
from datetime import datetime, timedelta
from pathlib import Path
from typing import List, Tuple, Dict, Any
import warnings

import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split

warnings.filterwarnings('ignore')
tqdm.pandas()

In [2]:
# --- CONFIGURATION ---

# Session configuration
SESSION_GAP_HOURS = 24  # Hours of inactivity that define a new session
MIN_SESSION_LENGTH = 2  # Minimum number of events in a valid session
MAX_SESSION_LENGTH = 500  # Maximum number of events to keep per session

# Special tokens
PAD_TOKEN = '[PAD]'
AUCTION_TOKEN = '[AUCTION]'
UNK_TOKEN = '[UNK]'

# Data split ratios
TRAIN_RATIO = 0.7
VAL_RATIO = 0.15
TEST_RATIO = 0.15

# Random seed for reproducibility
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

print("Configuration:")
print(f"  Session gap: {SESSION_GAP_HOURS} hours")
print(f"  Min session length: {MIN_SESSION_LENGTH} events")
print(f"  Max session length: {MAX_SESSION_LENGTH} events")
print(f"  Data split: {TRAIN_RATIO:.0%} train / {VAL_RATIO:.0%} val / {TEST_RATIO:.0%} test")

Configuration:
  Session gap: 24 hours
  Min session length: 2 events
  Max session length: 500 events
  Data split: 70% train / 15% val / 15% test


In [3]:
# --- LOAD DATA ---

# Use fixed filename (no timestamp)
data_dir = Path('./data')
unified_events_file = data_dir / 'unified_events.parquet'

if not unified_events_file.exists():
    raise FileNotFoundError("No unified_events.parquet file found. Please run 01_data_pull.ipynb first.")

print(f"Loading data from: {unified_events_file}")

df_events = pd.read_parquet(unified_events_file)
print(f"Loaded {len(df_events):,} events")
print(f"Unique users: {df_events['USER_ID'].nunique():,}")
print(f"Date range: {df_events['event_timestamp'].min()} to {df_events['event_timestamp'].max()}")

# Ensure timestamp is datetime
df_events['event_timestamp'] = pd.to_datetime(df_events['event_timestamp'])

Loading data from: data/unified_events.parquet
Loaded 5,408 events
Unique users: 24
Date range: 2025-09-05 00:05:17.255000 to 2025-09-06 23:59:37


## Sessionization

In [4]:
def sessionize_user_events(user_df: pd.DataFrame, gap_hours: int = 24) -> pd.DataFrame:
    """
    Split a user's event stream into sessions based on inactivity gaps.
    
    Args:
        user_df: DataFrame of events for a single user, sorted by timestamp
        gap_hours: Hours of inactivity that define a new session
    
    Returns:
        DataFrame with session_id column added
    """
    user_df = user_df.sort_values('event_timestamp').copy()
    
    # Calculate time gaps between consecutive events
    user_df['time_gap'] = user_df['event_timestamp'].diff()
    
    # Mark session boundaries where gap exceeds threshold
    session_gap = pd.Timedelta(hours=gap_hours)
    user_df['new_session'] = (user_df['time_gap'] > session_gap) | user_df['time_gap'].isna()
    
    # Assign session IDs using cumsum
    user_df['session_id'] = user_df['new_session'].cumsum()
    
    # Clean up temporary columns
    user_df = user_df.drop(['time_gap', 'new_session'], axis=1)
    
    return user_df

In [5]:
# Perform sessionization for all users
print("\nSessionizing user events...")

# Apply sessionization per user
sessionized_dfs = []

for user_id in tqdm(df_events['USER_ID'].unique(), desc="Processing users"):
    user_events = df_events[df_events['USER_ID'] == user_id]
    user_sessions = sessionize_user_events(user_events, SESSION_GAP_HOURS)
    # Create global session ID by combining user_id and session_id
    user_sessions['global_session_id'] = user_id + '_' + user_sessions['session_id'].astype(str)
    sessionized_dfs.append(user_sessions)

df_sessionized = pd.concat(sessionized_dfs, ignore_index=True)

print(f"\nTotal sessions created: {df_sessionized['global_session_id'].nunique():,}")

# Filter out sessions that are too short
session_lengths = df_sessionized.groupby('global_session_id').size()
valid_sessions = session_lengths[session_lengths >= MIN_SESSION_LENGTH].index
df_sessionized = df_sessionized[df_sessionized['global_session_id'].isin(valid_sessions)]

print(f"Valid sessions (>= {MIN_SESSION_LENGTH} events): {df_sessionized['global_session_id'].nunique():,}")


Sessionizing user events...


Processing users: 100%|██████████| 24/24 [00:00<00:00, 841.60it/s]


Total sessions created: 24
Valid sessions (>= 2 events): 22





## Vocabulary Creation

In [6]:
def create_vocabularies(df: pd.DataFrame) -> Dict[str, Dict]:
    """
    Create vocabulary mappings for event types and product IDs.
    
    Returns:
        Dictionary containing event and item vocabularies
    """
    print("\nCreating vocabularies...")
    
    # Event type vocabulary (fixed ordering)
    event_types = ['auction', 'impression', 'click', 'purchase']
    event_to_int = {event: i for i, event in enumerate(event_types)}
    int_to_event = {i: event for event, i in event_to_int.items()}
    
    print(f"Event types: {len(event_to_int)}")
    for event, idx in event_to_int.items():
        print(f"  {event}: {idx}")
    
    # Product/Item vocabulary
    # Reserve special indices
    item_to_int = {
        PAD_TOKEN: 0,
        AUCTION_TOKEN: 1,
        UNK_TOKEN: 2
    }
    
    # Get all unique product IDs
    unique_products = df['product_id'].dropna().unique()
    print(f"\nUnique products: {len(unique_products):,}")
    
    # Assign indices to products
    for i, product in enumerate(sorted(unique_products), start=3):
        item_to_int[product] = i
    
    int_to_item = {i: item for item, i in item_to_int.items()}
    
    print(f"Total vocabulary size: {len(item_to_int):,}")
    print(f"  Special tokens: {PAD_TOKEN}(0), {AUCTION_TOKEN}(1), {UNK_TOKEN}(2)")
    print(f"  Product IDs: 3-{len(item_to_int)-1}")
    
    vocab = {
        'event_to_int': event_to_int,
        'int_to_event': int_to_event,
        'item_to_int': item_to_int,
        'int_to_item': int_to_item,
        'special_tokens': {
            'pad': PAD_TOKEN,
            'auction': AUCTION_TOKEN,
            'unk': UNK_TOKEN
        }
    }
    
    return vocab

In [7]:
# Create vocabularies
vocab = create_vocabularies(df_sessionized)

# Save vocabularies
models_dir = Path('./models')
models_dir.mkdir(exist_ok=True)

vocab_path = models_dir / 'vocab.json'
with open(vocab_path, 'w') as f:
    # Convert int keys to strings for JSON serialization
    vocab_serializable = {
        'event_to_int': vocab['event_to_int'],
        'int_to_event': {str(k): v for k, v in vocab['int_to_event'].items()},
        'item_to_int': {str(k): v for k, v in vocab['item_to_int'].items()},
        'int_to_item': {str(k): v for k, v in vocab['int_to_item'].items()},
        'special_tokens': vocab['special_tokens']
    }
    json.dump(vocab_serializable, f, indent=2)
print(f"\nVocabulary saved to {vocab_path}")


Creating vocabularies...
Event types: 4
  auction: 0
  impression: 1
  click: 2
  purchase: 3

Unique products: 2,952
Total vocabulary size: 2,955
  Special tokens: [PAD](0), [AUCTION](1), [UNK](2)
  Product IDs: 3-2954

Vocabulary saved to models/vocab.json


## Transform Sessions to Model Format

In [8]:
def transform_session_to_sequence(
    session_df: pd.DataFrame,
    vocab: Dict[str, Dict]
) -> Tuple[List[Tuple[int, int, float]], int, int]:
    """
    Transform a session DataFrame into the (X, T, Y) format for modeling.
    
    CRITICAL: X must only contain pre-treatment events to avoid conditioning on bad controls!
    
    Args:
        session_df: DataFrame of events in a single session
        vocab: Vocabulary mappings
    
    Returns:
        X: List of (event_type_id, item_id, timedelta_minutes) tuples (PRE-TREATMENT ONLY)
        T: Binary treatment indicator (1 if any click in session)
        Y: Binary outcome indicator (1 if any purchase in session)
    """
    session_df = session_df.sort_values('event_timestamp').copy()
    
    # Get first timestamp as reference
    first_timestamp = session_df['event_timestamp'].iloc[0]
    
    # Find the first click event (if any)
    click_mask = session_df['event_type'] == 'click'
    has_click = click_mask.any()
    
    # Determine treatment
    T = 1 if has_click else 0
    
    # Determine outcome (ANY purchase in the full session)
    has_purchase = (session_df['event_type'] == 'purchase').any()
    Y = 1 if has_purchase else 0
    
    # CRITICAL: Select events for X based on treatment status
    if has_click:
        # Find index of first click
        first_click_idx = click_mask.idxmax()  # Gets index of first True value
        first_click_position = session_df.index.get_loc(first_click_idx)
        
        # Use only events BEFORE the first click for X
        pre_treatment_df = session_df.iloc[:first_click_position]
        
        # If no events before click, use a minimal sequence
        if len(pre_treatment_df) == 0:
            # At least include the auction that led to the click
            pre_treatment_df = session_df.iloc[:1]
    else:
        # No click - use all events for X
        pre_treatment_df = session_df
    
    # Build sequence X from pre-treatment events only
    X = []
    for _, row in pre_treatment_df.iterrows():
        # Get event type ID
        event_type = row['event_type']
        event_type_id = vocab['event_to_int'][event_type]
        
        # Get item ID
        if event_type == 'auction':
            item_id = vocab['item_to_int'][AUCTION_TOKEN]
        elif pd.isna(row['product_id']):
            item_id = vocab['item_to_int'][UNK_TOKEN]
        else:
            product_id = row['product_id']
            item_id = vocab['item_to_int'].get(product_id, vocab['item_to_int'][UNK_TOKEN])
        
        # Calculate time delta in minutes
        timedelta_minutes = (row['event_timestamp'] - first_timestamp).total_seconds() / 60.0
        
        # Append to sequence
        X.append((event_type_id, item_id, timedelta_minutes))
    
    # Ensure minimum sequence length
    if len(X) < MIN_SESSION_LENGTH:
        # Pad with auction events if too short
        while len(X) < MIN_SESSION_LENGTH:
            X.append((vocab['event_to_int']['auction'], vocab['item_to_int'][AUCTION_TOKEN], 0.0))
    
    # Truncate if too long
    if len(X) > MAX_SESSION_LENGTH:
        X = X[:MAX_SESSION_LENGTH]
    
    return X, T, Y

In [9]:
# Transform all sessions
print("\nTransforming sessions to model format...")

session_data = []

for session_id in tqdm(df_sessionized['global_session_id'].unique(), desc="Processing sessions"):
    session_df = df_sessionized[df_sessionized['global_session_id'] == session_id]
    
    X, T, Y = transform_session_to_sequence(session_df, vocab)
    
    # Extract user_id from global_session_id
    user_id = session_id.rsplit('_', 1)[0]
    
    session_data.append({
        'session_id': session_id,
        'user_id': user_id,
        'sequence': X,
        'sequence_length': len(X),
        'treatment': T,
        'outcome': Y
    })

df_sessions = pd.DataFrame(session_data)

print(f"\nTotal sessions: {len(df_sessions):,}")
print(f"Sessions with treatment (click): {df_sessions['treatment'].sum():,} ({df_sessions['treatment'].mean():.1%})")
print(f"Sessions with outcome (purchase): {df_sessions['outcome'].sum():,} ({df_sessions['outcome'].mean():.1%})")

# Analyze sequence lengths
print(f"\nSequence length statistics:")
print(f"  Mean: {df_sessions['sequence_length'].mean():.1f}")
print(f"  Median: {df_sessions['sequence_length'].median():.1f}")
print(f"  Min: {df_sessions['sequence_length'].min()}")
print(f"  Max: {df_sessions['sequence_length'].max()}")
print(f"  95th percentile: {df_sessions['sequence_length'].quantile(0.95):.1f}")


Transforming sessions to model format...


Processing sessions: 100%|██████████| 22/22 [00:00<00:00, 670.26it/s]


Total sessions: 22
Sessions with treatment (click): 16 (72.7%)
Sessions with outcome (purchase): 16 (72.7%)

Sequence length statistics:
  Mean: 46.7
  Median: 26.0
  Min: 2
  Max: 250
  95th percentile: 111.6





## Train/Validation/Test Split

In [10]:
def user_level_split(
    df: pd.DataFrame,
    train_ratio: float = 0.7,
    val_ratio: float = 0.15,
    test_ratio: float = 0.15,
    random_state: int = 42
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """
    Split data at the user level to prevent leakage.
    
    Returns:
        train_df, val_df, test_df
    """
    assert abs(train_ratio + val_ratio + test_ratio - 1.0) < 1e-6, "Ratios must sum to 1"
    
    # Get unique users
    unique_users = df['user_id'].unique()
    np.random.seed(random_state)
    np.random.shuffle(unique_users)
    
    # Calculate split points
    n_users = len(unique_users)
    train_end = int(n_users * train_ratio)
    val_end = int(n_users * (train_ratio + val_ratio))
    
    # Split users
    train_users = unique_users[:train_end]
    val_users = unique_users[train_end:val_end]
    test_users = unique_users[val_end:]
    
    # Split dataframe
    train_df = df[df['user_id'].isin(train_users)].copy()
    val_df = df[df['user_id'].isin(val_users)].copy()
    test_df = df[df['user_id'].isin(test_users)].copy()
    
    return train_df, val_df, test_df

In [11]:
# Perform train/val/test split
print("\nSplitting data at user level...")

train_df, val_df, test_df = user_level_split(
    df_sessions,
    train_ratio=TRAIN_RATIO,
    val_ratio=VAL_RATIO,
    test_ratio=TEST_RATIO,
    random_state=RANDOM_SEED
)

print(f"\nTrain set:")
print(f"  Sessions: {len(train_df):,}")
print(f"  Users: {train_df['user_id'].nunique():,}")
print(f"  Treatment rate: {train_df['treatment'].mean():.1%}")
print(f"  Outcome rate: {train_df['outcome'].mean():.1%}")

print(f"\nValidation set:")
print(f"  Sessions: {len(val_df):,}")
print(f"  Users: {val_df['user_id'].nunique():,}")
print(f"  Treatment rate: {val_df['treatment'].mean():.1%}")
print(f"  Outcome rate: {val_df['outcome'].mean():.1%}")

print(f"\nTest set:")
print(f"  Sessions: {len(test_df):,}")
print(f"  Users: {test_df['user_id'].nunique():,}")
print(f"  Treatment rate: {test_df['treatment'].mean():.1%}")
print(f"  Outcome rate: {test_df['outcome'].mean():.1%}")

# Verify no user overlap
train_users = set(train_df['user_id'].unique())
val_users = set(val_df['user_id'].unique())
test_users = set(test_df['user_id'].unique())

assert len(train_users & val_users) == 0, "Train and validation sets have user overlap!"
assert len(train_users & test_users) == 0, "Train and test sets have user overlap!"
assert len(val_users & test_users) == 0, "Validation and test sets have user overlap!"
print("\n✓ No user overlap between splits")


Splitting data at user level...

Train set:
  Sessions: 15
  Users: 15
  Treatment rate: 66.7%
  Outcome rate: 80.0%

Validation set:
  Sessions: 3
  Users: 3
  Treatment rate: 66.7%
  Outcome rate: 33.3%

Test set:
  Sessions: 4
  Users: 4
  Treatment rate: 100.0%
  Outcome rate: 75.0%

✓ No user overlap between splits


## Save Processed Data

In [12]:
# Save processed datasets with FIXED FILENAMES (no timestamps)
print("\nSaving processed datasets...")

# Use fixed filenames that overwrite
train_path = data_dir / 'train_sessions.parquet'
val_path = data_dir / 'validation_sessions.parquet'
test_path = data_dir / 'test_sessions.parquet'

train_df.to_parquet(train_path, index=False)
val_df.to_parquet(val_path, index=False)
test_df.to_parquet(test_path, index=False)

print(f"  Train saved to: {train_path.name}")
print(f"  Validation saved to: {val_path.name}")
print(f"  Test saved to: {test_path.name}")

# ALSO SAVE AS PICKLE FOR COMPATIBILITY
train_pkl_path = data_dir / 'train_sessions.pkl'
val_pkl_path = data_dir / 'validation_sessions.pkl'
test_pkl_path = data_dir / 'test_sessions.pkl'

import pickle

with open(train_pkl_path, 'wb') as f:
    pickle.dump(train_df, f)
with open(val_pkl_path, 'wb') as f:
    pickle.dump(val_df, f)
with open(test_pkl_path, 'wb') as f:
    pickle.dump(test_df, f)

print(f"\n  Also saved as pickle for compatibility:")
print(f"  Train: {train_pkl_path.name}")
print(f"  Val: {val_pkl_path.name}")
print(f"  Test: {test_pkl_path.name}")

# Save metadata (fixed filename)
metadata = {
    'session_gap_hours': SESSION_GAP_HOURS,
    'min_session_length': MIN_SESSION_LENGTH,
    'max_session_length': MAX_SESSION_LENGTH,
    'vocab_size': {
        'events': len(vocab['event_to_int']),
        'items': len(vocab['item_to_int'])
    },
    'data_splits': {
        'train': {
            'sessions': len(train_df),
            'users': train_df['user_id'].nunique(),
            'treatment_rate': float(train_df['treatment'].mean()),
            'outcome_rate': float(train_df['outcome'].mean())
        },
        'validation': {
            'sessions': len(val_df),
            'users': val_df['user_id'].nunique(),
            'treatment_rate': float(val_df['treatment'].mean()),
            'outcome_rate': float(val_df['outcome'].mean())
        },
        'test': {
            'sessions': len(test_df),
            'users': test_df['user_id'].nunique(),
            'treatment_rate': float(test_df['treatment'].mean()),
            'outcome_rate': float(test_df['outcome'].mean())
        }
    }
}

metadata_path = data_dir / 'processing_metadata.json'
with open(metadata_path, 'w') as f:
    json.dump(metadata, f, indent=2)

print(f"\n  Metadata saved to: {metadata_path.name}")

print("\n" + "="*80)
print("DATA PROCESSING COMPLETE")
print("="*80)
print("\nReady for model training!")
print(f"Vocabulary: {vocab_path}")
print(f"Train data: {train_path}")
print(f"Validation data: {val_path}")
print(f"Test data: {test_path}")


Saving processed datasets...
  Train saved to: train_sessions.parquet
  Validation saved to: validation_sessions.parquet
  Test saved to: test_sessions.parquet

  Also saved as pickle for compatibility:
  Train: train_sessions.pkl
  Val: validation_sessions.pkl
  Test: test_sessions.pkl

  Metadata saved to: processing_metadata.json

DATA PROCESSING COMPLETE

Ready for model training!
Vocabulary: models/vocab.json
Train data: data/train_sessions.parquet
Validation data: data/validation_sessions.parquet
Test data: data/test_sessions.parquet


## Analysis and Sanity Checks

In [13]:
# Analyze treatment effect correlation
print("\nTreatment-Outcome Analysis:")
print("="*50)

for name, df in [('Train', train_df), ('Validation', val_df), ('Test', test_df)]:
    # Calculate purchase rates by treatment status
    treated = df[df['treatment'] == 1]
    control = df[df['treatment'] == 0]
    
    treated_purchase_rate = treated['outcome'].mean() if len(treated) > 0 else 0
    control_purchase_rate = control['outcome'].mean() if len(control) > 0 else 0
    
    print(f"\n{name} Set:")
    print(f"  Control (no click) purchase rate: {control_purchase_rate:.2%}")
    print(f"  Treated (click) purchase rate: {treated_purchase_rate:.2%}")
    print(f"  Naive lift: {(treated_purchase_rate - control_purchase_rate):.2%}")
    
print("\nNote: This naive lift includes selection bias and is not the true causal effect.")
print("The Causal Transformer will estimate the true effect.")


Treatment-Outcome Analysis:

Train Set:
  Control (no click) purchase rate: 80.00%
  Treated (click) purchase rate: 80.00%
  Naive lift: 0.00%

Validation Set:
  Control (no click) purchase rate: 0.00%
  Treated (click) purchase rate: 50.00%
  Naive lift: 50.00%

Test Set:
  Control (no click) purchase rate: 0.00%
  Treated (click) purchase rate: 75.00%
  Naive lift: 75.00%

Note: This naive lift includes selection bias and is not the true causal effect.
The Causal Transformer will estimate the true effect.


In [14]:
# Sample sequence inspection
print("\nSample Sequences:")
print("="*50)

# Get a few interesting examples
examples = [
    train_df[train_df['outcome'] == 1].iloc[0],  # Purchase example
    train_df[(train_df['treatment'] == 1) & (train_df['outcome'] == 0)].iloc[0],  # Click but no purchase
    train_df[(train_df['treatment'] == 0) & (train_df['outcome'] == 1)].iloc[0] if len(train_df[(train_df['treatment'] == 0) & (train_df['outcome'] == 1)]) > 0 else None  # Purchase without click
]

for i, example in enumerate(examples, 1):
    if example is None:
        continue
        
    print(f"\nExample {i}:")
    print(f"  Session ID: {example['session_id']}")
    print(f"  Treatment: {example['treatment']}, Outcome: {example['outcome']}")
    print(f"  Sequence length: {example['sequence_length']}")
    print(f"  First 5 events:")
    
    for j, (event_type_id, item_id, timedelta) in enumerate(example['sequence'][:5]):
        event_name = vocab['int_to_event'][event_type_id]
        print(f"    {j+1}. {event_name} (item={item_id}, time={timedelta:.1f}min)")
    
    if len(example['sequence']) > 5:
        print(f"    ... ({len(example['sequence']) - 5} more events)")


Sample Sequences:

Example 1:
  Session ID: ext1:0bd275fb-ce99-4062-b49b-9ed813f01791_1
  Treatment: 1, Outcome: 1
  Sequence length: 21
  First 5 events:
    1. auction (item=1, time=0.0min)
    2. auction (item=1, time=0.8min)
    3. auction (item=1, time=5.4min)
    4. impression (item=129, time=5.4min)
    5. impression (item=72, time=5.4min)
    ... (16 more events)

Example 2:
  Session ID: ext1:369ec0f1-6f2b-477c-b571-cbf43a4fda65_1
  Treatment: 1, Outcome: 0
  Sequence length: 100
  First 5 events:
    1. auction (item=1, time=0.0min)
    2. impression (item=690, time=0.0min)
    3. impression (item=1083, time=0.0min)
    4. auction (item=1, time=0.1min)
    5. auction (item=1, time=0.3min)
    ... (95 more events)

Example 3:
  Session ID: ext1:91f1cbc8-4881-4e1e-9cdd-1a2ecd3f094c_1
  Treatment: 0, Outcome: 1
  Sequence length: 250
  First 5 events:
    1. auction (item=1, time=0.0min)
    2. impression (item=1356, time=0.0min)
    3. impression (item=1913, time=0.0min)
    4