# Deep Learning Ad Recommender - Complete Tutorial

This notebook demonstrates the complete pipeline for building a production-ready ad recommendation system with two-stage retrieval.

## Architecture

1. **Stage 1**: Two-Tower Neural Network (1M → 500 candidates in <50ms)
2. **Stage 2**: Transformer Ranker (500 → 10 ads in <50ms)
3. **FAISS**: Fast similarity search

## Contents

1. Data Preparation
2. Two-Tower Model Training
3. FAISS Index Building
4. Transformer Ranker Training
5. Inference & Evaluation
6. Performance Analysis

## Setup

In [None]:
import sys
sys.path.append('/home/claude/ad_recommender')

import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm

# Our modules
from data_preprocessing import CriteoDataPreprocessor, create_synthetic_criteo_data
from two_tower_model import TwoTowerModel
from transformer_ranker import TransformerRanker
from faiss_retrieval import FAISSIndex
from training_pipeline import AdDataset, TwoTowerTrainer, TransformerTrainer
from inference import AdRecommenderInference

# Configure
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

## 1. Data Preparation

We'll create synthetic Criteo-like data for this demo. In production, use real Criteo dataset.

In [None]:
# Create synthetic data
print("Creating synthetic dataset...")
df = create_synthetic_criteo_data(
    n_samples=50000,
    save_path='/home/claude/ad_recommender/data/demo_data.txt'
)

# Display sample
print("\nDataset preview:")
print(df.head())
print(f"\nShape: {df.shape}")
print(f"CTR: {df['label'].mean():.4f}")

In [None]:
# Visualize label distribution
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# Label distribution
df['label'].value_counts().plot(kind='bar', ax=axes[0])
axes[0].set_title('Label Distribution')
axes[0].set_xlabel('Label')
axes[0].set_ylabel('Count')

# Numerical feature distribution
df[['I1', 'I2', 'I3']].hist(ax=axes[1], bins=30, alpha=0.7)
axes[1].set_title('Numerical Features Distribution')

plt.tight_layout()
plt.show()

In [None]:
# Split and preprocess
train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

print(f"Train: {len(train_df):,} samples")
print(f"Val:   {len(val_df):,} samples")
print(f"Test:  {len(test_df):,} samples")

# Preprocess
preprocessor = CriteoDataPreprocessor()
train_data = preprocessor.fit_transform(train_df)
val_data = preprocessor.transform(val_df)
test_data = preprocessor.transform(test_df)

print(f"\nFeature dimensions: {preprocessor.feature_dims}")

## 2. Two-Tower Model Training (Stage 1)

The two-tower model learns separate embeddings for users and ads.

In [None]:
# Prepare features
def split_features(data):
    num_user_cat = 6
    return {
        'user_categorical': data['categorical'][:, :num_user_cat],
        'ad_categorical': data['categorical'][:, num_user_cat:],
        'numerical': data['numerical'],
        'labels': data['labels']
    }

train_split = split_features(train_data)
val_split = split_features(val_data)

In [None]:
# Create datasets
from torch.utils.data import DataLoader

train_dataset = AdDataset(
    user_categorical=train_split['user_categorical'],
    ad_categorical=train_split['ad_categorical'],
    numerical=train_split['numerical'],
    labels=train_split['labels']
)

val_dataset = AdDataset(
    user_categorical=val_split['user_categorical'],
    ad_categorical=val_split['ad_categorical'],
    numerical=val_split['numerical'],
    labels=val_split['labels']
)

train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True, num_workers=2)
val_loader = DataLoader(val_dataset, batch_size=512, shuffle=False, num_workers=2)

print(f"Train batches: {len(train_loader)}")
print(f"Val batches: {len(val_loader)}")

In [None]:
# Create model
user_cat_cols = [f'C{i}' for i in range(1, 7)]
ad_cat_cols = [f'C{i}' for i in range(7, 27)]

user_feature_dims = {col: preprocessor.feature_dims[col] for col in user_cat_cols}
ad_feature_dims = {col: preprocessor.feature_dims[col] for col in ad_cat_cols}

two_tower_model = TwoTowerModel(
    user_feature_dims=user_feature_dims,
    ad_feature_dims=ad_feature_dims,
    numerical_dim=13,
    embedding_dim=16,
    hidden_dims=[256, 128],  # Smaller for demo
    output_dim=128,
    dropout=0.3
)

print(f"Model parameters: {sum(p.numel() for p in two_tower_model.parameters()):,}")

In [None]:
# Train
trainer = TwoTowerTrainer(
    model=two_tower_model,
    device=device,
    learning_rate=0.001
)

trainer.train(
    train_loader=train_loader,
    val_loader=val_loader,
    epochs=3,  # Quick demo
    save_dir='/home/claude/ad_recommender/models'
)

In [None]:
# Plot training curves
plt.figure(figsize=(10, 5))
plt.plot(trainer.train_losses, label='Train Loss', marker='o')
plt.plot(trainer.val_losses, label='Val Loss', marker='s')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Two-Tower Training Progress')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

## 3. FAISS Index Building

Build an index of ad embeddings for fast retrieval.

In [None]:
# Generate ad embeddings
two_tower_model.eval()
ad_embeddings = []

with torch.no_grad():
    for batch in tqdm(val_loader, desc="Generating embeddings"):
        ad_cat = batch['ad_categorical'].to(device)
        ad_emb = two_tower_model.get_ad_embeddings(ad_cat)
        ad_embeddings.append(ad_emb.cpu().numpy())

ad_embeddings = np.vstack(ad_embeddings)
print(f"Generated {len(ad_embeddings)} ad embeddings")
print(f"Embedding dimension: {ad_embeddings.shape[1]}")

In [None]:
# Create and populate FAISS index
faiss_index = FAISSIndex(
    dimension=ad_embeddings.shape[1],
    index_type='Flat',  # Exact search for small dataset
    use_gpu=False
)

faiss_index.add(ad_embeddings)
print(f"\nIndex contains {faiss_index.index.ntotal} vectors")

In [None]:
# Benchmark retrieval speed
import time

# Generate query embeddings
test_batch = next(iter(val_loader))
user_cat = test_batch['user_categorical'][:100].to(device)
user_num = test_batch['numerical'][:100].to(device)

with torch.no_grad():
    user_emb = two_tower_model.get_user_embeddings(user_cat, user_num)
    user_emb_np = user_emb.cpu().numpy()

# Benchmark
times = []
for _ in range(10):
    start = time.time()
    candidates, scores = faiss_index.search(user_emb_np, k=100)
    times.append((time.time() - start) * 1000)

print(f"Average retrieval time: {np.mean(times):.2f}ms")
print(f"Per-query time: {np.mean(times)/100:.2f}ms")

## 4. Transformer Ranker Training (Stage 2)

The transformer ranker refines the candidate set using attention and feature interactions.

In [None]:
# Create multi-task datasets
train_engagement = (train_split['labels'] * np.random.random(len(train_split['labels'])) > 0.3).astype(float)
train_revenue = (train_split['labels'] * np.random.random(len(train_split['labels'])) > 0.2).astype(float)

val_engagement = (val_split['labels'] * np.random.random(len(val_split['labels'])) > 0.3).astype(float)
val_revenue = (val_split['labels'] * np.random.random(len(val_split['labels'])) > 0.2).astype(float)

train_dataset_mt = AdDataset(
    user_categorical=train_split['user_categorical'],
    ad_categorical=train_split['ad_categorical'],
    numerical=train_split['numerical'],
    labels=train_split['labels'],
    engagement_labels=train_engagement,
    revenue_labels=train_revenue
)

val_dataset_mt = AdDataset(
    user_categorical=val_split['user_categorical'],
    ad_categorical=val_split['ad_categorical'],
    numerical=val_split['numerical'],
    labels=val_split['labels'],
    engagement_labels=val_engagement,
    revenue_labels=val_revenue
)

train_loader_mt = DataLoader(train_dataset_mt, batch_size=256, shuffle=True, num_workers=2)
val_loader_mt = DataLoader(val_dataset_mt, batch_size=512, shuffle=False, num_workers=2)

In [None]:
# Create transformer model
transformer_ranker = TransformerRanker(
    user_feature_dims=user_feature_dims,
    ad_feature_dims=ad_feature_dims,
    numerical_dim=13,
    embedding_dim=16,
    d_model=128,  # Smaller for demo
    num_heads=4,
    num_layers=2,
    d_ff=512,
    dropout=0.1
)

print(f"Model parameters: {sum(p.numel() for p in transformer_ranker.parameters()):,}")

In [None]:
# Train
transformer_trainer = TransformerTrainer(
    model=transformer_ranker,
    device=device,
    learning_rate=0.0001,
    task_weights={'ctr': 1.0, 'engagement': 0.5, 'revenue': 0.3}
)

transformer_trainer.train(
    train_loader=train_loader_mt,
    val_loader=val_loader_mt,
    epochs=3,
    save_dir='/home/claude/ad_recommender/models'
)

In [None]:
# Plot training curves
plt.figure(figsize=(10, 5))
plt.plot(transformer_trainer.train_losses, label='Train Loss', marker='o')
plt.plot(transformer_trainer.val_losses, label='Val Loss', marker='s')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Transformer Ranker Training Progress')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

## 5. Inference & Evaluation

Test the complete two-stage system.

In [None]:
# Create synthetic user for demo
user_data = {
    'categorical': {f'C{i}': f'cat_{np.random.randint(0, 50)}' for i in range(1, 7)},
    'numerical': {f'I{i}': np.random.random() * 100 for i in range(1, 14)}
}

print("User features:")
print(f"  Categorical: {list(user_data['categorical'].values())[:3]}...")
print(f"  Numerical: {list(user_data['numerical'].values())[:3]}...")

In [None]:
# Manual two-stage inference
# Stage 1: Retrieve candidates
user_cat_tensor = torch.tensor([[0, 1, 2, 3, 4, 5]]).to(device)  # Dummy indices
user_num_tensor = torch.randn(1, 13).to(device)

with torch.no_grad():
    user_emb = two_tower_model.get_user_embeddings(user_cat_tensor, user_num_tensor)
    user_emb_np = user_emb.cpu().numpy()

candidates, stage1_scores = faiss_index.search(user_emb_np, k=100)
print(f"Stage 1: Retrieved {len(candidates[0])} candidates")
print(f"  Top scores: {stage1_scores[0][:5]}")

In [None]:
# Stage 2: Rank candidates
batch_user_cat = user_cat_tensor.repeat(len(candidates[0]), 1)
batch_user_num = user_num_tensor.repeat(len(candidates[0]), 1)
batch_ad_cat = torch.randint(0, 200, (len(candidates[0]), 20)).to(device)

with torch.no_grad():
    predictions = transformer_ranker(batch_user_cat, batch_ad_cat, batch_user_num)
    ctr_scores = torch.sigmoid(predictions['ctr']).cpu().numpy()

# Get top-10
top_indices = np.argsort(ctr_scores)[::-1][:10]
final_ads = candidates[0][top_indices]
final_scores = ctr_scores[top_indices]

print(f"\nStage 2: Ranked to top-10")
print(f"\nTop-10 Recommendations:")
for i, (ad_id, score) in enumerate(zip(final_ads, final_scores), 1):
    print(f"  {i}. Ad {ad_id}: CTR={score:.4f}")

## 6. Performance Analysis

In [None]:
# Analyze score distributions
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Stage 1 scores
axes[0].hist(stage1_scores[0], bins=50, alpha=0.7, color='blue')
axes[0].axvline(stage1_scores[0].mean(), color='red', linestyle='--', label='Mean')
axes[0].set_title('Stage 1: Retrieval Scores')
axes[0].set_xlabel('Similarity Score')
axes[0].set_ylabel('Count')
axes[0].legend()

# Stage 2 scores
axes[1].hist(ctr_scores, bins=50, alpha=0.7, color='green')
axes[1].axvline(ctr_scores.mean(), color='red', linestyle='--', label='Mean')
axes[1].set_title('Stage 2: CTR Predictions')
axes[1].set_xlabel('Predicted CTR')
axes[1].set_ylabel('Count')
axes[1].legend()

plt.tight_layout()
plt.show()

In [None]:
# Latency analysis
latencies = {'stage1': [], 'stage2': [], 'total': []}

for _ in range(100):
    # Stage 1
    start = time.time()
    with torch.no_grad():
        user_emb = two_tower_model.get_user_embeddings(user_cat_tensor, user_num_tensor)
        user_emb_np = user_emb.cpu().numpy()
    candidates, _ = faiss_index.search(user_emb_np, k=100)
    stage1_time = (time.time() - start) * 1000
    
    # Stage 2
    start = time.time()
    batch_user_cat = user_cat_tensor.repeat(len(candidates[0]), 1)
    batch_user_num = user_num_tensor.repeat(len(candidates[0]), 1)
    batch_ad_cat = torch.randint(0, 200, (len(candidates[0]), 20)).to(device)
    with torch.no_grad():
        predictions = transformer_ranker(batch_user_cat, batch_ad_cat, batch_user_num)
    stage2_time = (time.time() - start) * 1000
    
    latencies['stage1'].append(stage1_time)
    latencies['stage2'].append(stage2_time)
    latencies['total'].append(stage1_time + stage2_time)

# Plot latency distribution
fig, ax = plt.subplots(figsize=(10, 5))
positions = [1, 2, 3]
data = [latencies['stage1'], latencies['stage2'], latencies['total']]
bp = ax.boxplot(data, positions=positions, labels=['Stage 1', 'Stage 2', 'Total'])
ax.set_ylabel('Latency (ms)')
ax.set_title('Inference Latency Distribution')
ax.grid(True, alpha=0.3)

# Add statistics
stats_text = (
    f"Stage 1: {np.mean(latencies['stage1']):.2f}ms (±{np.std(latencies['stage1']):.2f})\n"
    f"Stage 2: {np.mean(latencies['stage2']):.2f}ms (±{np.std(latencies['stage2']):.2f})\n"
    f"Total: {np.mean(latencies['total']):.2f}ms (±{np.std(latencies['total']):.2f})"
)
ax.text(0.02, 0.98, stats_text, transform=ax.transAxes, 
        verticalalignment='top', bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))

plt.tight_layout()
plt.show()

## Summary

✅ **Completed:**
1. Two-Tower Model trained for candidate generation
2. FAISS index built for fast retrieval
3. Transformer Ranker trained for final ranking
4. Complete inference pipeline tested
5. Performance benchmarked

**Next Steps:**
- Train on full Criteo dataset (45M+ samples)
- Experiment with different architectures
- Add online learning capabilities
- Deploy to production
- Implement A/B testing