# Recommendation System - Exploratory Analysis

This notebook demonstrates:
1. Data exploration and preprocessing
2. Feature engineering
3. Model training and evaluation
4. Embedding visualization
5. A/B test analysis

In [None]:
# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE
from sklearn.metrics import roc_auc_score, ndcg_score
import torch

# Set style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')

# Display settings
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 100)

## 1. Data Exploration

In [None]:
# Load sample data
# In production, this would load from your data warehouse

# Generate synthetic data for demonstration
np.random.seed(42)

n_users = 10000
n_items = 5000
n_interactions = 100000

# User data
users = pd.DataFrame({
    'user_id': [f'user_{i}' for i in range(n_users)],
    'age': np.random.randint(18, 70, n_users),
    'gender': np.random.choice(['M', 'F', 'O'], n_users),
    'country': np.random.choice(['US', 'UK', 'DE', 'FR', 'JP'], n_users, p=[0.4, 0.2, 0.15, 0.15, 0.1]),
    'signup_days': np.random.randint(1, 1000, n_users),
})

# Item data
items = pd.DataFrame({
    'item_id': [f'item_{i}' for i in range(n_items)],
    'category': np.random.choice(['Electronics', 'Clothing', 'Home', 'Sports', 'Books'], n_items),
    'price': np.random.lognormal(4, 1, n_items).round(2),
    'popularity': np.random.pareto(2, n_items),
})

# Interactions
interactions = pd.DataFrame({
    'user_id': [f'user_{np.random.randint(0, n_users)}' for _ in range(n_interactions)],
    'item_id': [f'item_{np.random.zipf(2) % n_items}' for _ in range(n_interactions)],
    'event_type': np.random.choice(['view', 'click', 'add_cart', 'purchase'], n_interactions, p=[0.5, 0.3, 0.15, 0.05]),
    'timestamp': pd.date_range('2024-01-01', periods=n_interactions, freq='T'),
})

print(f"Users: {len(users):,}")
print(f"Items: {len(items):,}")
print(f"Interactions: {len(interactions):,}")

In [None]:
# Interaction distribution
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

# Event types
interactions['event_type'].value_counts().plot(kind='bar', ax=axes[0], color='steelblue')
axes[0].set_title('Event Type Distribution')
axes[0].set_ylabel('Count')

# Interactions per user
user_counts = interactions.groupby('user_id').size()
axes[1].hist(user_counts, bins=50, color='steelblue', edgecolor='white')
axes[1].set_title('Interactions per User')
axes[1].set_xlabel('Number of interactions')
axes[1].set_ylabel('Number of users')

# Item popularity (long tail)
item_counts = interactions.groupby('item_id').size().sort_values(ascending=False)
axes[2].plot(range(len(item_counts)), item_counts.values, color='steelblue')
axes[2].set_title('Item Popularity (Long Tail)')
axes[2].set_xlabel('Item rank')
axes[2].set_ylabel('Number of interactions')
axes[2].set_yscale('log')

plt.tight_layout()
plt.show()

## 2. Feature Engineering

In [None]:
# User features
user_features = users.copy()

# Add interaction-based features
user_stats = interactions.groupby('user_id').agg({
    'item_id': 'count',
    'event_type': lambda x: (x == 'purchase').sum(),
}).rename(columns={'item_id': 'total_interactions', 'event_type': 'total_purchases'})

user_features = user_features.merge(user_stats, on='user_id', how='left').fillna(0)
user_features['conversion_rate'] = user_features['total_purchases'] / user_features['total_interactions'].clip(lower=1)

print("User Features Sample:")
user_features.head()

In [None]:
# Item features
item_features = items.copy()

# Add interaction-based features
item_stats = interactions.groupby('item_id').agg({
    'user_id': 'nunique',
    'event_type': [
        ('views', lambda x: (x == 'view').sum()),
        ('clicks', lambda x: (x == 'click').sum()),
        ('purchases', lambda x: (x == 'purchase').sum()),
    ]
}).reset_index()
item_stats.columns = ['item_id', 'unique_users', 'views', 'clicks', 'purchases']

item_features = item_features.merge(item_stats, on='item_id', how='left').fillna(0)
item_features['ctr'] = item_features['clicks'] / item_features['views'].clip(lower=1)

print("Item Features Sample:")
item_features.head()

## 3. Model Training Demo

In [None]:
import sys
sys.path.insert(0, '..')

from src.models.two_tower import TwoTowerModel, TwoTowerConfig
from src.models.dlrm import DLRM, DLRMConfig

# Initialize Two-Tower model
two_tower_config = TwoTowerConfig(
    user_embedding_dim=64,
    item_embedding_dim=64,
    output_embedding_dim=128,
    num_user_categorical_features=3,
    num_item_categorical_features=2,
    user_categorical_cardinalities=[100, 3, 5],  # age_bucket, gender, country
    item_categorical_cardinalities=[5, 100],  # category, price_bucket
)

two_tower = TwoTowerModel(two_tower_config)
print(f"Two-Tower parameters: {sum(p.numel() for p in two_tower.parameters()):,}")

In [None]:
# Initialize DLRM model
dlrm_config = DLRMConfig(
    num_sparse_features=10,
    sparse_cardinalities=[100] * 10,
    sparse_embedding_dim=64,
    num_dense_features=13,
    bottom_mlp_dims=[512, 256, 64],
    top_mlp_dims=[512, 256, 1],
)

dlrm = DLRM(dlrm_config)
print(f"DLRM parameters: {sum(p.numel() for p in dlrm.parameters()):,}")

## 4. Embedding Visualization

In [None]:
# Generate random embeddings for visualization
# In production, these would come from trained models

n_samples = 1000
embedding_dim = 128

# Simulate item embeddings with cluster structure
categories = ['Electronics', 'Clothing', 'Home', 'Sports', 'Books']
category_centers = np.random.randn(5, embedding_dim) * 2

embeddings = []
labels = []

for i, cat in enumerate(categories):
    n_cat = n_samples // 5
    cat_embeddings = category_centers[i] + np.random.randn(n_cat, embedding_dim) * 0.5
    embeddings.append(cat_embeddings)
    labels.extend([cat] * n_cat)

embeddings = np.vstack(embeddings)

# t-SNE projection
tsne = TSNE(n_components=2, random_state=42, perplexity=30)
embeddings_2d = tsne.fit_transform(embeddings)

# Plot
plt.figure(figsize=(10, 8))
for cat in categories:
    mask = [l == cat for l in labels]
    plt.scatter(
        embeddings_2d[mask, 0],
        embeddings_2d[mask, 1],
        label=cat,
        alpha=0.6,
        s=20
    )

plt.legend()
plt.title('Item Embeddings (t-SNE Projection)')
plt.xlabel('Dimension 1')
plt.ylabel('Dimension 2')
plt.tight_layout()
plt.show()

## 5. Model Evaluation Metrics

In [None]:
def compute_metrics(y_true, y_pred, k=10):
    """Compute recommendation metrics."""
    metrics = {}
    
    # AUC
    metrics['auc'] = roc_auc_score(y_true, y_pred)
    
    # Precision@K, Recall@K
    top_k_idx = np.argsort(y_pred)[-k:]
    relevant = y_true[top_k_idx].sum()
    metrics[f'precision@{k}'] = relevant / k
    metrics[f'recall@{k}'] = relevant / y_true.sum() if y_true.sum() > 0 else 0
    
    return metrics

# Simulate predictions
np.random.seed(42)
n_test = 1000
y_true = np.random.binomial(1, 0.1, n_test)  # 10% positive rate
y_pred = y_true * 0.7 + np.random.rand(n_test) * 0.3  # Noisy predictions

metrics = compute_metrics(y_true, y_pred)
print("Model Metrics:")
for name, value in metrics.items():
    print(f"  {name}: {value:.4f}")

## 6. A/B Test Analysis

In [None]:
from scipy import stats

# Simulate A/B test results
np.random.seed(42)

# Control: baseline CTR 2.5%
control_impressions = 50000
control_clicks = np.random.binomial(control_impressions, 0.025)
control_ctr = control_clicks / control_impressions

# Treatment: improved CTR 2.8% (+12% lift)
treatment_impressions = 50000
treatment_clicks = np.random.binomial(treatment_impressions, 0.028)
treatment_ctr = treatment_clicks / treatment_impressions

# Statistical significance test
pooled_ctr = (control_clicks + treatment_clicks) / (control_impressions + treatment_impressions)
se = np.sqrt(pooled_ctr * (1 - pooled_ctr) * (1/control_impressions + 1/treatment_impressions))
z_score = (treatment_ctr - control_ctr) / se
p_value = 2 * (1 - stats.norm.cdf(abs(z_score)))

print("A/B Test Results:")
print(f"  Control CTR: {control_ctr:.4f} ({control_clicks:,} / {control_impressions:,})")
print(f"  Treatment CTR: {treatment_ctr:.4f} ({treatment_clicks:,} / {treatment_impressions:,})")
print(f"  Relative Lift: {(treatment_ctr - control_ctr) / control_ctr * 100:.1f}%")
print(f"  Z-score: {z_score:.2f}")
print(f"  P-value: {p_value:.4f}")
print(f"  Significant (Î±=0.05): {p_value < 0.05}")

In [None]:
# Visualize A/B test
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# CTR comparison
bars = axes[0].bar(['Control', 'Treatment'], [control_ctr * 100, treatment_ctr * 100], 
                   color=['steelblue', 'coral'])
axes[0].set_ylabel('CTR (%)')
axes[0].set_title('Click-Through Rate Comparison')
axes[0].bar_label(bars, fmt='%.2f%%')

# Confidence intervals
control_se = np.sqrt(control_ctr * (1 - control_ctr) / control_impressions)
treatment_se = np.sqrt(treatment_ctr * (1 - treatment_ctr) / treatment_impressions)

x = [0, 1]
y = [control_ctr * 100, treatment_ctr * 100]
yerr = [1.96 * control_se * 100, 1.96 * treatment_se * 100]

axes[1].errorbar(x, y, yerr=yerr, fmt='o', capsize=5, capthick=2, markersize=10)
axes[1].set_xticks([0, 1])
axes[1].set_xticklabels(['Control', 'Treatment'])
axes[1].set_ylabel('CTR (%)')
axes[1].set_title('95% Confidence Intervals')

plt.tight_layout()
plt.show()

## Summary

This notebook demonstrated:

1. **Data Exploration**: Understanding interaction patterns and the long-tail distribution
2. **Feature Engineering**: Creating user and item features from raw data
3. **Model Architecture**: Two-Tower for retrieval, DLRM for ranking
4. **Embedding Visualization**: t-SNE projection showing category clusters
5. **Evaluation Metrics**: AUC, Precision@K, Recall@K
6. **A/B Testing**: Statistical significance testing for CTR lift

For production deployment, see the deployment guide in `docs/deployment.md`.