# üéØ Self-Improving Recommendation System - Quick Start

This notebook demonstrates the complete pipeline for training an RL-based recommendation system.

## 1Ô∏è‚É£ Setup

In [1]:
import sys
sys.path.insert(0, '..')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Set random seed for reproducibility
np.random.seed(42)

## 2Ô∏è‚É£ Download & Preprocess Data

In [2]:
from src.data.download import download_movielens, verify_dataset

# Download MovieLens 1M dataset
data_path = download_movielens('movielens-1m', '../data/raw')
verify_dataset(data_path, 'movielens-1m')

Dataset already exists at ..\data\raw\ml-1m
Dataset verification passed!


True

In [None]:
from src.data.preprocess import preprocess_data

# Preprocess the data
processed_data = preprocess_data(
    data_path='../data/raw/ml-1m',
    output_path='../data/processed',
    min_user_interactions=20,
    min_item_interactions=10
)

In [None]:
# Explore the data
print(f"Number of users: {processed_data['n_users']}")
print(f"Number of items: {processed_data['n_items']}")
print(f"Training interactions: {len(processed_data['train_df'])}")
print(f"Validation interactions: {len(processed_data['val_df'])}")
print(f"Test interactions: {len(processed_data['test_df'])}")

# Show sample data
processed_data['train_df'].head(10)

## 3Ô∏è‚É£ Train Baseline Model (Matrix Factorization)

In [None]:
from src.models.matrix_factorization import MatrixFactorization

# Initialize and train baseline
embedding_dim = 64

baseline = MatrixFactorization(
    n_users=processed_data['n_users'],
    n_items=processed_data['n_items'],
    embedding_dim=embedding_dim,
    method='svd'
)

baseline.fit(processed_data['train_matrix'])

In [None]:
# Get embeddings
user_embeddings = baseline.get_all_user_embeddings()
item_embeddings = baseline.get_all_item_embeddings()

print(f"User embeddings shape: {user_embeddings.shape}")
print(f"Item embeddings shape: {item_embeddings.shape}")

In [None]:
# Save baseline model
import os
os.makedirs('../results/models', exist_ok=True)
baseline.save('../results/models/baseline.npz')

## 4Ô∏è‚É£ Explore User Simulator

In [None]:
from src.environment.user_simulator import UserSimulator, FeedbackConfig

# Create simulator
simulator = UserSimulator(
    user_embeddings=user_embeddings,
    item_embeddings=item_embeddings,
    config=FeedbackConfig(
        purchase_threshold=0.7,
        click_threshold=0.4,
        reward_purchase=5.0,
        reward_click_dwell=2.0,
        reward_skip=-1.0
    )
)

In [None]:
# Simulate a user session
user_idx = simulator.reset(user_idx=0)
print(f"Session started for user {user_idx}")

# Simulate 10 recommendations
for step in range(10):
    # Get random item
    item_idx = np.random.randint(0, len(item_embeddings))
    
    # Get feedback
    reward, info = simulator.get_feedback(item_idx)
    
    print(f"Step {step+1}: Item {item_idx} -> {info['feedback_type']} (reward: {reward:.1f}, sim: {info['similarity']:.3f})")

## 5Ô∏è‚É£ Create Recommendation Environment

In [None]:
from src.environment.recommender_env import RecommenderEnv

# Create environment
env = RecommenderEnv(
    user_embeddings=user_embeddings,
    item_embeddings=item_embeddings,
    max_steps=20,
    num_candidates=100,
    history_length=10
)

print(f"State space: {env.observation_space}")
print(f"Action space: {env.action_space}")

In [None]:
# Test environment with random actions
state, info = env.reset()
print(f"Initial state shape: {state.shape}")

total_reward = 0
done = False
step = 0

while not done:
    action = env.action_space.sample()  # Random action
    next_state, reward, terminated, truncated, info = env.step(action)
    total_reward += reward
    done = terminated or truncated
    step += 1

print(f"Episode finished after {step} steps")
print(f"Total reward: {total_reward:.2f}")

## 6Ô∏è‚É£ Initialize DQN Agent

In [None]:
from src.models.dqn_agent import DQNAgent

# Get state dimension
assert env.observation_space.shape is not None, "Observation space shape must be defined"
state_dim = env.observation_space.shape[0]
print(f"State dimension: {state_dim}")

# Initialize agent
agent = DQNAgent(
    state_dim=state_dim,
    embedding_dim=embedding_dim,
    num_candidates=100,
    hidden_layers=[256, 128],
    learning_rate=0.001,
    gamma=0.99,
    epsilon_start=1.0,
    epsilon_end=0.01,
    epsilon_decay=0.995,
    buffer_size=10000,
    batch_size=64,
    double_dqn=True
)

print(f"Agent initialized on device: {agent.device}")

## 7Ô∏è‚É£ Quick Training Demo (100 episodes)

In [None]:
from tqdm.notebook import tqdm

# Training parameters
num_episodes = 100
min_replay_size = 500

# Track metrics
episode_rewards = []
episode_lengths = []

for episode in tqdm(range(num_episodes), desc="Training"):
    state, info = env.reset()
    episode_reward = 0
    done = False
    
    while not done:
        # Get candidate embeddings
        candidate_embeddings = env.get_candidate_embeddings()
        
        # Select action
        action = agent.select_action(state, candidate_embeddings, training=True)
        
        # Take step
        next_state, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated
        
        # Store experience
        agent.store_experience(
            state, action, reward, next_state, done, candidate_embeddings
        )
        
        # Train
        if len(agent.replay_buffer) >= min_replay_size:
            agent.train()
        
        state = next_state
        episode_reward += reward
    
    episode_rewards.append(episode_reward)
    episode_lengths.append(info.get('step', env.max_steps))

print(f"\nTraining complete!")
print(f"Final epsilon: {agent.epsilon:.3f}")
print(f"Average reward (last 20): {np.mean(episode_rewards[-20:]):.2f}")

In [None]:
# Plot learning curve
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Rewards
ax1.plot(episode_rewards, alpha=0.6)
window = 10
smoothed = np.convolve(episode_rewards, np.ones(window)/window, mode='valid')
ax1.plot(range(window-1, len(episode_rewards)), smoothed, color='red', linewidth=2)
ax1.set_xlabel('Episode')
ax1.set_ylabel('Cumulative Reward')
ax1.set_title('Learning Curve')
ax1.grid(True, alpha=0.3)

# Episode lengths
ax2.plot(episode_lengths, alpha=0.6)
ax2.axhline(y=env.max_steps, color='green', linestyle='--', label='Max steps')
ax2.set_xlabel('Episode')
ax2.set_ylabel('Episode Length')
ax2.set_title('Session Length Over Time')
ax2.legend()
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 8Ô∏è‚É£ Evaluate Agent vs Random Policy

In [None]:
def evaluate_policy(env, agent=None, num_episodes=50):
    """Evaluate a policy (agent or random)"""
    rewards = []
    
    for _ in range(num_episodes):
        state, info = env.reset()
        episode_reward = 0
        done = False
        
        while not done:
            if agent:
                candidate_embeddings = env.get_candidate_embeddings()
                action = agent.select_action(state, candidate_embeddings, training=False)
            else:
                action = env.action_space.sample()
            
            state, reward, terminated, truncated, info = env.step(action)
            done = terminated or truncated
            episode_reward += reward
        
        rewards.append(episode_reward)
    
    return rewards

# Evaluate
print("Evaluating RL Agent...")
rl_rewards = evaluate_policy(env, agent, num_episodes=50)

print("Evaluating Random Policy...")
random_rewards = evaluate_policy(env, agent=None, num_episodes=50)

print(f"\n{'='*50}")
print("RESULTS")
print(f"{'='*50}")
print(f"RL Agent:     {np.mean(rl_rewards):.2f} ¬± {np.std(rl_rewards):.2f}")
print(f"Random:       {np.mean(random_rewards):.2f} ¬± {np.std(random_rewards):.2f}")
print(f"Improvement:  {(np.mean(rl_rewards) - np.mean(random_rewards)):.2f}")

In [None]:
# Visualize comparison
fig, ax = plt.subplots(figsize=(10, 6))

positions = [1, 2]
data = [random_rewards, rl_rewards]
bp = ax.boxplot(data, positions=positions, widths=0.6, patch_artist=True)

colors = ['#ff6b6b', '#45b7d1']
for patch, color in zip(bp['boxes'], colors):
    patch.set_facecolor(color)
    patch.set_alpha(0.7)

ax.set_xticks(positions)
ax.set_xticklabels(['Random Policy', 'DQN Agent'])
ax.set_ylabel('Cumulative Reward')
ax.set_title('Policy Comparison', fontsize=14, fontweight='bold')
ax.axhline(y=0, color='gray', linestyle='--', alpha=0.5)
ax.grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

## ‚úÖ Summary

In this notebook, we:

1. **Downloaded and preprocessed** MovieLens 1M dataset
2. **Trained a baseline** Matrix Factorization model
3. **Built a user simulator** based on embedding similarity
4. **Created a Gym environment** for RL training
5. **Implemented a DQN agent** with experience replay
6. **Trained and evaluated** the agent

### Key Interview Points:

- "The user simulator uses cosine similarity between user and item embeddings to generate probabilistic feedback"
- "We use DQN with experience replay to break correlation between samples"
- "The reward function is designed to optimize long-term engagement, not just immediate clicks"
- "Œµ-greedy exploration ensures the agent discovers new user preferences"