# Movie Recommendation Engine — Training Notebook

This notebook trains an SVD-based collaborative filtering model on the MovieLens ml-latest-small dataset.

**Steps:**
1. Dataset overview & EDA
2. Baseline predictor (BaselineOnly ALS)
3. SVD with default parameters
4. Hyperparameter tuning (GridSearchCV)
5. Results summary & export

**Target:** ~22% MAE reduction vs baseline

In [None]:
import pandas as pd
import numpy as np
import json, pickle, time, os
from surprise import Dataset, Reader, SVD, BaselineOnly
from surprise.model_selection import cross_validate, GridSearchCV

## 1. Load & Explore Data

In [None]:
# Load processed ratings
ratings = pd.read_csv('../data/processed/all_ratings.csv')
movies = pd.read_csv('../data/processed/movies_metadata.csv')

print(f'Ratings: {len(ratings):,}')
print(f'Users:   {ratings["userId"].nunique()}')
print(f'Movies:  {ratings["movieId"].nunique()}')
print(f'Sparsity: {1 - len(ratings) / (ratings["userId"].nunique() * ratings["movieId"].nunique()):.2%}')
print()
ratings.describe()

In [None]:
# Rating distribution
print('Rating Distribution:')
print(ratings['rating'].value_counts().sort_index())
print(f'\nMean rating: {ratings["rating"].mean():.3f}')
print(f'Median rating: {ratings["rating"].median()}')

## 2. Prepare Surprise Dataset

In [None]:
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
print('Surprise dataset loaded.')

## 3. Baseline Predictor

Using Surprise's `BaselineOnly` with ALS optimization. This models user and item biases only (no latent factors).

In [None]:
baseline = BaselineOnly(bsl_options={'method': 'als', 'n_epochs': 10})
baseline_results = cross_validate(baseline, data, measures=['MAE', 'RMSE'], cv=5, verbose=True)

baseline_mae = baseline_results['test_mae'].mean()
baseline_rmse = baseline_results['test_rmse'].mean()
print(f'\nBaseline MAE:  {baseline_mae:.4f}')
print(f'Baseline RMSE: {baseline_rmse:.4f}')

## 4. SVD with Default Parameters

SVD decomposes the rating matrix R ≈ P × Q^T where P and Q are low-rank user/item factor matrices.

In [None]:
svd_default = SVD()
svd_results = cross_validate(svd_default, data, measures=['MAE', 'RMSE'], cv=5, verbose=True)

default_mae = svd_results['test_mae'].mean()
default_rmse = svd_results['test_rmse'].mean()
print(f'\nDefault SVD MAE:  {default_mae:.4f}')
print(f'Default SVD RMSE: {default_rmse:.4f}')
print(f'Improvement vs Baseline: {(baseline_mae - default_mae) / baseline_mae * 100:.1f}%')

## 5. Hyperparameter Tuning (GridSearchCV)

Searching over n_factors, n_epochs, learning rate, and regularization.

In [None]:
param_grid = {
    'n_factors': [50, 100, 150, 200],
    'n_epochs': [20, 30, 40],
    'lr_all': [0.002, 0.005, 0.01],
    'reg_all': [0.02, 0.05, 0.1],
}

total = 1
for v in param_grid.values(): total *= len(v)
print(f'Searching {total} combinations (3-fold CV)...')

gs = GridSearchCV(SVD, param_grid, measures=['mae', 'rmse'], cv=3, n_jobs=-1)
gs.fit(data)

tuned_mae = gs.best_score['mae']
tuned_rmse = gs.best_score['rmse']
best_params = gs.best_params['mae']

print(f'\nBest MAE:    {tuned_mae:.4f}')
print(f'Best RMSE:   {tuned_rmse:.4f}')
print(f'Best params: {json.dumps(best_params, indent=2)}')

## 6. Results Summary

In [None]:
improvement = (baseline_mae - tuned_mae) / baseline_mae * 100

print('=' * 50)
print('FINAL RESULTS')
print('=' * 50)
print(f'{"Baseline MAE (ALS)":<30} {baseline_mae:.4f}')
print(f'{"Default SVD MAE":<30} {default_mae:.4f}')
print(f'{"Tuned SVD MAE":<30} {tuned_mae:.4f}')
print(f'{"MAE reduction vs Baseline":<30} {improvement:.1f}%')
print()
print('Best Hyperparameters:')
for k, v in best_params.items():
    print(f'  {k}: {v}')

if improvement >= 20:
    print(f'\n✓ TARGET MET: {improvement:.1f}% MAE reduction')
else:
    print(f'\n⚠ Achieved {improvement:.1f}% (target ~22%)')

## 7. Train Final Model & Export

In [None]:
# Train on full dataset with best params
final_svd = SVD(**best_params)
trainset = data.build_full_trainset()
final_svd.fit(trainset)

# Save
os.makedirs('../models', exist_ok=True)
with open('../models/svd_model.pkl', 'wb') as f:
    pickle.dump({'model': final_svd, 'trainset': trainset}, f)

# Save metrics
metrics = {
    'baseline_mae': round(baseline_mae, 4),
    'default_svd_mae': round(default_mae, 4),
    'tuned_svd_mae': round(tuned_mae, 4),
    'mae_reduction_pct': round(improvement, 1),
    'best_params': best_params,
    'dataset': 'MovieLens ml-latest-small',
    'algorithm': 'SVD (Matrix Factorization)',
    'library': 'Surprise',
}
with open('../models/metrics.json', 'w') as f:
    json.dump(metrics, f, indent=2)

print('Model and metrics saved successfully.')