In [3]:
# Recommender system (user-item matrix) using
# 1) user-based collaborative filtering (cosine similarity)
# 2) matrix factorization (SGD low-rank factorization)
# Includes movie titles in recommendations

import pandas as pd
import numpy as np
import os, zipfile, urllib.request
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity
from math import sqrt

# --- auto-download MovieLens ---
url = 'https://files.grouplens.org/datasets/movielens/ml-latest-small.zip'
filename = 'ml-latest-small.zip'
folder = 'ml-latest-small'

if not os.path.exists(folder):
    print('Downloading MovieLens dataset...')
    urllib.request.urlretrieve(url, filename)
    with zipfile.ZipFile(filename, 'r') as zip_ref:
        zip_ref.extractall('.')
    print('Extracted to', folder)

# --- load ratings ---
ratings = pd.read_csv(os.path.join(folder, 'ratings.csv'))
ratings = ratings[['userId','movieId','rating']]
ratings.rename(columns={'movieId':'itemId'}, inplace=True)

# --- load movies ---
movies = pd.read_csv(os.path.join(folder, 'movies.csv'))
movie_dict = pd.Series(movies.title.values,index=movies.movieId).to_dict()

# --- create mappings ---
users = ratings['userId'].unique()
items = ratings['itemId'].unique()
user_to_idx = {u:i for i,u in enumerate(users)}
item_to_idx = {i:j for j,i in enumerate(items)}
ratings['uidx'] = ratings['userId'].map(user_to_idx)
ratings['iidx'] = ratings['itemId'].map(item_to_idx)

# --- train-test split ---
train, test = train_test_split(ratings, test_size=0.2, random_state=42)

# --- build user-item matrix ---
n_users = len(users)
nn_items = len(items)
train_mat = np.zeros((n_users, nn_items), dtype=np.float32)
for r in train.itertuples(index=False):
    train_mat[r.uidx, r.iidx] = r.rating

# --- 1) User-based CF ---
user_sim = cosine_similarity(train_mat)

def predict_user_based(u, i, k=20):
    sim = user_sim[u]
    topk = np.argsort(sim)[-k-1:-1][::-1]
    nums = 0.0
    dens = 0.0
    for v in topk:
        if train_mat[v,i] > 0:
            nums += sim[v]*train_mat[v,i]
            dens += abs(sim[v])
    if dens==0:
        return train_mat[u].mean() if train_mat[u].sum()>0 else train['rating'].mean()
    return nums/dens

# --- Evaluate user-based CF ---
preds = [predict_user_based(r.uidx, r.iidx) for r in test.itertuples(index=False)]
trues = [r.rating for r in test.itertuples(index=False)]
rmse_user = sqrt(mean_squared_error(trues, preds))

# --- 2) Matrix factorization ---
K = 20
P = np.random.normal(0,0.1,(n_users,K)).astype(np.float32)
Q = np.random.normal(0,0.1,(nn_items,K)).astype(np.float32)

lr = 0.01
reg = 0.02
n_epochs = 10

for epoch in range(n_epochs):
    for r in train.itertuples(index=False):
        u = r.uidx
        i = r.iidx
        rating = r.rating
        pred = P[u].dot(Q[i])
        e = rating - pred
        P[u] += lr*(e*Q[i] - reg*P[u])
        Q[i] += lr*(e*P[u] - reg*Q[i])

# --- evaluate MF ---
preds_mf = [P[r.uidx].dot(Q[r.iidx]) for r in test.itertuples(index=False)]
trues_mf = [r.rating for r in test.itertuples(index=False)]
rmse_mf = sqrt(mean_squared_error(trues_mf, preds_mf))

# --- recommend for a user ---
user_id_example = users[0]
u = user_to_idx[user_id_example]

scores_cf = np.array([predict_user_based(u,i) if train_mat[u,i]==0 else 0 for i in range(nn_items)])
topk_cf = np.argsort(scores_cf)[-10:][::-1]

scores_mf = np.array([P[u].dot(Q[i]) if train_mat[u,i]==0 else 0 for i in range(nn_items)])
topk_mf = np.argsort(scores_mf)[-10:][::-1]

inv_item = {v:k for k,v in item_to_idx.items()}
rec_cf = [movie_dict[inv_item[i]] for i in topk_cf]
rec_mf = [movie_dict[inv_item[i]] for i in topk_mf]

print(f'User-based CF RMSE: {rmse_user:.4f}')
print(f'Matrix Factorization RMSE: {rmse_mf:.4f}')
print('\nRecommendations for user', user_id_example)
print('User-based CF top-10:')
for t in rec_cf: print('-', t)
print('MF top-10:')
for t in rec_mf: print('-', t)

User-based CF RMSE: 1.5350
Matrix Factorization RMSE: 1.1601

Recommendations for user 1
User-based CF top-10:
- Rosencrantz and Guildenstern Are Dead (1990)
- Touch of Evil (1958)
- General, The (1926)
- Swing Kids (1993)
- Summer of Sam (1999)
- Talladega Nights: The Ballad of Ricky Bobby (2006)
- Let the Right One In (Låt den rätte komma in) (2008)
- You Can't Take It with You (1938)
- 12 Angry Men (1957)
- No Country for Old Men (2007)
MF top-10:
- Great Escape, The (1963)
- Shawshank Redemption, The (1994)
- Casablanca (1942)
- Spirited Away (Sen to Chihiro no kamikakushi) (2001)
- Spotlight (2015)
- Philadelphia Story, The (1940)
- Amelie (Fabuleux destin d'Amélie Poulain, Le) (2001)
- Jaws (1975)
- Sunset Blvd. (a.k.a. Sunset Boulevard) (1950)
- Hoop Dreams (1994)
