# Implementing a recommendation system for recommending movies using collaborative filtering

In [2]:
import numpy as np
import pandas as pd
import zipfile
import os

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!unzip '/content/drive/My Drive/movie_reviews.zip' -d "/content/drive/My Drive/movies"

Archive:  /content/drive/My Drive/movie_reviews.zip
replace /content/drive/My Drive/movies/movie_genres.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: /content/drive/My Drive/movies/movie_genres.csv  
replace /content/drive/My Drive/movies/user_reviews.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: /content/drive/My Drive/movies/user_reviews.csv  


In [4]:
import torch

df_genres = pd.read_csv("/content/drive/My Drive/movies/movie_genres.csv")
df_ratings = pd.read_csv("/content/drive/My Drive/movies/user_reviews.csv")

# Movies x Genres
X = df_genres.iloc[:, 2:].values.astype(np.float32)

# Users x Movies
y = df_ratings.iloc[:, 2:].values.astype(np.float32)

$\theta$, User Embeddings

X, Item (Movie) Embeddings

Collaborative filtering: Learn $\theta$ and X through optimizing:

Minimizing the squared loss $\sum_{(i,j)∈R}(\theta_i^T x_j - y_{ij})^2$

In [5]:
def train_test_split(ratings: torch.Tensor, split_ratio: float = 0.8):
    user_idx, movie_idx = ratings.nonzero(as_tuple=True)
    all_pairs = torch.stack([user_idx, movie_idx], dim=1)

    # Shuffle the indices
    num_ratings = all_pairs.size(0)
    permutation = torch.randperm(num_ratings)
    all_pairs = all_pairs[permutation]

    # Train-test split
    train_size = int(split_ratio * num_ratings)
    train_pairs = all_pairs[:train_size]
    test_pairs  = all_pairs[train_size:]

    # Create train and test matrices
    r_train = torch.zeros_like(ratings)
    r_test  = torch.zeros_like(ratings)

    r_train[train_pairs[:, 0], train_pairs[:, 1]] = ratings[train_pairs[:, 0], train_pairs[:, 1]]
    r_test[test_pairs[:, 0], test_pairs[:, 1]] = ratings[test_pairs[:, 0], test_pairs[:, 1]]

    # Ensure every user has at least one rating in train set
    unique_users_in_train = set(train_pairs[:, 0].tolist())
    all_users = set(user_idx.tolist())

    missing_users = all_users - unique_users_in_train
    for user in missing_users:
        user_ratings = (user_idx == user).nonzero(as_tuple=True)[0]
        first_rating_idx = user_ratings[0]  # Take at least one rating from test
        r_train[user_idx[first_rating_idx], movie_idx[first_rating_idx]] = ratings[user_idx[first_rating_idx], movie_idx[first_rating_idx]]
        r_test[user_idx[first_rating_idx], movie_idx[first_rating_idx]] = 0  # Remove from test

    return r_train, r_test

# convert to tensors. collaborative filtering will only use user_reviews
user_reviews = torch.tensor(y)
movie_genres = torch.tensor(X)

r_train, r_test = train_test_split(user_reviews, split_ratio=0.8)


In [24]:
class Recommender(torch.nn.Module):
  def __init__(self, num_users, num_movies, num_features):
    super(Recommender, self).__init__()

    # vectors for users- and movies latent features
    self.user_embeddings = torch.nn.Embedding(num_users, num_features)
    self.movie_embeddings = torch.nn.Embedding(num_movies, num_features)

    # bias terms to capture systematic patterns
    self.user_bias = torch.nn.Embedding(num_users, 1)
    self.movie_bias = torch.nn.Embedding(num_movies, 1)
    self.global_bias = torch.nn.Parameter(torch.tensor(0.0))

    # initialize with a normal distribution
    torch.nn.init.normal_(self.user_embeddings.weight, std=0.01)
    torch.nn.init.normal_(self.movie_embeddings.weight, std=0.01)

  def forward(self, user_ids, movie_ids):

    # get the vector with latent features
    user_vector = self.user_embeddings(user_ids)
    movie_vector = self.movie_embeddings(movie_ids)

    # interactions between users and movies
    dot_product = (user_vector * movie_vector).sum(1)

    # get biases
    user_b = self.user_bias(user_ids).squeeze(1)
    movie_b = self.movie_bias(movie_ids).squeeze(1)

    # predicted rating
    return dot_product + user_b + movie_b + self.global_bias


num_users = y.shape[0]
num_movies = X.shape[0]
num_features = 32 # latent features

model = Recommender(num_users, num_movies, num_features)
loss = torch.nn.MSELoss()
# stochastic gradient descent with l2 regularization
optimizer = torch.optim.Adam( #stochastic gradient descent
    model.parameters(),
    lr = 0.001,
    weight_decay = 0.0005)

user_idx, movie_idx = r_train.nonzero(as_tuple=True)
ratings = r_train[user_idx, movie_idx]  # get actual ratings

losses = []
epochs = 10000
best_loss = float("inf")

for epoch in range(epochs):
    optimizer.zero_grad()

    y_pred = model(user_idx, movie_idx)

    loss_value = loss(y_pred, ratings)

    if loss_value < best_loss:
      best_loss = loss_value
      losses.append(loss_value.item())
    else:
      break

    loss_value.backward()
    optimizer.step()

    if epoch % 1000 == 0:
        print(f"Epoch {epoch}/{epochs} - Loss: {loss_value.item():.4f}")

print("Training completed.")


Epoch 0/10000 - Loss: 15.5807
Epoch 1000/10000 - Loss: 0.4078
Epoch 2000/10000 - Loss: 0.3129
Epoch 3000/10000 - Loss: 0.2868
Epoch 4000/10000 - Loss: 0.2717
Epoch 5000/10000 - Loss: 0.2660
Epoch 6000/10000 - Loss: 0.2628
Epoch 7000/10000 - Loss: 0.2611
Training completed.


In [11]:
# Evaluate on the test set
model.eval()  # Set model to evaluation mode
with torch.no_grad():
    user_idx_test, movie_idx_test = r_test.nonzero(as_tuple=True)  # Get nonzero test ratings
    ratings_test = r_test[user_idx_test, movie_idx_test]  # Get actual ratings

    y_pred_test = model(user_idx_test, movie_idx_test)  # Predict ratings

    test_loss = loss(y_pred_test, ratings_test)  # Compute loss
    print(f"Test Loss (MSE): {test_loss.item():.4f}")
    print(f"Test RMSE: {test_loss.sqrt().item():.4f}")


Test Loss (MSE): 1.7051
Test RMSE: 1.3058


In [30]:

# get the index for movies so that we can get the titles
movie_index_to_name = {idx: title for idx, title in enumerate(df_genres["movie_title"].tolist())}

def recommend_movies(model, user_id, top_k=5):
    num_movies = len(movie_index_to_name)

    # predict movie ratings for a user:
    # create tensors for: repeat user id for each movie and movie indices
    user_tensor = torch.tensor([user_id] * num_movies)
    movie_tensor = torch.tensor(list(range(num_movies)))

    # get predicted ratings for all movies
    #model.eval()
    with torch.no_grad():
        predicted_ratings = model(user_tensor, movie_tensor)

    # Mask already rated movies
    rated_mask = y[user_id] > 0  # Mask for rated movies
    predicted_ratings[rated_mask] = -float("inf")  # Set to lowest value


    # get top K movie indices
    top_movie_indices = predicted_ratings.argsort(descending=True)[:top_k]

    # convert movie indices to names
    recommended_movies = [movie_index_to_name[idx.item()] for idx in top_movie_indices]

    return recommended_movies

user_names = ["Vincent", "Edgar", "Addilyn", "Marlee", "Javier"]
user_index = {name: idx for idx, name in enumerate(user_names)}

for user in user_names:
    user_id = user_index[user]  # Convert name to index
    top_movies = recommend_movies(model, user_id, top_k=5)
    print(f"Top 5 recommendations for {user}: {top_movies}")
    print("")



Top 5 recommendations for Vincent: ['Homefront', 'Alpha and Omega 4: The Legend of the Saw Toothed Cave', 'The Cabin in the Woods', 'Harry Potter and the Chamber of Secrets', 'Jaws']

Top 5 recommendations for Edgar: ['The Magic Sword: Quest for Camelot', 'Force 10 from Navarone', 'Jonah: A VeggieTales Movie', 'Dylan Dog: Dead of Night', 'Wasabi']

Top 5 recommendations for Addilyn: ['Bottle Rocket', 'Last Action Hero', 'Alexander', 'The Tempest', "Perrier's Bounty"]

Top 5 recommendations for Marlee: ['Max Payne', 'Homefront', 'Harley Davidson and the Marlboro Man', 'Bottle Rocket', 'The Good Thief']

Top 5 recommendations for Javier: ['Seeking a Friend for the End of the World', 'Chill Factor', 'Arthur Christmas', 'Monster House', 'Good Dick']

