Download MovieLens Dataset


In [None]:
# !wget http://files.grouplens.org/datasets/movielens/ml-latest-small.zip
# !unzip ml-latest-small.zip -d ./data
# !rm -rf ml-latest-small.zip

In [13]:
%load_ext autoreload
%autoreload 2


In [None]:
import pandas as pd
import torch


### Load Data

In [6]:
df_links = pd.read_csv("./data/ml-latest-small/links.csv")
df_movies = pd.read_csv("./data/ml-latest-small/movies.csv")
df_ratings = pd.read_csv("./data/ml-latest-small/ratings.csv")
df_tags = pd.read_csv("./data/ml-latest-small/tags.csv")


In [64]:
df_tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [7]:
df_links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [8]:
df_movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [65]:
# Number of unique movies with data

num_movies = df_movies.movieId.nunique()
num_movies
# Number of unique movies with ratings

9742

In [9]:
df_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [67]:
print(f"Number of movies: {num_movies}")

movies_with_ratings = df_ratings.movieId.nunique()
print(f"Number of movies with ratings: {movies_with_ratings}") 

# Number of users with ratings
num_users = df_ratings.userId.nunique()
print(f"Number of users with ratings: {num_users}")

# Number of unique movies without ratings
print(f"Number of movies without ratings: {num_movies - movies_with_ratings}")

Number of movies: 9742
Number of movies with ratings: 9724
Number of users with ratings: 610
Number of movies without ratings: 18


In [71]:
df_ratings.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,100836.0,100836.0,100836.0,100836.0
mean,326.127564,19435.295718,3.501557,1205946000.0
std,182.618491,35530.987199,1.042529,216261000.0
min,1.0,1.0,0.5,828124600.0
25%,177.0,1199.0,3.0,1019124000.0
50%,325.0,2991.0,3.5,1186087000.0
75%,477.0,8122.0,4.0,1435994000.0
max,610.0,193609.0,5.0,1537799000.0


In [None]:
* We can observe that the user_id and movie_id both are 1-indexed.
* Rating Range is 0.5 to 5.0

### Prepare Training/Validation Dataset

In [None]:
from sklearn.model_selection import train_test_split


# mapping of user_id to index and movie_id to index
user_id_to_index = {user_id: i for i, user_id in enumerate(sorted(df_ratings.userId.unique()))}
movie_id_to_index = {movie_id: i for i, movie_id in enumerate(sorted(df_ratings.movieId.unique()))}

# create a new column with the index of the user_id and movie_id
df_ratings['user_id_index'] = df_ratings['userId'].map(user_id_to_index)
df_ratings['movie_id_index'] = df_ratings['movieId'].map(movie_id_to_index)


# Split the data into training and testing sets
df_ratings_train, df_ratings_test = train_test_split(df_ratings, test_size=0.2, random_state=42)

In [85]:
df_ratings_train.head()

Unnamed: 0,userId,movieId,rating,timestamp,user_id_index,movie_id_index
80568,509,7347,3.0,1435994597,508,4893
50582,326,71462,4.0,1322252335,325,7127
8344,57,2115,3.0,965798155,56,1575
99603,610,1127,4.0,1479544102,609,855
71701,462,2409,2.0,1174438249,461,1808


In [144]:
from torch.utils.data import Dataset

class MovieDataset(Dataset):
    def __init__(self, rating_file):
        self.df_ratings = pd.read_csv(rating_file)
        
        # Create mappings and store as instance attributes
        self.user_id_to_index = {user_id: i for i, user_id in enumerate(sorted(self.df_ratings.userId.unique()))}
        self.movie_id_to_index = {movie_id: i for i, movie_id in enumerate(sorted(self.df_ratings.movieId.unique()))}
        
        # Apply mappings
        self.df_ratings['user_id_index'] = self.df_ratings['userId'].map(self.user_id_to_index)
        self.df_ratings['movie_id_index'] = self.df_ratings['movieId'].map(self.movie_id_to_index)
        
        # Store number of unique users and movies
        self.num_users = len(self.user_id_to_index)
        self.num_movies = len(self.movie_id_to_index)

    def __len__(self):
        return len(self.df_ratings)
    
    def __getitem__(self, idx):
        """
        Returns a single training example as tensors
        """
        row = self.df_ratings.iloc[idx]
        
        user_id = torch.tensor(row['user_id_index'], dtype=torch.long)
        movie_id = torch.tensor(row['movie_id_index'], dtype=torch.long)
        rating = torch.tensor(row['rating'], dtype=torch.float)
        
        return user_id, movie_id, rating

In [145]:
from torch.utils.data import DataLoader, random_split

dataset = MovieDataset('./data/ml-latest-small/ratings.csv')
# Split dataset
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=2, shuffle=False)

In [146]:
print("Training Data", df_ratings_train.shape)
print("Testing Data", df_ratings_test.shape)

Training Data (80668, 6)
Testing Data (20168, 6)


### Modelling
* We will build a collaborative filtering model to recommend movies to users.


1. Need User embedding matrix, init randomly => U
2. similarly Movie embedding matrix, init randomly => M
3. Forward Pass:
    - To compute A(i,j) = MatMul(Ui, Mj)
    - Perform backward pass and update the matrix U and M

In [160]:
import torch

a = torch.tensor([[1, 2, 3], [4, 5, 6]])
b = torch.tensor([[10, 10, 10], [-10, -10, -10]])

a_bias = torch.tensor([100, -100])
b_bias = torch.tensor([2, 2])

w1w2 = einsum(a, b, "batch d, batch d -> batch")
print(w1w2)
result = w1w2 + a_bias + b_bias
print(result)

tensor([  60, -150])
tensor([ 162, -248])


In [249]:
from torch import nn
from einops import einsum, rearrange, reduce

class CollaborativeFiltering(nn.Module):
    def __init__(self, num_users, num_movies, embedding_dim):
        super().__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_dim, max_norm=1.0)
        self.movie_embedding = nn.Embedding(num_movies, embedding_dim, max_norm=1.0)
        self.user_bias = nn.Embedding(num_users, 1)
        self.movie_bias = nn.Embedding(num_movies, 1)

    def forward(self, user_ids, movie_ids):
        user_embeddings = self.user_embedding(user_ids)
        movie_embeddings = self.movie_embedding(movie_ids)
        user_bias = self.user_bias(user_ids)
        movie_bias = self.movie_bias(movie_ids)
        w1w2 =  einsum(user_embeddings, movie_embeddings, "batch embedding_dim, batch embedding_dim -> batch")
        result = w1w2 + rearrange(user_bias, "batch 1-> batch") + rearrange(movie_bias, "batch 1-> batch")
        # pass the model through the sigmoid function and scale the result to the range 0.0 to 5.5
        result = torch.sigmoid(result) * 5.5
        return result


In [None]:
# Lets keep the number of latent factors to 10
# It's a hyperparameter that we can tune
embedding_dim = 10
model = CollaborativeFiltering(num_users, num_movies, embedding_dim=embedding_dim)
print(model)

# define loss function
loss_fn = nn.MSELoss()
# define optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

CollaborativeFiltering(
  (user_embedding): Embedding(610, 10, max_norm=1.0)
  (movie_embedding): Embedding(9742, 10, max_norm=1.0)
  (user_bias): Embedding(610, 1)
  (movie_bias): Embedding(9742, 1)
)


### Manual Validation

In [251]:
pytorch_total_params = sum(p.numel() for p in model.parameters())
print("Number of parameters in the model", pytorch_total_params)

Number of parameters in the model 113872


In [244]:
num_movies*10 + num_users* 10 + num_movies + num_users

113872

In [252]:
batch = next(iter(train_loader))
batch

[tensor([437, 461]), tensor([4426, 6983]), tensor([5., 4.])]

In [253]:
user_embeddings =  model.user_embedding(batch[0])
movie_embeddings = model.movie_embedding(batch[1])
user_bias =        model.user_bias(batch[0])
movie_bias =       model.movie_bias(batch[1])

print(user_embeddings.shape)
print(movie_embeddings.shape)

print(user_bias.shape)
print(movie_bias.shape)

w1w2 =  einsum(user_embeddings, movie_embeddings, "batch embedding_dim, batch embedding_dim -> batch")
result = w1w2 + rearrange(user_bias, "batch 1-> batch") + rearrange(movie_bias, "batch 1-> batch")
torch.sigmoid(result) * 5.5

torch.Size([2, 10])
torch.Size([2, 10])
torch.Size([2, 1])
torch.Size([2, 1])


tensor([2.7433, 0.1405], grad_fn=<MulBackward0>)

In [254]:
model(batch[0], batch[1])

tensor([2.7433, 0.1405], grad_fn=<MulBackward0>)

### Training loop

In [255]:
# Now based on the model, loss function and optimizer, we can train the model
# training loop

for epoch in range(20):
    # Training loop
    model.train()
    train_loss = 0
    for batch in dataloader:
        predictions = model(batch[0], batch[1])
        # compute loss
        loss = loss_fn(predictions, batch[2])
        # backward pass
        optimizer.zero_grad()
        loss.backward()
        # update weights
        optimizer.step()
        train_loss += loss.item()

    # Evaluation loop
    model.eval()
    test_loss = 0
    with torch.no_grad():
        for batch in dataloader:
            predictions = model(batch[0], batch[1])
            loss = loss_fn(predictions, batch[2])
            test_loss += loss.item()

    # print loss
    print(f"Epoch {epoch+1}, Train Loss: {train_loss/len(dataloader)}, Test Loss: {test_loss/len(dataloader)}")


Epoch 1, Train Loss: 3.743292913937629, Test Loss: 3.644081073969149
Epoch 2, Train Loss: 3.5640319542685135, Test Loss: 3.47058545934216
Epoch 3, Train Loss: 3.394427059643765, Test Loss: 3.3061117074135598
Epoch 4, Train Loss: 3.235082643324046, Test Loss: 3.149911215160099
Epoch 5, Train Loss: 3.083901254167109, Test Loss: 3.002606145905208
Epoch 6, Train Loss: 2.9412228887342864, Test Loss: 2.865289363348242
Epoch 7, Train Loss: 2.8079830226181124, Test Loss: 2.7366817366653287
Epoch 8, Train Loss: 2.683718777625694, Test Loss: 2.6164081699314155
Epoch 9, Train Loss: 2.5672303843430155, Test Loss: 2.504119653008916
Epoch 10, Train Loss: 2.4576354682407704, Test Loss: 2.3982223388809842
Epoch 11, Train Loss: 2.3556340212371145, Test Loss: 2.2992840166851347
Epoch 12, Train Loss: 2.2601554932138943, Test Loss: 2.205773183480283
Epoch 13, Train Loss: 2.169024942641331, Test Loss: 2.1189295471962635
Epoch 14, Train Loss: 2.0839133851116682, Test Loss: 2.0374778481153064
Epoch 15, Train

This is a healthy training loop.
- we can observe that with each epoch, the training loss is reducing and also the test loss is also reducing.
- No Overfitting is observed. Since the test loss is also reducing.
- We can also observe that the training loss is reducing faster than the test loss.
- This is a good sign that the model is learning.



In [256]:
# Now lets save the model
torch.save(model.state_dict(), "collaborative_filtering.pth")

### Test

In [None]:
# Now we have learned User and Movie Embeddings.
# For a given movie, fetch top-n similar movies

In [257]:
df_movies["movie_id_index"] = df_movies["movieId"].map(dataset.movie_id_to_index)

In [274]:
df_movies.head()

Unnamed: 0,movieId,title,genres,movie_id_index
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,1.0
2,3,Grumpier Old Men (1995),Comedy|Romance,2.0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,3.0
4,5,Father of the Bride Part II (1995),Comedy,4.0


In [280]:

df_movies[df_movies.title.str.contains("Titanic")]

Unnamed: 0,movieId,title,genres,movie_id_index
1291,1721,Titanic (1997),Drama|Romance,1290.0
2542,3403,Raise the Titanic (1980),Drama|Thriller,2539.0
2543,3404,Titanic (1953),Action|Drama,2540.0
3553,4864,Titanica (1992),Documentary|IMAX,3548.0


#### Build Faiss Index 

In [265]:
import faiss
import numpy as np
import torch
import pandas as pd
from typing import Tuple, Dict

def build_cosine_similarity_index(model: torch.nn.Module) -> faiss.Index:
    """
    Build a FAISS index for cosine similarity search using movie embeddings.
    
    Returns:
        faiss.Index: Index configured for cosine similarity (inner product on normalized vectors)
    """
    dimension = model.movie_embedding.weight.shape[1]
    
    # Use IndexFlatIP (Inner Product) for cosine similarity
    index = faiss.IndexFlatIP(dimension)
    
    # Get and normalize movie embeddings
    movie_embeddings = model.movie_embedding.weight.detach().numpy().astype(np.float32)
    faiss.normalize_L2(movie_embeddings)  # Normalize for cosine similarity
    
    index.add(movie_embeddings)
    return index

def get_similar_movies_cosine(
    model: torch.nn.Module, 
    index: faiss.Index, 
    movie_idx: int, 
    df_movies: pd.DataFrame,
    n: int = 10
) -> pd.DataFrame:
    """
    Find similar movies using cosine similarity.
    
    Args:
        model: Trained collaborative filtering model
        index: FAISS index built with build_cosine_similarity_index
        movie_idx: Index of the query movie
        df_movies: DataFrame containing movie information
        n: Number of similar movies to return
    
    Returns:
        DataFrame with similar movies and their cosine similarity scores
    """
    # Get and normalize query movie embedding
    query_embedding = model.movie_embedding.weight[movie_idx:movie_idx+1].detach().numpy().astype(np.float32)
    faiss.normalize_L2(query_embedding)
    
    # Search for similar movies (returns similarity scores, not distances)
    similarities, similar_movie_indices = index.search(query_embedding, n)
    
    # Create results DataFrame
    results = _create_results_dataframe(
        similar_movie_indices[0], 
        similarities[0], 
        df_movies, 
        metric_name='cosine_similarity'
    )
    
    return results

def _create_results_dataframe(
    movie_indices: np.ndarray, 
    scores: np.ndarray, 
    df_movies: pd.DataFrame,
    metric_name: str = 'score'
) -> pd.DataFrame:
    """Helper function to create results DataFrame"""
    
    # Create mapping from movie index to score
    score_map = dict(zip(movie_indices, scores))
    
    # Filter movies and add scores
    results = df_movies[df_movies.movieId.isin(movie_indices)].copy()
    results[metric_name] = results['movieId'].map(score_map)
    
    # Sort by score (descending for similarity, ascending for distance)
    ascending = metric_name.endswith('distance')
    results = results.sort_values(metric_name, ascending=ascending)
    
    return results



In [281]:
# Usage
index = build_cosine_similarity_index(model)
similar_movies = get_similar_movies_cosine(model, index, movie_idx=1290, df_movies=df_movies, n=10)
print(similar_movies)

      movieId                                   title  \
989      1290           Some Kind of Wonderful (1987)   
3945     5553                         Stakeout (1987)   
3747     5220                         Showtime (2002)   
197       231  Dumb & Dumber (Dumb and Dumber) (1994)   
4343     6344    101 Reykjavik (101 Reykjavík) (2000)   
1809     2409                         Rocky II (1979)   

                             genres  movie_id_index  cosine_similarity  
989                   Drama|Romance           988.0           1.000000  
3945  Comedy|Crime|Romance|Thriller          3940.0           0.943884  
3747                  Action|Comedy          3742.0           0.884455  
197                Adventure|Comedy           197.0           0.871591  
4343           Comedy|Drama|Romance          4337.0           0.853902  
1809                   Action|Drama          1808.0           0.818806  
