[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/secretorange/ml-recommender/blob/main/MatrixFactorization.ipynb)

This is a bare bones example of using Matrix Factorisation to build a recommendation system using pytorch.

Matrix factorization is a popular technique used in recommender systems to predict user preferences. Imagine a big grid where rows are users, columns are items (like movies or products), and the cells contain ratings. Many cells in this grid are empty because not every user has rated every item. Matrix factorization works by breaking this large table into two smaller tables: one representing users and the other representing items. These smaller tables capture the underlying **factors** that influence user preferences, like genre preferences in movies. By multiplying these smaller tables, we can estimate the missing ratings in the original table, helping us recommend items that users are likely to enjoy.

## Dataset

In [None]:
from torch.utils.data import Dataset, DataLoader
import zipfile
import pandas as pd
import pdb;
from collections import defaultdict
from sklearn.preprocessing import LabelEncoder

class MovieDataset(Dataset):
  def __init__(self):
    # Download the popular movielens dataset
    ! curl http://files.grouplens.org/datasets/movielens/ml-latest-small.zip -o ml-latest-small.zip

    with zipfile.ZipFile('ml-latest-small.zip', 'r') as zip:
      zip.extractall('data')

    movies_df = pd.read_csv('data/ml-latest-small/movies.csv')
    ratings_df = pd.read_csv('data/ml-latest-small/ratings.csv')

    # Label Encode the ids (the encodings will then match the indexes of the embeddings)
    self.d = defaultdict(LabelEncoder)
    for c in ['userId', 'movieId']:
      # Encode the ids
      self.d[c].fit(ratings_df[c].unique())

      # Swap out the ids for the encoded values
      ratings_df[c] = self.d[c].transform(ratings_df[c])


    self.x = ratings_df.drop(['rating', 'timestamp'], axis=1).values
    self.y = ratings_df['rating'].values
    self.x, self.y = torch.tensor(self.x), torch.tensor(self.y)

    users = ratings_df.userId.unique()
    movies = ratings_df.movieId.unique()

    self.n_users = len(users)
    self.n_items = len(movies)

  def __getitem__(self, index):
    return (self.x[index], self.y[index])

  def __len__(self):
    return len(self.x)

## Matrix Factorization

In [None]:
import torch
import numpy as np

class MatrixFactorization(torch.nn.Module):
  def __init__(self, n_users, n_items, n_factors=20):
    super().__init__()
    # Create the embeddings that will be trained
    self.user_factors = torch.nn.Embedding(n_users, n_factors)
    self.item_factors = torch.nn.Embedding(n_items, n_factors)

    # Initialise to random weights
    self.user_factors.weight.data.uniform_(0, 0.05)
    self.item_factors.weight.data.uniform_(0, 0.05)

  def forward(self, data):
    users, items = data[:,0], data[:, 1]

    user_embedding = self.user_factors(users)
    item_embedding = self.item_factors(items)

    dot_product = (user_embedding * item_embedding).sum(1)

    return dot_product

  def predict(self, user, item):
    data = torch.tensor([[user, item]], dtype=torch.long)
    return self.forward(data)


## Train Test Split

In [None]:
from torch.utils.data import DataLoader, Dataset, random_split
from sklearn.model_selection import train_test_split

train_set = MovieDataset()

# Split the dataset into training and testing sets
train_size = int(0.8 * len(train_set))
test_size = len(train_set) - train_size
train_dataset, test_dataset = random_split(train_set, [train_size, test_size])

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False)

# Define the loss function, model and optimizer
epochs = 128
loss_fn = torch.nn.MSELoss()
model = MatrixFactorization(train_set.n_users, train_set.n_items, n_factors=8)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop
for epoch in range(epochs):
    model.train()
    epoch_loss = 0.0
    for x, y in train_loader:
        optimizer.zero_grad()
        outputs = model(x)
        loss = loss_fn(outputs, y.type(torch.float32))

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        # Accumulate the loss
        epoch_loss += loss.item()

    # Calculate the average loss for the epoch
    epoch_loss = epoch_loss / len(train_loader)

    # Print the training loss for this epoch
    print(f"Epoch {epoch+1}/{epochs}, Loss: {epoch_loss}")

# Evaluation function
def evaluate(model, loader):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for x, y in loader:
            predictions = model(x)
            loss = loss_fn(predictions, y.type(torch.float32))
            total_loss += loss.item()

    return total_loss / len(loader)

# Evaluate on the test set
test_loss = evaluate(model, test_loader)
print(f"Test Loss: {test_loss}")

## Predict

In [None]:
def predict(userId, movieId):
  movieIndex = train_set.d['movieId'].transform([movieId])[0]
  userIndex = train_set.d['userId'].transform([userId])[0]
  predicted_rating = model.predict(userId, movieIndex)
  print(f"Predicted rating for user {userId} and item {movieId}: {predicted_rating}")

predict(1, 1)
predict(1, 2)