In [1]:
from pathlib import Path
import pandas as pd
import numpy as np

#### Lets build a recommender system on the MovieLens dataset

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from tqdm import tqdm


In [3]:
! head data/ml-100k/u.data

196	242	3	881250949
186	302	3	891717742
22	377	1	878887116
244	51	2	880606923
166	346	1	886397596
298	474	4	884182806
115	265	2	881171488
253	465	5	891628467
305	451	3	886324817
6	86	3	883603013


In [4]:
# import the ml100k dataset
df = pd.read_csv('data/ml-100k/u.data', sep='\t', header=None, names=['user_id', 'item_id', 'rating', 'timestamp'])
df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [5]:
# count the number of unique users and items
print(df.user_id.unique().shape[0]), print(df.item_id.unique().shape[0])

943
1682


(None, None)

In [6]:
# number of ratings
print(df.shape[0])

100000


In [7]:
# split the data into train and test sets
train, test = train_test_split(df, test_size=0.2, random_state=42)

In [8]:
# instead, lets create a function which creates a dictionary of user ids and item ids
def create_user_item_dict(df):
    user_ids = df.user_id.unique().tolist()
    item_ids = df.item_id.unique().tolist()
    user2idx = {o:i for i,o in enumerate(user_ids)}
    item2idx = {o:i for i,o in enumerate(item_ids)}
    return user2idx, item2idx

In [9]:
# get the user and item dictionaries
user2idx, item2idx = create_user_item_dict(train)

In [10]:
train.head()

Unnamed: 0,user_id,item_id,rating,timestamp
75220,807,1411,1,893082619
48955,474,659,5,887925187
44966,463,268,4,877384940
13568,139,286,4,879537844
92727,621,751,4,883799651


In [11]:
# create a dataset class that does the encoding too
class MovieLensDataset(Dataset):
    def __init__(self, df, user2idx, item2idx):
        self.users = df.user_id.values
        self.items = df.item_id.values
        self.ratings = df.rating.values
        self.user2idx = user2idx
        self.item2idx = item2idx
        
    def __len__(self):
        return len(self.users)
    
    def __getitem__(self, idx):
        user = self.user2idx.get(self.users[idx], -1)
        item = self.item2idx.get(self.items[idx], -1)
        rating = self.ratings[idx]
        return user, item, rating

In [12]:
# initialize the dataset
train_ds = MovieLensDataset(train, user2idx, item2idx)
test_ds = MovieLensDataset(test, user2idx, item2idx)

In [13]:
#check the dataset class
train_ds[0]


(0, 0, 1)

In [14]:

# create a dataloader

train_dl = DataLoader(train_ds, batch_size=1024, shuffle=True)
test_dl = DataLoader(test_ds, batch_size=1024, shuffle=False)


In [15]:
next(iter(train_dl))

[tensor([786, 417, 242,  ..., 382, 348, 390]),
 tensor([173, 478, 154,  ..., 374, 455, 272]),
 tensor([3, 2, 2,  ..., 4, 3, 4])]

In [18]:
device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.has_mps else "cpu")
print(device)

mps


### i. Trying Simple Matrix Factorization model
- Here Embedding layer is used to learn the latent factors

In [38]:
# create a model class
class MatrixFactorization(nn.Module):
    def __init__(self, n_users, n_items, n_factors=40):
        super().__init__()
        self.user_factors = nn.Embedding(n_users, n_factors)
        self.item_factors = nn.Embedding(n_items, n_factors)
        #initialize the embeddings, this is important. Else, model can struggle to learn
        nn.init.normal_(self.user_factors.weight, std=0.01)
        nn.init.normal_(self.item_factors.weight, std=0.01)
        
    def forward(self, user, item):
        return (self.user_factors(user) * self.item_factors(item)).sum(1)

In [39]:

# initialize the model
model = MatrixFactorization(len(user2idx), len(item2idx), n_factors=60).to(device)

# define the loss function. We will use mean squared error because we are predicting ratings
# if we were predicting a binary outcome, we would use binary cross entropy
criterion = nn.MSELoss()

# define the optimizer
optimizer = torch.optim.SGD(model.parameters(), lr=1e-2)

# define the number of epochs
epochs = 20

In [40]:
# no of batches
len(train_dl)


79

In [41]:
# train the model.
# use the tqdm library to show the progress bar

for epoch in range(epochs):
    running_loss = 0.0
    train_tqdm = tqdm(train_dl)
    for i, data in enumerate(train_tqdm):
        users, items, ratings = data
        users = users.long().to(device)
        items = items.long().to(device)
        ratings = ratings.float().to(device)
        
        # zero the parameter gradients
        optimizer.zero_grad()
        
        outputs = model(users, items)
        loss = criterion(outputs, ratings)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
        train_tqdm.set_description(f"Epoch {epoch}, Loss: {loss.item():.4f}")

Epoch 0, Loss: 13.0230: 100%|██████████| 79/79 [00:00<00:00, 120.79it/s]
Epoch 1, Loss: 14.2037: 100%|██████████| 79/79 [00:00<00:00, 126.59it/s]
Epoch 2, Loss: 14.8197: 100%|██████████| 79/79 [00:00<00:00, 130.84it/s]
Epoch 3, Loss: 13.8361: 100%|██████████| 79/79 [00:00<00:00, 126.19it/s]
Epoch 4, Loss: 14.0629: 100%|██████████| 79/79 [00:00<00:00, 127.42it/s]
Epoch 5, Loss: 13.8282: 100%|██████████| 79/79 [00:00<00:00, 120.67it/s]
Epoch 6, Loss: 13.3747: 100%|██████████| 79/79 [00:00<00:00, 120.88it/s]
Epoch 7, Loss: 13.8676: 100%|██████████| 79/79 [00:00<00:00, 121.34it/s]
Epoch 8, Loss: 12.7109: 100%|██████████| 79/79 [00:00<00:00, 123.39it/s]
Epoch 9, Loss: 13.7339: 100%|██████████| 79/79 [00:00<00:00, 119.36it/s]
Epoch 10, Loss: 14.3827: 100%|██████████| 79/79 [00:00<00:00, 126.05it/s]
Epoch 11, Loss: 13.9675: 100%|██████████| 79/79 [00:00<00:00, 128.62it/s]
Epoch 12, Loss: 13.0315: 100%|██████████| 79/79 [00:00<00:00, 130.17it/s]
Epoch 13, Loss: 13.2345: 100%|██████████| 79/79 

In [42]:
# predict the ratings for the test set
y_true = []
y_pred = []
for i, data in enumerate(test_dl):
    users, items, ratings = data
    users = users.long().to(device)
    items = items.long().to(device)
    ratings = ratings.float().to(device)
    
    outputs = model(users, items)
    y_true.extend(ratings.cpu().detach().numpy())
    y_pred.extend(outputs.cpu().detach().numpy())

In [43]:
# calculate the mean squared error
from sklearn.metrics import mean_squared_error
mean_squared_error(y_true, y_pred)


13.683395