In [None]:
# Importing initial packages
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset

# Setting the device to GPU for parallelization
device = 'mps' if torch.backends.mps.is_available() else 'cpu'
print(device)

In [None]:
# Setting Hyperparameters
batch_size = 16
epochs = 100

## Data Loading, Cleaning, and Preprocessing

### Data Loading

In [None]:
# In my project folder I added a data folder with both files (found in repo README) but did not push (gitignore) due to size
movies = pd.read_csv('data/movies.csv')

ratings = pd.read_csv('data/ratings.csv')

#### Inspecting the data

In [None]:
print(f"""The shape of movies is: {movies.shape}
The shape of rating is: {ratings.shape}""")

print(f"""The columns of movies is: {movies.columns.to_list()}
The columns of rating is: {ratings.columns.to_list()}

* Notice that both df's have a movieId column, which will be useful for merging""")

In [None]:
movies.head()

In [None]:
ratings.head()

As this is an already cleaned dataset, we can skip over to data preprocessing.

### Data Preprocessing

For this project we will first need to create a user-movie interaction matrix.

In [None]:
# Assume NAs are not watched by user, so rating of 0
user_rating_matrix = ratings.pivot(index='userId', columns='movieId', values='rating').fillna(0)
user_rating_matrix.head()

In [None]:
# Convert this from a pandas df to numpy array
user_rating_matrix_np = user_rating_matrix.values
print(f"Shape of the numpy matrix: {user_rating_matrix_np.shape}; represting {user_rating_matrix_np.shape[0]} users with {user_rating_matrix_np.shape[1]} movies.")

In [None]:
# Split into training and test split
train_data, test_data = train_test_split(user_rating_matrix_np, test_size=0.2, random_state=42)

# Convert to PyTorch tensors
train_data = torch.FloatTensor(train_data)
test_data = torch.FloatTensor(test_data)

## Model Development

### Defining Recommender model class

In [None]:
class Recommender(nn.Module):
    def __init__(self, num_users, num_items, n_embd = 50):
        super(Recommender, self).__init__()
        self.user_embedding = nn.Embedding(num_users, n_embd) # Embedding the users
        self.item_embedding = nn.Embedding(num_items, n_embd) # Embedding items
        self.fc1 = nn.Linear(n_embd * 2, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 1)

    def forward(self, user, item):
        user_embed = self.user_embedding(user)
        item_embed = self.item_embedding(item)
        x = torch.cat([user_embed, item_embed], dim=-1) # Concatenating the users and items 
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x
    
num_users, num_items = user_rating_matrix_np.shape
model = Recommender(num_users, num_items).to(device)

### Defining Loss Function and Optimizer
Purpose:

In [None]:
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr = 0.003)

### Creating DataLoader

In [None]:
class RatingsDataset(Dataset):
    def __init__(self, data):
        self.data = data.nonzero(as_tuple=True) # Get the indices of non-zero elements
        self.ratings = data[self.data] # Using non-zero indiced to extract those ratings

    def __len__ (self):
        return len(self.data[0])
    
    def __getitem__ (self, idx):
        user = self.data[0][idx]
        item = self.data[1][idx]
        rating = self.ratings[idx]
        
        return user, item, rating
    
train_dataset = RatingsDataset(train_data)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle = True)

test_dataset = RatingsDataset(test_data)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle = False)

print(f"Batch size: {batch_size}")
print(f"Number of samples in train_dataset: {len(train_dataset)}")
print(f"Number of samples in test_dataset: {len(test_dataset)}")


Now that we have created our model, and prepared the data for training, we can move onto model training!

## Model Training

In [None]:
model.train()
for epoch in range(epochs):
    total_loss = 0
    for user, item, rating in train_loader:
        user, item, rating = user.to(device), item.to(device), rating.to(device)
        optimizer.zero_grad()
        output = model(user, item).squeeze()
        loss = criterion(output, rating)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    if (epoch+1)%10==0:
        print(f"Epoch {epoch+1}, Loss: {(total_loss/len(train_loader))}")  # Print the average loss for this epoch

## Model Evaluation

In [None]:
def evaluate(model, data_loader):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for user, item, rating in data_loader:
            user, item, rating = user.to(device), item.to(device), rating.to(device)
            output = model(user, item).squeeze()
            loss = criterion(output, rating)
            total_loss += loss.item()
        return total_loss / len(data_loader)   

test_loss = evaluate(model, test_loader)
print(f"Test Loss: {test_loss}")