In [25]:
# Importing initial packages
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset

# Setting the device to GPU for parallelization
device = 'mps' if torch.backends.mps.is_available() else 'cpu'
print(device)

mps


## Data Loading, Cleaning, and Preprocessing

### Data Loading

In [26]:
# In my project folder I added a data folder with both files (found in repo README) but did not push (gitignore) due to size
movies = pd.read_csv('data/movies.csv')

ratings = pd.read_csv('data/ratings.csv')

#### Inspecting the data

In [27]:
print(f"""The shape of movies is: {movies.shape}
The shape of rating is: {ratings.shape}""")

print(f"""The columns of movies is: {movies.columns.to_list()}
The columns of rating is: {ratings.columns.to_list()}

* Notice that both df's have a movieId column, which will be useful for merging""")

The shape of movies is: (9742, 3)
The shape of rating is: (100836, 4)
The columns of movies is: ['movieId', 'title', 'genres']
The columns of rating is: ['userId', 'movieId', 'rating', 'timestamp']

* Notice that both df's have a movieId column, which will be useful for merging


In [28]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [29]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


As this is an already cleaned dataset, we can skip over to data preprocessing.

### Data Preprocessing

For this project we will first need to create a user-movie interaction matrix.

In [30]:
# Assume NAs are not watched by user, so rating of 0
user_rating_matrix = ratings.pivot(index='userId', columns='movieId', values='rating').fillna(0)
user_rating_matrix.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
# Convert this from a pandas df to numpy array
user_rating_matrix_np = user_rating_matrix.values
print(f"Shape of the numpy matrix: {user_rating_matrix_np.shape}; represting {user_rating_matrix_np.shape[0]} users with {user_rating_matrix_np.shape[1]} movies.")

Shape of the numpy matrix: (610, 9724); represting 610 users with 9724 movies.


In [32]:
# Split into training and test split
train_data, test_data = train_test_split(user_rating_matrix_np, test_size=0.2, random_state=42)

# Convert to PyTorch tensors
train_data = torch.FloatTensor(train_data)
test_data = torch.FloatTensor(test_data)

## Model Development

### Defining Recommender model class

In [33]:
class Recommender(nn.Module):
    def __init__(self, num_users, num_items, n_embd = 50):
        super(Recommender, self).__init__()
        self.user_embedding = nn.Embedding(num_users, n_embd) # Embedding the users
        self.item_embedding = nn.Embedding(num_items, n_embd) # Embedding items
        self.fc1 = nn.Linear(n_embd * 2, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 1)

    def forward(self, user, item):
        user_embed = self.user_embedding(user)
        item_embed = self.item_embedding(item)
        x = torch.cat([user_embed, item_embed], dim=-1) # Concatenating the users and items 
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x
    
num_users, num_items = user_rating_matrix_np.shape
model = Recommender(num_users, num_items)

### Defining Loss Function and Optimizer
Purpose:

In [34]:
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr = 0.001)

### Creating DataLoader

In [41]:
class RatingsDataset(Dataset):
    def __init__(self,data):
        self.data = data.nonzero() # Get the indices of non-zero elements
        self.ratings = data[self.data] # Using non-zero indiced to extract those ratings

    def __len__ (self):
        
        return len(self.data)
    
    def forward (self, idx):
        user, item = self.data[idx]
        rating = self.ratings[idx]
        
        return user, item, rating
    
train_dataset = RatingsDataset(train_data)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle = True)

IndexError: index 2172 is out of bounds for dimension 0 with size 488

Now that we have created our model, and prepared the data for training, we can move onto model training!

## Model Training

In [None]:
epochs = 10
model.train()
for epoch in range(epochs):
    total_loss = 0
    for user, item, rating in train_loader:
        optimizer.zero_grad()
        output = model(user, item).squeeze()
        loss = criterion(output, rating)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader)}")  # Print the average loss for this epoch