## Deep Learning based Recommender Systems
### Neural Collaborative Filtering model

In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from tqdm.notebook import tqdm, trange
#import pytorch_lightning as pl

np.random.seed(5412087)

In [2]:
## Load Large file
# https://www.kaggle.com/grouplens/movielens-20m-dataset
ratings = pd.read_csv('/kaggle/input/movielens-20m-dataset/rating.csv', 
                      parse_dates=['timestamp'])

In [None]:
## Load Small Test file
ratings = pd.read_csv('../input/testcsv/ratings.csv')
ratings['timestamp'] = pd.to_datetime(ratings['timestamp'], unit = 's')

In [None]:
ratings

In [3]:
## Randomly sample data 
rand_userIds = np.random.choice(ratings['userId'].unique(), 
                                size=int(len(ratings['userId'].unique())*0.03), 
                                replace=False)

ratings = ratings.loc[ratings['userId'].isin(rand_userIds)]

In [4]:
## Leave most recent rating as test data
ratings['rank_latest'] = ratings.groupby(['userId'])['timestamp'].rank(method='first', ascending=False)

train_ratings = ratings[ratings['rank_latest'] != 1]
test_ratings = ratings[ratings['rank_latest'] == 1]

train_ratings = train_ratings[['userId', 'movieId', 'rating']]
test_ratings = test_ratings[['userId', 'movieId', 'rating']]

In [None]:
test_ratings

In [5]:
## Code ratings to binary
train_ratings.loc[:, 'rating'] = 1

train_ratings.sample(5)

Unnamed: 0,userId,movieId,rating
19266381,133331,1371,1
17451604,120701,2730,1
5949008,40986,161,1
4563090,31234,51575,1
18842020,130485,296,1


In [6]:
## Combine positive data with negative data
## Create tensor 
class MovieLensTrainDataset(Dataset):
    """MovieLens PyTorch Dataset for Training
    
    Args:
        ratings (pd.DataFrame): Dataframe containing the movie ratings
        all_movieIds (list): List containing all movieIds
    
    """

    def __init__(self, ratings, all_movieIds):
        self.users, self.items, self.labels = self.get_dataset(ratings, all_movieIds)

    def __len__(self):
        return len(self.users)
  
    def __getitem__(self, idx):
        return self.users[idx], self.items[idx], self.labels[idx]

    def get_dataset(self, ratings, all_movieIds):
        users, items, labels = [], [], []
        user_item_set = set(zip(ratings['userId'], ratings['movieId']))

        num_negatives = 4
        for u, i in user_item_set:
            users.append(u)
            items.append(i)
            labels.append(1)
            for _ in range(num_negatives):
                negative_item = np.random.choice(all_movieIds)
                while (u, negative_item) in user_item_set:
                    negative_item = np.random.choice(all_movieIds)
                users.append(u)
                items.append(negative_item)
                labels.append(0)

        return torch.tensor(users), torch.tensor(items), torch.tensor(labels)

In [8]:
# Get all movie IDs
all_movieIds = ratings['movieId'].unique()

# Placeholders that will hold the training data
#users, items, labels = [], [], []

# This is the set of items that each user has interaction with
#user_item_set = set(zip(train_ratings['userId'], train_ratings['movieId']))

In [9]:
train_loader = DataLoader(MovieLensTrainDataset(ratings, all_movieIds),
                          batch_size=512, num_workers=4,  pin_memory=True)

In [10]:
## Test tqdm
from tqdm.notebook import tqdm
for i in tqdm(range(int(9e6))):
    pass

HBox(children=(FloatProgress(value=0.0, max=9000000.0), HTML(value='')))




In [11]:
## Generate total num_users, num_items
num_users = ratings['userId'].max()+1
num_items = ratings['movieId'].max()+1


In [12]:
## Write model
import torch.nn as nn

class NCF(nn.Module):
    def __init__(self, num_users, num_items):
        super().__init__()
        self.user_embedding = nn.Embedding(num_embeddings=num_users, embedding_dim=8)
        self.item_embedding = nn.Embedding(num_embeddings=num_items, embedding_dim=8)
        self.fc1 = nn.Linear(in_features=16, out_features=64)
        self.fc2 = nn.Linear(in_features=64, out_features=32)
        self.output = nn.Linear(in_features=32, out_features=1)
        self.ratings = ratings
        self.all_movieIds = all_movieIds

    def forward(self, user_input, item_input):
        
        # Pass through embedding layers
        user_embedded = self.user_embedding(user_input)
        item_embedded = self.item_embedding(item_input)

        # Concat the two embedding layers
        vector = torch.cat([user_embedded, item_embedded], dim=-1)

        # Pass through dense layer
        vector = nn.ReLU()(self.fc1(vector))
        vector = nn.ReLU()(self.fc2(vector))

        # Output layer
        pred = nn.Sigmoid()(self.output(vector))

        return pred     



## Training
# Instantiate model  

model1 = NCF(num_users, num_items)
model1.cuda()
# Loss and Optimizer
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model1.parameters(), lr=0.001)

# Iterate through train set minibatchs 
for epoch in trange(5):  # <---- change here
    for user_input, item_input, labels in tqdm(train_loader):
        user_input, item_input, labels = user_input.cuda(), item_input.cuda(), labels.cuda()
        # Zero out the gradients
        optimizer.zero_grad()

        # Forward pass
        predicted_labels = model1(user_input, item_input)
        loss = criterion(predicted_labels, labels.view(-1, 1).float())
        # Backward pass
        loss.backward()
        optimizer.step()



HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=5755.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5755.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5755.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5755.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5755.0), HTML(value='')))





In [None]:
## Alarm when model done running
from IPython.lib.display import Audio
import numpy as np

framerate = 4410
play_time_seconds = 15

t = np.linspace(0, play_time_seconds, framerate*play_time_seconds)
audio_data = np.sin(2*np.pi*300*t) + np.sin(2*np.pi*240*t)
Audio(audio_data, rate=framerate, autoplay=True)

In [13]:
## Move model back to cpu
device = 'cpu'
model1.to(device)

## Test model using The Hit Ratio @ 10 
# User-item pairs for testing
test_user_item_set = set(zip(test_ratings['userId'], test_ratings['movieId']))

# Dict of all items that are interacted with by each user
user_interacted_items = ratings.groupby('userId')['movieId'].apply(list).to_dict()

hits = []
for (u,i) in tqdm(test_user_item_set):
    interacted_items = user_interacted_items[u]
    not_interacted_items = set(all_movieIds) - set(interacted_items)
    selected_not_interacted = list(np.random.choice(list(not_interacted_items), 99))
    test_items = selected_not_interacted + [i]
    
    predicted_labels = np.squeeze(model1(torch.tensor([u]*100), 
                                        torch.tensor(test_items)).detach().numpy())
    
    top10_items = [test_items[i] for i in np.argsort(predicted_labels)[::-1][0:10].tolist()]
    
    if i in top10_items:
        hits.append(1)
    else:
        hits.append(0)
        
print("The Hit Ratio @ 10 is {:.2f}".format(np.average(hits)))

HBox(children=(FloatProgress(value=0.0, max=4154.0), HTML(value='')))


The Hit Ratio @ 10 is 0.71
