In [None]:
import os

import numpy as np 
import pandas as pd 

import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset, random_split

import pytorch_lightning as pl

In [None]:
class LightningMatrixFactorization(pl.LightningModule):
    
    def __init__(self, number_of_books, number_of_users, **kwargs):
        super().__init__()
        
        self.learning_rate = kwargs["lr"]\
                if "lr" in kwargs.keys() else 1e-3
        self.number_of_books = number_of_books
        self.number_of_users = number_of_users
        self.embed_dim = kwargs["embed_dim"]\
                if "embed_dim" in kwargs.keys() else 32
        
        self.embed_users = torch.nn.Embedding(\
                self.number_of_users, self.embed_dim)
        self.embed_books = torch.nn.Embedding(\
                self.number_of_books, self.embed_dim)
    
    def forward(self, users, books):
        
        embedded_books = self.embed_books(books)
        embedded_users = self.embed_users(users)
        
        predicted = torch.sum(\
                torch.multiply(embedded_users, embedded_books),\
                dim=-1)
        
        return predicted
    
    def training_step(self,batch, batch_idx):
        
        users = batch[0] 
        boooks = batch[1]
        ratings = batch[2]
        
        embedded_books = self.embed_books(books)
        embedded_users = self.embed_users(users)
        
        predicted = torch.sum(\
                torch.multiply(embedded_users, embedded_books), \
                dim=-1)
        
        loss = F.mse_loss(predicted, ratings)
        
        return loss
    
    def validation_step(self, batch, batch_idx):
        users = batch[0] 
        boooks = batch[1]
        val_ratings = batch[2]
        
        embedded_books = self.embed_books(books)
        embedded_users = self.embed_users(users)
        
        val_predicted = torch.sum(\
                torch.multiply(embedded_users, embedded_books),\
                dim=-1)
        
        val_loss = F.mse_loss(val_predicted, val_ratings)
        self.log("val_loss", val_loss)
        
    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(),\
                lr=self.learning_rate)
        return optimizer

In [None]:
batch_size = 262144
learning_rate = 1e-3
embedding_size = 32
max_epochs = 1000
# dataloader cpu cores, based on Kaggle GPU notebooks.
num_workers = 2

In [None]:
my_filepath = "../data/ratings.csv"
df = pd.read_csv(my_filepath)
df = df.sample(frac=1).reset_index(drop=True)
df = df[:100000]
print(len(df))

df.head()

In [None]:
test_split = int(0.2 * len(df))

train_df = df[:-2*test_split]
val_df = df[-2*test_split:-test_split]
test_df = df[-test_split:]

In [None]:
users = torch.tensor(train_df.user_id.values).long()
books =  torch.tensor(train_df.book_id.values).long()
ratings = torch.tensor(train_df.rating.values).float()
dataset = TensorDataset(users, books, ratings)

train_loader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers)

val_users = torch.tensor(val_df.user_id.values).long()
val_books =  torch.tensor(val_df.book_id.values).long()
val_ratings = torch.tensor(val_df.rating.values).float()
val_dataset = TensorDataset(val_users, val_books, val_ratings)

val_loader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers)



In [None]:
number_of_users = np.max(df["user_id"])+1
number_of_books = np.max(df["book_id"])+1

lmf = LightningMatrixFactorization(number_of_books, number_of_users)

In [None]:
if torch.cuda.is_available():
    trainer = pl.Trainer(accelerator="gpu", devices=1, max_epochs=max_epochs)
else:
    trainer = pl.Trainer(max_epochs=max_epochs)
    
trainer.fit(model=lmf, train_dataloaders=train_loader, val_dataloaders=val_loader)

In [None]:
with torch.no_grad():
    
    test_users = torch.tensor(test_df.user_id.values).long()
    test_books = torch.tensor(test_df.book_id.values).long()
    test_ratings = torch.tensor(test_df.rating.values).float()
    
    lmf.eval()
    test_prediction = lmf(test_users, test_books)
    
    test_loss = F.mse_loss(test_prediction, test_ratings)
    
    test_msg = f"MSE loss for test data = {test_loss:.3} \n"
    print(test_msg)

for hh in range(10):
    # see a few examples of predictions
    lmf.eval()
    
    my_index = np.random.randint(len(test_users))
    
    my_prediction = lmf(test_users[my_index], test_books[my_index])
    
    msg = f"Test set prediction {my_prediction}, ground truth: {test_ratings[my_index]}"
    print(msg)