In [1]:
# Much of this comes from the site https://www.ethanrosenthal.com/2017/06/20/matrix-factorization-in-pytorch/
# Accessed on March 7th, 2021

import torch.nn.functional as F
from torch import optim
from tqdm import tqdm
from torch import nn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib notebook
import torch
import os

from pathlib import Path

In [2]:
#%%
# If using colab
# from google.colab import drive
# drive.mount('/content/drive')
# DRIVE_PATH = "/content/drive/Shared drives/CMPUT466 Project"

# If using windows

DRIVE_PATH = Path("G:/Shared drives/CMPUT466 Project")
DATA_FOLDER=DRIVE_PATH/"src/data/"
train = pd.read_csv(DATA_FOLDER/"user_data_train.csv")
validation = pd.read_csv(DATA_FOLDER/"user_data_validation.csv")

In [3]:
def to_categorical(df, col):
    keys = df[col].unique()
    d = {key: i for i, key in enumerate(keys)}
    return np.array([d[key] for key in df[col]])

def get_sparse_matrix(df, usr_col, item_col, rating_col):
    return df.groupby([usr_col, item_col])[rating_col].max().unstack().fillna(0).astype(int)

train["Username"] = to_categorical(train, "Username")
train["Title"] = to_categorical(train, "Title")
validation["Username"] = to_categorical(validation, "Username")
validation["Title"] = to_categorical(validation, "Title")

ratings = get_sparse_matrix(train, "Username", "Title", "Userscore").to_numpy()
val_ratings = get_sparse_matrix(validation, "Username", "Title", "Userscore").to_numpy()

In [94]:
""" https://arxiv.org/pdf/1812.01478v1.pdf """

class DMF(nn.Module):
    def __init__(self, n_users, n_items, n_factors=40):
        super().__init__() 
        self.user_fc1 = nn.Linear(n_items, 512).cuda() # The array of users will be (BATCH_SIZE x n_items)
        self.user_fc2 = nn.Linear(512, 512).cuda()
        self.user_fc3 = nn.Linear(512, n_factors).cuda()
        
        self.item_fc1 = nn.Linear(n_users, 512).cuda() # The array of items will be (BATCH_SIZE x n_users)
        self.item_fc2 = nn.Linear(512, 512).cuda()
        self.item_fc3 = nn.Linear(512, n_factors).cuda()
                
    def forward(self, users, items):
        # Users network
        users = F.relu(self.user_fc1(users))
        users = F.relu(self.user_fc2(users))
        users = F.relu(self.user_fc3(users))
        
        # Items network
        items = F.relu(self.item_fc1(items))
        items = F.relu(self.item_fc2(items))
        items = F.relu(self.item_fc3(items)).T

        # Simulate pair-wise cosine similarity
        res = torch.matmul(users, items)
        user_norm = torch.linalg.norm(users, dim=1).view(-1, 1)
        item_norm = torch.linalg.norm(items, dim=0).view(1, -1)
        norms = torch.matmul(user_norm, item_norm)
        res = res / norms
        return torch.add(torch.mul(res, 5), 5) # Map from [-1, 1] to [0, 10]


model = DMF(*ratings.shape, n_factors=40)

In [96]:
BATCH_SIZE=256
EPOCHS = 3

def SGD(model, data):
    loss_func = torch.nn.MSELoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=1e-6)  # learning rate
    data = torch.FloatTensor(data).cuda()
    for epoch in range(EPOCHS):
        for row_i in tqdm(range(0, data.shape[0], BATCH_SIZE)):
            rows = data[row_i:row_i+BATCH_SIZE,:]
            for col_i in range(0, data.shape[1], BATCH_SIZE):
                cols = data[:,col_i:col_i+BATCH_SIZE].T
                
                # Set gradients to zero
                optimizer.zero_grad()

                # Get the actual ratings
                rating = data[row_i:row_i+BATCH_SIZE, col_i:col_i+BATCH_SIZE]

                # Predict and calculate loss
                prediction = model(rows, cols)
                loss = loss_func(prediction, rating)

                # Backpropagate
                loss.backward()

                # Update the parameters
                optimizer.step()
                

SGD(model, ratings)


 41%|████      | 46/112 [00:03<00:05, 12.29it/s]


KeyboardInterrupt: 

In [81]:
def test(model, data, sample=1000):
    with torch.no_grad():
        rows, cols = data.nonzero()
        data = torch.Tensor(data).cuda()
        samples = np.random.choice(len(rows), sample)
        rows = torch.LongTensor(rows[samples]).cuda()
        cols = torch.LongTensor(cols[samples]).cuda()
        mean = data[rows,cols].mean()
        print(model(rows, cols).max(), model(rows, cols).min())
        print("Data:", data[rows, cols][:5])
        print("Preds: ", model(rows, cols)[:5])
        print("Baseline score:", ((data[rows,cols] - mean)**2).sum() / len(rows))
        print("Model score:", ((data[rows,cols] - model(rows,cols))**2).sum() / len(rows))

test(model, ratings)

KeyboardInterrupt: 