In [32]:
# Much of this comes from the site https://www.ethanrosenthal.com/2017/06/20/matrix-factorization-in-pytorch/
# Accessed on March 7th, 2021

import torch.nn.functional as F
from torch import optim
from tqdm import tqdm
from torch import nn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib notebook
import torch
import os

from pathlib import Path

In [2]:
#%%
# If using colab
# from google.colab import drive
# drive.mount('/content/drive')
# DRIVE_PATH = "/content/drive/Shared drives/CMPUT466 Project"

# If using windows

DRIVE_PATH = Path("G:/Shared drives/CMPUT466 Project")
DATA_FOLDER=DRIVE_PATH/"src/data/"
train = pd.read_csv(DATA_FOLDER/"user_data_train.csv")
validation = pd.read_csv(DATA_FOLDER/"user_data_validation.csv")

In [62]:
def to_categorical(df, col):
    keys = df[col].unique()
    d = {key: i for i, key in enumerate(keys)}
    return np.array([d[key] for key in df[col]])

def get_sparse_matrix(df, usr_col, item_col, rating_col):
    return df.groupby([usr_col, item_col])[rating_col].max().unstack().fillna(0).astype(int)

train["Username"] = to_categorical(train, "Username")
train["Title"] = to_categorical(train, "Title")
validation["Username"] = to_categorical(validation, "Username")
validation["Title"] = to_categorical(validation, "Title")

ratings = get_sparse_matrix(train, "Username", "Title", "Userscore").to_numpy()
val_ratings = get_sparse_matrix(validation, "Username", "Title", "Userscore").to_numpy()

In [141]:
""" https://arxiv.org/pdf/1812.01478v1.pdf """

class MF(nn.Module):
    def __init__(self, n_users, n_items, n_factors=40, min_rating=0, max_rating=10):
        super().__init__()
        self.min_rating = min_rating
        self.max_rating = max_rating
        self.user_embs = torch.nn.Embedding(n_users, n_factors, sparse=True).cuda()
        self.item_embs = torch.nn.Embedding(n_items, n_factors, sparse=True).cuda()
        self.user_embs.weight.data.uniform_(-1, 1)
        self.item_embs.weight.data.uniform_(-1, 1)
        self.fc1 = nn.Linear(n_factors*2, 128).cuda()
        self.fc2 = nn.Linear(128, 128).cuda()
        self.fc3 = nn.Linear(128, 1).cuda()

    def forward(self, user, item):
        x = torch.cat([self.user_embs(user), self.item_embs(item)], dim=1)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.sigmoid(self.fc3(x))
        return x * (self.max_rating - self.min_rating + 1) + self.min_rating - 0.5

model = MF(*ratings.shape, n_factors=40)

In [142]:
BATCH_SIZE=64
EPOCHS = 3

def SGD(model, data):
    loss_func = torch.nn.MSELoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=1e-6)  # learning rate
    rows, cols = ratings.nonzero()
    data = torch.FloatTensor(data).cuda()
    rows = torch.LongTensor(rows).cuda()
    cols = torch.LongTensor(cols).cuda()
    for epoch in range(EPOCHS):
        p = np.random.permutation(len(rows))
        rows, cols = rows[p], cols[p]
        i = 0
        for i in tqdm(range(0, len(rows), BATCH_SIZE)):
            row = rows[i:i+BATCH_SIZE]
            col = cols[i:i+BATCH_SIZE]
            # Set gradients to zero
            optimizer.zero_grad()

            # Turn data into tensors
            rating = data[row, col]

            # Predict and calculate loss
            prediction = model(row, col)
            loss = loss_func(prediction, rating)

            # Backpropagate
            loss.backward()

            # Update the parameters
            optimizer.step()
SGD(model, ratings)


100%|██████████| 1919/1919 [00:02<00:00, 686.73it/s]
100%|██████████| 1919/1919 [00:03<00:00, 606.01it/s]
100%|██████████| 1919/1919 [00:03<00:00, 580.24it/s]


In [None]:
def test(model, data, sample=1000):
    with torch.no_grad():
        rows, cols = data.nonzero()
        data = torch.Tensor(data).cuda()
        samples = np.random.choice(len(rows), sample)
        rows = torch.LongTensor(rows[samples]).cuda()
        cols = torch.LongTensor(cols[samples]).cuda()
        mean = data[rows,cols].mean()
        print(model(rows, cols).max(), model(rows, cols).min())
        print("Data:", data[rows, cols][:5])
        print("Preds: ", model(rows, cols)[:5])
        print("Baseline score:", ((data[rows,cols] - mean)**2).sum() / len(rows))
        print("Model score:", ((data[rows,cols] - model(rows,cols))**2).sum() / len(rows))

test(model, ratings)