In [106]:
# Much of this comes from the site https://www.ethanrosenthal.com/2017/06/20/matrix-factorization-in-pytorch/
# Accessed on March 7th, 2021

import pandas as pd
import numpy as np
import scipy.sparse as sparse
import random
import implicit 
import matplotlib.pyplot as plt
%matplotlib notebook
import seaborn as sns
from tqdm import tqdm
import os

from pathlib import Path

In [107]:
#%%
# If using colab
# from google.colab import drive
# drive.mount('/content/drive')
# DRIVE_PATH = "/content/drive/Shared drives/CMPUT466 Project"

# If using windows

DRIVE_PATH = Path("G:/Shared drives/CMPUT466 Project")
DATA_FOLDER=DRIVE_PATH/"src/data/"
train = pd.read_csv(DATA_FOLDER/"user_data_train.csv")
validation = pd.read_csv(DATA_FOLDER/"user_data_validation.csv")
test = pd.read_csv(DATA_FOLDER/"user_data_test.csv")
data = pd.concat([train, validation, test])
data["Title-Platform"] = data["Title"] + data["Platform"]
data.head()

Unnamed: 0.1,Unnamed: 0,Title,Platform,Userscore,Comment,Username,Title-Platform
0,39153,BioShock Infinite,PC,10,One of the greatest games I've ever played. Th...,sti4thewin,BioShock InfinitePC
1,177986,DmC: Devil May Cry,Xbox360,9,Coming from a long time DMC fan this game is g...,sti4thewin,DmC: Devil May CryXbox360
2,8791,Metroid Prime,GameCube,10,aWESOME... nEeD i SaY MoRe?,L.S.,Metroid PrimeGameCube
3,65251,Team Fortress 2,PC,10,A fun game that can end in countless ways eac...,L.S.,Team Fortress 2PC
4,2037,Grand Theft Auto IV,Xbox360,3,"Great presentation and idea, however failing ...",JamesD.,Grand Theft Auto IVXbox360


In [108]:
data["Username"] = data["Username"].astype("category").cat.codes
data["Title-Platform"] = data["Title-Platform"].astype("category").cat.codes
data.head()
ratings = data[['Username', 'Title-Platform', 'Userscore']]

In [109]:
sparse = sparse.csr_matrix((ratings['Userscore'].astype(float), (ratings['Username'], ratings['Title-Platform'])))

# Training the ALS Model

According to [SAP](https://help.sap.com/viewer/2cfbc5cf2bc14f028cfbe2a2bba60a50/2.0.03/en-US/7129de6bddcc490698bee0c2c95e9c73.html), the latent factors for an explicit feedback model are trained by solving the optimization problem
$$ argmin_{f_{u}, f_{v}} \frac{1}{|D|}\left(\sum_{r_{u, v}\in D}\left(r_{u, v} - f_{u}^T f_{v}\right)^2 + \lambda \left(\sum_{u}n_{u}\lVert f_{u}\rVert^2 + \sum_{v} n_{v}\lVert f_{v}\rVert^2 \right)\right) $$

Where:
- $f_{u}, f_{v}$ are the latent features for users, items respectively.
- $r_{u, v}$ is the rating of user $u$ with item $v$.
- $D$ is the set of all observed feedback.
- $n_{u}, n_{v}$ are the number of feedbacks for $u$ and $v$ respectively.
- $\lambda$ is the regularization parameter.

In [110]:
# Try to vectorize
def loss(data, nonzero_indices, f_u, f_v):
    ans = 0
    for i in tqdm(range(indices.shape[0])):
        ans += sparse[indices[i][0], indices[i][1]] - f_u[:,indices[i][0]].T.dot(f_v[:,indices[i][1]])
    print(ans)
#     sq_diff = (sparse[indices.T[0], indices.T[1]] - f_u[indices.T[0]].T.dot(f_v[indices.T][1])).sum()
    

In [111]:
# Get all indices of the observed explicit feedback.
indices = np.append(nonzero[0].reshape(1, -1), nonzero[1].reshape(1, -1), axis=0).T

# Accessing all non-zero elements: sparse[indices.T[0], indices.T[1]].shape
# Hyperparameters
n_factors = 40
LAMBDA = 0

np.random.seed(69)
f_u = np.random.random((n_factors, sparse.shape[0]))
f_v = np.random.random((n_factors, sparse.shape[1]))
print(loss(sparse, indices, f_u, f_v))
def ALS(data, f_u, f_v, users=True):
    if users:
        update, remain = f_u, f_v
    else:
        update, remain = f_v, f_u
        
    print(remain.shape)
    remain.dot(remain.T)

ALS(sparse, f_u, f_v, True)
    
# def ALS(dataset, X, Y, reg, n_factors, alpha=10, user=True):
#     if user:
#         data = dataset.train_user
#     else:
#         data = dataset.train_item
 
#     YtY = Y.T.dot(Y)
#     for s in data:
#         A = YtY + reg * np.eye(n_factors)
#         b = np.zeros(n_factors)
#         for i in data[s]:
#             factor = Y[i]
#             confidence = 1 + alpha * data[s][i]
#             A += (confidence-1) * np.outer(factor, factor) # calculate the outer product
#             b += confidence * factor
 
#         X[s] = np.linalg.solve(A, b)

  3%|▎         | 4767/178271 [00:00<00:03, 47666.64it/s]

(1, 178271)


100%|██████████| 178271/178271 [00:03<00:00, 45512.21it/s]

-306043.6724336051
None
(40, 3316)



