In [None]:
from typing import Tuple, List

import os
import pandas as pd
import numpy as np
import random
from matplotlib import pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader 

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print ("device:[%s]."%(device))

%matplotlib inline

In [None]:
seed = 0

random.seed(seed)
np.random.seed(seed)

In [None]:

base_path = os.path.join(dpath, 'data')
data_path = os.path.join(base_path, 'ml-latest-small')

In [None]:
rating_path = os.path.join(data_path, 'ratings.csv')

ratings_df = pd.read_csv(rating_path, encoding='utf-8')
ratings_df['user'] = ratings_df['userId'] - 1
ratings_df['item'] = ratings_df['movieId'] - 1
ratings_df.head()

In [None]:
item_encoder = {}
for idx, item in enumerate(ratings_df['item'].unique()):
    item_encoder[item] = idx
item_encoder

In [None]:
user_num = len(ratings_df.user.unique())
item_num = len(ratings_df.item.unique())
global_bias = ratings_df.rating.mean()

MF Model Implementation

In [None]:
class MovieLens(Dataset):
    def __init__(self,df,item_encoder):
        self.df = df
        self.item_encoder = item_encoder
        self.user = torch.tensor(self.df['user'].tolist())
        self.item = torch.tensor([self.item_encoder[i] for i in self.df['item'].tolist()])
        self.y = torch.tensor(self.df['rating'].tolist())

    def __len__(self):
        return len(self.df)


    def __getitem__(self,idx):
        return self.user[idx], self.item[idx], self.y[idx]

        

In [None]:
MovieLens_dataset = MovieLens(ratings_df, item_encoder)
BATCH_SIZE = 16
train_iter = DataLoader(MovieLens_dataset,batch_size=BATCH_SIZE,shuffle=True,num_workers=1)

In [None]:
class MF(nn.Module):
    def __init__(self,user_num,item_num,emb_dim,global_bias):
        super(MF,self).__init__()
        self.user_emb = nn.Embedding(user_num,emb_dim)
        self.item_emb = nn.Embedding(item_num,emb_dim)
        self.user_bias = nn.Embedding(user_num,1)
        self.item_bias = nn.Embedding(item_num,1)
        self.global_bias = global_bias
    
    def forward(self,user_id,item_id):
        user_emb = self.user_emb(user_id)
        item_emb = self.item_emb(item_id)
        user_bias = self.user_bias(user_id)
        item_bias = self.item_bias(item_id)
        out = torch.sum((user_emb*item_emb),axis=1)+ torch.squeeze(user_bias) + torch.squeeze(item_bias) + self.global_bias

        return out.view(-1)

K = 20

learning_rate = 0.01

def RMSELoss(yhat,y):
    return torch.sqrt(torch.mean((yhat-y)**2))

MF = MF(user_num=user_num,item_num=item_num,emb_dim=K,global_bias = global_bias).to(device)
loss = RMSELoss
optm = optim.SGD(MF.parameters(),lr=learning_rate)
print ("Done.")



In [None]:
from tqdm import tqdm

print ("Start training.")
EPOCHS = 10
for epoch in tqdm(range(EPOCHS)):
    loss_val_sum = 0
    for user,item,rating in train_iter:
        y_pred = MF.forward(user.to(device),item.to(device))
        loss_out = loss(y_pred,rating.to(device))
        optm.zero_grad()  
        loss_out.backward() 
        optm.step() 
        loss_val_sum += loss_out
    loss_val_avg = loss_val_sum/len(train_iter)
    print(f"epoch : {epoch}, loss : {loss_val_avg}")

print ("Done")

BPRMF model implementation

In [None]:
BPRratings_df = ratings_df.copy()
BPRratings_df['rating'] = 1
BPRratings_df

In [None]:
class NegativeSampleMovieLens(Dataset):
    def __init__(self,df,item_encoder,negative_num):
        self.df = df
        self.item_encoder = item_encoder
        self.negative_num = negative_num
        self.user = torch.tensor(self.df['user'].tolist())
        self.item = torch.tensor([self.item_encoder[i] for i in self.df['item'].tolist()])
        self.y = torch.tensor(self.df['rating'].tolist())
        self.negative_dict = self.make_negative_dataset()

    def __len__(self):
        return len(self.df)

    def __getitem__(self,idx):
        return self.user[idx], self.item[idx], self.y[idx]

    def make_negative_dataset(self):
        negative_dict = {}
        for idx,row in enumerate(self.df['user'].unique()):
            negative_list = np.random.choice(list(set(self.df['item'].unique()).difference(set(self.df.loc[self.df['user']==row,'item'].values))),self.negative_num,replace=False)
            negative_dict[row] = negative_list
        return negative_dict

In [None]:
class BPRMF(nn.Module):
    def __init__(self,user_num,item_num,emb_dim):
        super(BPRMF,self).__init__()
        self.user_emb = nn.Embedding(user_num,emb_dim)
        self.item_emb = nn.Embedding(item_num,emb_dim)
    
    def forward(self,user_id,item_id):
        user_emb = self.user_emb(user_id)
        item_emb = self.item_emb(item_id)
        out = F.sigmoid(torch.sum((user_emb*item_emb),axis=1))

        return out.view(-1)

def BPRLoss(pos,neg):
    return -nn.LogSigmoid(pos-neg).mean()

class BPR_Loss(nn.Module):
    def __init__(self):
        super(BPR_Loss, self).__init__()
    
    def forward(self, pos, neg):
        bpr_loss = -torch.mean(torch.log(torch.sigmoid(pos - neg)))
        return bpr_loss

K = 20

learning_rate = 0.01

BPRMF = BPRMF(user_num=user_num,item_num=item_num,emb_dim=K).to(device)
loss = BPR_Loss()
optm = optim.SGD(MF.parameters(),lr=learning_rate)
print ("Done.")


In [None]:
BPRdataset = NegativeSampleMovieLens(BPRratings_df,item_encoder,4)
BATCH_SIZE = 1
train_iter = DataLoader(MovieLens_dataset,batch_size=BATCH_SIZE,shuffle=True,num_workers=1)

In [None]:
from tqdm import tqdm

print ("Start training.")
EPOCHS = 10
for epoch in tqdm(range(EPOCHS)):
    loss_val_sum = 0
    for user,item, _ in train_iter:
        user = user.to(device)
        pos = BPRMF.forward(user,item.to(device))
        for i in BPRdataset.negative_dict[int(user)]:
            neg_item = torch.tensor(BPRdataset.item_encoder[i])
            neg = BPRMF.forward(user,neg_item.to(device))
            loss_out = loss(pos=pos,neg=neg)
            optm.zero_grad()
            loss_out.backward(retain_graph=True) 
            optm.step() 
            loss_val_sum += loss_out
    loss_val_avg = loss_val_sum/len(train_iter)
    print(f"epoch : {epoch}, loss : {loss_val_avg}")

print ("Done")