In [97]:
import os
import numpy as np
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch import nn
from torch.utils.data import Dataset, DataLoader
torch.manual_seed(0)

<torch._C.Generator at 0x112002c30>

In [98]:
import argparse
from time import time
import pickle
import scipy.sparse as sp
# import pandas as pd
np.random.seed(7)
import math
import heapq

In [99]:
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")
# set the device as gpu or cpu depending upon the machine its running on

In [100]:
epochs = 50
batch_size = 256
weight_decay = 0.00001
num_negatives_pretrain = 4
num_negatives_pasttest = 100
learning_rate = 0.001
dropout = 0.2
optimizer = 'adam'


In [101]:
# movie lens dataset has some positive interactions between user and movie means movie has been watched by user but doesnt
# necessarily imply that user liked the movie. Below class, generates a sparse matrix with value 1 for positive interactions
# and 0 otherwise. Initialzing the class with default parameters would return training data with 4 negative interaction for
# for every positive interaction and test data with 100 negative interactions for every positive interaction
class MovieLensDataset(Dataset):
    
    def __init__(self, file_name, num_negatives_train=5, num_negatives_test=100):
        self.train_matrix = self.load_matrix(file_name + ".train.rating")
        self.n_users, self.n_items = self.train_matrix.shape
        self.user_input, self.item_input, self.ratings = self.get_train_instances(self.train_matrix, num_negatives_train)
        self.testRatings = self.load_list(file_name + ".test.rating")
        self.testNegatives = self.create_negatives(num_samples=num_negatives_test)
        assert len(self.testRatings) == len(self.testNegatives)

    def __len__(self):
        return len(self.user_input)

    def __getitem__(self, index):
        return {'user_id': self.user_input[index],
                'item_id': self.item_input[index],
                'rating': self.ratings[index]}
    def load_matrix(self, filename):
        num_users, num_items = 0, 0
        with open(filename, "r") as f:
            line = f.readline()
            while line != None and line != "":
                arr = line.split("\t")
                u, i = int(arr[0]), int(arr[1])
                num_users = max(num_users, u)
                num_items = max(num_items, i)
                line = f.readline()
        mat = sp.dok_matrix((num_users+1, num_items+1), dtype=np.float32)
        with open(filename, "r") as f:
            line = f.readline()
            while line != None and line != "":
                arr = line.split("\t")
                user, item, rating = int(arr[0]), int(arr[1]), float(arr[2])
                if (rating > 0):
                    mat[user, item] = 1.0
                line = f.readline()
        return mat

    def get_train_instances(self, train, num_negatives):
        user_input, item_input, ratings = [], [], []
        num_users, num_items = train.shape
        for (u, i) in train.keys():
            user_input.append(u)
            item_input.append(i)
            ratings.append(1)
            for t in range(num_negatives):
                j = np.random.randint(1, num_items)
                while (u, j) in train:
                    j = np.random.randint(1, num_items)
                user_input.append(u)
                item_input.append(j)
                ratings.append(0)
        return user_input, item_input, ratings

    def load_list(self, filename):
        ratingList = []
        with open(filename, "r") as f:
            line = f.readline()
            while line != None and line != "":
                arr = line.split("\t")
                user, item = int(arr[0]), int(arr[1])
                ratingList.append([user, item])
                line = f.readline()
        return ratingList

    def create_negatives(self, num_samples=100):
        negativeList = []
        for user_item_pair in self.testRatings:
            user = user_item_pair[0]
            item = user_item_pair[1]
            negatives = []
            for t in range(num_samples):
                j = np.random.randint(1, self.n_items)
                while (user, j) in self.train_matrix or j == item:
                    j = np.random.randint(1, self.n_items)
                negatives.append(j)
            negativeList.append(negatives)
        return negativeList

    


In [102]:
# neural network with 3 hidden layers and input as concated embeddings for users and items. It return the probability of 
# that user will watch the given item
class recommendationNetwork(nn.Module):
    def __init__(self, n_users, n_items,embedding_dim):
        super().__init__()
        self.user_embed = torch.nn.Embedding(n_users, embedding_dim)
        self.item_embed = torch.nn.Embedding(n_items, embedding_dim)
        self.linear1 = nn.Linear(2*embedding_dim,32)
        self.linear2 = nn.Linear(32,16)
        self.linear3 = nn.Linear(16,8)
        self.output = nn.Linear(8,1)
#         self.D = torch.nn.Dropout(0.1)
        
    
    def forward(self, inputUserItem):
        users = inputUserItem['user_id']
        items = inputUserItem['item_id']
        user_embedding = self.user_embed(users)
        item_embedding = self.item_embed(items)
        # concatenate user and item embeddings to form input
        input_embedding = torch.cat([user_embedding, item_embedding], 1)
        hidden1 = (F.relu(self.linear1(input_embedding)))
        hidden2 = (F.relu(self.linear2(hidden1)))
        hidden3 = (F.relu(self.linear3(hidden2)))
        output = torch.sigmoid(self.output(hidden3))
        return output

    def predict(self, inputUserItem):
        # return the score, inputs and outputs are numpy arrays
        for key in inputUserItem:
            if type(inputUserItem[key])!= type(None):
                inputUserItem[key] = torch.from_numpy(inputUserItem[key]).to(dtype=torch.long, device=device)
        output_scores = self.forward(inputUserItem)
        return output_scores.cpu().detach().numpy()



In [103]:
import math
import heapq
# function to evaluate the performance of trained model on complete test dataset using the metric of hit-ratio. 

def evaluate_model(model,testRatings,testNegatives,topK: int):
    hitratio = []
    for idx in range(len(testRatings)):
    
        itemsList = testNegatives[idx]
        u = testRatings[idx][0]
        positiveItem = testRatings[idx][1]
        itemsList.append(positiveItem)
        map_item_score = {}
        users = np.full(len(itemsList), u, dtype='int32')

        data = {
            'user_id': users,
            'item_id': np.array(itemsList),
        }
        predictions = model.predict(data)
        for i in range(len(itemsList)):
            item = itemsList[i]
            map_item_score[item] = predictions[i]
        ranklist = heapq.nlargest(topK, map_item_score, key=map_item_score.get)
        hr = getHitRatio(ranklist, positiveItem)
        hitratio.append(hr)
    return hitratio

def getHitRatio(ranklist, positiveItem):
    for item in ranklist:
        if item == positiveItem:
            return 1
    return 0

In [104]:
# reading the dataset from local and creating movielensdataset class to generate a spare train matrix and test datastet
dataset = MovieLensDataset("/Users/shiprajain/Desktop/RecommenderSystem/movielens/movielens")

In [105]:
# pytorch's dataloader shuffles the datapoints and divide it in batches of fixed size for batch training 
training_data_generator = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=0)

In [106]:
train = dataset.train_matrix
num_users, num_items = train.shape
model = recommendationNetwork(num_users, num_items,8)
model.to(device)
loss_fn = torch.nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), weight_decay=weight_decay)

In [107]:
# training the model with adam optimizer and binary cross entropy loss function and testing its performance for every epoch
for epoch in range(epochs):
    epoch_loss = []
    model.train()
    for data in training_data_generator:
        for key in data:
            if type(data[key]) != type(None):
                data[key] = data[key].to(dtype = torch.long, device = device)
        prediction = model(data)
        rating = data['rating']
        rating = rating.float().view(prediction.size())  
        loss = loss_fn(prediction, rating)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        epoch_loss.append(loss.item())
    hitRatios = evaluate_model(model,dataset.testRatings,dataset.testNegatives,10)
    print("epoch : ", epoch)
    print("average loss : ",np.mean(epoch_loss))
    print("average hit ratio : ",np.mean(hitRatios))
    



epoch :  0
average loss :  0.39431801856744914
average hit ratio :  0.39660657476139977
epoch :  1
average loss :  0.32912987462925974
average hit ratio :  0.4061505832449629
epoch :  2
average loss :  0.322510433796405
average hit ratio :  0.41145281018027574
epoch :  3
average loss :  0.3191268353640366
average hit ratio :  0.40721102863202546
epoch :  4
average loss :  0.3166667561236894
average hit ratio :  0.40827147401908803
epoch :  5
average loss :  0.3137715369219106
average hit ratio :  0.4252386002120891
epoch :  6
average loss :  0.30879563881881683
average hit ratio :  0.45387062566277836
epoch :  7
average loss :  0.30085556898955945
average hit ratio :  0.47932131495228
epoch :  8
average loss :  0.29245117974758766
average hit ratio :  0.5058324496288441
epoch :  9
average loss :  0.2856951417797706
average hit ratio :  0.5185577942735949
epoch :  10
average loss :  0.28030305889071727
average hit ratio :  0.5323435843054083
epoch :  11
average loss :  0.275827277673153