In [3]:
import json 
import torch 
from torch.utils.data import Dataset, DataLoader 
import torch.nn as nn 
import torch.nn.functional as F
import os
import json
import gzip
import pandas as pd
import numpy as np 
import math
from urllib.request import urlopen
from tqdm import tqdm 
import random 
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics

In [4]:
!wget http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/Video_Games_5.json.gz

In [5]:
# Ensure deterministic behavior
torch.backends.cudnn.deterministic = True
random.seed(hash("setting random seeds") % 2**32 - 1)
np.random.seed(hash("improves reproducibility") % 2**32 - 1)
torch.manual_seed(hash("by removing stochasticity") % 2**32 - 1)
torch.cuda.manual_seed_all(hash("so runs are repeatable") % 2**32 - 1)

In [29]:
class config:
    learning_rate = 0.01
    batch_size = 1024
    train_idx = 315218
    emb_sz = 100
    device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
    epochs = 20
    sparse = True

In [7]:
datafile = 'Video_Games_5.json.gz'

In [8]:
def load_data_into_df(datafile):
    data = []
    with gzip.open(datafile) as f:
        for l in f:
            data.append(json.loads(l.strip()))

    print(len(data))
    df = pd.DataFrame(data)
    del data
    return df

df = load_data_into_df(datafile)

In [8]:
# df = df.sort_values('reviewerID') # Sort by reviewer ID so that we can split on reviewerID during train test split
# df_train = df[:315218].sample(frac=1., random_state=42).reset_index()
# df_test = df[315218:].sample(frac=1., random_state=42).reset_index()

In [9]:
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42, stratify=df['reviewerID'])

In [10]:
df_train = df_train.reset_index()
df_test = df_test.reset_index()

In [11]:
def create_vocab(df, field):
    id2int = {}
    int2id = {}
    for i, id in enumerate(df[field].unique()):
        id2int[id] = i
        int2id[i] = id
            
    return id2int, int2id

In [12]:
product2int, int2product = create_vocab(df, 'asin')
reviewer2int, int2reviewer = create_vocab(df, 'reviewerID')

In [13]:
class RecSysDataset(Dataset):
    def __init__(self, df):
        self.df = df 
        self.ratings = df['overall'].values
        self.product_ids = df['asin'].values
        self.reviewer_ids = df['reviewerID'].values
    def __len__(self):
        return len(self.ratings)
    
    def __getitem__(self, idx):
        
        rating = torch.tensor(self.ratings[idx] - 1, dtype=torch.float)
        product_id = torch.tensor(product2int[self.product_ids[idx]], dtype=torch.long)
        reviewer_id = torch.tensor(reviewer2int[self.reviewer_ids[idx]], dtype=torch.long)
        # review_text = item['reviewText'] # Add with bert tokenizer later 

        return {
            'rating': rating,
            'product_id': product_id,
            'reviewer_id': reviewer_id
        }


In [20]:
train_dataset = RecSysDataset(df_train)
test_dataset = RecSysDataset(df_test)

In [21]:
class MFModel(nn.Module):
    def __init__(self, num_reviewers, num_products, emb_sz, sparse):
        super().__init__()
        self.reviewer_embeddings = nn.Embedding(num_reviewers, emb_sz, sparse=sparse)
        self.product_embeddings = nn.Embedding(num_products, emb_sz, sparse=sparse)
        
        self.reviewer_biases = nn.Embedding(num_reviewers, 1, sparse=sparse)
        self.product_biases = nn.Embedding(num_products, 1, sparse=sparse)
        
        torch.nn.init.xavier_uniform_(self.reviewer_embeddings.weight)
        torch.nn.init.xavier_uniform_(self.product_embeddings.weight)        
            
    def forward(self, product_id, reviewer_id):
        bias = self.reviewer_biases(reviewer_id) + self.product_biases(product_id)
        pred = bias + (
            (self.reviewer_embeddings(reviewer_id) * self.product_embeddings(product_id))
            .sum(dim=1, keepdim=True)
        )
        return pred.squeeze()



In [22]:
class MFNeuralNetwork(nn.Module):
    def __init__(self, num_reviewers, num_products, emb_sz, sparse):
        super().__init__()
        self.reviewer_embeddings = nn.Embedding(num_reviewers, emb_sz)
        self.product_embeddings = nn.Embedding(num_products, emb_sz)
        self.linear1 = nn.Linear(2*emb_sz, 64)
        self.linear2 = nn.Linear(64, 1)
        
        torch.nn.init.xavier_uniform_(self.reviewer_embeddings.weight)
        torch.nn.init.xavier_uniform_(self.product_embeddings.weight)        
            
    def forward(self, product_id, reviewer_id):
        
        self.emb_out = torch.cat([self.reviewer_embeddings(reviewer_id), self.product_embeddings(product_id)], dim=1)
        self.out1 = F.relu(self.linear1(self.emb_out))
        self.out2 = self.linear2(self.out1).squeeze(-1)
        
        return self.out2



In [23]:
train_loader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=config.batch_size, shuffle=False)

In [24]:
def train(train_loader, model, loss_fn, optimizer, device):
    accuracy = []
    total_loss = 0
    predictions = []
    ratings = []
    for idx, data in enumerate(tqdm(train_loader)):
        optimizer.zero_grad()
        rating = data['rating'].to(device)
        reviewer_id = data['reviewer_id'].to(device)
        product_id = data['product_id'].to(device)

        preds = model(product_id, reviewer_id)

        loss = loss_fn(preds, rating)
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        predictions.extend(list(preds.cpu().detach()))
        ratings.extend(list(rating.cpu().detach()))
    rmse_loss = metrics.mean_squared_error(ratings, predictions, squared=False)
    mae_loss = metrics.mean_absolute_error(ratings, predictions)
    print(f'Training RMSE: {rmse_loss}, Training MAE: {mae_loss}')
    
    return rmse_loss, mae_loss, predictions, ratings



In [25]:
def test(test_loader, model, loss_fn, device):
    accuracy = []
    total_loss = 0
    predictions = []
    ratings = []
    with torch.no_grad():
        for idx, data in enumerate(tqdm(test_loader)):

            rating = data['rating'].to(device)
            reviewer_id = data['reviewer_id'].to(device)
            product_id = data['product_id'].to(device)

            preds = model(product_id, reviewer_id)
            loss = loss_fn(preds, rating)
            total_loss += loss.item()

            predictions.extend(list(preds.cpu().detach()))
            ratings.extend(list(rating.cpu().detach()))
#         rmse_loss = np.sqrt(total_loss / len(train_loader))
        rmse_loss = metrics.mean_squared_error(ratings, predictions, squared=False)
        mae_loss = metrics.mean_absolute_error(ratings, predictions)
        print(f'Testing RMSE: {rmse_loss}, Testing MAE: {mae_loss}')
    
    return rmse_loss, mae_loss, predictions, ratings

In [88]:
model = MFNeuralNetwork(len(reviewer2int), len(product2int), config.emb_sz, config.sparse)
model = model.to(config.device)
loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate)
# optimizer_reviewer = torch.optim.SparseAdam([model.reviewer_embeddings.weight, model.reviewer_biases.weight], lr=config.learning_rate)
# optimizer_product = torch.optim.SparseAdam([model.product_embeddings.weight, model.product_biases.weight], lr=config.learning_rate)

In [27]:
# model = MFModel(len(reviewer2int), len(product2int), config.emb_sz, config.sparse)
# model = model.to(config.device)
# loss_fn = nn.MSELoss()
# optimizer = torch.optim.SparseAdam(model.parameters(), lr=config.learning_rate)

In [92]:
def engine(train_loader, test_loader, model, loss_fn, optimizer, epochs, device):
    best_test_loss = float('inf')
    best_test_mae = float('inf')
    best_test_predictions = None 
    train_losses = []
    train_maes = []
    test_losses = []
    test_maes = []
    patience = 0
    for e in range(epochs):
        print("Starting Training ...")
        model.train()
        train_loss, train_mae, train_predictions, train_ratings = train(train_loader, model, loss_fn, optimizer, device)
        train_losses.append(train_loss)
        train_maes.append(train_mae)
        print("Starting Testing ...")
        test_loss, test_mae, test_predictions, test_ratings = test(test_loader, model, loss_fn, device)
        test_losses.append(test_loss)
        test_maes.append(test_mae)
        if test_loss < best_test_loss:
            best_test_loss = test_loss
            best_test_predictions = test_predictions 
            best_test_mae = test_mae
            torch.save(model.state_dict(), 'best_model_nn.pth')
            np.save('best_predictions_nn.npy', np.array(best_test_predictions))
            np.save('ratings_nn.npy', np.array(test_ratings))
            patience = 0
        else:
            patience += 1
        
        if patience >= 5:
            return train_losses, train_maes, test_losses, test_maes, best_test_loss, best_test_mae
    return train_losses, train_maes, test_losses, test_maes, best_test_loss, best_test_mae

In [31]:
train_losses, train_maes, test_losses, test_maes, best_test_loss, best_test_mae = engine(train_loader, test_loader, model, loss_fn, optimizer, config.epochs, config.device)

In [91]:
best_test_loss, best_test_mae 

In [103]:
best_test_loss_nn, best_test_mae_nn

In [93]:
train_losses_nn, train_maes_nn, test_losses_nn, test_maes_nn, best_test_loss_nn, best_test_mae_nn = engine(train_loader, test_loader, model, loss_fn, optimizer, config.epochs, config.device)

In [117]:
test_ratings = np.load('ratings_mf.npy')
test_predictions = np.load('best_predictions_mf.npy')

In [None]:
def print_metrics(test_ratings, test_predictions):
    print(metrics.recall_score(np.rint(test_ratings), np.rint(test_predictions), average='macro'))
    print(metrics.precision_score(np.rint(test_ratings), np.rint(test_predictions), average='macro'))
    print(metrics.f1_score(np.rint(test_ratings), np.rint(test_predictions), average='macro'))

In [82]:
import matplotlib.pyplot as plt 

In [102]:

plt.subplot(2, 2, 1)

plt.plot(np.arange(len(train_losses)),train_losses)

plt.xlabel("Epochs")
plt.ylabel("RMSE Loss")
plt.title('Matrix Factorization Train')


plt.subplot(2, 2, 2)
plt.plot(np.arange(len(test_losses)),test_losses)

plt.xlabel("Epochs")
plt.ylabel("RMSE Loss")
plt.title('Matrix Factorization Test')

plt.subplot(2, 2, 3)
plt.plot(np.arange(len(train_losses_nn)),train_losses_nn)

plt.xlabel("Epochs")
plt.ylabel("RMSE Loss")
plt.title('Neural Network Train')

plt.subplot(2, 2, 4)
plt.plot(np.arange(len(test_losses_nn)),test_losses_nn)

plt.xlabel("Epochs")
plt.ylabel("RMSE Loss")
plt.title('Neural Network Test')

plt.subplots_adjust(left=0.1,
                    bottom=0.1, 
                    right=0.9, 
                    top=0.9, 
                    wspace=0.4, 
                    hspace=0.6)

In [94]:
plt.plot(np.arange(len(train_losses_nn)),train_losses_nn)

plt.xlabel("Epochs")
plt.ylabel("RMSE Loss")
plt.title('Neural Network')

In [37]:
model = model.to('cpu')

In [73]:
def get_product_ratings(df_test, model, reviewer2int, product2int, int2product):
    df_test_1 = df_test.groupby('reviewerID').filter(lambda x: len(x) > 10)
    reviewers_in_test_dataset = df_test_1['reviewerID'].unique()
    products_in_test_dataset = df_test_1['asin'].unique()
    
    top10products_user = {}
    
    for reviewer in tqdm(reviewers_in_test_dataset):
        reviewer_id = reviewer2int[reviewer]
        user2product_ratings = []
        for product in products_in_test_dataset:
            product_id = product2int[product]
            product_rating = model(torch.tensor(product_id, dtype=torch.long).unsqueeze(0), torch.tensor(reviewer_id, dtype=torch.long).unsqueeze(0))
            user2product_ratings.append(product_rating)
        top10products = torch.topk(torch.tensor(user2product_ratings), 10).indices
        top10products_user[reviewer] = [int2product[i.item()] for i in top10products]
    return top10products_user

In [49]:
top10products_user = get_product_ratings(df_test, model, reviewer2int, product2int, int2product)

In [74]:
top50products_user = get_product_ratings(df_test, model, reviewer2int, product2int, int2product)

In [75]:
def calculate_precision(topk, total, num_k):
    relevant = 0
    for k in topk:
        if k in total:
            relevant += 1
    return relevant / num_k

def calculate_recall(topk, total):
    relevant = 0
    for k in topk:
        if k in total:
            relevant += 1
    return relevant / len(total)
    

In [77]:
def calculate_metrics(df_test, top10products_user):
    df_test_1 = df_test.groupby('reviewerID').filter(lambda x: len(x) > 10)
    reviewers_in_test_dataset = df_test_1['reviewerID'].unique()
    
    precisions = []
    recalls = []
    f1s = []
    for reviewer in reviewers_in_test_dataset:
        products_true = df_test_1[df_test_1['reviewerID'] == reviewer]['asin'].values
        products_recom = top10products_user[reviewer]
        precision = calculate_precision(products_recom, products_true, 50)
        recall = calculate_recall(products_recom, products_true)
        f1 = precision * recall / (precision + recall + 1e-5)
        
        precisions.append(precision)
        recalls.append(recall)
        f1s.append(f1)
    return precisions, recalls, f1s
        