In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
import os
import sys
import re
import pickle
import ast
from itertools import chain
import random
from datetime import datetime

os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

# Downloading Datasets

In [None]:
# !curl http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/Electronics_5.json.gz --output Electronics_5.json.gz

In [None]:
# !curl http://deepyeti.ucsd.edu/jianmo/amazon/metaFiles2/meta_Electronics.json.gz --output electronics_metadata.json.gz

# Converting JSON to DataFrames

In [None]:
%%time
def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield json.loads(l.strip())

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

df = getDF('./raw/Electronics_5.json.gz')
df_meta = getDF('./raw/electronics_metadata.json.gz')
df.shape, df_meta.shape

In [None]:
# cleaning reviews DataFrame
df = df.drop(columns=['image','reviewTime'])
df['reviewTime'] = pd.to_datetime(df['unixReviewTime'], unit='s')
df = df.drop(columns=['reviewText','summary','reviewerName'])
df.drop('style', axis=1).nunique()

In [None]:
# cleaning meta DataFrame
df_meta = df_meta[['asin','main_cat','title']]
df_meta = df_meta.drop_duplicates()
df_meta = df_meta[~df_meta['main_cat'].str.contains('img src')]
df_meta['main_cat'] = df_meta['main_cat'].replace(r'^\s*$', 'others', regex=True)
df_meta.nunique()

In [None]:
# Merge Reviews and Meta Datasets

data = df.merge(df_meta,how='inner', left_on='asin', right_on='asin')

# Create Sequential Data

In [None]:
seq = data[['reviewerID','asin','main_cat','reviewTime']]
seq = seq.sort_values('reviewTime')

# Calculating length of sequences
seq = seq.groupby('reviewerID').agg(list).reset_index()
seq['length'] = seq.apply(lambda row: len(row['asin']), axis=1)

print('Mean length of sequences per user: {}'.format(seq['length'].mean()))
print('Median length of sequences per user: {}'.format(seq['length'].median()))

In [None]:
seq.to_csv('./seq_data.csv', index=False)

# Creating Datsets for Training

In [2]:
seq = pd.read_csv('./seq_data.csv')
seq = seq[seq['length'] > 7]
seq['asin'] = seq['asin'].apply(lambda x: ast.literal_eval(x))
seq

Unnamed: 0,reviewerID,asin,main_cat,reviewTime,length
1,A0003214FKMKJE0PCW3D,"[B000P1O73A, B008562SXS, B00BFO14W8, B004XVN1V...","['Computers', 'Home Audio & Theater', 'Cell Ph...","[Timestamp('2014-10-29 00:00:00'), Timestamp('...",9
3,A00101847G3FJTWYGNQA,"[B00AWRUIY4, B00C7NSIO8, B005F778JO, B006T9B6R...","['Computers', 'Computers', 'All Electronics', ...","[Timestamp('2013-09-19 00:00:00'), Timestamp('...",11
4,A00222906VX8GH7X6J6B,"[B00DI89IQS, B00EPIWY2U, B00JFR8UQ0, B00J8DL78...","['Home Audio & Theater', 'Computers', 'Compute...","[Timestamp('2015-01-22 00:00:00'), Timestamp('...",13
8,A0072193KFP6LUHKEXLT,"[B00008N6Y8, B00AF56QA8, B000I97G0U, B000EPNB5...","['All Electronics', 'Home Audio & Theater', 'C...","[Timestamp('2013-10-24 00:00:00'), Timestamp('...",9
9,A007917716EGEEP4D8LB,"[B00HSGR8PE, B0063X4BNK, B00SJVCT88, B000WJTEN...","['Computers', 'Computers', 'Computers', 'All E...","[Timestamp('2015-10-15 00:00:00'), Timestamp('...",11
...,...,...,...,...,...
524812,AZZXJAE2DILET,"[B00DKFF386, B00DU45VWU, B0013FRNKG, B00EAM87E...","['Computers', 'Computers', 'Computers', 'Compu...","[Timestamp('2014-08-13 00:00:00'), Timestamp('...",20
524813,AZZY4W8E5AX2K,"[B0092U4140, B0046S54GC, B008HCX6S6, B00HZZGY8...","['Amazon Devices', 'Home Audio & Theater', 'Ca...","[Timestamp('2013-10-21 00:00:00'), Timestamp('...",8
524814,AZZYJH0XNZ896,"[B006WHPQSQ, B0099TX7O4, B008X8NK0I, B00E3R0S1...","['Home Audio & Theater', 'Computers', 'Compute...","[Timestamp('2014-06-14 00:00:00'), Timestamp('...",23
524816,AZZYW4YOE1B6E,"[B001AK0496, B002PHM0G8, B004Z0S7K6, B009FWEKO...","['Home Audio & Theater', 'Computers', 'All Ele...","[Timestamp('2009-01-28 00:00:00'), Timestamp('...",44


## Example of a split

In [3]:
# https://stackoverflow.com/a/65216213/15751564
a = pd.Series(seq['asin'].iloc[0])
b = [window.to_list() for window in a.rolling(window=6)]
c = a.values.tolist()
d = [i[:-1] for i in b]
l = ['train'] * len(c)
l[-1] = 'test'
l[-2] = 'valid'
for i in range(1, len(c)):
    print(d[i], c[i], l[i])

['B000P1O73A'] B008562SXS train
['B000P1O73A', 'B008562SXS'] B00BFO14W8 train
['B000P1O73A', 'B008562SXS', 'B00BFO14W8'] B004XVN1V2 train
['B000P1O73A', 'B008562SXS', 'B00BFO14W8', 'B004XVN1V2'] B0016BVDIK train
['B000P1O73A', 'B008562SXS', 'B00BFO14W8', 'B004XVN1V2', 'B0016BVDIK'] B0016BVDIK train
['B008562SXS', 'B00BFO14W8', 'B004XVN1V2', 'B0016BVDIK', 'B0016BVDIK'] B004XJ6R0Q train
['B00BFO14W8', 'B004XVN1V2', 'B0016BVDIK', 'B0016BVDIK', 'B004XJ6R0Q'] B000Z9R2QQ valid
['B004XVN1V2', 'B0016BVDIK', 'B0016BVDIK', 'B004XJ6R0Q', 'B000Z9R2QQ'] B000Z9R2QQ test


In [4]:
%%time

def create_split(asin, window_size=6):
    a = pd.Series(asin)
    windows = [window.to_list() for window in a.rolling(window_size)] 
    # https://stackoverflow.com/a/65216213/15751564
    input = [i[:-1] for i in windows]
    labels = ['train'] * len(asin)
    labels[-1] = 'test'
    labels[-2] = 'valid'

    return labels[1:], input[1:], asin[1:]

def save_files(seq, sample_percentage=0.1):
    test = seq[['reviewerID','asin']]
    test[['split', 'input', 'target']] = test.apply(lambda r: create_split(r['asin']), axis=1, result_type="expand")
    test = test.drop(columns=['asin'])
    t = test.set_index(['reviewerID']).apply(pd.Series.explode).reset_index()
    t['label'] = 1
    t.to_csv('./splitted.csv', sep='\t', index=False)
    t[t['split']=='train'].sample(frac=sample_percentage, random_state=4535).to_csv('./train_data.csv', sep='\t', header=False, index=False)
    t[t['split']=='valid'].sample(frac=sample_percentage, random_state=4535).to_csv('./valid_data.csv', sep='\t', header=False, index=False)
    t[t['split']=='test'].sample(frac=sample_percentage, random_state=4535).to_csv('./test_data.csv', sep='\t', header=False, index=False)
    return t

save_files(seq)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


CPU times: user 2min 56s, sys: 3.08 s, total: 2min 59s
Wall time: 2min 58s


Unnamed: 0,reviewerID,split,input,target,label
0,A0003214FKMKJE0PCW3D,train,[B000P1O73A],B008562SXS,1
1,A0003214FKMKJE0PCW3D,train,"[B000P1O73A, B008562SXS]",B00BFO14W8,1
2,A0003214FKMKJE0PCW3D,train,"[B000P1O73A, B008562SXS, B00BFO14W8]",B004XVN1V2,1
3,A0003214FKMKJE0PCW3D,train,"[B000P1O73A, B008562SXS, B00BFO14W8, B004XVN1V2]",B0016BVDIK,1
4,A0003214FKMKJE0PCW3D,train,"[B000P1O73A, B008562SXS, B00BFO14W8, B004XVN1V...",B0016BVDIK,1
...,...,...,...,...,...
4009267,AZZZU3P1UQZ0C,train,"[B0007NWL70, B01DMHPT3U, B007PUMCWC]",B00ESW9SZ4,1
4009268,AZZZU3P1UQZ0C,train,"[B0007NWL70, B01DMHPT3U, B007PUMCWC, B00ESW9SZ4]",B00092DZ6U,1
4009269,AZZZU3P1UQZ0C,train,"[B0007NWL70, B01DMHPT3U, B007PUMCWC, B00ESW9SZ...",B014Q8XTZE,1
4009270,AZZZU3P1UQZ0C,valid,"[B01DMHPT3U, B007PUMCWC, B00ESW9SZ4, B00092DZ6...",B018APC50Y,1


In [7]:
# create an Item Dict
def create_item_dict(complete):
    asin = complete['asin'].values.tolist()
    asin = list(chain.from_iterable(asin)) # https://stackoverflow.com/a/29244327/15751564
    asin = list(set(asin))
    item_dict = {asin[i-1]:i for i in range(1,len(asin)+1)}

    file = open(b"item_dict.pkl","wb")
    pickle.dump(item_dict, file)
    file.close()

    return item_dict, asin

item_dict, asin = create_item_dict(seq)

len(item_dict)

159305

In [8]:
%%time

# Negative sampling
def negative_sampling(filename, asin, num_negs=5):
    with open(filename, "r") as f:
        test_lines = f.readlines()
    write_test = open(filename[:-4]+'_output.csv', "w")
    for line in test_lines:
        write_test.write(line)
        words = line.strip().split('\t')
        positive_item = words[-2]
        count = 0
        neg_items = set()
        while count < num_negs:
            neg_item = random.choice(asin)
            if neg_item == positive_item or neg_item in neg_items:
                continue
            count += 1
            neg_items.add(neg_item)
            words[-1] = "0"
            words[-2] = neg_item
            a = "\t".join(words) + "\n"
            write_test.write("\t".join(words) + "\n")

negative_sampling('./train_data.csv', asin, num_negs=1)
negative_sampling('./valid_data.csv', asin)
negative_sampling('./test_data.csv', asin, num_negs=50)

train = pd.read_csv('./train_data_output.csv', sep='\t')
valid = pd.read_csv('./valid_data_output.csv', sep='\t')
test = pd.read_csv('./test_data_output.csv', sep='\t')
train.shape, valid.shape, test.shape

CPU times: user 4.58 s, sys: 556 ms, total: 5.14 s
Wall time: 5.14 s


((678951, 5), (184355, 5), (1567025, 5))

# Dataset and Dataloader class

In [9]:
%%time

from torch.utils.data import Dataset
from torch.utils.data import DataLoader

class CustomDataset(Dataset):
    def __init__(self, file, item_dict, pad=5):
        self.file = pd.read_csv(file, sep='\t', names=[1,2,3,4,5])
        self.file[3] = self.file[3].apply(lambda x: ast.literal_eval(x))
        self.item_dict = item_dict
        self.input = self.file[3].values
        self.target = self.file[4].values
        self.labels = self.file[5].values
        self.pad = pad

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        input = self.input[idx]
        input = [self.item_dict[i] for i in input]
        target = self.item_dict[self.target[idx]]
        label = [self.labels[idx]]
        input = torch.tensor(input)
        label = torch.tensor(label)
        input = nn.ConstantPad1d((self.pad-input.shape[0], 0), 0)(input)
        e_i = torch.zeros(len(self.item_dict) + 1)
        e_i[target] = 1

        return (input,e_i), label

training = CustomDataset('./train_data_output.csv', item_dict)
validation = CustomDataset('./valid_data_output.csv', item_dict)
train_dataloader = DataLoader(training, batch_size=64, shuffle=True)
valid_dataloader = DataLoader(validation, batch_size=60, shuffle=True)

CPU times: user 8.72 s, sys: 244 ms, total: 8.97 s
Wall time: 8.97 s


# Model Class

In [10]:
class seqrecModel(nn.Module):
    def __init__(self, cell, len_item, embedding_dim, hidden_dim, final_dim, seq_length,num_layers, bidirectional=False):
        super(seqrecModel, self).__init__()
        self.len_item = len_item + 1
        self.embs = nn.Embedding(self.len_item, embedding_dim, padding_idx=0)
        self.rnn = cell(embedding_dim, hidden_dim, num_layers, bidirectional=bidirectional,batch_first=True)
        if bidirectional:
            self.l = nn.Linear(2*hidden_dim, final_dim)
        else:
            self.l = nn.Linear(hidden_dim, final_dim)
        self.l1 = nn.Linear(final_dim*seq_length+self.len_item, hidden_dim-final_dim)
        self.l2 = nn.Linear(hidden_dim-final_dim, 1)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

        nn.init.xavier_uniform_(self.l.weight)
        nn.init.zeros_(self.l.bias)
        nn.init.xavier_uniform_(self.l1.weight)
        nn.init.zeros_(self.l1.bias)
        nn.init.xavier_uniform_(self.l2.weight)
        nn.init.zeros_(self.l2.bias)

    def forward(self, input):
        embeds = self.embs(input[0])
        rnn_out, _ = self.rnn(embeds)
        l = self.l(rnn_out)
        inp = torch.cat((l.flatten(1),input[1]), 1)
        l1 = self.relu(self.l1(inp))
        score = self.sigmoid(self.l2(l1))
        return score

# Evaluation Metrics

In [11]:
def ndcg_score(y_true, y_score, k=10):
    """Computing ndcg score metric at k.

    Args:
        y_true (np.ndarray): Ground-truth labels.
        y_score (np.ndarray): Predicted labels.

    Returns:
        numpy.ndarray: ndcg scores.
    """
    best = dcg_score(y_true, y_true, k)
    actual = dcg_score(y_true, y_score, k)
    ans = actual / best
    if best == 0:
        return 0
    else:
        return actual / best

def hit_score(y_true, y_score, k=10):
    """Computing hit score metric at k.

    Args:
        y_true (np.ndarray): ground-truth labels.
        y_score (np.ndarray): predicted labels.

    Returns:
        np.ndarray: hit score.
    """
    ground_truth = np.where(y_true == 1)[0]
    argsort = np.argsort(y_score)[::-1][:k]
    for idx in argsort:
        if idx in ground_truth:
            return 1
    return 0


def dcg_score(y_true, y_score, k=10):
    """Computing dcg score metric at k.

    Args:
        y_true (np.ndarray): Ground-truth labels.
        y_score (np.ndarray): Predicted labels.

    Returns:
        np.ndarray: dcg scores.
    """
    k = min(np.shape(y_true)[-1], k)
    order = np.argsort(y_score)[::-1]
    y_true = np.take(y_true, order[:k])
    gains = 2 ** y_true - 1
    discounts = np.log2(np.arange(len(y_true)) + 2)
    return np.sum(gains / discounts)

def evaluate_metrics(metric, labels, preds):
    res = {}
    for i in metric:
        if i.startswith("ndcg"):  # format like:  ndcg@2;4;6;8
            ndcg_list = [1, 2]
            ks = i.split("@")
            if len(ks) > 1:
                ndcg_list = [int(token) for token in ks[1].split(";")]
            for k in ndcg_list:
                ndcg_temp = np.mean(
                    [
                        ndcg_score(each_labels, each_preds, k)
                        for each_labels, each_preds in zip(labels, preds)
                    ]
                )
                res["ndcg@{0}".format(k)] = round(ndcg_temp, 4)
                
        elif i.startswith("hit"):  # format like:  hit@2;4;6;8
            hit_list = [1, 2]
            ks = i.split("@")
            if len(ks) > 1:
                hit_list = [int(token) for token in ks[1].split(";")]
            for k in hit_list:
                hit_temp = np.mean(
                    [
                        hit_score(each_labels, each_preds, k)
                        for each_labels, each_preds in zip(labels, preds)
                    ]
                )
                res["hit@{0}".format(k)] = round(hit_temp, 4)
    return res

# Training and Test Loops

In [12]:
# Train function
def train(dataloader, model, loss_fn, optimizer, DEVICE):
    size = len(dataloader.dataset)
    for batch, (input, label) in enumerate(dataloader):
        model.zero_grad()
        input[0], input[1] = input[0].to(DEVICE), input[1].to(DEVICE)
        label = label.to(DEVICE)
        preds = model(input)
        
        label = label.float()
        loss = loss_fn(preds,label)
        loss.backward()
        optimizer.step()

        if batch % 1000 == 0:
            loss, current = loss.item(), batch*len(input[0])
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

def test(dataloader, model, DEVICE, num_negs=5):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    
    group = num_negs + 1
    predictions = []
    labels = []

    with torch.no_grad():
        for input, label in dataloader:
            input[0], input[1] = input[0].to(DEVICE), input[1].to(DEVICE)
            pred = model(input)
            pred = pred.cpu().numpy()
            label = label.numpy()
            
            predictions.extend(np.reshape(pred, (-1, group)))
            labels.extend(np.reshape(label, (-1, group)))
            

    res = evaluate_metrics(['ndcg@10', 'hit@10'], labels, predictions)
    print(res)
    return res

In [41]:
def trainandSaving(epochs, model, lossfn, optim, PATH):
    for t in range(epochs):
        start = datetime.now()
        print(f"Epoch {t+1}\n-------------------------------")
        train(train_dataloader, model, loss_fn, optimizer, DEVICE)
        results = test(valid_dataloader, model, DEVICE)
        diff = (datetime.now() - start).total_seconds() / 60.0
        print(f"Done in {diff} mins")
    
    torch.save(model.state_dict(), PATH)

        
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

for cell in [(nn.RNN,'RNN'), (nn.LSTM,'LSTM'), (nn.GRU,'GRU')]:
    for bidirection in [True, False]:
        model = seqrecModel(cell[0],len(item_dict), 100, 256, 128, seq_length=5,num_layers=2,bidirectional=bidirection)
        model = model.to(DEVICE)
        
        loss_fn = nn.BCELoss()
        optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

        trainandSaving(10, model, loss_fn, optimizer, f"./trained/{cell[1]}.pt")

Epoch 1
-------------------------------
loss: 0.690807  [    0/678952]
loss: 0.135633  [64000/678952]
loss: 0.169644  [128000/678952]
loss: 0.251018  [192000/678952]
loss: 0.125674  [256000/678952]
loss: 0.119854  [320000/678952]
loss: 0.187529  [384000/678952]
loss: 0.172008  [448000/678952]
loss: 0.120195  [512000/678952]
loss: 0.074819  [576000/678952]
loss: 0.051090  [640000/678952]


  ans = actual / best


{'ndcg@10': 0.4384, 'hit@10': 0.6628}
Done in 6.59540895 mins
Epoch 2
-------------------------------
loss: 0.115195  [    0/678952]
loss: 0.161450  [64000/678952]
loss: 0.111647  [128000/678952]
loss: 0.314923  [192000/678952]
loss: 0.078330  [256000/678952]
loss: 0.188367  [320000/678952]
loss: 0.191598  [384000/678952]
loss: 0.124986  [448000/678952]
loss: 0.114240  [512000/678952]
loss: 0.208225  [576000/678952]
loss: 0.069257  [640000/678952]
{'ndcg@10': 0.4382, 'hit@10': 0.6702}
Done in 6.461520916666667 mins
Epoch 3
-------------------------------
loss: 0.120227  [    0/678952]
loss: 0.238389  [64000/678952]
loss: 0.083099  [128000/678952]
loss: 0.100993  [192000/678952]
loss: 0.109095  [256000/678952]
loss: 0.048874  [320000/678952]
loss: 0.081494  [384000/678952]
loss: 0.108552  [448000/678952]
loss: 0.124853  [512000/678952]
loss: 0.101285  [576000/678952]
loss: 0.034829  [640000/678952]
{'ndcg@10': 0.4402, 'hit@10': 0.6657}
Done in 6.429085116666667 mins
Epoch 4
------------

# Loading the trained models for evaluation

In [18]:
test_data = CustomDataset('./test_data_output.csv', item_dict)
test_dataloader = DataLoader(test_data, batch_size=102, shuffle=True)

RESULTS = []
for cell in [(nn.RNN,'RNN'), (nn.LSTM,'LSTM'), (nn.GRU,'GRU')]:
    for bidirection in [True, False]:
        device = torch.device("cuda")
        model = seqrecModel(cell[0],len(item_dict), 100, 256, 128, seq_length=5,num_layers=2,bidirectional=bidirection)
        model.load_state_dict(torch.load(f"./trained/{cell[1]}_{str(bidirection)}.pt", map_location="cuda:0")) 
        model = model.to(device)
        
        print(f"Model : {cell[1]}_{str(bidirection)}")
        res = test(test_dataloader, model, device, num_negs=50)
        print('-------------------')
        RESULTS.append((f"Model_{cell[1]}_{str(bidirection)}", res))     

Model : RNN_True


  ans = actual / best


{'ndcg@10': 0.0631, 'hit@10': 0.1792}
-------------------
Model : RNN_False
{'ndcg@10': 0.0641, 'hit@10': 0.1807}
-------------------
Model : LSTM_True
{'ndcg@10': 0.0636, 'hit@10': 0.1796}
-------------------
Model : LSTM_False
{'ndcg@10': 0.0636, 'hit@10': 0.1801}
-------------------
Model : GRU_True
{'ndcg@10': 0.0643, 'hit@10': 0.1819}
-------------------
Model : GRU_False
{'ndcg@10': 0.0647, 'hit@10': 0.1825}
-------------------


# Results

In [None]:
# RNN = {'ndcg@10': 0.0641, 'hit@10': 0.1807}
# RNN Biderectional = {'ndcg@10': 0.0631, 'hit@10': 0.1792}
# LSTM = {'ndcg@10': 0.0636, 'hit@10': 0.1801}
# LSTM Biderctional = {'ndcg@10': 0.0636, 'hit@10': 0.1796}
# GRU Biderctional = {'ndcg@10': 0.0643, 'hit@10': 0.1819}
# GRU = {'ndcg@10': 0.0647, 'hit@10': 0.1825}