In [1]:
import pandas as pd
import numpy as np
import json
import os
import scipy.sparse as spp
import torch
from spotlight.evaluation import rmse_score
from collections import defaultdict
import csv

### Process items to match Interactions object in Spotlight

In [2]:
users = []
items = []
score = []
timestamp = []
helpful = []
    
with open('reviews.training.json', 'r') as f:
    for line in f:
        x = json.loads(line)
        users.append(x['reviewerID'])
        items.append(x['asin'])
        score.append(x['overall'])
        timestamp.append(x['unixReviewTime'])
        helpful.append(x['helpful'])

trainusers = users[:]
trainitems = items[:]
        
with open('reviews.dev.json', 'r') as g:
    for line in g:
        x = json.loads(line)
        users.append(x['reviewerID'])
        items.append(x['asin'])

with open('reviews.test.unlabeled.csv', 'r') as h:
    next(h, None)
    for line in h:
        x = line.split(',')
        users.append(x[1].strip())
        items.append(x[2].strip())

user_labels = {y: x for x, y in enumerate(sorted(set(users)))}
item_labels = {y: x for x, y in enumerate(sorted(set(items)))}

In [3]:
user_index = np.int32([user_labels[x] for x in trainusers])
item_index = np.int32([item_labels[x] for x in trainitems])

score = np.float32(score)

In [4]:
train = spp.coo_matrix((score, (user_index, item_index)))

In [5]:
len(set(users)), len(set(items))

(123960, 51744)

In [6]:
len(set(trainusers)), len(set(trainitems))

(123952, 50050)

### Process dev items

In [7]:
devusers = []
devitems = []
devscore = []
dev_tups = []

with open('reviews.dev.json', 'r') as g:
    for line in g:
        x = json.loads(line)
        devusers.append(x['reviewerID'])
        devitems.append(x['asin'])
        devscore.append(x['overall'])
        dev_tups.append((user_labels[x['reviewerID']], item_labels[x['asin']], x['overall']))

dev_ui = np.int32([user_labels[x] for x in devusers])
dev_ii = np.int32([item_labels[x] for x in devitems])

devscore = np.float32(devscore)

### Spotlight Models: Explicit Factorization (full dataset)

In [8]:
import spotlight as sl
from spotlight.interactions import Interactions
import torch
from spotlight.factorization.explicit import ExplicitFactorizationModel
from spotlight.factorization.implicit import ImplicitFactorizationModel

In [9]:
sp_train = Interactions(user_ids = user_index, 
                        item_ids = item_index,
                        ratings = score,
                        num_users = 123960,
                        num_items = 51744 
            )

In [10]:
sp_dev = Interactions(user_ids = dev_ui, 
                        item_ids = dev_ii,
                        ratings = devscore,
                        num_users = 123960,
                        num_items = 51744 
            )

In [11]:
latent = 128
iterations = 4
mbatch = 1024
L2 = 1e-9
learning = 1e-3

In [14]:
expmodel = ExplicitFactorizationModel(loss='regression',
                                       embedding_dim=latent,  # latent dimensionality
                                       n_iter=iterations,  # number of epochs of training
                                       batch_size=mbatch,  # minibatch size
                                       l2=L2,  # strength of L2 regularization
                                       learning_rate=learning,
                                       use_cuda=torch.cuda.is_available()
                                     )

In [15]:
expmodel.fit(sp_train, verbose=True)

Epoch 0: loss 13.042520043359453
Epoch 1: loss 3.242231524969818
Epoch 2: loss 1.2984356079320756
Epoch 3: loss 0.9573760417591115


In [16]:
rmse1 = rmse_score(expmodel, sp_train)
rmse2 = rmse_score(expmodel, sp_dev)
rmse1, rmse2

(0.8623668, 1.0816336)

In [17]:
from sklearn.metrics import mean_squared_error

In [18]:
test_tups = []

with open('reviews.test.unlabeled.csv', 'r') as h:
    next(h, None)
    for line in h:
        x = line.split(',')
        test_tups.append((x[0].strip(), user_labels[x[1].strip()], item_labels[x[2].strip()]))

In [19]:
y_true = []
y_pred = []

for x in dev_tups:
    predictions = expmodel.predict(np.array([x[0], x[0]], dtype=np.int64), np.array([x[1], x[1]], dtype=np.int64))[0]
    if predictions > 5:
        predictions = 5
    if predictions < 1:
        predictions = 1
    y_true.append(x[2])
    y_pred.append(predictions)

In [20]:
true_rmse = np.sqrt(mean_squared_error(y_true, y_pred))
true_rmse

1.0708745074851083

In [22]:
expmodel2 = ExplicitFactorizationModel(loss='regression',
                                       embedding_dim=128,  # latent dimensionality
                                       n_iter=8,  # number of epochs of training
                                       batch_size=mbatch,  # minibatch size
                                       l2=L2,  # strength of L2 regularization
                                       learning_rate=learning,
                                       use_cuda=torch.cuda.is_available()
                                     )

In [23]:
expmodel2.fit(sp_train, verbose=True)

Epoch 0: loss 13.089207182095835
Epoch 1: loss 3.278919423174481
Epoch 2: loss 1.3041685572583241
Epoch 3: loss 0.949660100609895
Epoch 4: loss 0.8103529811354625
Epoch 5: loss 0.6844400304012399
Epoch 6: loss 0.5470632019578378
Epoch 7: loss 0.40772226946912676


In [24]:
y_true = []
y_pred = []

for x in dev_tups[:50]:
    predictions = expmodel2.predict(np.array([x[0], x[0]], dtype=np.int64), np.array([x[1], x[1]], dtype=np.int64))[0]
    if predictions > 5:
        predictions = 5
    if predictions < 1:
        predictions = 1
    y_true.append(x[2])
    y_pred.append(predictions)
    print(x, predictions)

(69616, 27706, 5.0) 4.9988813
(75326, 9399, 2.0) 3.4523451
(33338, 44134, 5.0) 4.611212
(98604, 12860, 3.0) 1.8130231
(85381, 15444, 3.0) 4.698665
(24044, 23391, 5.0) 4.2680655
(55350, 9671, 5.0) 3.9705033
(120327, 45513, 5.0) 3.96565
(1611, 45869, 5.0) 5
(67328, 43732, 5.0) 4.7994933
(60921, 21823, 4.0) 4.433361
(69654, 49595, 5.0) 3.478121
(57987, 41782, 3.0) 3.8890612
(59435, 8582, 1.0) 4.7688513
(59451, 9625, 5.0) 5
(88766, 30047, 4.0) 3.3429453
(11995, 15195, 4.0) 4.1063943
(76608, 16672, 2.0) 2.2537487
(11138, 17614, 5.0) 5
(113174, 44869, 5.0) 4.1313214
(68519, 4941, 5.0) 4.7420883
(117221, 28960, 3.0) 3.1606114
(121393, 13106, 3.0) 3.205855
(35633, 33759, 4.0) 3.8210588
(120045, 49512, 5.0) 5
(67037, 5020, 4.0) 3.4852266
(43999, 40664, 5.0) 5
(85683, 39286, 5.0) 3.6200361
(40336, 9764, 3.0) 3.8191195
(9938, 9669, 5.0) 4.243241
(40170, 191, 5.0) 4.7740946
(81278, 14262, 5.0) 5
(61990, 933, 3.0) 3.5157237
(78207, 23318, 4.0) 3.7575312
(36150, 1991, 5.0) 4.42806
(83329, 21306, 5.0

In [25]:
true_rmse = np.sqrt(mean_squared_error(y_true, y_pred))
true_rmse

1.0171161321256215

In [26]:
y_true = []
y_pred = []

for x in dev_tups:
    predictions = expmodel2.predict(np.array([x[0], x[0]], dtype=np.int64), np.array([x[1], x[1]], dtype=np.int64))[0]
    if predictions > 5:
        predictions = 5
    if predictions < 1:
        predictions = 1
    y_true.append(x[2])
    y_pred.append(predictions)

In [27]:
with open('submit_sp_moretraining.csv', 'w', newline='') as g:
    csvw = csv.writer(g, delimiter=',')
    csvw.writerow(['datapointID','overall'])
    for x in test_tups:
        predictions = expmodel2.predict(np.array([x[1], x[1]], dtype=np.int64), np.array([x[2], x[2]], dtype=np.int64))[0]
        if predictions > 5:
            predictions = 5
        if predictions < 1:
            predictions = 1
        csvw.writerow([x[0], predictions])

In [28]:
expmodel3 = ExplicitFactorizationModel(loss='regression',
                                       embedding_dim=100,  # latent dimensionality
                                       n_iter=6,  # number of epochs of training
                                       batch_size=mbatch,  # minibatch size
                                       l2=1e-7,  # strength of L2 regularization
                                       learning_rate=learning,
                                       use_cuda=torch.cuda.is_available()
                                     )

In [29]:
expmodel3.fit(sp_train, verbose=True)

Epoch 0: loss 13.603769565760505
Epoch 1: loss 3.8392041759914983
Epoch 2: loss 1.4731016009035376
Epoch 3: loss 1.0013021011952628
Epoch 4: loss 0.8485163029370312
Epoch 5: loss 0.7418312040054448


In [32]:
with open('submit_sp_reducedim.csv', 'w', newline='') as g:
    csvw = csv.writer(g, delimiter=',')
    csvw.writerow(['datapointID','overall'])
    for x in test_tups:
        predictions = expmodel3.predict(np.array([x[1], x[1]], dtype=np.int64), np.array([x[2], x[2]], dtype=np.int64))[0]
        if predictions > 5:
            predictions = 5
        if predictions < 1:
            predictions = 1
        csvw.writerow([x[0], predictions])

In [33]:
y_true = []
y_pred = []

for x in dev_tups[:50]:
    predictions = expmodel3.predict(np.array([x[0], x[0]], dtype=np.int64), np.array([x[1], x[1]], dtype=np.int64))[0]
    if predictions > 5:
        predictions = 5
    if predictions < 1:
        predictions = 1
    y_true.append(x[2])
    y_pred.append(predictions)

true_rmse = np.sqrt(mean_squared_error(y_true, y_pred))
true_rmse

0.9816580881823934