In [1]:
import pandas as pd                         
from collections import defaultdict
import scipy
import scipy.optimize
import numpy
import random

In [18]:
df = pd.read_csv("data/Office_Products_5.csv")             
data = df.to_numpy()

In [19]:
train = data[:int(0.85*len(data))]
data = data[int(0.85*len(data)):]   #test data 

In [20]:
tr4 = []
tr5 = []
tr_other = []
for d in data:
    if d[2] == 4:
        tr4.append(d)
    elif d[2] == 5:
        tr5.append(d)
    else:
        tr_other.append(d)
data = tr_other + tr4[:10000] + tr5[:10000]
random.shuffle(data)

In [21]:
tr4 = []
tr5 = []
tr_other = []
for d in train:
    if d[2] == 4:
        tr4.append(d)
    elif d[2] == 5:
        tr5.append(d)
    else:
        tr_other.append(d)
train = tr_other + tr4[:50000] + tr5[:50000]
random.shuffle(train)

In [25]:
dataf = pd.DataFrame.from_records(data)
dataf.columns = ['reviewerID', 'asin','overall']
trainf = pd.DataFrame.from_records(train)
trainf.columns = ['reviewerID', 'asin','overall']

In [23]:
reviewsPerUser = defaultdict(list)
reviewsPerItem = defaultdict(list)
for d in data:
    user,item = d[0], d[1]
    reviewsPerUser[user].append(d)
    reviewsPerItem[item].append(d)

## Simple (bias only) latent factor-based recommender

In [6]:
N = len(data)
nUsers = len(reviewsPerUser)
nItems = len(reviewsPerItem)
users = list(reviewsPerUser.keys())
items = list(reviewsPerItem.keys())
alpha = sum([d[2] for d in data]) / len(data)
labels = [d[2] for d in data]

In [7]:
userBiases = defaultdict(float)
itemBiases = defaultdict(float)

In [8]:
def MSE(predictions, labels):
    differences = [(x-y)**2 for x,y in zip(predictions,labels)]
    return sum(differences) / len(differences)

In [9]:
def prediction(user, item):
    return alpha + userBiases[user] + itemBiases[item]

In [10]:
def unpack(theta):
    global alpha
    global userBiases
    global itemBiases
    alpha = theta[0]
    userBiases = dict(zip(users, theta[1:nUsers+1]))
    itemBiases = dict(zip(items, theta[1+nUsers:]))


In [11]:
def cost(theta, labels, lamb):
    unpack(theta)
    predictions = [prediction(d[0], d[1]) for d in data]
    cost = MSE(predictions, labels)
    print("MSE = " + str(cost))
    for u in userBiases:
        cost += lamb*userBiases[u]**2
    for i in itemBiases:
        cost += lamb*itemBiases[i]**2
    return cost

In [12]:
def derivative(theta, labels, lamb):
    unpack(theta)
    N = len(data)
    dalpha = 0
    dUserBiases = defaultdict(float)
    dItemBiases = defaultdict(float)
    for d in data:
        u,i = d[0], d[1]
        pred = prediction(u, i)
        diff = pred - d[2]
        dalpha += 2/N*diff
        dUserBiases[u] += 2/N*diff
        dItemBiases[i] += 2/N*diff
    for u in userBiases:
        dUserBiases[u] += 2*lamb*userBiases[u]
    for i in itemBiases:
        dItemBiases[i] += 2*lamb*itemBiases[i]
    dtheta = [dalpha] + [dUserBiases[u] for u in users] + [dItemBiases[i] for i in items]
    return numpy.array(dtheta)

In [13]:
scipy.optimize.fmin_l_bfgs_b(cost, [alpha] + [0.0]*(nUsers+nItems),
                             derivative, args = (labels, 0.001))

MSE = 1.7930627363636482
MSE = 1.7349223215285619
MSE = 2.8043078245334043
MSE = 1.6795845118470718
MSE = 1.5497686449877404
MSE = 1.54750179717496
MSE = 1.5390552507881587
MSE = 1.443921923461243
MSE = 1.3942257831204012
MSE = 1.3564022816190109
MSE = 1.3392963700607183
MSE = 1.3176841699818578
MSE = 1.304251715870892
MSE = 1.3026225885271114
MSE = 1.3068176757029466
MSE = 1.3082436620318303
MSE = 1.3086929797073603
MSE = 1.294267973614376
MSE = 1.3030153039305588
MSE = 1.298763286329454
MSE = 1.2933768873836964
MSE = 1.2923834189092662
MSE = 1.2923129425017659
MSE = 1.2681034276828804
MSE = 1.2867813928446419
MSE = 1.2898642401478653
MSE = 1.2916810753625214
MSE = 1.290576569394126
MSE = 1.2911661154622958
MSE = 1.2900320506174994
MSE = 1.2892082700378702
MSE = 1.2885779672951738
MSE = 1.2887383122967884
MSE = 1.2889769333730619
MSE = 1.2895425509351979
MSE = 1.2898682309386658
MSE = 1.29032624896914
MSE = 1.289166756806326
MSE = 1.2898566288762834
MSE = 1.2893411783571764
MSE = 1.28

(array([ 3.31651145, -0.00844758,  0.02683877, ..., -0.00867198,
         0.01871117, -0.00918939]),
 1.4641853485509875,
 {'grad': array([ 2.78781354e-05, -5.95337578e-09, -3.83187053e-08, ...,
         -7.95177502e-09, -1.55812360e-08,  1.05613356e-08]),
  'task': b'CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH',
  'funcalls': 57,
  'nit': 48,
  'warnflag': 0})

## Complete latent factor model 

In [14]:
alpha =  sum([d[2] for d in data]) / len(data)
userBiases = defaultdict(float)
itemBiases = defaultdict(float)
userGamma = {}
itemGamma = {}

In [15]:
K = 2

In [16]:
for u in reviewsPerUser:
    userGamma[u] = [random.random() * 0.1 - 0.05 for k in range(K)]
for i in reviewsPerItem:
    itemGamma[i] = [random.random() * 0.1 - 0.05 for k in range(K)]

In [17]:
def unpack(theta):   
    global alpha
    global userBiases
    global itemBiases
    global userGamma
    global itemGamma
    index = 0
    alpha = theta[index]
    index += 1
    userBiases = dict(zip(users, theta[index:index+nUsers]))
    index += nUsers
    itemBiases = dict(zip(items, theta[index:index+nItems]))
    index += nItems
    for u in users:
        userGamma[u] = theta[index:index+K]
        index += K
    for i in items:
        itemGamma[i] = theta[index:index+K]
        index += K

In [18]:
def inner(x, y):
    return sum([a*b for a,b in zip(x,y)])

In [19]:
def prediction(user, item):
    return alpha + userBiases[user] + itemBiases[item] + inner(userGamma[user], itemGamma[item])

In [20]:
def cost(theta, labels, lamb):
    unpack(theta)
    predictions = [prediction(d[0], d[1]) for d in data]
    cost = MSE(predictions, labels)
    print("MSE = " + str(cost))
    for u in users:
        cost += lamb*userBiases[u]**2
        for k in range(K):
            cost += lamb*userGamma[u][k]**2
    for i in items:
        cost += lamb*itemBiases[i]**2
        for k in range(K):
            cost += lamb*itemGamma[i][k]**2
    return cost

In [21]:
def derivative(theta, labels, lamb):
    unpack(theta)
    N = len(data)
    dalpha = 0
    dUserBiases = defaultdict(float)
    dItemBiases = defaultdict(float)
    dUserGamma = {}
    dItemGamma = {}
    for u in reviewsPerUser:
        dUserGamma[u] = [0.0 for k in range(K)]
    for i in reviewsPerItem:
        dItemGamma[i] = [0.0 for k in range(K)]
    for d in data:
        u,i = d[0], d[1]
        pred = prediction(u, i)
        diff = pred - d[2]
        dalpha += 2/N*diff
        dUserBiases[u] += 2/N*diff
        dItemBiases[i] += 2/N*diff
        for k in range(K):
            dUserGamma[u][k] += 2/N*itemGamma[i][k]*diff
            dItemGamma[i][k] += 2/N*userGamma[u][k]*diff
    for u in userBiases:
        dUserBiases[u] += 2*lamb*userBiases[u]
        for k in range(K):
            dUserGamma[u][k] += 2*lamb*userGamma[u][k]
    for i in itemBiases:
        dItemBiases[i] += 2*lamb*itemBiases[i]
        for k in range(K):
            dItemGamma[i][k] += 2*lamb*itemGamma[i][k]
    dtheta = [dalpha] + [dUserBiases[u] for u in users] + [dItemBiases[i] for i in items]
    for u in users:
        dtheta += dUserGamma[u]
    for i in items:
        dtheta += dItemGamma[i]
    return numpy.array(dtheta)

In [22]:
scipy.optimize.fmin_l_bfgs_b(cost, [alpha] + # Initialize alpha
                                   [0.0]*(nUsers+nItems) + # Initialize beta
                                   [random.random() * 0.1 - 0.05 for k in range(K*(nUsers+nItems))], # Gamma
                             derivative, args = (labels, 0.001))

MSE = 1.793077624531897
MSE = 1.7364904992624735
MSE = 3.022247568264985
MSE = 1.6820334149774039
MSE = 1.5462013523386424
MSE = 1.5436182874447077
MSE = 1.5339822382166852
MSE = 1.4281174587497003
MSE = 1.3788760616968054
MSE = 1.3353908056682389
MSE = 1.317377988094165
MSE = 1.295659097635261
MSE = 1.2886645892604183
MSE = 1.2941245880131866
MSE = 1.2980712819475477
MSE = 1.3005444643291082
MSE = 1.3029727065926633
MSE = 1.294425482362959
MSE = 1.2887002515796606
MSE = 1.2839539686833157
MSE = 1.2820820484960798
MSE = 1.2862037421259636
MSE = 1.287443438127511
MSE = 1.2887584573052338
MSE = 1.290036610990684
MSE = 1.289761915279949
MSE = 1.2880751741052996
MSE = 1.2874786876845201
MSE = 1.2875287254245396
MSE = 1.2882136059971223
MSE = 1.288544085165325
MSE = 1.2887433650910787
MSE = 1.289160990808683
MSE = 1.2892512182991125
MSE = 1.2878943590518963
MSE = 1.289005765713097
MSE = 1.2890327326555933
MSE = 1.2891374335779524
MSE = 1.2885965034904212
MSE = 1.2905719732958836
MSE = 1.288

(array([ 3.31650101e+00, -8.44328403e-03,  2.68601752e-02, ...,
         7.59092016e-06, -3.25352584e-06,  6.02116384e-06]),
 1.4641853539213276,
 {'grad': array([ 9.74672190e-06,  3.08381540e-09,  2.99047675e-08, ...,
          1.55056668e-08, -6.56288632e-09,  1.18736817e-08]),
  'task': b'CONVERGENCE: NORM_OF_PROJECTED_GRADIENT_<=_PGTOL',
  'funcalls': 69,
  'nit': 61,
  'warnflag': 0})

## BPR model

In [28]:
import cornac
import math
from sklearn.metrics import mean_squared_error
SEED = 42
NUM_FACTORS = 2000
NUM_EPOCHS = 1000

In [29]:
train = trainf
test = dataf

In [32]:
train_set = cornac.data.Dataset.from_uir(train.itertuples(index=False), seed=SEED)



In [33]:
bpr = cornac.models.BPR(
    k=NUM_FACTORS,
    max_iter=NUM_EPOCHS,
    learning_rate=0.01,
    lambda_reg=0.001,
    verbose=True,
    seed=SEED
)

In [34]:
bpr.fit(train_set)

100%|██████████| 1000/1000 [10:13<00:00,  1.63it/s, correct=99.87%, skipped=0.02%]

Optimization finished!





<cornac.models.bpr.recom_bpr.BPR at 0x123403450>

In [35]:
def predict_rating(
        model,
        data,
        usercol,
        itemcol,
        predcol,
):
    uid_map = model.train_set.uid_map
    iid_map = model.train_set.iid_map
    predictions = [
        [
            getattr(row, usercol),
            getattr(row, itemcol),
            model.rate(user_idx=uid_map.get(getattr(row, usercol), len(uid_map)),
                       item_idx=iid_map.get(getattr(row, itemcol), len(iid_map)))
        ]
        for row in data.itertuples()
    ]
    predictions = pd.DataFrame(data=predictions, columns=[usercol, itemcol, predcol])
    return predictions

In [36]:
prediction = predict_rating(bpr, train, usercol='reviewerID', itemcol='asin', predcol='prediction')

### MSE for Test set

In [37]:
def merge_rating_true_pred(
    rating_true,
    rating_pred,
    col_user,
    col_item,
    col_rating,
    col_prediction,
):

    # pd.merge will apply suffixes to columns which have the same name across both dataframes
    suffixes = ["_true", "_pred"]
    rating_true_pred = pd.merge(
        rating_true, rating_pred, on=[col_user, col_item], suffixes=suffixes
    )
    if col_rating in rating_pred.columns:
        col_rating = col_rating + suffixes[0]
    if col_prediction in rating_true.columns:
        col_prediction = col_prediction + suffixes[1]
    return rating_true_pred[col_rating], rating_true_pred[col_prediction]

In [38]:
def rmse(
    rating_true,
    rating_pred,
    col_user,
    col_item,
    col_rating,
    col_prediction,
):
    y_true, y_pred = merge_rating_true_pred(
        rating_true=rating_true,
        rating_pred=rating_pred,
        col_user=col_user,
        col_item=col_item,
        col_rating=col_rating,
        col_prediction=col_prediction,
    )
    return numpy.sqrt(mean_squared_error(y_true, y_pred))


In [39]:
print(rmse(test, prediction, 'reviewerID','asin', 'overall', 'prediction'))

0.834058282640287
