In [1]:
import numpy as np
import matplotlib.pyplot as plt
import random
import os
import gzip
from collections import defaultdict
import math
import scipy.optimize
from sklearn import svm
import string
from sklearn import linear_model
import pickle
from tqdm import tqdm

In [2]:
with open("us_reviews_data.pkl",'rb') as f:
    us_reviews = pickle.load(f)

In [3]:
len(us_reviews)

4896184

In [6]:
with open("us_reviews_restuarants_train.pkl",'rb') as f:
    us_restaurant_train = pickle.load(f)

In [8]:
with open("us_reviews_restuarants_test.pkl",'rb') as f:
    us_restaurant_test = pickle.load(f)

In [9]:
len(us_restaurant_train)

454830

In [10]:
len(us_restaurant_test)

125468

In [11]:
us_restaurant_train[0]

{'rating': 5.0,
 'reviewerName': 'Chris Johnson',
 'reviewText': None,
 'categories': ['American Restaurant', 'Bar'],
 'gPlusPlaceId': '103654778391814923896',
 'unixReviewTime': 1311509683,
 'reviewTime': 'Jul 24, 2011',
 'gPlusUserId': '100000524810171549476'}

In [12]:
us_restaurant_test[0]

{'rating': 5.0,
 'reviewerName': 'Chris Johnson',
 'reviewText': "They have worked miracles for me trying to cater to that big-azz agency upstairs on a moment's notice.  Love love love them.",
 'categories': ['New American Restaurant', 'Mediterranean Restaurant'],
 'gPlusPlaceId': '118154383123752939812',
 'unixReviewTime': 1294448618,
 'reviewTime': 'Jan 7, 2011',
 'gPlusUserId': '100000524810171549476'}

In [13]:
random.shuffle(us_restaurant_train)

In [15]:
random.shuffle(us_restaurant_test)

In [18]:
x = len(us_restaurant_test)//2
x

62734

In [19]:
us_restaurant_valid_data = us_restaurant_test[:x]
us_restaurant_test_data = us_restaurant_test[x:]

In [5]:
# data_size = len(us_reviews)
# train_data = us_reviews[:int(data_size*0.8)]
# valid_data = us_reviews[int(data_size*0.8):int(data_size*0.9)]
# test_data = us_reviews[int(data_size*0.9):]

In [20]:
print(len(us_restaurant_train))
print(len(us_restaurant_valid_data))
print(len(us_restaurant_test_data))

454830
62734
62734


In [21]:
train_ratings = []
for review in tqdm(us_restaurant_train):
    train_ratings.append(review['rating'])
mu = sum(train_ratings)/len(train_ratings)
mu

100%|███████████████████████████████| 454830/454830 [00:00<00:00, 641275.74it/s]


3.939010179627553

In [29]:
userIDs = {}
placeIDs = {}
interactions_train = []

for review in tqdm(us_restaurant_train):
    u = review['gPlusUserId']
    p = review['gPlusPlaceId']
    r = review['rating']
    if not u in userIDs: 
        userIDs[u] = len(userIDs)
    if not p in placeIDs: 
        placeIDs[p] = len(placeIDs)
    interactions_train.append((u,p,r))

100%|███████████████████████████████| 454830/454830 [00:01<00:00, 285127.35it/s]


In [31]:
interactions_train[:5]

[('102402605559816764426', '114524455263681124737', 4.0),
 ('117078683766767369032', '101421411984715145689', 4.0),
 ('111079365364134389793', '105982662324380119076', 5.0),
 ('117595887129268694192', '112246755620018649114', 1.0),
 ('104426882443773508676', '104288077033725316535', 2.0)]

In [32]:
userIDs_valid = {}
placeIDs_valid = {}
interactions_valid = []

for review in tqdm(us_restaurant_valid_data):
    u = review['gPlusUserId']
    p = review['gPlusPlaceId']
    r = review['rating']
    if not u in userIDs_valid: 
        userIDs_valid[u] = len(userIDs_valid)
    if not p in placeIDs_valid: 
        placeIDs_valid[p] = len(placeIDs_valid)
    interactions_valid.append((u,p,r))

100%|█████████████████████████████████| 62734/62734 [00:00<00:00, 153505.50it/s]


In [33]:
userIDs_test = {}
placeIDs_test = {}
interactions_test = []

for review in tqdm(us_restaurant_test_data):
    u = review['gPlusUserId']
    p = review['gPlusPlaceId']
    r = review['rating']
    if not u in userIDs_test: 
        userIDs_test[u] = len(userIDs_test)
    if not p in placeIDs_test: 
        placeIDs_test[p] = len(placeIDs_test)
    interactions_test.append((u,p,r))

100%|█████████████████████████████████| 62734/62734 [00:00<00:00, 288162.15it/s]


In [None]:
##### Latent Factor Model #####
# placesPerUser = defaultdict(list)
# usersPerPlace = defaultdict(list)
# for u,p,r in interactions_train:
#     placesPerUser[u].append(p)
#     usersPerPlace[p].append(u)
optimizer = tf.keras.optimizers.Adam(0.1)

In [None]:
class LatentFactorModel(tf.keras.Model):
    def __init__(self, mu, K, lamb):
        super(LatentFactorModel, self).__init__()
        # Initialize to average
        self.alpha = tf.Variable(mu)
        # Initialize to small random values
        self.betaU = tf.Variable(tf.random.normal([len(userIDs)],stddev=0.001))
        self.betaP = tf.Variable(tf.random.normal([len(placeIDs)],stddev=0.001))
        self.gammaU = tf.Variable(tf.random.normal([len(userIDs),K],stddev=0.001))
        self.gammaP = tf.Variable(tf.random.normal([len(placeIDs),K],stddev=0.001))
        self.lamb = lamb

    # Prediction for a single instance (useful for evaluation)
    def predict(self, u, p):
        pred = self.alpha + self.betaU[u] + self.betaP[p] +\
            tf.tensordot(self.gammaU[u], self.gammaP[p], 1)
        return pred

    # Regularizer
    def reg(self):
        return self.lamb * (tf.reduce_sum(self.betaU**2) +\
                            tf.reduce_sum(self.betaI**2) +\
                            tf.reduce_sum(self.gammaU**2) +\
                            tf.reduce_sum(self.gammaI**2))
    
    # Prediction for a sample of instances
    def predictSample(self, sampleU, sampleP):
        u = tf.convert_to_tensor(sampleU, dtype=tf.int32)
        p = tf.convert_to_tensor(sampleP, dtype=tf.int32)
        beta_u = tf.nn.embedding_lookup(self.betaU, u)
        beta_p = tf.nn.embedding_lookup(self.betaP, p)
        gamma_u = tf.nn.embedding_lookup(self.gammaU, u)
        gamma_p = tf.nn.embedding_lookup(self.gammaP, p)
        pred = self.alpha + beta_u + beta_p +\
               tf.reduce_sum(tf.multiply(gamma_u, gamma_p), 1)
        return pred
    
    # Loss
    def call(self, sampleU, sampleP, sampleR):
        pred = self.predictSample(sampleU, sampleP)
        r = tf.convert_to_tensor(sampleR, dtype=tf.float32)
        return tf.nn.l2_loss(pred - r) / len(sampleR)

In [None]:
modelLFM = LatentFactorModel(mu, 5, 0.00001)

In [None]:
def trainingStep(model, interactions):
    Nsamples = 50000
    with tf.GradientTape() as tape:
        sampleU, sampleP, sampleR = [], [], []
        for _ in range(Nsamples):
            u,p,r = random.choice(interactions)
            sampleU.append(userIDs[u])
            sampleP.append(placeIDs[p])
            sampleR.append(r)

        loss = model(sampleU,sampleP,sampleR)
        loss += model.reg()
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients((grad, var) for
                              (grad, var) in zip(gradients, model.trainable_variables)
                              if grad is not None)
    return loss.numpy()

In [None]:
epochs = 100
for i in range(epochs):
    obj = trainingStep(modelLFM, interactions_train)
    if (i % 10 == 9): print("iteration " + str(i+1) + ", objective = " + str(obj))

In [None]:
valid_data_predict = []
for u,p,r in interactions_valid:
    valid_data_predict.append(modelLFM.predict(userIDs[u], placeIDs[p]).numpy())
valid_predict = np.array(valid_data_predict)

In [None]:
gt = []
for r in tqdm(us_restaurant_valid_data):
    gt.append(r['rating'])

In [None]:
mse = np.sum((valid_predict - gt)**2)/len(gt)