In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
! pip install surprise



In [None]:
import gzip
from collections import defaultdict
from sklearn import linear_model
import csv
import pandas as pd
import random
from tqdm import tqdm
import matplotlib.pyplot as plt
import numpy as np
# from implicit import bpr
from surprise import SVD, Reader, Dataset
from sklearn.model_selection import train_test_split

%matplotlib inline
plt.rcParams.update({'figure.figsize':(7,5), 'figure.dpi':100})

In [None]:

def readGz(path):
    for l in gzip.open(path, 'rt'):
        yield eval(l)

def readCSV(path):
    f = gzip.open(path, 'rt')
    c = csv.reader(f)
    header = next(c)
    for l in c:
        d = dict(zip(header,l))
        yield d['user_id'],d['recipe_id'],d


In [None]:
data_dir = "drive/MyDrive/assignment1_data/"

In [None]:

### Rating baseline: compute averages for each user, or return the global average if we've never seen the user before

allRatings = []
userRatings = defaultdict(list)
userRecipeDict = defaultdict(set)
totalRecipe = set()
dataset = []

for user,recipe,d in readCSV(data_dir + "trainInteractions.csv.gz"):
    r = int(d['rating'])
    allRatings.append(r)
    userRecipeDict[user].add(recipe)
    userRatings[user].append(r)
    totalRecipe.add(recipe)
    dataset.append((user, recipe, 1,r))
    

globalAverage = sum(allRatings) / len(allRatings)
userAverage = {}
for u in userRatings:
  if len(userRatings[u]) > 10:
    userAverage[u] = sum(userRatings[u]) / len(userRatings[u])


In [None]:
globalAverage

4.580794

In [None]:
train_size = 400000

X = [(u,i) for u,i,_,r in dataset]
y = [r for _,_,_,r in dataset]
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42)


## Question 1 

In [None]:
import pandas as pd

In [None]:
data = pd.DataFrame([(t[0], t[1], r) for t, r in zip(X_train, y_train)])
data.columns = ["user_id", "recipe_id", "rating"]

In [None]:
data.head()

Unnamed: 0,user_id,recipe_id,rating
0,35619582,44587855,0
1,50554282,51482760,5
2,93702895,9843811,4
3,93484810,80488857,5
4,72132392,62506910,5


In [None]:

median_no_rating = data.groupby("user_id").size().median()
median_recipe_count = data.groupby("recipe_id").size().median()

data = data.join(data.groupby("user_id").size().rename('user_count'), on="user_id")
data = data.join(data.groupby("recipe_id").size().rename('recipe_count'), on="recipe_id")

active_user_data = data[data["user_count"] >= 4*median_no_rating]
active_user_data = active_user_data[active_user_data["recipe_count"] >= 4*median_recipe_count]


active_userid = set(active_user_data["user_id"])
active_recipe = set(active_user_data["recipe_id"])

In [None]:
data.groupby("user_id").size().max()

3809

In [None]:
median_recipe_count, median_no_rating

(1.0, 10.0)

In [None]:
active_user_data.shape

(148985, 5)

In [None]:
train_data_active = active_user_data[["user_id", "recipe_id", "rating"]].values.tolist()

## Question 9

In [None]:
mu_train = sum([r for r in y_train]) / len(y_train)
mu_train

4.580835

In [None]:
import tensorflow as tf
# train_data[0]

In [None]:
optimizer = tf.keras.optimizers.Adam(0.01)

In [None]:
recipeRating = defaultdict(list)
recipeAverage = defaultdict(float)
for _, recipe, rating in train_data_active:
  recipeRating[recipe].append(rating)

for recipe in recipeRating.keys():
  if len(recipeRating[recipe]) >= 10:
    recipeAverage[recipe] = sum(recipeRating[recipe]) / len(recipeRating[recipe])

In [None]:
userIDs = { userId:idx for idx, userId in enumerate(list(active_userid))}
userRevMap = { idx: userId for idx, userId in enumerate(list(active_userid))}
recipeIds = {recipe:idx for idx, recipe in enumerate(list(active_recipe))}
recipeIdRevMap = {idx:recipe for idx, recipe in enumerate(list(active_recipe))}

In [None]:
class LatentFactorModel(tf.keras.Model):
    def __init__(self, mu, K, lamb):
        super(LatentFactorModel, self).__init__()
        # Initialize to average
        self.alpha = tf.Variable(mu)
        # Initialize to small random values
        self.betaU = tf.Variable(tf.random.normal([len(userIDs)],stddev=0.01))
        self.betaI = tf.Variable(tf.random.normal([len(recipeIds)],stddev=0.01))
        self.gammaU = tf.Variable(tf.random.normal([len(userIDs),K],stddev=0.01))
        self.gammaI = tf.Variable(tf.random.normal([len(recipeIds),K],stddev=0.01))
        self.lamb = lamb

    # Prediction for a single instance (useful for evaluation)
    def predict(self, u, i):
        p = self.alpha + self.betaU[u] + self.betaI[i] +\
            tf.tensordot(self.gammaU[u], self.gammaI[i], 1)
        return p

    # Regularizer
    def reg(self):
        return 0.0001 * tf.reduce_sum(self.betaU**2) +\
                         0.001 * tf.reduce_sum(self.betaI**2) +\
                         0.001 * tf.reduce_sum(self.gammaU**2) +\
                          0.001 *  tf.reduce_sum(self.gammaI**2)
    
    # Prediction for a sample of instances
    def predictSample(self, sampleU, sampleI):
        u = tf.convert_to_tensor(sampleU, dtype=tf.int32)
        i = tf.convert_to_tensor(sampleI, dtype=tf.int32)
        beta_u = tf.nn.embedding_lookup(self.betaU, u)
        beta_i = tf.nn.embedding_lookup(self.betaI, i)
        gamma_u = tf.nn.embedding_lookup(self.gammaU, u)
        gamma_i = tf.nn.embedding_lookup(self.gammaI, i)
        pred = self.alpha + beta_u + beta_i +\
               tf.reduce_sum(tf.multiply(gamma_u, gamma_i), 1)
        return pred
    
    # Loss
    def call(self, sampleU, sampleI, sampleR):
        pred = self.predictSample(sampleU, sampleI)
        r = tf.convert_to_tensor(sampleR, dtype=tf.float32)
        return tf.nn.l2_loss(pred - r) / len(sampleR)


In [None]:
def trainingStep(interactions):
    Nsamples = 50000
    with tf.GradientTape() as tape:
        sampleU, sampleI, sampleR = [], [], []
        for _ in range(Nsamples):
            u,i,r = random.choice(interactions)
            sampleU.append(userIDs[u])
            sampleI.append(recipeIds[i])
            sampleR.append(r)

        loss = model(sampleU,sampleI,sampleR)
        loss += model.reg()
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients((grad, var) for
                              (grad, var) in zip(gradients, model.trainable_variables)
                              if grad is not None)
    return loss.numpy()


In [None]:
trainData = [(userid, recipeId, rating) for userid, recipeId, rating in train_data_active]
train_labels = [rating for userid, recipeId, rating in train_data_active]
valData = X_test[:]
val_labels = y_test[:]

In [None]:
len(val_labels)

100000

In [None]:
# 10 iterations of gradient descent
model = LatentFactorModel(mu_train, 256, 0.0002)

for i in range(100):
    obj = trainingStep(trainData)
    print("iteration " + str(i) + ", objective = " + str(obj))


iteration 0, objective = 1.021687
iteration 1, objective = 0.62277114
iteration 2, objective = 0.4204625
iteration 3, objective = 0.46294641
iteration 4, objective = 0.555688
iteration 5, objective = 0.5787035
iteration 6, objective = 0.5471002
iteration 7, objective = 0.4839044
iteration 8, objective = 0.4471284
iteration 9, objective = 0.42574608
iteration 10, objective = 0.4141975
iteration 11, objective = 0.39965934
iteration 12, objective = 0.40172973
iteration 13, objective = 0.39251924
iteration 14, objective = 0.38497174
iteration 15, objective = 0.38278458
iteration 16, objective = 0.36155355
iteration 17, objective = 0.36192504
iteration 18, objective = 0.35479853
iteration 19, objective = 0.35511798
iteration 20, objective = 0.3415732
iteration 21, objective = 0.3404727
iteration 22, objective = 0.34419113
iteration 23, objective = 0.32937884
iteration 24, objective = 0.33129236
iteration 25, objective = 0.32034189
iteration 26, objective = 0.32921845
iteration 27, objective

In [None]:
def MSE(predictions, labels):
    differences = [(x-y)**2 for x,y in zip(predictions,labels)]
    return sum(differences) / len(differences)


In [None]:

train_predictions = []
train_labels_active = []

orginalPred = []

pair = []
j = 0
p_count, np_count = 0,0 
for u,i,_ in tqdm(trainData[:2000]):
  if (u in active_userid and i in active_recipe):
    p_count += 1
    pair.append((userIDs[u],recipeIds[i]))
    train_labels_active.append(train_labels[j])

  if u in userAverage:
    orginalPred.append(userAverage[u])
  else:
    orginalPred.append(globalAverage)
  j += 1

train_predictions = model.predictSample([i[0] for i in pair], [i[1] for i in pair]).numpy()
print("\n", p_count, np_count, len(train_predictions), len(train_labels_active), MSE(train_predictions, train_labels_active))
print(MSE(orginalPred, train_labels_active))

100%|██████████| 2000/2000 [00:00<00:00, 277483.64it/s]


 2000 0 2000 2000 0.5499088622715448
0.5636414198252752





In [None]:
val_predictions = []
val_labels_active = []
pair = []
j = 0
orginalPred = []

p_count, np_count = 0,0 
for u,i in tqdm(valData):
  if (u in active_userid and i in active_recipe):
    p_count += 1
    pair.append((userIDs[u],recipeIds[i]))
    val_labels_active.append(val_labels[j])
  
    if u in userAverage:
      orginalPred.append(userAverage[u])
    else:
      orginalPred.append(globalAverage)

  j += 1

val_predictions = model.predictSample([i[0] for i in pair], [i[1] for i in pair]).numpy()


print("\n", p_count, np_count, len(val_predictions), len(val_labels_active), len(orginalPred), MSE(val_predictions, val_labels_active))
print(MSE(orginalPred, val_labels_active))

100%|██████████| 100000/100000 [00:00<00:00, 597780.65it/s]



 31443 0 31443 31443 31443 0.5861009934204994
0.5766266840052291


In [None]:

predictions = []
val_labels_active = []

pair = []
pair_val = []
j = 0
p_count, np_count = 0,0 
for u,i in tqdm(valData):
  if (u in active_userid and i in active_recipe):
    p_count += 1
    pair.append((userIDs[u],recipeIds[i]))
    pair_val.append(val_labels[j])
  else:
    np_count += 1
    if u in userAverage:
      predictions.append(userAverage[u])
    else:
      predictions.append(globalAverage)
    val_labels_active.append(val_labels[j])

  j += 1


pPred = model.predictSample([i[0] for i in pair], [i[1] for i in pair]).numpy()

print("\n", p_count, np_count, len(predictions+list(pPred)), len(val_labels_active+pair_val), MSE(predictions+list(pPred), val_labels_active+pair_val))

100%|██████████| 100000/100000 [00:00<00:00, 473238.04it/s]



 31443 68557 100000 100000 0.7934786710064207


In [None]:
predictions = []
val_labels_active = []

j = 0
p_count, np_count = 0,0 
for u,i in tqdm(valData):
  
  np_count += 1
  if u in userAverage:
    predictions.append(userAverage[u])
  else:
    predictions.append(globalAverage)
  val_labels_active.append(val_labels[j])

  j += 1

# predictions =\
#     [model.predict(userIDs[u],recipeIds[i]).numpy() for u,i,_ in tqdm(valData[:10000])]
print("\n", p_count, np_count, MSE(predictions, val_labels_active))

100%|██████████| 100000/100000 [00:00<00:00, 660007.90it/s]


 0 100000 0.7904996638967468





In [None]:
print(p_count, np_count)

0 100000


In [None]:
print("MSE for bias only predictions ", MSE(predictions, val_labels_active))


MSE for bias only predictions  0.7904996638967468


In [None]:
predictions_new = []

for pred, _ in list(zip(predictions, val_labels_active)):
    predictions_new.append(pred)

MSE(predictions_new, val_labels_active)

0.7904996638967468

In [None]:
# predictions = open(data_dir+"predictions_Rated.txt", 'w')
# count = 0
# nonpred = 0
# for l in open(data_dir+"stub_Rated.txt"):
#   if l.startswith("user_id"):
#     #header
#     predictions.write(l)
#     continue
#   u,i = l.strip().split('-')
#   pred = 0 
#   if u in userIDs and i in recipeIds:
#     count += 1
#     pred = model.predict(userIDs[u],recipeIds[i]).numpy()
#   else:
#     nonpred += 1
#     if u in userAverage:
#       pred = userAverage[u]
#     elif i in recipeAverage:
#       pred = recipeAverage[i]
#     else:
#       pred = globalAverage
#   if pred > 4.85:
#     pred = 5
#   predictions.write(u + '-' + i + ',' + str(pred) + '\n')

# predictions.close()


In [None]:
count, nonpred

NameError: ignored