In [2]:
import gzip
from collections import defaultdict
import math
import scipy.optimize
from sklearn import svm
import numpy as np
import string
import random
import string
from sklearn import linear_model

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
def readGz(path):
    for l in gzip.open(path, 'rt'):
        yield eval(l)

def readCSV(path):
    f = gzip.open(path, 'rt')
    f.readline()
    for l in f:
        u,b,r = l.strip().split(',')
        r = int(r)
        yield u,b,r

In [5]:
allRatings = []
for l in readCSV("train_Interactions.csv.gz"):
    allRatings.append(l)

In [6]:
ratingsTrain = allRatings[:190000]
ratingsValid = allRatings[190000:]
ratingsPerUser = defaultdict(list)
ratingsPerItem = defaultdict(list)
for u,b,r in ratingsTrain:
    ratingsPerUser[u].append((b,r))
    ratingsPerItem[b].append((u,r))



In [7]:
def Jaccard(s1, s2):
    inter = len(set(s1).intersection(set(s2)))
    union = len(set(s1).union(set(s2)))
    if union == 0:
        return 0
    return inter / union

In [54]:
import tensorflow as tf
import tensorflow_recommenders as tfrs

# Define user and item embeddings
class SimpleModel(tfrs.Model):
    def __init__(self):
        super().__init__()
        self.user_embedding = tf.keras.Sequential([tf.keras.layers.Embedding(1000, 32)])
        self.item_embedding = tf.keras.Sequential([tf.keras.layers.Embedding(1000, 32)])
        self.task = tfrs.tasks.Retrieval(metrics=tfrs.metrics.FactorizedTopK())

    def compute_loss(self, features, training=False):
        user_embeddings = self.user_embedding(features["user_id"])
        item_embeddings = self.item_embedding(features["item_id"])
        return self.task(user_embeddings, item_embeddings)

In [47]:
K = 7
alpha = np.mean([r for _, _, r in ratingsTrain])
beta_user = defaultdict(float)
beta_book = defaultdict(float)
gamma_user = defaultdict(lambda: np.zeros(K))
gamma_book = defaultdict(lambda: np.zeros(K))

for u in ratingsPerUser:
    beta_user[u] = 0.0
    gamma_user[u] = np.random.normal(scale=0.1, size=K)

for b in ratingsPerItem:
    beta_book[b] = 0.0
    gamma_book[b] = np.random.normal(scale=0.1, size=K)

alpha = np.mean([r for _, _, r in ratingsTrain])
for u in ratingsPerUser:
    beta_user[u] = 0

for b in ratingsPerItem:
    beta_book[b] = 0

def solve(L1, L2, L3):
    newAlpha = 0
    # Update global bias
    for u, b, r in ratingsTrain:
        newAlpha += r - (beta_user[u] + beta_book[b] + np.dot(gamma_user[u], gamma_book[b]))
    alpha = newAlpha / len(ratingsTrain)
           # Update user and book biases
    for u in ratingsPerUser:
        newBetaU = 0
        for b, r in ratingsPerUser[u]:
            pred = alpha + beta_user[u] + beta_book[b] + np.dot(gamma_user[u], gamma_book[b])
            newBetaU += r - pred
        beta_user[u] = newBetaU / (L1 + len(ratingsPerUser[u]))

    for b in ratingsPerItem:
        newBetaI = 0
        for u, r in ratingsPerItem[b]:
            pred = alpha + beta_user[u] + beta_book[b] + np.dot(gamma_user[u], gamma_book[b])
            newBetaI += r - pred
        beta_book[b] = newBetaI / (L2 + len(ratingsPerItem[b]))

    # Update latent factors using SGD
    for u, b, r in ratingsTrain:
        pred = alpha + beta_user[u] + beta_book[b] + np.dot(gamma_user[u], gamma_book[b])
        err = r - pred

        # Gradient updates
        gamma_user[u] += 0.01 * (err * gamma_book[b] - L3 * gamma_user[u])
        gamma_book[b] += 0.01 * (err * gamma_user[u] - L3 * gamma_book[b])


    validMSE = 0
    for u, b, r in ratingsTrain:
        pred = alpha + beta_user[u] + beta_book[b] + np.dot(gamma_user[u], gamma_book[b])
        validMSE += (r - pred) ** 2
    reg_user = sum(beta_user[u] ** 2 for u in beta_user) 
    reg_book = sum(beta_book[b] ** 2 for b in beta_book)

    return (validMSE, validMSE + L1 * reg_user + L2 * reg_book + L3 * (np.linalg.norm(gamma_user[u]) ** 2 + np.linalg.norm(gamma_book[b]) ** 2))


In [48]:
def solve2(L1, L2, L3):
    mse, minimize = solve(L1, L2, L3)
    new_mse, new_minimize = solve(L1, L2, L3)
    iter = 2
    while iter < 10 and minimize - new_minimize > 0.0001:
        mse, minimize = new_mse, new_minimize
        new_mse, new_minimize = solve(L1, L2, L3)
        iter+=1

    validMSE = 0
    for u,b,r in ratingsValid:
        bu = 0
        bi = 0
        if u in beta_user:
            bu = beta_user[u]
        if b in beta_book:
            bi = beta_book[b]
        prediction = alpha + bu + bi + np.dot(gamma_user[u], gamma_book[b])
        validMSE += (r - prediction)**2

    validMSE /= len(ratingsValid)
    print("Validation MSE = " + str(validMSE))
    return validMSE
    


In [50]:
print(solve2(4, 18, 0.005))


Validation MSE = 1.5022850042058067
1.5022850042058067


In [None]:
predictions = open("predictions_Rating.csv", 'w')
for l in open("pairs_Rating.csv"):
    if l.startswith("userID"): # header
        predictions.write(l)
        continue
    u,b = l.strip().split(',') # Read the user and item from the "pairs" file and write out your prediction
    prediction = alpha + beta_user.get(u, 0) + beta_book.get(b, 0)
    predictions.write(u + ',' + b + ',' + str(prediction) + '\n')
    
predictions.close()