In [8]:
import gzip
from collections import defaultdict
import math
import scipy.optimize
from sklearn import svm
import numpy as np
import string
import random
import string
from sklearn import linear_model

In [9]:
import warnings
warnings.filterwarnings("ignore")

In [10]:
def readGz(path):
    for l in gzip.open(path, 'rt'):
        yield eval(l)

def readCSV(path):
    f = gzip.open(path, 'rt')
    f.readline()
    for l in f:
        u,b,r = l.strip().split(',')
        r = int(r)
        yield u,b,r

In [11]:
allRatings = []
for l in readCSV("train_Interactions.csv.gz"):
    allRatings.append(l)

In [12]:
ratingsTrain = allRatings[:190000]
ratingsValid = allRatings[190000:]
ratingsPerUser = defaultdict(list)
ratingsPerItem = defaultdict(list)
for u,b,r in ratingsTrain:
    ratingsPerUser[u].append((b,r))
    ratingsPerItem[b].append((u,r))



In [13]:
def Jaccard(s1, s2):
    inter = len(set(s1).intersection(set(s2)))
    union = len(set(s1).union(set(s2)))
    if union == 0:
        return 0
    return inter / union

In [14]:
def initialize_parameters(ratingsPerUser, ratingsPerItem, K, seed=42):
    np.random.seed(seed)
    beta_user = defaultdict(float)
    beta_book = defaultdict(float)
    gamma_user = defaultdict(lambda: np.zeros(K))
    gamma_book = defaultdict(lambda: np.zeros(K))
    
    for u in ratingsPerUser:
        beta_user[u] = 0.0
        gamma_user[u] = np.random.normal(scale=0.1, size=K)
    
    for b in ratingsPerItem:
        beta_book[b] = 0.0
        gamma_book[b] = np.random.normal(scale=0.1, size=K)
    
    alpha = np.mean([r for _, _, r in ratingsTrain])
    
    return alpha, beta_user, beta_book, gamma_user, gamma_book

In [22]:
def train(ratingsTrain, ratingsValid, L1, L2, L3, L4, K, epochs, learning_rate):
    # Initialize parameters
    alpha, beta_user, beta_book, gamma_user, gamma_book = initialize_parameters(ratingsPerUser, ratingsPerItem, K)
    
    for u in ratingsPerUser:
        beta_user[u] = 0.0
    for b in ratingsPerItem:
        beta_book[b] = 0.0
    for u in ratingsPerUser:
        gamma_user[u] = np.random.normal(scale=0.1, size=K)
    for b in ratingsPerItem:
        gamma_book[b] = np.random.normal(scale=0.1, size=K)
    
    last_obj = 10**9
    for epoch in range(epochs):
        np.random.shuffle(ratingsTrain)  # Shuffle training data each epoch
        for u, b, r in ratingsTrain:
            prediction = alpha + beta_user[u] + beta_book[b] + np.dot(gamma_user[u], gamma_book[b])
            error = r - prediction
            alpha += learning_rate * error / len(ratingsTrain)
            beta_user[u] += learning_rate * (error - L1 * beta_user[u])
            beta_book[b] += learning_rate * (error - L2 * beta_book[b])
            gamma_user[u] += learning_rate * (error * gamma_book[b] - L3 * gamma_user[u])
            gamma_book[b] += learning_rate * (error * gamma_user[u] - L4 * gamma_book[b])

        sse, obj = compute_objective(alpha, beta_user, beta_book, gamma_user, gamma_book, ratingsValid, L1, L2, L3, L4)
        print(f"Epoch {epoch}: SSE={sse}, Objective={obj}")
        print(last_obj - obj)
        if last_obj - obj < 0.1: break
        last_obj = obj
    
    return alpha, beta_user, beta_book, gamma_user, gamma_book

def compute_objective(alpha, beta_user, beta_book, gamma_user, gamma_book, ratingsValid, L1, L2, L3, L4):
    sse = 0
    for u, b, r in ratingsValid:
        prediction = alpha + beta_user[u] + beta_book[b] + np.dot(gamma_user[u], gamma_book[b])
        sse += (r - prediction) ** 2
    
    reg_user = L1 * sum(beta ** 2 for beta in beta_user.values())
    reg_book = L2 * sum(beta ** 2 for beta in beta_book.values())
    reg_gamma_user = L3 * sum(np.dot(gamma, gamma) for gamma in gamma_user.values())
    reg_gamma_book = L4 * sum(np.dot(gamma, gamma) for gamma in gamma_book.values())
    
    objective = sse + reg_user + reg_book + reg_gamma_user + reg_gamma_book
    return sse, objective


In [24]:
alpha, beta_user, beta_book, gamma_user, gamma_book = train(ratingsTrain, ratingsValid, 3, 21, 0, 0, 6, 25, 0.01)
validMSE = 0
for u,b,r in ratingsValid:
    bu = 0
    bi = 0
    if u in beta_user:
        bu = beta_user[u]
    if b in beta_book:
        bi = beta_book[b]
    prediction = alpha + bu + bi + np.dot(gamma_user[u], gamma_book[b])
    validMSE += (r - prediction)**2

validMSE /= len(ratingsValid)
print("Validation MSE = " + str(validMSE))


Epoch 0: SSE=16684.56753213684, Objective=16957.92280644155
999983042.0771936
Epoch 1: SSE=16425.67405595806, Objective=17049.376143074198
-91.45333663264682
Validation MSE = 1.6425674055958062


In [None]:
predictions = open("predictions_Rating.csv", 'w')
for l in open("pairs_Rating.csv"):
    if l.startswith("userID"): # header
        predictions.write(l)
        continue
    u,b = l.strip().split(',') # Read the user and item from the "pairs" file and write out your prediction
    prediction = alpha + beta_user.get(u, 0) + beta_book.get(b, 0)
    predictions.write(u + ',' + b + ',' + str(prediction) + '\n')
    
predictions.close()