In [1]:
import gzip
from collections import defaultdict
import math
import scipy.optimize
from sklearn import svm
import numpy as np
import string
import random
import string
from sklearn import linear_model

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
def readGz(path):
    for l in gzip.open(path, 'rt'):
        yield eval(l)

def readCSV(path):
    f = gzip.open(path, 'rt')
    f.readline()
    for l in f:
        u,b,r = l.strip().split(',')
        r = int(r)
        yield u,b,r

In [4]:
allRatings = []
for l in readCSV("train_Interactions.csv.gz"):
    allRatings.append(l)

In [5]:
ratingsTrain = allRatings[:190000]
ratingsValid = allRatings[190000:]
ratingsPerUser = defaultdict(list)
ratingsPerItem = defaultdict(list)
for u,b,r in ratingsTrain:
    ratingsPerUser[u].append((b,r))
    ratingsPerItem[b].append((u,r))



In [6]:
def Jaccard(s1, s2):
    inter = len(set(s1).intersection(set(s2)))
    union = len(set(s1).union(set(s2)))
    if union == 0:
        return 0
    return inter / union

In [7]:
def initialize_parameters(ratingsPerUser, ratingsPerItem, K, seed=42):
    np.random.seed(seed)
    beta_user = defaultdict(float)
    beta_book = defaultdict(float)
    gamma_user = defaultdict(lambda: np.zeros(K))
    gamma_book = defaultdict(lambda: np.zeros(K))
    
    for u in ratingsPerUser:
        beta_user[u] = 0.0
        gamma_user[u] = np.random.normal(scale=0.1, size=K)
    
    for b in ratingsPerItem:
        beta_book[b] = 0.0
        gamma_book[b] = np.random.normal(scale=0.1, size=K)
    
    alpha = np.mean([r for _, _, r in ratingsTrain])
    
    return alpha, beta_user, beta_book, gamma_user, gamma_book

In [8]:
def compute_objective(alpha, beta_user, beta_item, gamma_user, gamma_item, ratingsValid, L1, L2, L3, L4):
    sse = 0
    for u, i, r in ratingsValid:
        prediction = alpha + beta_user[u] + beta_item[i] + np.dot(gamma_user[u], gamma_item[i])
        sse += (r - prediction) ** 2
    
    reg_user = L1 * sum(beta ** 2 for beta in beta_user.values())
    reg_item = L2 * sum(beta ** 2 for beta in beta_item.values())
    reg_gamma_user = L3 * sum(np.dot(gamma, gamma) for gamma in gamma_user.values())
    reg_gamma_item = L4 * sum(np.dot(gamma, gamma) for gamma in gamma_item.values())
    
    objective = sse + reg_user + reg_item + reg_gamma_user + reg_gamma_item
    return sse, objective


In [9]:
def train(ratingsTrain, ratingsValid, ratingsPerUser, ratingsPerItem, L1, L2, L3, L4, K, epochs, learning_rate, seed=42):
    # Initialize parameters
    alpha, beta_user, beta_item, gamma_user, gamma_item = initialize_parameters(ratingsPerUser, ratingsPerItem, K, seed)
    
    for epoch in range(1, epochs + 1):
        np.random.shuffle(ratingsTrain)  # Shuffle training data each epoch
        for u, i, r in ratingsTrain:
            prediction = alpha + beta_user[u] + beta_item[i] + np.dot(gamma_user[u], gamma_item[i])
            error = r - prediction
            
            # Update alpha
            alpha += learning_rate * (error / len(ratingsTrain))
            
            # Update biases
            beta_user[u] += learning_rate * (error - L1 * beta_user[u])
            beta_item[i] += learning_rate * (error - L2 * beta_item[i])
            
            # Update latent factors
            gamma_user[u] += learning_rate * (error * gamma_item[i] - L3 * gamma_user[u])
            gamma_item[i] += learning_rate * (error * gamma_user[u] - L4 * gamma_item[i])
        
        # Compute and print objective
        sse, obj = compute_objective(alpha, beta_user, beta_item, gamma_user, gamma_item, ratingsValid, L1, L2, L3, L4)
        print(f"Epoch {epoch}: SSE={sse:.4f}, Objective={obj:.4f}")
    
    return alpha, beta_user, beta_item, gamma_user, gamma_item


In [10]:
train(ratingsTrain, ratingsValid, ratingsPerUser, ratingsPerItem, 3, 12, 1, 1, 5, 10, 0.01)

Epoch 1: SSE=16626.4534, Objective=18357.1001
Epoch 2: SSE=16351.1044, Objective=18261.3453
Epoch 3: SSE=16177.9494, Objective=18300.8628
Epoch 4: SSE=16068.2590, Objective=18387.5457
Epoch 5: SSE=15998.1931, Objective=18496.3063
Epoch 6: SSE=15933.8706, Objective=18585.2066
Epoch 7: SSE=15896.0417, Objective=18675.5739


KeyboardInterrupt: 

In [None]:
predictions = open("predictions_Rating.csv", 'w')
for l in open("pairs_Rating.csv"):
    if l.startswith("userID"): # header
        predictions.write(l)
        continue
    u,b = l.strip().split(',') # Read the user and item from the "pairs" file and write out your prediction
    prediction = alpha + beta_user.get(u, 0) + beta_book.get(b, 0)
    predictions.write(u + ',' + b + ',' + str(prediction) + '\n')
    
predictions.close()