In [4]:
import gzip
from collections import defaultdict
import math
import scipy.optimize
from sklearn import svm
import numpy as np
import string
import random
import string
from sklearn import linear_model

In [5]:
import warnings
warnings.filterwarnings("ignore")

In [6]:
def readGz(path):
    for l in gzip.open(path, 'rt'):
        yield eval(l)

def readCSV(path):
    f = gzip.open(path, 'rt')
    f.readline()
    for l in f:
        u,b,r = l.strip().split(',')
        r = int(r)
        yield u,b,r

In [7]:
allRatings = []
for l in readCSV("train_Interactions.csv.gz"):
    allRatings.append(l)

In [8]:
ratingsTrain = allRatings[:190000]
ratingsValid = allRatings[190000:]
ratingsPerUser = defaultdict(list)
ratingsPerItem = defaultdict(list)
for u,b,r in ratingsTrain:
    ratingsPerUser[u].append((b,r))
    ratingsPerItem[b].append((u,r))



In [9]:
def Jaccard(s1, s2):
    inter = len(set(s1).intersection(set(s2)))
    union = len(set(s1).union(set(s2)))
    if union == 0:
        return 0
    return inter / union

In [73]:
alpha = np.mean([r for _, _, r in ratingsTrain])
beta_user = defaultdict(float)
beta_book = defaultdict(float)
for u in ratingsPerUser:
    beta_user[u] = 0

for b in ratingsPerItem:
    beta_book[b] = 0

def solve(L1, L2):
    alpha = sum(rating - (beta_user[user] + beta_book[book]) for user, book, rating in ratingsTrain) / len(ratingsTrain)
    for user, items in ratingsPerUser.items():
        beta_user[user] = sum(rating - (alpha + beta_book[book]) for book, rating in items) / (L1 + len(items))
    for book, items in ratingsPerItem.items():
        beta_book[book] = sum(rating - (alpha + beta_user[user]) for user, rating in items) / (L2 + len(items))

    valid_error = [(rating - (alpha + beta_user.get(user, 0) + beta_book.get(book, 0))) ** 2 for user, book, rating in ratingsValid]
    validMSE = np.mean(valid_error).item()
    reg_user = 0
    reg_book = 0
    for user in beta_user:
        reg_user += beta_user[user] ** 2
    for book in beta_book:
        reg_book += beta_book[book] ** 2
    return (validMSE, validMSE + L1 * reg_user + L2 * reg_book)


In [74]:
def solve2(L1, L2):
    mse, minimize = solve(L1, L2)
    new_mse, new_minimize = solve(L1, L2)
    iter = 2
    while iter < 10 and minimize - new_minimize > 0.0001:
        mse, minimize = new_mse, new_minimize
        new_mse, new_minimize = solve(L1, L2)
        iter+=1

    validMSE = 0
    for u,b,r in ratingsValid:
        bu = 0
        bi = 0
        if u in beta_user:
            bu = beta_user[u]
        if b in beta_book:
            bi = beta_book[b]
        prediction = alpha + bu + bi
        validMSE += (r - prediction)**2

    validMSE /= len(ratingsValid)
    print("Validation MSE = " + str(validMSE))
    return validMSE
    


In [75]:
best = 10000
for L1 in np.arange(2, 8, 0.5):
    for L2 in np.arange(2, 8, 0.5):
        validMSA = solve2(L1, L2)
        print(L1, L2)
        print(validMSA)
        best = min(best, validMSA)


Validation MSE = 1.428909973297286
2.0 2.0
1.428909973297286
Validation MSE = 1.4340296279494604
2.0 2.5
1.4340296279494604
Validation MSE = 1.4396709580381868
2.0 3.0
1.4396709580381868
Validation MSE = 1.4415130913676943
2.0 3.5
1.4415130913676943
Validation MSE = 1.4415569725797233
2.0 4.0
1.4415569725797233
Validation MSE = 1.4413342195793477
2.0 4.5
1.4413342195793477
Validation MSE = 1.4408994419868588
2.0 5.0
1.4408994419868588
Validation MSE = 1.4402533762093452
2.0 5.5
1.4402533762093452
Validation MSE = 1.4394685704652461
2.0 6.0
1.4394685704652461
Validation MSE = 1.4387269714704114
2.0 6.5
1.4387269714704114
Validation MSE = 1.4380676676226483
2.0 7.0
1.4380676676226483
Validation MSE = 1.4374905128994184
2.0 7.5
1.4374905128994184
Validation MSE = 1.441312925189919
2.5 2.0
1.441312925189919
Validation MSE = 1.440426343233893
2.5 2.5
1.440426343233893
Validation MSE = 1.439455213935539
2.5 3.0
1.439455213935539
Validation MSE = 1.4385305077918926
2.5 3.5
1.4385305077918926


In [15]:
predictions = open("predictions_Rating.csv", 'w')
for l in open("pairs_Rating.csv"):
    if l.startswith("userID"): # header
        predictions.write(l)
        continue
    u,b = l.strip().split(',') # Read the user and item from the "pairs" file and write out your prediction
    prediction = alpha + beta_user.get(u, 0) + beta_book.get(b, 0)
    predictions.write(u + ',' + b + ',' + str(prediction) + '\n')
    
predictions.close()