In [None]:
# 0.81125 - Full
# 0.8242344 - Partial

import gzip
from collections import defaultdict
from sklearn import linear_model
import csv
import scipy
import numpy

def readGz(path):
    for l in gzip.open(path, 'rt'):
        yield eval(l)

def readCSV(path):
    f = gzip.open(path, 'rt')
    c = csv.reader(f)
    header = next(c)
    for l in c:
        d = dict(zip(header,l))
        yield d['user_id'],d['recipe_id'],d

In [None]:
allRatings = []
userRatings = defaultdict(list)
recipesPerUser = defaultdict(set)
usersPerRecipe = defaultdict(set)
all_recipes = set()

dataset = []
for user,recipe,d in readCSV("trainInteractions.csv.gz"):
    r = int(d['rating'])
    allRatings.append(r)
    recipesPerUser[d['user_id']].add(d['recipe_id'])
    usersPerRecipe[d['recipe_id']].add(d['user_id'])
    all_recipes.add(d['recipe_id'])
    dataset.append([d['user_id'], d['recipe_id'], int(d['rating'])])
    
training_set = dataset[:400000]

userRatingsTraining = defaultdict(list)
recipesPerUserTraining = defaultdict(set)
usersPerRecipeTraining = defaultdict(set)

nUsersTraining = len(recipesPerUserTraining)
nItemsTraining = len(usersPerRecipeTraining)


In [None]:
import random
userGamma = {}
itemGamma = {}

K = 2

for u in recipesPerUser:
    userGamma[u] = [random.random() * 0.1 - 0.05 for k in range(K)]
    
for i in usersPerRecipe:
    itemGamma[i] = [random.random() * 0.1 - 0.05 for k in range(K)]

In [None]:
ratingMean = sum([d[2] for d in dataset]) / len(dataset)
labels = [d[2] for d in dataset]

In [None]:
N = len(dataset)
nUsers = len(recipesPerUser)
nItems = len(usersPerRecipe)
users = list(recipesPerUser.keys())
items = list(usersPerRecipe.keys())

alpha = ratingMean

In [None]:
ratingMean

4.580794

In [None]:


def MSE(predictions, labels):
    differences = [(x-y)**2 for x,y in zip(predictions,labels)]
    return sum(differences) / len(differences)

In [None]:
def unpack(theta):
    global alpha
    global userBiases
    global itemBiases
    global userGamma
    global itemGamma
    index = 0
    alpha = theta[index]
    index += 1
    userBiases = dict(zip(users, theta[index:index+nUsers]))
    index += nUsers
    itemBiases = dict(zip(items, theta[index:index+nItems]))
    index += nItems
    for u in users:
        userGamma[u] = theta[index:index+K]
        index += K
    for i in items:
        itemGamma[i] = theta[index:index+K]
        index += K

In [None]:
def inner(x, y):
    return sum([a*b for a,b in zip(x,y)])

In [None]:
def prediction(user, item):
    userB = 0
    itemB = 0
    userG = 0
    itemG = 0
    if user in userBiases:
        userB = userBiases[user]
    if item in itemBiases:
        itemB = itemBiases[item]
    flag = 0
    if user in userGamma:
        userG = userGamma[user]
    if item in itemGamma:
        itemG = itemGamma[item]
        
    if user not in userGamma or item not in itemGamma:
        flag = 1
    if flag:
        return alpha + userB + itemB
    return alpha + userB + itemB + inner(userG, itemG)

In [None]:
def cost(theta, labels, lamb):
    unpack(theta)
    predictions = [prediction(d[0], d[1]) for d in dataset]
    cost = MSE(predictions, labels)
    print("MSE = " + str(cost))
    for u in users:
        cost += lamb*userBiases[u]**2
        for k in range(K):
            cost += lamb*userGamma[u][k]**2
    for i in items:
        cost += lamb*itemBiases[i]**2
        for k in range(K):
            cost += lamb*itemGamma[i][k]**2
    return cost

In [None]:
dataset

In [None]:
def derivative(theta, labels, lamb):
    unpack(theta)
    N = len(dataset)
    dalpha = 0
    dUserBiases = defaultdict(float)
    dItemBiases = defaultdict(float)
    dUserGamma = {}
    dItemGamma = {}
    for u in recipesPerUser:
        dUserGamma[u] = [0.0 for k in range(K)]
    for i in usersPerRecipe:
        dItemGamma[i] = [0.0 for k in range(K)]
    for d in dataset:
        u,i = d[0], d[1]
        pred = prediction(u, i)
        diff = pred - d[2]
        dalpha += 2/N*diff
        dUserBiases[u] += 2/N*diff
        dItemBiases[i] += 2/N*diff
        for k in range(K):
            dUserGamma[u][k] += 2/N*itemGamma[i][k]*diff
            dItemGamma[i][k] += 2/N*userGamma[u][k]*diff
    for u in userBiases:
        dUserBiases[u] += 2*lamb*userBiases[u]
        for k in range(K):
            dUserGamma[u][k] += 2*lamb*userGamma[u][k]
    for i in itemBiases:
        dItemBiases[i] += 2*lamb*itemBiases[i]
        for k in range(K):
            dItemGamma[i][k] += 2*lamb*itemGamma[i][k]
    dtheta = [dalpha] + [dUserBiases[u] for u in users] + [dItemBiases[i] for i in items]
    for u in users:
        dtheta += dUserGamma[u]
    for i in items:
        dtheta += dItemGamma[i]
    return numpy.array(dtheta)

In [None]:
scipy.optimize.fmin_l_bfgs_b(cost, [alpha] + # Initialize alpha
                                   [0.0]*(nUsers+nItems) + # Initialize beta
                                   [random.random() * 0.1 - 0.05 for k in range(K*(nUsers+nItems))], # Gamma
                             derivative, args = (labels, 0.001))

MSE = 0.9008931639898173
MSE = 0.89432067156482
MSE = 0.8818474803763278
MSE = 5.901699843515148
MSE = 0.8848451254807598
MSE = 0.8995866923542163
MSE = 0.892268346553329
MSE = 0.8700979842096767
MSE = 0.8534219049603212
MSE = 0.8532294252452209
MSE = 0.8559699014592056
MSE = 0.8570206728598808
MSE = 0.8571209681247338
MSE = 0.8572282408815318
MSE = 0.8573286545478771
MSE = 0.8573925717508543
MSE = 0.8574047960198145
MSE = 0.8573956436916906
MSE = 0.8574037313437505
MSE = 0.8573947046061768
MSE = 0.8573945878140727


(array([ 4.54397957e+00, -1.28475833e-02, -5.62909030e-03, ...,
        -6.83254900e-08, -6.27388629e-07,  1.91818463e-07]),
 0.8735105060004237,
 {'grad': array([-9.97275617e-06, -3.25211507e-08, -4.86109116e-09, ...,
         -1.36700732e-10, -1.23646143e-09,  3.98361719e-10]),
  'task': 'CONVERGENCE: NORM_OF_PROJECTED_GRADIENT_<=_PGTOL',
  'funcalls': 21,
  'nit': 16,
  'warnflag': 0})

In [None]:
test_set = []

with open("stub_Rated.txt") as file:
    while (line := file.readline().rstrip()):
        user_id, recipe_id = line.split('-')
        try:
            test_set.append([user_id,recipe_id])
        except:
            pass

In [None]:
lfPredictions = []
for d in test_set[1:]:
    print(prediction(d[0], d[1]))
    break
#     prediction()

4.613109232425409


In [None]:
userBiases

In [None]:
f = open("A24Gamma.txt", "w")
f.write("user_id-recipe_id,prediction\n")

for user_id, recipe_id in test_set[1:]:
    f.write(str(user_id)+"-"+str(recipe_id)+","+str(prediction(user_id, recipe_id))+"\n")
    
f.close()