In [59]:
import gzip
from collections import defaultdict
import math
import scipy.optimize
from sklearn import svm
import numpy
import string
import random
import string
from sklearn import linear_model

In [60]:
def readGz(path):
    for l in gzip.open(path, 'rt'):
        yield eval(l)

In [61]:
def readCSV(path):
    f = gzip.open(path, 'rt')
    f.readline()
    for l in f:
        u,b,r = l.strip().split(',')
        r = int(r)
        yield u,b,r

In [62]:
answers = {}

In [63]:
# Some data structures that will be useful

In [64]:
allRatings = []
for l in readCSV("train_Interactions.csv.gz"):
    allRatings.append(l)

In [65]:
len(allRatings)

200000

In [66]:
ratingsTrain = allRatings[:190000]
ratingsValid = allRatings[190000:]
ratingsPerUser = defaultdict(list)
ratingsPerItem = defaultdict(list)
for u,b,r in ratingsTrain:
    ratingsPerUser[u].append((b,r))
    ratingsPerItem[b].append((u,r))

In [67]:
##################################################
# Read prediction                                #
##################################################

In [68]:
# From baseline code
bookCount = defaultdict(int)
totalRead = 0

for user,book,_ in readCSV("train_Interactions.csv.gz"):
    bookCount[book] += 1
    totalRead += 1

mostPopular = [(bookCount[x], x) for x in bookCount]
mostPopular.sort()
mostPopular.reverse()

return1 = set()
count = 0
for ic, i in mostPopular:
    count += ic
    return1.add(i)
    if count > totalRead/2: break

In [69]:
# Generate a negative set

userSet = set()
bookSet = set()
readSet = set()

for u,b,r in allRatings:
    userSet.add(u)
    bookSet.add(b)
    readSet.add((u,b))

lUserSet = list(userSet)
lBookSet = list(bookSet)

notRead = set()
for u,b,r in ratingsValid:
    #u = random.choice(lUserSet)
    b = random.choice(lBookSet)
    while ((u,b) in readSet or (u,b) in notRead):
        b = random.choice(lBookSet)
    notRead.add((u,b))

readValid = set()
for u,b,r in ratingsValid:
    readValid.add((u,b))

In [70]:
def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    if denom > 0:
        return numer/denom
    return 0

In [71]:
### Question 1

In [72]:
# Evaluate baseline strategy

correct = 0
p0, p1 = 0,0
for (label,sample) in [(1, readValid), (0, notRead)]:
    for (u,b) in sample:
        pred = 0
        if b in return1:
            pred = 1
        if pred == label:
            correct += 1

In [73]:
correct / (len(readValid) + len(notRead))

0.7141

In [74]:
answers['Q1'] = correct / (len(readValid) + len(notRead))

In [75]:
assert type(answers['Q1']) == float

In [76]:
### Question 2

In [77]:
# Improved strategy

return1 = set()
count = 0
for ic, i in mostPopular:
    count += ic
    return1.add(i)
    if count > 1.5 * totalRead/2: break

In [78]:
# Evaluate baseline strategy

correct = 0
p0, p1 = 0,0
for (label,sample) in [(1, readValid), (0, notRead)]:
    for (u,b) in sample:
        pred = 0
        if b in return1:
            pred = 1
        if pred == label:
            correct += 1

In [79]:
correct / (len(readValid) + len(notRead))

0.75585

In [80]:
answers['Q2'] = [1.5 * totalRead/2, correct / (len(readValid) + len(notRead))]

In [81]:
assert type(answers['Q2'][0]) == float
assert type(answers['Q2'][1]) == float

In [82]:
### Question 3/4

In [83]:
# Slow implementation, could easily be improved

correct = 0
for (label,sample) in [(1, readValid), (0, notRead)]:
    for (u,b) in sample:
        maxSim = 0
        users = set(ratingsPerItem[b])
        for b2,_ in ratingsPerUser[u]:
            sim = Jaccard(users,set(ratingsPerItem[b2]))
            if sim > maxSim:
                maxSim = sim
        pred = 0
        if maxSim > 0.013 or len(ratingsPerItem[b]) > 40:
            pred = 1
        if pred == label:
            correct += 1

In [84]:
correct / (len(readValid) + len(notRead))

0.751

In [85]:
answers['Q3'] = correct / (len(readValid) + len(notRead))
answers['Q4'] = correct / (len(readValid) + len(notRead))

In [86]:
assert type(answers['Q3']) == float
assert type(answers['Q4']) == float

In [87]:
predictions = open("predictions_Read.csv", 'w')
for l in open("/home/julian/teaching/2024/assignment1/pairs_Read.csv"):
    if l.startswith("userID"):
        predictions.write(l)
        continue
    u,b = l.strip().split(',')
    maxSim = 0
    users = set(ratingsPerItem[b])
    for b2,_ in ratingsPerUser[u]:
        sim = Jaccard(users,set(ratingsPerItem[b2]))
        if sim > maxSim:
            maxSim = sim
    pred = 0
    if maxSim > 0.013 or len(ratingsPerItem[b]) > 40:
        pred = 1
    _ = predictions.write(u + ',' + b + ',' + str(pred) + '\n')

predictions.close()

FileNotFoundError: [Errno 2] No such file or directory: '/home/julian/teaching/2024/assignment1/pairs_Read.csv'

In [None]:
answers['Q5'] = "I confirm that I have uploaded an assignment submission to gradescope"

In [None]:
assert type(answers['Q5']) == str

In [None]:
##################################################
# Rating prediction                              #
##################################################

In [None]:
trainRatings = [r[2] for r in ratingsTrain]
globalAverage = sum(trainRatings) * 1.0 / len(trainRatings)

In [None]:
validMSE = 0
for u,b,r in ratingsValid:
  se = (r - globalAverage)**2
  validMSE += se

validMSE /= len(ratingsValid)

print("Validation MSE (average only) = " + str(validMSE))

Validation MSE (average only) = 1.7272009396120143


In [None]:
### Question 6

In [None]:
betaU = {}
betaI = {}
for u in ratingsPerUser:
    betaU[u] = 0

for b in ratingsPerItem:
    betaI[b] = 0

In [None]:
alpha = globalAverage # Could initialize anywhere, this is a guess

In [None]:
def iterate(lamb):
    newAlpha = 0
    for u,b,r in ratingsTrain:
        newAlpha += r - (betaU[u] + betaI[b])
    alpha = newAlpha / len(ratingsTrain)
    for u in ratingsPerUser:
        newBetaU = 0
        for b,r in ratingsPerUser[u]:
            newBetaU += r - (alpha + betaI[b])
        betaU[u] = newBetaU / (lamb + len(ratingsPerUser[u]))
    for b in ratingsPerItem:
        newBetaI = 0
        for u,r in ratingsPerItem[b]:
            newBetaI += r - (alpha + betaU[u])
        betaI[b] = newBetaI / (lamb + len(ratingsPerItem[b]))
    mse = 0
    for u,b,r in ratingsTrain:
        prediction = alpha + betaU[u] + betaI[b]
        mse += (r - prediction)**2
    regularizer = 0
    for u in betaU:
        regularizer += betaU[u]**2
    for b in betaI:
        regularizer += betaI[b]**2
    mse /= len(ratingsTrain)
    return mse, mse + lamb*regularizer

In [None]:
mse,objective = iterate(1)
newMSE,newObjective = iterate(1)
iterations = 2

In [None]:
while iterations < 10 or objective - newObjective > 0.0001:
    mse, objective = newMSE, newObjective
    newMSE, newObjective = iterate(1)
    iterations += 1
    print("Objective after "
        + str(iterations) + " iterations = " + str(newObjective))
    print("MSE after "
        + str(iterations) + " iterations = " + str(newMSE))

Objective after 3 iterations = 13116.588597112237
MSE after 3 iterations = 1.0532999574393143
Objective after 4 iterations = 13126.061979708516
MSE after 4 iterations = 1.0531700104166957
Objective after 5 iterations = 13123.880213845163
MSE after 5 iterations = 1.053139460478934
Objective after 6 iterations = 13119.024310980794
MSE after 6 iterations = 1.0531277418089378
Objective after 7 iterations = 13113.705317224998
MSE after 7 iterations = 1.05312075617977
Objective after 8 iterations = 13108.483752331393
MSE after 8 iterations = 1.0531152532250623
Objective after 9 iterations = 13103.503333711971
MSE after 9 iterations = 1.0531103584002324
Objective after 10 iterations = 13098.795339035578
MSE after 10 iterations = 1.0531058094104893
Objective after 11 iterations = 13094.358965836527
MSE after 11 iterations = 1.0531015180428667
Objective after 12 iterations = 13090.184043329817
MSE after 12 iterations = 1.0530974490161191
Objective after 13 iterations = 13086.257833272284
MSE af

In [None]:
validMSE = 0
for u,b,r in ratingsValid:
    bu = 0
    bi = 0
    if u in betaU:
        bu = betaU[u]
    if b in betaI:
        bi = betaI[b]
    prediction = alpha + bu + bi
    validMSE += (r - prediction)**2

validMSE /= len(ratingsValid)
print("Validation MSE = " + str(validMSE))

Validation MSE = 1.4867655009847975


In [None]:
answers['Q6'] = validMSE

In [None]:
assert type(answers['Q6']) == float

In [None]:
### Question 7

In [None]:
betaUs = [(betaU[u], u) for u in betaU]
betaIs = [(betaI[i], i) for i in betaI]
betaUs.sort()
betaIs.sort()

print("Maximum betaU = " + str(betaUs[-1][1]) + ' (' + str(betaUs[-1][0]) + ')')
print("Maximum betaI = " + str(betaIs[-1][1]) + ' (' + str(betaIs[-1][0]) + ')')
print("Minimum betaU = " + str(betaUs[0][1]) + ' (' + str(betaUs[0][0]) + ')')
print("Minimum betaI = " + str(betaIs[0][1]) + ' (' + str(betaIs[0][0]) + ')')

Maximum betaU = u18223169 (1.8200651107262364)
Maximum betaI = b22273615 (1.7887811142876646)
Minimum betaU = u88024921 (-3.562782354660326)
Minimum betaI = b85650308 (-2.308817425988121)


In [None]:
answers['Q7'] = [betaUs[-1][1], betaUs[0][1], betaUs[-1][0], betaUs[0][0]]

In [None]:
assert [type(x) for x in answers['Q7']] == [str, str, float, float]

In [None]:
### Question 8

In [None]:
# Better lambda...

iterations = 1
while iterations < 10 or objective - newObjective > 0.0001:
    mse, objective = newMSE, newObjective
    newMSE, newObjective = iterate(0.6)
    iterations += 1
    print("Objective after " + str(iterations) + " iterations = " + str(newObjective))
    print("MSE after " + str(iterations) + " iterations = " + str(newMSE))

NameError: name 'newMSE' is not defined

In [None]:
validMSE = 0
for u,b,r in ratingsValid:
    bu = 0
    bi = 0
    if u in betaU:
        bu = betaU[u]
    if b in betaI:
        bi = betaI[b]
    prediction = alpha + bu + bi
    validMSE += (r - prediction)**2

validMSE /= len(ratingsValid)
print("Validation MSE = " + str(validMSE))

Validation MSE = 1.4320141515744742


In [None]:
answers['Q8'] = (5.0, validMSE)

In [None]:
assert type(answers['Q8'][0]) == float
assert type(answers['Q8'][1]) == float

In [None]:
predictions = open("predictions_Rating.csv", 'w')
for l in open("/home/julian/teaching/2024/assignment1/pairs_Rating.csv"):
    if l.startswith("userID"):
        predictions.write(l)
        continue
    u,b = l.strip().split(',')
    bu = 0
    bi = 0
    if u in betaU:
        bu = betaU[u]
    if b in betaI:
        bi = betaI[b]
    _ = predictions.write(u + ',' + b + ',' + str(alpha + bu + bi) + '\n')

predictions.close()

In [None]:
f = open("answers_hw3.txt", 'w')
f.write(str(answers) + '\n')
f.close()