In [1]:
import json
import gzip
import math
from collections import defaultdict
import numpy
from sklearn import linear_model
import random
import statistics

In [2]:
def assertFloat(x):
    assert type(float(x)) == float

def assertFloatList(items, N):
    assert len(items) == N
    assert [type(float(x)) for x in items] == [float]*N

In [3]:
answers = {}

In [4]:
z = gzip.open("/Users/tiyang/Downloads/MGTA461_Midterm2023/train.json.gz")

In [5]:
dataset = []
for l in z:
    d = eval(l)
    dataset.append(d)

In [6]:
z.close()

In [7]:
### Question 1

In [8]:
def MSE(y, ypred):
    return numpy.mean((y - ypred)**2)

In [9]:
def MAE(y, ypred):
    return numpy.mean(numpy.abs(y - ypred))

In [10]:
reviewsPerUser = defaultdict(list)
reviewsPerItem = defaultdict(list)

for d in dataset:
    u,i = d['userID'],d['gameID']
    reviewsPerUser[u].append(d)
    reviewsPerItem[i].append(d)
    
for u in reviewsPerUser:
    reviewsPerUser[u].sort(key=lambda x: x['date'])
    
for i in reviewsPerItem:
    reviewsPerItem[i].sort(key=lambda x: x['date'])

In [11]:
def feat1(d):
    return [d['hours']]

In [12]:
X = numpy.array([feat1(d) for d in dataset])
y = numpy.array([len(d['text']) for d in dataset])

In [13]:
mod = linear_model.LinearRegression()
mod.fit(X,y)
predictions = mod.predict(X)

In [14]:
mse_q1 = MSE(y, predictions)
theta_1 = mod.coef_[0]

In [15]:
answers['Q1'] = [theta_1, mse_q1]

In [16]:
assertFloatList(answers['Q1'], 2)

print(answers['Q1'])

[0.00785726970433403, 570936.2842458971]


In [17]:
### Question 2

In [18]:
hours_list = [d['hours'] for d in dataset]
median_hours = statistics.median(hours_list)

In [19]:
def feat2(d):
    hours = d['hours']
    return [
        1,  # for the intercept term, θ0
        hours,  # θ1 * (hours)
        math.log2(hours + 1),  # θ2 * log2(hours + 1)
        math.sqrt(hours),  # θ3 * sqrt(hours)
        1 if hours > median_hours else 0  # θ4 * δ(hours > median)
    ]

In [20]:
X = [feat2(d) for d in dataset]

In [21]:
mod = linear_model.LinearRegression(fit_intercept=False)
mod.fit(X,y)
predictions = mod.predict(X)

In [22]:
mse_q2 = MSE(y, predictions)

In [23]:
answers['Q2'] = mse_q2

In [24]:
assertFloat(answers['Q2'])

print(answers['Q2'])

565419.5340402179


In [25]:
### Question 3

In [26]:
def feat3(d):
    hours = d['hours']
    return [
        1,  # for the intercept term, θ0
        1 if hours > 1 else 0,    # θ1 * δ(h > 1)
        1 if hours > 5 else 0,    # θ2 * δ(h > 5)
        1 if hours > 10 else 0,   # θ3 * δ(h > 10)
        1 if hours > 100 else 0,  # θ4 * δ(h > 100)
        1 if hours > 1000 else 0  # θ5 * δ(h > 1000)
    ]

In [27]:
X = [feat3(d) for d in dataset]

In [28]:
mod = linear_model.LinearRegression(fit_intercept=False)
mod.fit(X,y)
predictions = mod.predict(X)

In [29]:
mse_q3 = MSE(y, predictions)

In [30]:
answers['Q3'] = mse_q3

In [31]:
assertFloat(answers['Q3'])

print(answers['Q3'])

565405.439588582


In [32]:
### Question 4

In [33]:
def feat4(d):
    return [len(d['text'])]

In [34]:
X = [feat4(d) for d in dataset]
y = [d['hours'] for d in dataset]

In [35]:
mod = linear_model.LinearRegression(fit_intercept=False)
mod.fit(X,y)
predictions = mod.predict(X)

In [36]:
mse = MSE(y, predictions)
mae = MAE(y, predictions)

In [37]:
explanation = "MAE may be more suitable when dealing with datasets with potential outliers, as it is less sensitive to extreme values than MSE."

In [38]:
answers['Q4'] = [mse, mae, explanation]

In [39]:
assertFloatList(answers['Q4'][:2], 2)

print(answers['Q4'])

[79171.87488518466, 67.44841239495095, 'MAE may be more suitable when dealing with datasets with potential outliers, as it is less sensitive to extreme values than MSE.']


In [40]:
### Question 5

In [41]:
y_trans = [math.log2(d['hours'] + 1) for d in dataset]

In [42]:
mod = linear_model.LinearRegression(fit_intercept=False)
mod.fit(X,y_trans)
predictions_trans = mod.predict(X)

In [43]:
mse_trans = MSE(y_trans, predictions_trans) # MSE using the transformed variable

In [44]:
y_array = numpy.array(y)

In [45]:
predictions_untrans = numpy.array([2**p - 1 for p in predictions_trans]) # Undoing the transformation

In [46]:
mse_untrans = MSE(y_array, predictions_untrans)

In [47]:
answers['Q5'] = [mse_trans, mse_untrans]

In [48]:
assertFloatList(answers['Q5'], 2)

print(answers['Q5'])

[15.684260629190968, 24345628.474495023]


In [49]:
### Question 6

In [50]:
def feat6(d):
    hours = int(d['hours'])  # Get the integer part of the hours
    one_hot = [0]*100
    one_hot[min(hours, 99)] = 1  # Cap the feature at 99 hours
    return one_hot

In [51]:
X = [feat6(d) for d in dataset]
y = [len(d['text']) for d in dataset]

In [52]:
Xtrain, Xvalid, Xtest = X[:len(X)//2], X[len(X)//2:(3*len(X))//4], X[(3*len(X))//4:]
ytrain, yvalid, ytest = y[:len(X)//2], y[len(X)//2:(3*len(X))//4], y[(3*len(X))//4:]

In [53]:
from sklearn.linear_model import Ridge

In [54]:
models = {}
mses = {}
bestC = None
bestMSE = float('inf')

for c in [1, 10, 100, 1000, 10000]:
    models[c] = Ridge(alpha=c)
    models[c].fit(Xtrain, ytrain)

    predictions_valid = models[c].predict(Xvalid)

    mses[c] = MSE(yvalid, predictions_valid)

    if mses[c] < bestMSE:
        bestC = c
        bestMSE = mses[c]

In [55]:
best_model = models[bestC]

In [56]:
predictions_test = best_model.predict(Xtest)

In [57]:
mse_valid = bestMSE

In [58]:
mse_test = MSE(ytest, predictions_test)

In [59]:
answers['Q6'] = [bestC, mse_valid, mse_test]

In [60]:
assertFloatList(answers['Q6'], 3)

print(answers['Q6'])

[1000, 581432.8208480754, 560786.7645482325]


In [61]:
### Question 7

In [62]:
times = [d['hours_transformed'] for d in dataset]
median = statistics.median(times)

In [63]:
notPlayed = [time for time in times if time < 1]
nNotPlayed = len(notPlayed)

In [64]:
answers['Q7'] = [median, nNotPlayed]

In [65]:
assertFloatList(answers['Q7'], 2)

print(answers['Q7'])

[3.4724877714627436, 19913]


In [66]:
### Question 8

In [67]:
def feat8(d):
    return [len(d['text'])]

In [68]:
X = [feat8(d) for d in dataset]
y = [d['hours_transformed'] > median for d in dataset]

In [69]:
mod = linear_model.LogisticRegression(class_weight='balanced')
mod.fit(X,y)
predictions = mod.predict(X) # Binary vector of predictions

In [70]:
def rates(predictions, y):
    TP = sum(p and l for p, l in zip(predictions, y))
    TN = sum(not p and not l for p, l in zip(predictions, y))
    FP = sum(p and not l for p, l in zip(predictions, y))
    FN = sum(not p and l for p, l in zip(predictions, y))
    return TP, TN, FP, FN

In [71]:
TP, TN, FP, FN = rates(predictions, y)

In [72]:
P = sum(y) 
N = len(y) - P
BER = 0.5 * ((FP / N) + (FN / P))

In [73]:
answers['Q8'] = [TP, TN, FP, FN, BER]

In [74]:
assertFloatList(answers['Q8'], 5)

print(answers['Q8'])

[24656, 67811, 20007, 62526, 0.472506390561468]


In [75]:
### Question 9

In [76]:
precision = 
recall = 

SyntaxError: invalid syntax (688621682.py, line 1)

In [77]:
probabilities = mod.predict_proba(X)[:,1]
sorted_by_prob = sorted(zip(probabilities, y), key=lambda x: x[0], reverse=True)

In [78]:
precs = []
recs = []

for i in [5, 10, 100, 1000]:
    sorted_data = sorted(zip(dataset, probabilities), key=lambda x: x[1], reverse=True)

    threshold = sorted_data[i-1][1] if i <= len(sorted_data) else sorted_data[-1][1]
    
    threshold_data = [(d, prob) for d, prob in sorted_data if prob >= threshold]
    
    TP = sum(1 for d, prob in threshold_data if d['hours_transformed'] > median)
    precision = TP / len(threshold_data)
    
    recall = TP / sum(1 for d in dataset if d['hours_transformed'] > median)
    
    precs.append(precision)
    recs.append(recall)

In [79]:
answers['Q9'] = precs

In [80]:
assertFloatList(answers['Q9'], 4)

print(answers['Q9'])

[0.5454545454545454, 0.5454545454545454, 0.67, 0.685]


In [81]:
### Question 10

In [82]:
y_trans = [d['hours_transformed'] for d in dataset]

In [83]:
mod = linear_model.LinearRegression(fit_intercept=False)
mod.fit(X,y_trans)
predictions_trans = mod.predict(X)

In [84]:
ber_to_beat = answers['Q8'][4]

def calculate_ber(predictions, actual):
    TP = sum((p and a) for p, a in zip(predictions, actual))
    TN = sum((not p and not a) for p, a in zip(predictions, actual))
    FP = sum((p and not a) for p, a in zip(predictions, actual))
    FN = sum((not p and a) for p, a in zip(predictions, actual))
    P = sum(actual)  
    N = len(actual) - P  
    return 0.5 * ((FP / N) + (FN / P))

best_threshold = None
best_ber = float('inf')
for threshold in numpy.linspace(min(predictions_trans), max(predictions_trans), 1000):
    predictions_thresh = predictions_trans > threshold
    current_ber = calculate_ber(predictions_thresh, y)
    if current_ber < best_ber:
        best_ber = current_ber
        best_threshold = threshold

In [85]:
predictions_thresh = predictions_trans > best_threshold # Using a fixed threshold to make predictions

In [86]:
TP, TN, FP, FN = rates(predictions_thresh, y)

In [87]:
BER = 0.5 * ((FP / (FP + TN)) + (FN / (TP + FN)))

In [88]:
answers['Q10'] = [best_threshold, BER]

In [89]:
assertFloatList(answers['Q10'], 2)

print(answers['Q10'])

[0.5917041695431507, 0.4712697914365896]


In [90]:
### Question 11

In [91]:
dataTrain = dataset[:int(len(dataset)*0.9)]
dataTest = dataset[int(len(dataset)*0.9):]

In [92]:
userMedian = defaultdict(list)
itemMedian = defaultdict(list)

# Compute medians on training data

In [93]:
for d in dataTrain:
    user, item, playtime = d['userID'], d['gameID'], d['hours']
    userMedian[user].append(playtime)
    itemMedian[item].append(playtime)

for user in userMedian:
    userMedian[user] = statistics.median(userMedian[user])

for item in itemMedian:
    itemMedian[item] = statistics.median(itemMedian[item])

first_item = dataTrain[0]['gameID']
first_user = dataTrain[0]['userID']

In [95]:
answers['Q11'] = [itemMedian['g35322304'], userMedian['u55351001']]

In [96]:
assertFloatList(answers['Q11'], 2)

print(answers['Q11'])

[0.5, 3.9]


In [None]:
### Question 12

In [97]:
global_median = statistics.median([d['hours'] for d in dataTrain])

In [98]:
def f12(u,i):
    # Function returns a single value (0 or 1)
    if i in itemMedian:
        if itemMedian[i] > global_median:
            return 1
        else:
            return 0
    else:
        if u in userMedian and userMedian[u] > global_median:
            return 1
        else:
            return 0

In [99]:
preds = [f12(d['userID'], d['gameID']) for d in dataTest]

In [100]:
y = [1 if d['hours'] > global_median else 0 for d in dataTest]

In [101]:
accuracy = sum(1 for (pred, actual) in zip(preds, y) if pred == actual) / len(preds)

In [102]:
answers['Q12'] = accuracy

In [103]:
assertFloat(answers['Q12'])

print(answers['Q12'])

0.7410857142857142


In [None]:
### Question 13

In [104]:
usersPerItem = defaultdict(set) # Maps an item to the users who rated it
itemsPerUser = defaultdict(set) # Maps a user to the items that they rated
itemNames = {}

for d in dataset:
    user,item = d['userID'], d['gameID']
    usersPerItem[item].add(user)
    itemsPerUser[user].add(item)

In [105]:
def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    return numer / denom

In [106]:
def mostSimilar(i, func, N):
    similarities = []
    users = usersPerItem[i]
    for i2 in usersPerItem:
        if i2 == i: continue
        sim = func(users, usersPerItem[i2])
        similarities.append((sim, i2))
    similarities.sort(reverse=True)
    return similarities[:N]

In [107]:
ms = mostSimilar(dataset[0]['gameID'], Jaccard, 10)

In [108]:
answers['Q13'] = [ms[0][0], ms[-1][0]]

In [109]:
assertFloatList(answers['Q13'], 2)

print(answers['Q13'])

[0.07988165680473373, 0.04390243902439024]


In [None]:
### Question 14

In [110]:
def mostSimilar14(i, func, N):
    similarities = []
    users_i1 = usersPerItem[i]
    for i2 in usersPerItem:
        if i2 == i: continue
        sim = func(i, i2)
        similarities.append((sim, i2))
    similarities.sort(reverse=True)
    return similarities[:N]

In [111]:
ratingDict = {}

for d in dataset:
    u,i = d['userID'], d['gameID']
    lab = # Set the label based on a rule
    ratingDict[(u,i)] = lab

SyntaxError: invalid syntax (362479486.py, line 5)

In [112]:
def Cosine(i1, i2):
    # Between two items
    inter = len(usersPerItem[i1].intersection(usersPerItem[i2]))
    sum_sq_i1 = len(usersPerItem[i1])
    sum_sq_i2 = len(usersPerItem[i2])
    if inter == 0: return 0
    return inter / math.sqrt(sum_sq_i1 * sum_sq_i2)

In [113]:
ms = mostSimilar14(dataset[0]['gameID'], Cosine, 10)

In [114]:
answers['Q14'] = [ms[0][0], ms[-1][0]]

In [115]:
assertFloatList(answers['Q14'], 2)

print(answers['Q14'])

[0.16402709233688792, 0.08907534498272675]


In [None]:
### Question 15

In [116]:
ratingDict = {}

for d in dataset:
    u,i = d['userID'], d['gameID']
    lab = # Set the label based on a rule
    ratingDict[(u,i)] = lab

SyntaxError: invalid syntax (362479486.py, line 5)

In [117]:
def CosineHours(i1, i2):
    users_i1 = usersPerItem[i1]
    users_i2 = usersPerItem[i2]
    inter_users = users_i1.intersection(users_i2)

    numerator = sum(ratingDict[(u, i1)] * ratingDict[(u, i2)] for u in inter_users)

    sum_sq_i1 = sum(ratingDict[(u, i1)]**2 for u in users_i1)
    sum_sq_i2 = sum(ratingDict[(u, i2)]**2 for u in users_i2)
    
    if sum_sq_i1 == 0 or sum_sq_i2 == 0:
        return 0
    
    return numerator / math.sqrt(sum_sq_i1 * sum_sq_i2)

In [118]:
ms = mostSimilar14(dataset[0]['gameID'], Cosine, 10)

In [119]:
answers['Q15'] = [ms[0][0], ms[-1][0]]

In [120]:
assertFloatList(answers['Q15'], 2)

print(answers['Q15'])

[0.16402709233688792, 0.08907534498272675]


In [121]:
f = open("answers_midterm.txt", 'w')
f.write(str(answers) + '\n')
f.close()