In [8]:
import json
import gzip
import math
from collections import defaultdict
import numpy as np
from sklearn import linear_model
import random
import statistics

In [2]:
def assertFloat(x):
    assert type(float(x)) == float

def assertFloatList(items, N):
    assert len(items) == N
    assert [type(float(x)) for x in items] == [float]*N

In [3]:
answers = {}

In [4]:
z = gzip.open("data/train.json.gz")

In [5]:
dataset = []
for l in z:
    d = eval(l)
    dataset.append(d)

In [6]:
z.close()

In [7]:
### Question 1

In [9]:
def MSE(y, ypred):
    return np.sum((y-ypred)**2) / len(y)

In [13]:
def MAE(y, ypred):
    return np.sum(abs(y-ypred)) / len(y)

In [14]:
reviewsPerUser = defaultdict(list)
reviewsPerItem = defaultdict(list)

for d in dataset:
    u,i = d['userID'],d['gameID']
    reviewsPerUser[u].append(d)
    reviewsPerItem[i].append(d)
    
for u in reviewsPerUser:
    reviewsPerUser[u].sort(key=lambda x: x['date'])
    
for i in reviewsPerItem:
    reviewsPerItem[i].sort(key=lambda x: x['date'])

In [18]:
for d in dataset:
    print(d)
    break

{'hours': 0.3, 'gameID': 'g35322304', 'hours_transformed': 0.37851162325372983, 'early_access': False, 'date': '2015-04-08', 'text': '+1', 'userID': 'u55351001'}


In [19]:
def feat1(d):
    feat = [1]
    hour = d['hours']
    return feat + [hour]

In [20]:
X = [feat1(datum) for datum in dataset]
y = [len(datum['text']) for datum in dataset]

In [25]:
mod = linear_model.LinearRegression(fit_intercept=False)
mod.fit(X,y)
predictions = mod.predict(X)

In [49]:
theta_1 = mod.coef_[1]
mse_q1 = MSE(np.array(y), np.array(predictions))

In [50]:
answers['Q1'] = [theta_1, mse_q1]

In [51]:
assertFloatList(answers['Q1'], 2)

In [52]:
answers

{'Q1': [0.007857269704335848, 570936.2842458971]}

In [53]:
### Question 2

In [54]:
medianHours = np.median([datum['hours'] for datum in dataset])

In [60]:
dataset[0]

{'hours': 0.3,
 'gameID': 'g35322304',
 'hours_transformed': 0.37851162325372983,
 'early_access': False,
 'date': '2015-04-08',
 'text': '+1',
 'userID': 'u55351001'}

In [62]:
def feat2(d):
    feat = [1]
    hour = d['hours']
    hourTransformed = d['hours_transformed']
    sqrtHour = np.sqrt(hour)
    aboveMedian = 1 if hour > medianHours else 0
    return feat + [hour, hourTransformed, sqrtHour, aboveMedian]

In [63]:
X = [feat2(d) for d in dataset]

In [65]:
mod = linear_model.LinearRegression(fit_intercept=False)
mod.fit(X,y)
predictions = mod.predict(X)

In [66]:
mse_q2 = MSE(np.array(y), np.array(predictions))

In [67]:
answers['Q2'] = mse_q2

In [68]:
assertFloat(answers['Q2'])

In [70]:
answers

{'Q1': [0.007857269704335848, 570936.2842458971], 'Q2': 565419.5340402179}

In [69]:
### Question 3

In [71]:
def feat3(d):
    feat = [1]
    hour = d['hours']
    aboveOne = 1 if hour > 1 else 0
    aboveFive = 1 if hour > 5 else 0
    aboveTen = 1 if hour > 10 else 0
    aboveHundred = 1 if hour > 100 else 0
    aboveThousand = 1 if hour > 1000 else 0
    return feat + [aboveOne, aboveFive, aboveTen, aboveHundred, aboveThousand]

In [72]:
X = [feat3(d) for d in dataset]

In [74]:
mod = linear_model.LinearRegression(fit_intercept=False)
mod.fit(X,y)
predictions = mod.predict(X)

In [75]:
mse_q3 = MSE(np.array(y), np.array(predictions))

In [76]:
answers['Q3'] = mse_q3

In [77]:
assertFloat(answers['Q3'])

In [78]:
answers

{'Q1': [0.007857269704335848, 570936.2842458971],
 'Q2': 565419.5340402179,
 'Q3': 565405.4395885819}

In [79]:
### Question 4

In [80]:
def feat4(d):
    feat = [1]
    reviewLength = len(d['text'])
    return feat + [reviewLength]

In [81]:
X = [feat4(d) for d in dataset]
y = [datum['hours'] for datum in dataset]

In [84]:
mod = linear_model.LinearRegression(fit_intercept=False)
mod.fit(X,y)
predictions = mod.predict(X)

In [85]:
mse = MSE(np.array(y), np.array(predictions))
mae = MAE(np.array(y), np.array(predictions))

In [89]:
print(f'Min error: {np.min(y-predictions)}')
print(f'Max error: {np.max(y-predictions)}')
print(f'Diff: {np.max(y-predictions) - np.min(y-predictions)}')

Min error: -74.17849111189709
Max error: 16473.699183669458
Diff: 16547.877674781354


In [94]:
answers['Q4'] = [mse, mae, "Since we have a sizable range of error (16548), the MAE is better as it uses the absolute differences of errors, whereas the MSE squares errors, which exacerbates our errors, especially when the difference is large."]

In [95]:
assertFloatList(answers['Q4'][:2], 2)

In [96]:
answers

{'Q1': [0.007857269704335848, 570936.2842458971],
 'Q2': 565419.5340402179,
 'Q3': 565405.4395885819,
 'Q4': [75735.70018272949,
  90.35613031985204,
  'Since we have a sizable range of error (16548), the MAE is better as it uses the absolute differences of errors, whereas the MSE squares errors, which exacerbates our errors, especially when the difference is large.']}

In [97]:
### Question 5

In [99]:
y_trans = [datum['hours_transformed'] for datum in dataset]

In [101]:
mod = linear_model.LinearRegression(fit_intercept=False)
mod.fit(X,y_trans)
predictions_trans = mod.predict(X)

In [102]:
mse_trans = MSE(np.array(y_trans), np.array(predictions_trans))

In [103]:
predictions_untrans = (2**(np.array(predictions_trans)))-1

In [105]:
predictions_trans

array([3.63648953, 3.65594151, 3.6802042 , ..., 3.64213688, 3.64924836,
       3.66535376])

In [104]:
predictions_untrans

array([11.43633547, 11.60515115, 11.81893227, ..., 11.48511221,
       11.546807  , 11.68765692])

In [108]:
mse_untrans = MSE(np.array(y), np.array(predictions_untrans))

In [109]:
answers['Q5'] = [mse_trans, mse_untrans]

In [110]:
assertFloatList(answers['Q5'], 2)

In [111]:
answers

{'Q1': [0.007857269704335848, 570936.2842458971],
 'Q2': 565419.5340402179,
 'Q3': 565405.4395885819,
 'Q4': [75735.70018272949,
  90.35613031985204,
  'Since we have a sizable range of error (16548), the MAE is better as it uses the absolute differences of errors, whereas the MSE squares errors, which exacerbates our errors, especially when the difference is large.'],
 'Q5': [5.255254235328314, 78668.56502956731]}

In [112]:
### Question 6

In [124]:
def feat6(d):
    feat = [1]
    hourEncode = [0]*100
    hourRoundDown = math.floor(d['hours'])
    if hourRoundDown <= 99:
        hourEncode[hourRoundDown] = 1
    else: 
        hourEncode[-1] = 1
    return feat + hourEncode

In [125]:
X = [feat6(d) for d in dataset]
y = [len(d['text']) for d in dataset]

In [135]:
dataset[88]

{'hours': 281.6,
 'text': "Kerbal Space Program is just that game that when you love space stuff, you simply have no word to describe the beauty of exploration. The challenges you have overcome to reach your goal. What about the physics? Awsome! Keep it on devs, awsome job and keep inspiring people! This isn't just a game. Big things start with small beginnings. Orbital mechanics might save your life one day, in the future ahahahah",
 'gameID': 'g57834425',
 'hours_transformed': 8.142617655446577,
 'early_access': False,
 'user_id': '76561198155677805',
 'date': '2016-11-18',
 'userID': 'u47037823'}

In [139]:
Xtrain, Xvalid, Xtest = X[:len(X)//2], X[len(X)//2:(3*len(X))//4], X[(3*len(X))//4:]
ytrain, yvalid, ytest = y[:len(X)//2], y[len(X)//2:(3*len(X))//4], y[(3*len(X))//4:]

In [146]:
models = {}
mses = {}
bestC = None
bestMSE = None

for c in [1, 10, 100, 1000, 10000]:
    mdl = linear_model.Ridge(alpha=c, fit_intercept=False)
    mdl.fit(Xtrain, ytrain)
    preds = mdl.predict(Xvalid)
    mse = MSE(np.array(yvalid), np.array(preds))
    models[c] = mdl
    mses[c] = mse 
    if bestMSE is None:
        bestMSE = mse
        bestC = c
    if mse < bestMSE:
        bestMSE = mse
        bestC = c

In [149]:
predictions_test = models[bestC].predict(Xtest)

In [150]:
mse_valid = mses[bestC]

In [151]:
mse_test = MSE(np.array(ytest), np.array(predictions_test))

In [152]:
answers['Q6'] = [bestC, mse_valid, mse_test]

In [153]:
assertFloatList(answers['Q6'], 3)

In [154]:
answers

{'Q1': [0.007857269704335848, 570936.2842458971],
 'Q2': 565419.5340402179,
 'Q3': 565405.4395885819,
 'Q4': [75735.70018272949,
  90.35613031985204,
  'Since we have a sizable range of error (16548), the MAE is better as it uses the absolute differences of errors, whereas the MSE squares errors, which exacerbates our errors, especially when the difference is large.'],
 'Q5': [5.255254235328314, 78668.56502956731],
 'Q6': [100, 581556.5979507361, 560698.1850582251]}

In [155]:
### Question 7

In [156]:
times = [d['hours_transformed'] for d in dataset]
median = statistics.median(times)

In [164]:
notPlayed = [datum['hours'] for datum in dataset]
nNotPlayed = np.sum([1 for h in notPlayed if h < 1])

In [169]:
answers['Q7'] = [median, nNotPlayed]

In [170]:
assertFloatList(answers['Q7'], 2)

In [171]:
answers

{'Q1': [0.007857269704335848, 570936.2842458971],
 'Q2': 565419.5340402179,
 'Q3': 565405.4395885819,
 'Q4': [75735.70018272949,
  90.35613031985204,
  'Since we have a sizable range of error (16548), the MAE is better as it uses the absolute differences of errors, whereas the MSE squares errors, which exacerbates our errors, especially when the difference is large.'],
 'Q5': [5.255254235328314, 78668.56502956731],
 'Q6': [100, 581556.5979507361, 560698.1850582251],
 'Q7': [3.4724877714627436, 19913]}

In [172]:
### Question 8

In [173]:
def feat8(d):
    return [len(d['text'])]


In [174]:
X = [feat8(d) for d in dataset]
y = [d['hours_transformed'] > median for d in dataset]

In [177]:
mod = linear_model.LogisticRegression(class_weight='balanced')
mod.fit(X,y)
predictions = mod.predict(X) # Binary vector of predictions

In [247]:
def rates(predictions, y):
    truePos = 0
    numPos = 0
    trueNeg = 0
    numNeg = 0
    for i in range(len(y)):
        if y[i]:
            numPos += 1
        if predictions[i] and y[i]:
            truePos += 1
        if not y[i]:
            numNeg += 1
        if not predictions[i] and not y[i]:
            trueNeg += 1
    TP = truePos
    FP = numNeg-trueNeg
    TN = trueNeg
    FN = numPos-truePos
    return TP, TN, FP, FN

In [248]:
TP, TN, FP, FN = rates(predictions, y)

In [249]:
SEN = TP / (TP + FN)  # sensitivity / TPR 
FPR = FP / (FP + TN)  # FPR
SPE = TN / (TN + FP)  # specificity / TNR
BER = 0.5*(FPR + (1-SEN))

In [250]:
answers['Q8'] = [TP, TN, FP, FN, BER]

In [251]:
assertFloatList(answers['Q8'], 5)

In [254]:
answers

{'Q1': [0.007857269704335848, 570936.2842458971],
 'Q2': 565419.5340402179,
 'Q3': 565405.4395885819,
 'Q4': [75735.70018272949,
  90.35613031985204,
  'Since we have a sizable range of error (16548), the MAE is better as it uses the absolute differences of errors, whereas the MSE squares errors, which exacerbates our errors, especially when the difference is large.'],
 'Q5': [5.255254235328314, 78668.56502956731],
 'Q6': [100, 581556.5979507361, 560698.1850582251],
 'Q7': [3.4724877714627436, 19913],
 'Q8': [24656, 67811, 20007, 62526, 0.472506390561468]}

In [252]:
### Question 9

In [None]:
# confidences = reg_q5.decision_function(X)

# sortedByConfidence = list(zip(confidences, y))
# sortedByConfidence.sort(reverse=True)

# sortedByConfidence

# precs = []

# for k in [1,100,1000,10000]:
#     topK = sortedByConfidence[:k]
#     prec = np.sum([1 if pred[1] else 0 for pred in topK]) / k
#     precs.append(prec)

In [275]:
precision = lambda label: np.sum([1 if label[1] else 0 for label in topK]) / len(topK)
recall = lambda label: np.sum([1 if label[1] else 0 for label in topK]) / np.sum(y)

In [276]:
confidences = mod.decision_function(X)
sortedByConfidence = list(zip(confidences, y))
sortedByConfidence.sort(reverse=True)

In [292]:
precs = []
recs = []

for i in [5, 10, 100, 1000]:
    topK = sortedByConfidence[:i]
    threshold = topK[-1][0]
    k = i
    nextVal = sortedByConfidence[k][0]
    while nextVal == threshold:
        topK.append(sortedByConfidence[k])
        k += 1
        nextVal = sortedByConfidence[k][0]
    prec = precision(topK)
    rec = recall(topK)
    precs.append(prec)
    recs.append(rec)


In [293]:
answers['Q9'] = precs

In [294]:
assertFloatList(answers['Q9'], 4)

In [295]:
answers

{'Q1': [0.007857269704335848, 570936.2842458971],
 'Q2': 565419.5340402179,
 'Q3': 565405.4395885819,
 'Q4': [75735.70018272949,
  90.35613031985204,
  'Since we have a sizable range of error (16548), the MAE is better as it uses the absolute differences of errors, whereas the MSE squares errors, which exacerbates our errors, especially when the difference is large.'],
 'Q5': [5.255254235328314, 78668.56502956731],
 'Q6': [100, 581556.5979507361, 560698.1850582251],
 'Q7': [3.4724877714627436, 19913],
 'Q8': [24656, 67811, 20007, 62526, 0.472506390561468],
 'Q9': [0.5454545454545454, 0.5454545454545454, 0.67, 0.685]}

In [296]:
### Question 10

In [297]:
y_trans = [d['hours_transformed'] for d in dataset]

In [298]:
mod = linear_model.LinearRegression(fit_intercept=False)
mod.fit(X,y_trans)
predictions_trans = mod.predict(X)

In [342]:
predictions_thresh

-0.217483012872894

In [363]:
predictions_thresh = np.median(predictions_trans)+0.31

In [364]:
binaryPreds = [True if pred > predictions_thresh else False for pred in predictions_trans]

In [365]:
TP, TN, FP, FN = rates(binaryPreds, y)

In [366]:
SEN = TP / (TP + FN)  # sensitivity / TPR 
FPR = FP / (FP + TN)  # FPR
SPE = TN / (TN + FP)  # specificity / TNR
BER = 0.5*(FPR + (1-SEN))

In [367]:
BER

0.47126979143658965

In [369]:
your_threshold = predictions_thresh

In [370]:
answers['Q10'] = [your_threshold, BER]

In [371]:
assertFloatList(answers['Q10'], 2)

In [372]:
answers

{'Q1': [0.007857269704335848, 570936.2842458971],
 'Q2': 565419.5340402179,
 'Q3': 565405.4395885819,
 'Q4': [75735.70018272949,
  90.35613031985204,
  'Since we have a sizable range of error (16548), the MAE is better as it uses the absolute differences of errors, whereas the MSE squares errors, which exacerbates our errors, especially when the difference is large.'],
 'Q5': [5.255254235328314, 78668.56502956731],
 'Q6': [100, 581556.5979507361, 560698.1850582251],
 'Q7': [3.4724877714627436, 19913],
 'Q8': [24656, 67811, 20007, 62526, 0.472506390561468],
 'Q9': [0.5454545454545454, 0.5454545454545454, 0.67, 0.685],
 'Q10': [0.592516987127106, 0.47126979143658965]}

In [373]:
### Question 11

In [374]:
dataTrain = dataset[:int(len(dataset)*0.9)]
dataTest = dataset[int(len(dataset)*0.9):]

In [375]:
userMedian = defaultdict(list)
itemMedian = defaultdict(list)

# Compute medians on training data

In [377]:
dataTrain[0]

{'hours': 0.3,
 'gameID': 'g35322304',
 'hours_transformed': 0.37851162325372983,
 'early_access': False,
 'date': '2015-04-08',
 'text': '+1',
 'userID': 'u55351001'}

In [381]:
for datum in dataTrain:
    u, i = datum['userID'], datum['gameID']
    userMedian[u].append(datum['hours'])
    itemMedian[i].append(datum['hours'])

for u in userMedian:
    userMedian[u] = np.median(userMedian[u])

for i in itemMedian:
    itemMedian[i] = np.median(itemMedian[i])

In [382]:
answers['Q11'] = [itemMedian['g35322304'], userMedian['u55351001']]

In [383]:
assertFloatList(answers['Q11'], 2)

In [387]:
answers

{'Q1': [0.007857269704335848, 570936.2842458971],
 'Q2': 565419.5340402179,
 'Q3': 565405.4395885819,
 'Q4': [75735.70018272949,
  90.35613031985204,
  'Since we have a sizable range of error (16548), the MAE is better as it uses the absolute differences of errors, whereas the MSE squares errors, which exacerbates our errors, especially when the difference is large.'],
 'Q5': [5.255254235328314, 78668.56502956731],
 'Q6': [100, 581556.5979507361, 560698.1850582251],
 'Q7': [3.4724877714627436, 19913],
 'Q8': [24656, 67811, 20007, 62526, 0.472506390561468],
 'Q9': [0.5454545454545454, 0.5454545454545454, 0.67, 0.685],
 'Q10': [0.592516987127106, 0.47126979143658965],
 'Q11': [0.5, 3.9]}

In [386]:
### Question 12

In [388]:
globalMedian = np.median([datum['hours'] for datum in dataTrain])

In [398]:
def f12(u,i):
    # Function returns a single value (0 or 1)
    if i in list(itemMedian.keys()):
        return 1 if itemMedian[i] > globalMedian else 0
    return 1 if userMedian[u] > globalMedian else 0

In [399]:
preds = [f12(d['userID'], d['gameID']) for d in dataTest]

In [402]:
testMedian = np.median([datum['hours'] for datum in dataTest])
y = [1 if datum['hours'] > testMedian else 0 for datum in dataTest]

In [411]:
accuracy = np.sum(np.array(preds)==np.array(y)) / len(y)

In [412]:
answers['Q12'] = accuracy

In [413]:
assertFloat(answers['Q12'])

In [414]:
answers

{'Q1': [0.007857269704335848, 570936.2842458971],
 'Q2': 565419.5340402179,
 'Q3': 565405.4395885819,
 'Q4': [75735.70018272949,
  90.35613031985204,
  'Since we have a sizable range of error (16548), the MAE is better as it uses the absolute differences of errors, whereas the MSE squares errors, which exacerbates our errors, especially when the difference is large.'],
 'Q5': [5.255254235328314, 78668.56502956731],
 'Q6': [100, 581556.5979507361, 560698.1850582251],
 'Q7': [3.4724877714627436, 19913],
 'Q8': [24656, 67811, 20007, 62526, 0.472506390561468],
 'Q9': [0.5454545454545454, 0.5454545454545454, 0.67, 0.685],
 'Q10': [0.592516987127106, 0.47126979143658965],
 'Q11': [0.5, 3.9],
 'Q12': 0.7410857142857142}

In [415]:
### Question 13

In [416]:
usersPerItem = defaultdict(set) # Maps an item to the users who rated it
itemsPerUser = defaultdict(set) # Maps a user to the items that they rated
itemNames = {}

for d in dataset:
    user,item = d['userID'], d['gameID']
    usersPerItem[item].add(user)
    itemsPerUser[user].add(item)

In [418]:
def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    if denom == 0:
        return 0
    return numer / denom

In [471]:
def mostSimilar(i, func, N):
    similarities = []
    for i2 in usersPerItem:
        if i2 == i:
            continue
        simmilarity = func(usersPerItem[i], usersPerItem[i2])
        similarities.append((simmilarity, i2))
    similarities.sort(reverse=True)
    return similarities[:N]

In [472]:
ms = mostSimilar(dataset[0]['gameID'], Jaccard, 10)

In [473]:
answers['Q13'] = [ms[0][0], ms[-1][0]]

In [474]:
assertFloatList(answers['Q13'], 2)

In [475]:
answers

{'Q1': [0.007857269704335848, 570936.2842458971],
 'Q2': 565419.5340402179,
 'Q3': 565405.4395885819,
 'Q4': [75735.70018272949,
  90.35613031985204,
  'Since we have a sizable range of error (16548), the MAE is better as it uses the absolute differences of errors, whereas the MSE squares errors, which exacerbates our errors, especially when the difference is large.'],
 'Q5': [5.255254235328314, 78668.56502956731],
 'Q6': [100, 581556.5979507361, 560698.1850582251],
 'Q7': [3.4724877714627436, 19913],
 'Q8': [24656, 67811, 20007, 62526, 0.472506390561468],
 'Q9': [0.5454545454545454, 0.5454545454545454, 0.67, 0.685],
 'Q10': [0.592516987127106, 0.47126979143658965],
 'Q11': [0.5, 3.9],
 'Q12': 0.7410857142857142,
 'Q13': [0.07988165680473373, 0.04390243902439024]}

In [476]:
### Question 14

In [477]:
def mostSimilar14(i, func, N):
    similarities = []
    for i2 in usersPerItem:
        if i2 == i:
            continue
        simmilarity = func(i, i2)
        similarities.append((simmilarity, i2))
    similarities.sort(reverse=True)
    return similarities[:N]

In [478]:
median = np.median([datum['hours'] for datum in dataset])

In [479]:
ratingDict = {}

for d in dataset:
    u,i = d['userID'], d['gameID']
    lab = 1 if d['hours'] > median else -1
    ratingDict[(u,i)] = lab

In [488]:
def Cosine(i1, i2):
    # Between two items
    inter = usersPerItem[i1].intersection(usersPerItem[i2])
    numer = np.sum([ratingDict[(u, i1)]*ratingDict[(u, i2)] for u in inter])
    norm1 = np.sum([ratingDict[(u, i1)]**2 for u in usersPerItem[i1]])
    norm2 = np.sum([ratingDict[(u, i2)]**2 for u in usersPerItem[i2]])
    denom = math.sqrt(norm1) * math.sqrt(norm2)
    if denom == 0:
        return 0
    return numer / denom

In [489]:
ms = mostSimilar14(dataset[0]['gameID'], Cosine, 10)

In [490]:
answers['Q14'] = [ms[0][0], ms[-1][0]]

In [491]:
assertFloatList(answers['Q14'], 2)

In [492]:
answers

{'Q1': [0.007857269704335848, 570936.2842458971],
 'Q2': 565419.5340402179,
 'Q3': 565405.4395885819,
 'Q4': [75735.70018272949,
  90.35613031985204,
  'Since we have a sizable range of error (16548), the MAE is better as it uses the absolute differences of errors, whereas the MSE squares errors, which exacerbates our errors, especially when the difference is large.'],
 'Q5': [5.255254235328314, 78668.56502956731],
 'Q6': [100, 581556.5979507361, 560698.1850582251],
 'Q7': [3.4724877714627436, 19913],
 'Q8': [24656, 67811, 20007, 62526, 0.472506390561468],
 'Q9': [0.5454545454545454, 0.5454545454545454, 0.67, 0.685],
 'Q10': [0.592516987127106, 0.47126979143658965],
 'Q11': [0.5, 3.9],
 'Q12': 0.7410857142857142,
 'Q13': [0.07988165680473373, 0.04390243902439024],
 'Q14': [0.10251693271055495, 0.061667331307041336]}

In [485]:
### Question 15

In [493]:
ratingDict = {}

for d in dataset:
    u,i = d['userID'], d['gameID']
    lab = d['hours_transformed']
    ratingDict[(u,i)] = lab

In [495]:
ms = mostSimilar14(dataset[0]['gameID'], Cosine, 10)

In [496]:
answers['Q15'] = [ms[0][0], ms[-1][0]]

In [497]:
assertFloatList(answers['Q15'], 2)

In [498]:
answers

{'Q1': [0.007857269704335848, 570936.2842458971],
 'Q2': 565419.5340402179,
 'Q3': 565405.4395885819,
 'Q4': [75735.70018272949,
  90.35613031985204,
  'Since we have a sizable range of error (16548), the MAE is better as it uses the absolute differences of errors, whereas the MSE squares errors, which exacerbates our errors, especially when the difference is large.'],
 'Q5': [5.255254235328314, 78668.56502956731],
 'Q6': [100, 581556.5979507361, 560698.1850582251],
 'Q7': [3.4724877714627436, 19913],
 'Q8': [24656, 67811, 20007, 62526, 0.472506390561468],
 'Q9': [0.5454545454545454, 0.5454545454545454, 0.67, 0.685],
 'Q10': [0.592516987127106, 0.47126979143658965],
 'Q11': [0.5, 3.9],
 'Q12': 0.7410857142857142,
 'Q13': [0.07988165680473373, 0.04390243902439024],
 'Q14': [0.10251693271055495, 0.061667331307041336],
 'Q15': [0.3301567230633553, 0.12290154232706592]}

In [499]:
f = open("answers_midterm.txt", 'w')
f.write(str(answers) + '\n')
f.close()