In [82]:
import json
import gzip
import math
import numpy as np
from collections import defaultdict
from sklearn import linear_model
import random
import statistics
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [83]:
def assertFloat(x):
    assert type(float(x)) == float

def assertFloatList(items, N):
    assert len(items) == N
    assert [type(float(x)) for x in items] == [float]*N

In [84]:
answers = {}

In [85]:
# From https://cseweb.ucsd.edu/classes/fa24/cse258-b/files/steam.json.gz
z = gzip.open("steam.json.gz")

Each data point consists of a record of a video game review. A few relevant fields include:
userID, gameID The ID of the user and the game.
hours The amount of time the user played the game.
hours transformed log2
(hours + 1); i.e., the above, log-transformed.
text The text of the user’s review.

In [86]:
dataset = []
for l in z:
    d = eval(l)
    dataset.append(d)

In [87]:
dataset[0]

{'hours': 0.3,
 'gameID': 'g35322304',
 'hours_transformed': 0.37851162325372983,
 'early_access': False,
 'date': '2015-04-08',
 'text': '+1',
 'userID': 'u55351001'}

In [88]:
z.close()

## Section 1: Regression

### Question 1
Fit a linear regressor of the form
time played = θ0 + θ1˙(review length)
where the review length is the number of characters in the review (i.e., len(d[’text’])) and ‘hours’ is
the original (i.e., not transformed) hours variable.
Report the value of θ1 and the Mean Squared Error of the prediction.
Be careful to use fit intercept=’False’ if manually including an offset feature

In [89]:
def MSE(y, y_pred):
    return mean_squared_error(y, y_pred)

In [90]:
def feat1(d):
    return len(d['text'])

In [91]:
X = [[1, feat1(d)] for d in dataset]  # X has an additional '1' as the dummy feature
y = [d['hours'] for d in dataset]     # y is the list of hours played

In [92]:
mod = LinearRegression(fit_intercept=False)
mod.fit(X, y)

In [93]:
y_pred = mod.predict(X)
mse1 = MSE(y, y_pred)

In [94]:
answers['Q1'] = [float(mod.coef_[1]), float(mse1)] # Remember to cast things to float rather than (e.g.) np.float64

In [95]:
answers['Q1']

[0.0010422806169488943, 75735.70018272949]

In [96]:
assertFloatList(answers['Q1'], 2)

### Question 2
Split your data into train and test portions with ratios 80/20, following the code in the stub. Train a
model as in Q1 using the training set and report (a) its Mean Squared Error; (b) how often the model
underpredicts (i.e., prediction less than label); and (c) how often the model overpredicts (i.e., prediction
greater than label) (all quantities on test set)

In [97]:
dataTrain = dataset[:int(len(dataset)*0.8)]
dataTest = dataset[int(len(dataset)*0.8):]

In [98]:
X_train = [[1, feat1(d)] for d in dataTrain]  # Only review length as the feature
y_train = [d['hours'] for d in dataTrain]  # Target variable (hours played)

In [99]:
mod = LinearRegression(fit_intercept=False)
mod.fit(X_train, y_train)

In [100]:
X_test = [[feat1(d)] for d in dataTest]
y_test = [d['hours'] for d in dataTest]

In [101]:
X_test = [[1, feat1(d)] for d in dataTest]  # Ensure X_test has the correct number of features

y_pred_test = mod.predict(X_test)

mse2 = MSE(y_test, y_pred_test)  # Use the MSE function defined earlier

In [102]:
under = 0
over = 0
for actual, predicted in zip(y_test, y_pred_test):
    if predicted < actual:
        under += 1
    elif predicted > actual:
        over += 1

In [103]:
answers['Q2'] = [mse2, under, over]

In [104]:
answers['Q2']

[76047.19578054463, 5249, 29751]

In [105]:
assertFloatList(answers['Q2'], 3)

### Question 3
You (probably) found that the model overpredicts much more often than it underpredicts. Let’s try to
‘fix’ our model to correct this behavior. Attempt the following interventions:

(a) Delete outliers from the training set. Specifically, keep only those instances with target values among
the lowest 90% (among values in the training set).

(b) Use the hours transformed variable as the target (i.e., log2
(hours + 1)).

(c) (Hard) Fit a model with parameters θ0 and θ1 (as in Q2) such that:

• θ0 is the same value as in your solution to Q2

• θ1 is chosen such that the prediction for the median review length in the training set is equal
to the median number of hours. Hint: it is probably easiest to just solve this analytically, i.e., don’t try to use library function.
(that is, θ1 is chosen such that your line passes through (0, θ0) and (median length, median hours))
For each of the above interventions, report how many times the model overpredicts and underpredicts on
the corresponding test set.

In [106]:
# Calculate the 90th percentile of hours in the training set
y2 = y[:]
y2.sort()
perc90 = y2[int(len(y2) * 0.9)]  # 90th percentile threshold for hours

In [107]:
# 3a
# Filter training data to exclude outliers
X3a = []
y3a = []
for d in dataTrain:
    if d['hours'] <= perc90:
        X3a.append([1, feat1(d)])  # Only review length as feature
        y3a.append(d['hours'])

# Train the model with filtered data
mod3a = LinearRegression(fit_intercept=False)
mod3a.fit(X3a, y3a)

# Make predictions on the test set
pred3a = mod3a.predict(X_test)


In [108]:
under3a = 0
over3a = 0
for actual, predicted in zip(y_test, pred3a):
    if predicted < actual:
        under3a += 1
    elif predicted > actual:
        over3a += 1

In [109]:
# 3b
# Apply log transformation to hours for the target in training data
X3b = X_train  # Same features as before
y3b = [np.log2(d['hours'] + 1) for d in dataTrain]

# Train the model with transformed target
mod3b = LinearRegression(fit_intercept=False)
mod3b.fit(X3b, y3b)

# Make predictions on the test set with the transformed target
pred3b_log = mod3b.predict(X_test)

# Transform predictions back to original scale
pred3b = [2 ** pred - 1 for pred in pred3b_log]

In [110]:
under3b = 0
over3b = 0
for actual, predicted in zip(y_test, pred3b):
    if predicted < actual:
        under3b += 1
    elif predicted > actual:
        over3b += 1


In [111]:
# 3c
# Calculate median review length and median hours
median_review_length = np.median([feat1(d) for d in dataTrain])
median_hours = np.median([d['hours'] for d in dataTrain])

# Use theta_0 from Q2
theta_0 = mod.coef_[0] if hasattr(mod, 'coef_') else 0  # Replace this with Q2's intercept

# Calculate theta_1 analytically
theta_1_custom = (median_hours - theta_0) / median_review_length

# Make predictions with custom theta values
pred3c = [theta_0 + theta_1_custom * feat1(d) for d in dataTest]


In [112]:
under3c = 0
over3c = 0
for actual, predicted in zip(y_test, pred3c):
    if predicted < actual:
        under3c += 1
    elif predicted > actual:
        over3c += 1

In [113]:
answers['Q3'] = [under3a, over3a, under3b, over3b, under3c, over3c]

In [114]:
answers['Q3']

[13084, 21916, 15941, 19059, 20808, 14192]

In [115]:
assertFloatList(answers['Q3'], 6)

## Section 2: Classification

### Question 4
Following the training/test splits you used in Q2, solve the same problem using a classifier, 

i.e., use review length to predict whether the time played is above the median.

 Use a linear model.LogisticRegression

model with a regularization strength of C = 1. 

Report (a) the number of True Positives; (b) the number
of True Negatives; (c) the number of False Positives; (d) the number of False Negatives; and (e) the
Balanced Error Rate (BER) of the classifier on the test set

In [116]:
# use review length to predict whether the time played is above the median.
median_hours = np.median([d['hours'] for d in dataTrain])

In [117]:
# model with a regularization strength of C = 1
y = [1 if d['hours'] > median_hours else 0 for d in dataTrain]  # Training target
ytest = [1 if d['hours'] > median_hours else 0 for d in dataTest]  # Test target


In [118]:
# redefine X and Xtest
# X was in dataset
X = [[feat1(d)] for d in dataTrain]      # Training features
Xtest = [[feat1(d)] for d in dataTest]   # Test features

In [119]:
mod = linear_model.LogisticRegression()
mod.fit(X,y)
predictions = mod.predict(Xtest) # Binary vector of predictions

In [120]:
from sklearn.metrics import confusion_matrix

# Calculate confusion matrix elements
TN, FP, FN, TP = confusion_matrix(ytest, predictions).ravel()

# Balanced Error Rate (BER) calculation
BER = 0.5 * (FP / (FP + TN) + FN / (FN + TP))


In [121]:
answers['Q4'] = [TP, TN, FP, FN, BER]

In [122]:
answers['Q4']

[4525, 13909, 3641, 12925, 0.474076033273741]

In [123]:
assertFloatList(answers['Q4'], 5)

### Question 5
Using the same model, report

• How often the model overpredicts (i.e., predicts ‘above median’ when the target is below the median).Just report the number of overpredctions rather than e.g. the rate; the autograder will accept answers roughly in the correct
range

• How often the model underpredicts (i.e., predicts ‘below median’ when the target is above the
median). Hint: rather than computing these values manually, they can be expressed in terms of rates (true positives, true negatives,
etc.).

In [124]:
answers['Q5'] = [FP, FN] #[overpredicts, underpredicts]

In [125]:
answers['Q5']

[3641, 12925]

In [126]:
assertFloatList(answers['Q5'], 2)

### Question 6
The dataset spans years from 2010-2018 (the year of each entry can be found by taking int(d[‘date’][:4])).

Using the same type of model as in Q4, compute the BER for:

(a) those reviews written in 2014 or earlier.

(b) those reviews written in 2015 or later.

(c) How well does a model trained only on reviews written in 2014 or earlier (i.e., as a training set)
perform on reviews written in 2015 or later (i.e., as a test set)?

(d) How well does a model trained only on reviews written in 2015 or later perform on reviews written
in 2014 or earlier?

(all models should be trained using (an appropriate fraction of) the training set and evaluated using (an appropriate fraction of) the test set)

In [127]:
# Helper function to compute BER given a model and test data
def compute_ber(model, X_test, y_test):
    predictions = model.predict(X_test)
    TN, FP, FN, TP = confusion_matrix(y_test, predictions).ravel()
    BER = 0.5 * (FP / (FP + TN) + FN / (FN + TP))
    return BER

In [128]:
# (a) those reviews written in 2014 or earlier.
X2014 = [[feat1(d)] for d in dataTrain if int(d['date'][:4]) <= 2014]
y2014 = [1 if d['hours'] > median_hours else 0 for d in dataTrain if int(d['date'][:4]) <= 2014]
X2014test = [[feat1(d)] for d in dataTest if int(d['date'][:4]) <= 2014]
y2014test = [1 if d['hours'] > median_hours else 0 for d in dataTest if int(d['date'][:4]) <= 2014]


In [129]:
# model trained only on reviews written in 2014 
model_2014 = LogisticRegression(C=1, fit_intercept=True)
model_2014.fit(X2014, y2014)
BER_A = compute_ber(model_2014, X2014test, y2014test)

In [130]:
# (b) those reviews written in 2015 or later.
X2015 = [[feat1(d)] for d in dataTrain if int(d['date'][:4]) >= 2015]
y2015 = [1 if d['hours'] > median_hours else 0 for d in dataTrain if int(d['date'][:4]) >= 2015]
X2015test = [[feat1(d)] for d in dataTest if int(d['date'][:4]) >= 2015]
y2015test = [1 if d['hours'] > median_hours else 0 for d in dataTest if int(d['date'][:4]) >= 2015]

In [131]:
# model trained only on reviews written in 2015
model_2015 = LogisticRegression(C=1, fit_intercept=True)
model_2015.fit(X2015, y2015)
BER_B = compute_ber(model_2015, X2015test, y2015test)

In [132]:
# Part (c): Train on 2014 or earlier, test on 2015 or later
BER_C = compute_ber(model_2014, X2015test, y2015test)

# Part (d): Train on 2015 or later, test on 2014 or earlier
BER_D = compute_ber(model_2015, X2014test, y2014test)

In [133]:
answers['Q6'] = [BER_A, BER_B, BER_C, BER_D]

In [134]:
answers['Q6']

[0.4799670470952742,
 0.47394608476712863,
 0.4820528229832485,
 0.4722496441821391]

In [135]:
assertFloatList(answers['Q6'], 4)

## Section 3: Recommendation
For these questions you should use the first 80% of samples as a training set and the last 20% as a test set,
i.e., the same splits as we constructed in previous questions.

### Question 7
Using the training set, compute the 10 users most similar to first user in the dataset (i.e., the user from
the first entry) in terms of Jaccard similarity. Report the Jaccard similarity of the first and tenth most
similar users

In [136]:
usersPerItem = defaultdict(set) # Maps an item to the users who rated it
itemsPerUser = defaultdict(set) # Maps a user to the items that they rated
reviewsPerUser = defaultdict(list)
reviewsPerItem = defaultdict(list)

for d in dataTrain:
    user = d['userID']
    item = d['gameID']
    usersPerItem[item].add(user)
    itemsPerUser[user].add(item)

In [137]:
# Identify the first user in the dataset
first_user = dataTrain[0]['userID']
first_user_items = itemsPerUser[first_user]

# Compute Jaccard similarity between the first user and every other user
jaccard_similarities = []
for user, user_items in itemsPerUser.items():
    if user != first_user:
        # Calculate Jaccard similarity
        intersection = first_user_items.intersection(user_items)
        union = first_user_items.union(user_items)
        jaccard_similarity = len(intersection) / len(union) if union else 0
        jaccard_similarities.append((jaccard_similarity, user))


In [138]:
# Sort users by Jaccard similarity in descending order
jaccard_similarities.sort(reverse=True, key=lambda x: x[0])

# Retrieve the top 10 most similar users
top_10_similarities = jaccard_similarities[:10]

# Extract the Jaccard similarities for the first and tenth most similar users
first = top_10_similarities[0][0]
tenth = top_10_similarities[9][0]

In [139]:
answers['Q7'] = [first, tenth]

In [140]:
answers['Q7']

[0.10909090909090909, 0.08235294117647059]

In [141]:
assertFloatList(answers['Q7'], 2)

### Question 8
Implement a function to predict hours transformed from based on a weighted average of Jaccard similarities, 

i.e.,: hours transformed(u, i) = P v∈Ui Rv,i · Sim(u, v) P v∈Ui Sim(u, v). 

Implement two versions, one based on the similarity user-to-user similarity (i.e., as in the above equation)
and one based on item-to-item similarity. Handle ‘edge cases’ (unseen user/item or zero denominator)
by returning the global average (on the training set). Report the MSE of each predictor on the test set.

Note: a correct implementation should run in less than a minute.

In [142]:
usersPerItem = defaultdict(set)  # Maps item to set of users who reviewed it
itemsPerUser = defaultdict(set)  # Maps user to set of items they reviewed
hoursTransformedPerUserItem = {}  # Stores hours.transformed for each (user, item) pair

In [143]:
# Populate the mappings with training data
for d in dataTrain:
    user = d['userID']
    item = d['gameID']
    hours_transformed = d['hours_transformed']
    
    usersPerItem[item].add(user)
    itemsPerUser[user].add(item)
    hoursTransformedPerUserItem[(user, item)] = hours_transformed

In [144]:
# Compute the global average of hours.transformed in the training set (for edge cases)
global_avg = sum(hoursTransformedPerUserItem.values()) / len(hoursTransformedPerUserItem)

# Helper function to calculate Jaccard similarity
def jaccard_similarity(set1, set2):
    intersection = set1.intersection(set2)
    union = set1.union(set2)
    return len(intersection) / len(union) if union else 0

# User-to-User Prediction Function
def predict_user_based(user, item):
    if item not in usersPerItem or user not in itemsPerUser:
        return global_avg
    
    numerator = 0.0
    denominator = 0.0
    for other_user in usersPerItem[item]:
        if other_user == user:
            continue
        sim = jaccard_similarity(itemsPerUser[user], itemsPerUser[other_user])
        if sim > 0:
            numerator += hoursTransformedPerUserItem[(other_user, item)] * sim
            denominator += sim
    return numerator / denominator if denominator != 0 else global_avg


In [145]:
# Item-to-Item Prediction Function
def predict_item_based(user, item):
    if user not in itemsPerUser or item not in usersPerItem:
        return global_avg

    numerator = 0.0
    denominator = 0.0
    for other_item in itemsPerUser[user]:
        if other_item == item:
            continue
        sim = jaccard_similarity(usersPerItem[item], usersPerItem[other_item])
        if sim > 0:
            numerator += hoursTransformedPerUserItem.get((user, other_item), 0) * sim
            denominator += sim
    return numerator / denominator if denominator != 0 else global_avg

In [146]:
# Predict and calculate MSE for user-based and item-based predictors
user_based_predictions = []
item_based_predictions = []
true_values = []

for d in dataTest:
    user = d['userID']
    item = d['gameID']
    true_value = d['hours_transformed']
    
    # Get predictions from both methods
    user_pred = predict_user_based(user, item)
    item_pred = predict_item_based(user, item)
    
    user_based_predictions.append(user_pred)
    item_based_predictions.append(item_pred)
    true_values.append(true_value)

In [147]:
# Calculate MSE for both approaches
MSEU = mean_squared_error(true_values, user_based_predictions)
MSEI = mean_squared_error(true_values, item_based_predictions)

In [148]:
answers['Q8'] = [MSEU, MSEI]

In [149]:
answers['Q8']

[3.2810768459411728, 4.915274596519422]

In [150]:
assertFloatList(answers['Q8'], 2)

### Question 9
Adjust your definition above so that the similarity weights recent actions more highly. Use the similarity
function hours transformed(u, i) = P v∈Ui Rv,i · Sim(u, v) · e −|y(u,i)−y(v,i)| P v∈Ui Sim(u, v) · e−|y(u,i)−y(v,i)|
, where y(u, i) is the (integer) year in which the review occurred 

In [151]:
ht_PerUser = defaultdict(list)
ht_PerItem = defaultdict(list)
ht_ofUser = defaultdict(list)
ht_ofItem = defaultdict(list)
years = defaultdict(int)
ht = {}  # To retrieve a rating for a specific user/item pair

for d in dataTrain:
    user, game = d['userID'], d['gameID']
    ht_PerUser[user].append(d)
    ht_PerItem[game].append(d)
    ht_ofUser[user].append(d['hours_transformed'])
    ht_ofItem[game].append(d['hours_transformed'])
    years[user, game] = int(d['date'][:4])
    ht[(user, game)] = d['hours_transformed']
    
htMean = sum([d['hours_transformed'] for d in dataTrain]) / len(dataTrain)

In [152]:
def jaccard_similarity(set1, set2):
    intersection = set1.intersection(set2)
    union = set1.union(set2)
    return len(intersection) / len(union) if union else 0

In [153]:
# Prediction function for Question 9 with time decay
def predict_ht(user, game, year):
    ht = []
    similarities = []
    ydiff = []
    
    for d in ht_PerItem[game]:
        u2 = d['userID']
        if u2 == user:
            continue
        ht.append(d['hours_transformed'])
        
        # Calculate the year difference and decay factor
        year_diff = abs(year - int(d['date'][:4]))
        ydiff.append(math.exp(-year_diff))
        
        # Calculate Jaccard similarity between `user` and `u2`
        sim = jaccard_similarity(itemsPerUser[user], itemsPerUser[u2])
        similarities.append(sim)
    
    # Compute the weighted prediction if similarities are non-zero
    if sum(similarities) > 0:
        weighted_ht = [(x * y * z) for x, y, z in zip(ht, similarities, ydiff)]
        denom = [(y * z) for y, z in zip(similarities, ydiff)]
        return sum(weighted_ht) / sum(denom) if sum(denom) != 0 else htMean
    else:
        return htMean

In [154]:
# Calculate predictions for the test set and MSE for Q9
sim_predict_set_user = [predict_ht(d['userID'], d['gameID'], int(d['date'][:4])) for d in dataTest]
labels = [d['hours_transformed'] for d in dataTest]
MSE9 = mean_squared_error(labels, sim_predict_set_user)
answers['Q9'] = MSE9

In [155]:
answers['Q9']

3.310021092937094

In [156]:
assertFloat(answers['Q9'])

In [157]:
if "float" in str(answers) or "int" in str(answers):
    print("it seems that some of your answers are not native python ints/floats;")
    print("the autograder will not be able to read your solution unless you convert them to ints/floats")

In [158]:
f = open("answers_midterm.txt", 'w')
f.write(str(answers) + '\n')
f.close()