In [218]:
import numpy as np
import urllib
import scipy.optimize
import random
from sklearn import linear_model
import gzip
from collections import defaultdict
from sklearn.metrics import accuracy_score, confusion_matrix, mean_squared_error

In [219]:
import warnings
warnings.filterwarnings("ignore")

In [220]:
def assertFloat(x):
    assert type(float(x)) == float

def assertFloatList(items, N):
    assert len(items) == N
    assert [type(float(x)) for x in items] == [float]*N

In [None]:
## Tasks — Model Pipelines and Diagnostics:
# In the first homework, we began to explore a coulpe of issues with the classifiers we built. Namely (1) the data were not shuffled, and (2) the labels were highly imbalanced. Both of these made it difficult to effectively build an accurate classifier. Here we’ll try and correct for those issues using the Bankruptcy dataset.

In [221]:
f = open("polish+companies+bankruptcy+data/5year.arff", 'r')

In [222]:
# Read and parse the data
while not '@data' in f.readline():
    pass

dataset = []
for l in f:
    if '?' in l: # Missing entry
        continue
    l = l.split(',')
    values = [1] + [float(x) for x in l]
    values[-1] = values[-1] > 0 # Convert to bool
    dataset.append(values)

In [223]:
X = [d[:-1] for d in dataset]
y = [d[-1] for d in dataset]

In [224]:
answers = {} # Your answers

In [225]:
def accuracy(predictions, y):
    return accuracy_score(y, predictions)

In [226]:
def BER(predictions, y):
    tn, fp, fn, tp = confusion_matrix(y, predictions).ravel()
    false_positive_rate = fp / (fp + tn)
    false_negative_rate = fn / (fn + tp)
    return (false_positive_rate + false_negative_rate) / 2

In [227]:
### Question 1
# Code to read the data is available in the stub. Train a logistic regressor (e.g. sklearn.linear model.LogisticRegression) with
# regularization coefficient C = 1.0. Report the accuracy and Balanced Error Rate (BER) of your classifier.

In [228]:
mod = linear_model.LogisticRegression(C=1)
mod.fit(X,y)

pred = mod.predict(X)

In [229]:
# Calculate accuracy
acc1 = accuracy(pred, y)

# Calculate Balanced Error Rate (BER)
ber1 = BER(pred, y)

In [230]:
answers['Q1'] = [acc1, ber1] # Accuracy and balanced error rate

In [231]:
assertFloatList(answers['Q1'], 2)

In [232]:
answers['Q1']

[0.9656878917848895, 0.47668514315934635]

In [233]:
### Question 2
# Retrain the above model using the class weight=’balanced’ option. 
# Report the accuracy and BER of your new classifier

In [234]:
mod = linear_model.LogisticRegression(C=1, class_weight='balanced')
mod.fit(X,y)

pred = mod.predict(X)

In [235]:
# Calculate accuracy
acc2 = accuracy(pred, y)

# Calculate Balanced Error Rate (BER)
ber2 = BER(pred, y)

In [236]:
answers['Q2'] = [acc2, ber2]

In [237]:
assertFloatList(answers['Q2'], 2)

In [238]:
answers['Q2']

[0.6948201913559882, 0.304572597219154]

In [239]:
### Question 3
# Shuffle the data, and split it into training, validation, and test splits, with a 50/25/25% ratio. Use the 
# code in the stub provided to ensure that your random split is the same as the reference
# solution. Using the class weight=’balanced’ option, and training on the training set, report the
# training/validation/test BER.

In [240]:
random.seed(3)
random.shuffle(dataset)

In [241]:
X = [d[:-1] for d in dataset]
y = [d[-1] for d in dataset]

In [242]:
Xtrain, Xvalid, Xtest = X[:len(X)//2], X[len(X)//2:(3*len(X))//4], X[(3*len(X))//4:]
ytrain, yvalid, ytest = y[:len(X)//2], y[len(X)//2:(3*len(X))//4], y[(3*len(X))//4:]

In [243]:
len(Xtrain), len(Xvalid), len(Xtest)

(1515, 758, 758)

In [244]:
# Train the logistic regression model with class_weight='balanced' on the training set
mod = linear_model.LogisticRegression(C=1, class_weight='balanced')
mod.fit(Xtrain, ytrain)

In [245]:
# Make predictions for training, validation, and test sets
predTrain = mod.predict(Xtrain)
predValid = mod.predict(Xvalid)
predTest = mod.predict(Xtest)

In [246]:
# Calculate BER for training, validation, and test sets
berTrain = BER(predTrain, ytrain)
berValid = BER(predValid, yvalid)
berTest = BER(predTest, ytest)

In [247]:
answers['Q3'] = [berTrain, berValid, berTest]

In [248]:
assertFloatList(answers['Q3'], 3)

In [249]:
answers['Q3']

[0.29287226079549855, 0.3159203980099502, 0.2585616438356164]

In [250]:
### Question 4
# Implement a complete regularization pipeline with the above classifier. Consider values of C in the range
# {10−4, 10−3, . . . , 103, 104}. Report the validation BER for each value of C.

In [251]:
# Range of C values: {10^−4, 10^−3, ..., 10^3, 10^4}
C_values = [10**i for i in range(-4, 5)]

# List to store BER for each value of C on the validation set
berList = []

# Train and evaluate the model for each value of C
for C in C_values:
    # Train the logistic regression model with class_weight='balanced' and varying C
    mod = linear_model.LogisticRegression(C=C, class_weight='balanced')
    mod.fit(Xtrain, ytrain)

    # Make predictions on the validation set
    predValid = mod.predict(Xvalid)

    # Calculate the BER on the validation set
    berValid = BER(predValid, yvalid)

    # Append the BER for this value of C to the list
    berList.append(berValid)

In [252]:
answers['Q4'] = berList

In [253]:
assertFloatList(answers['Q4'], 9)

In [254]:
answers['Q4']

[0.32881049298959747,
 0.31931252826775214,
 0.32881049298959747,
 0.3233830845771144,
 0.3159203980099502,
 0.3111714156490276,
 0.2955030044582283,
 0.29618143050978873,
 0.29618143050978873]

In [255]:
### Question 5
# Based on these values, which classifier would you select (in terms of generalization performance)? Report
# the best value of C and its performance (BER) on the test set.

In [256]:
# Find the best value of C (the one with the lowest validation BER)
bestC_index = np.argmin(berList)  # Get the index of the minimum BER
bestC = C_values[bestC_index]  # Corresponding value of C

# Retrain the model using the best value of C
mod_best = linear_model.LogisticRegression(C=bestC, class_weight='balanced')
mod_best.fit(Xtrain, ytrain)

# Make predictions on the test set
predTest = mod_best.predict(Xtest)

# Calculate the BER on the test set
ber5 = BER(predTest, ytest)

In [257]:
answers['Q5'] = [bestC, ber5]

In [258]:
assertFloatList(answers['Q5'], 2)

In [259]:
answers['Q5']

[100, 0.26267123287671235]

In [None]:
## Tasks — Recommendation:
# For this question we’ll use the Goodreads book review data. The first 90% of the data should be used for training and the remaining 10% for evaluation (the stub shows how to split the data).

In [260]:
### Question 6
# Which 10 items have the highest Jaccard similarity compared to the first item (i.e., the item from the first
# review, ‘2767052’)? Report both similarities and item IDs (your answer should be a list of (similarity,
# item id) tuples). Note that the test data should not be used for this question.

In [261]:
f = gzip.open("young_adult_10000.json.gz")
dataset = []
for l in f:
    dataset.append(eval(l))

In [262]:
dataTrain = dataset[:9000]
dataTest = dataset[9000:]

In [263]:
# Some data structures you might want

usersPerItem = defaultdict(set) # Maps an item to the users who rated it
itemsPerUser = defaultdict(set) # Maps a user to the items that they rated
reviewsPerUser = defaultdict(list)
reviewsPerItem = defaultdict(list)
ratingDict = {} # To retrieve a rating for a specific user/item pair

for d in dataTrain:
    user = d['user_id']
    item = d['book_id']
    rating = d['rating']
    
    usersPerItem[item].add(user)
    itemsPerUser[user].add(item)
    reviewsPerItem[item].append(rating)
    ratingDict[(user, item)] = rating

In [264]:
def Jaccard(s1, s2):
    if len(s1) == 0 and len(s2) == 0:
        return 0
    return len(s1 & s2) / len(s1 | s2)


In [265]:
def mostSimilar(i, N):
    similarities = []
    users_i = usersPerItem[i]  # Set of users who rated item i
    
    for item in usersPerItem:
        if item == i:  # Skip comparing the item to itself
            continue
        users_j = usersPerItem[item]  # Set of users who rated item j
        sim = Jaccard(users_i, users_j)  # Calculate Jaccard similarity
        similarities.append((sim, item))
    
    # Sort by similarity in descending order and return the top N
    similarities.sort(reverse=True, key=lambda x: x[0])
    return similarities[:N]

In [266]:
answers['Q6'] = mostSimilar('2767052', 10)

In [267]:
assert len(answers['Q6']) == 10
assertFloatList([x[0] for x in answers['Q6']], 10)

In [268]:
answers['Q6']

[(0.4125, '6148028'),
 (0.3411764705882353, '7260188'),
 (0.1590909090909091, '256683'),
 (0.1375, '1162543'),
 (0.11494252873563218, '11735983'),
 (0.10989010989010989, '13335037'),
 (0.10810810810810811, '28187'),
 (0.10666666666666667, '428263'),
 (0.09876543209876543, '49041'),
 (0.09782608695652174, '41865')]

In [269]:
### Question 7
# Implement a rating prediction model based on the similarity function
#(there is already a prediction function similar to this in the provided example code, you can either start
# from scratch or modify an existing solution). Report the MSE (on the test set) of this rating prediction
# function when Sim(i, j) = Jaccard(i, j).

In [270]:
# training set

In [271]:
itemAverages = {}
globalAverage = np.mean([d['rating'] for d in dataTrain])  # Global average if an item is unseen
for item in reviewsPerItem:
    itemAverages[item] = np.mean(reviewsPerItem[item])


In [272]:
def Jaccard(item1, item2):
    users1 = usersPerItem[item1]
    users2 = usersPerItem[item2]
    if len(users1) == 0 or len(users2) == 0:
        return 0
    return len(users1 & users2) / len(users1 | users2)

In [273]:
def predictRating(user, item):
    if item in itemAverages:
        R_bar_i = itemAverages[item]  # Item average for the current item
    else:
        R_bar_i = globalAverage  # Use global average if item not in training

    numerator = 0.0
    denominator = 0.0

    for j in itemsPerUser[user]:  # Iterate over items rated by the user
        if j == item:  # Skip the target item
            continue

        Sim_ij = Jaccard(item, j)  # Jaccard similarity between item i and item j
        if Sim_ij > 0:  # Only consider positive similarities
            R_uj = ratingDict[(user, j)]  # Rating of user for item j
            R_bar_j = itemAverages.get(j, globalAverage)  # Average rating of item j

            numerator += (R_uj - R_bar_j) * Sim_ij
            denominator += Sim_ij

    if denominator > 0:
        return R_bar_i + (numerator / denominator)
    else:
        return R_bar_i

In [274]:
# test set

In [275]:
testPredictions = []
testTrueRatings = []

for d in dataTest:
    user = d['user_id']
    item = d['book_id']
    trueRating = d['rating']

    # Only predict if the user exists in training set (cold start problem)
    if user in itemsPerUser:
        predictedRating = predictRating(user, item)
    else:
        predictedRating = globalAverage  # Use global average if user not in training

    testPredictions.append(predictedRating)
    testTrueRatings.append(trueRating)

In [276]:
mse7 = mean_squared_error(testTrueRatings, testPredictions)

In [277]:
answers['Q7'] = mse7

In [278]:
assertFloat(answers['Q7'])

In [279]:
answers['Q7']

1.2448714881369496

In [280]:
### Question 8
# Modify the similarity function from Question 7 to interchange users and items (i.e., in terms of the
# similarity between users Sim(u, v) rather than Sim(i, j)), and report its MSE on the test data.

In [281]:
# training set

In [282]:
usersPerItem = defaultdict(set)  # Maps an item to the users who rated it
itemsPerUser = defaultdict(set)  # Maps a user to the items that they rated
ratingsPerUser = defaultdict(list)  # Stores ratings for each user
ratingDict = {}

In [283]:
for d in dataTrain:
    user = d['user_id']
    item = d['book_id']
    rating = d['rating']
    
    usersPerItem[item].add(user)
    itemsPerUser[user].add(item)
    reviewsPerUser[user].append(rating)
    ratingDict[(user, item)] = rating

In [284]:
itemAverages = {}
globalAverage = np.mean([d['rating'] for d in dataTrain])  # Global average if an item is unseen
for item in usersPerItem:
    itemAverages[item] = np.mean([ratingDict[(user, item)] for user in usersPerItem[item]])

In [285]:
def Jaccard(user1, user2):
    items1 = itemsPerUser[user1]
    items2 = itemsPerUser[user2]
    if len(items1) == 0 or len(items2) == 0:
        return 0
    return len(items1 & items2) / len(items1 | items2)

In [286]:
# Rating prediction function using user-user similarity
def predictRating(user, item):
    if item in itemAverages:
        R_bar_i = itemAverages[item]  # Item average for the current item
    else:
        R_bar_i = globalAverage  # Use global average if item not in training

    numerator = 0.0
    denominator = 0.0

    # Iterate over users who rated the item
    for other_user in usersPerItem[item]:
        if other_user == user:  # Skip the target user
            continue

        Sim_uv = Jaccard(user, other_user)  # Jaccard similarity between users u and v
        if Sim_uv > 0:  # Only consider positive similarities
            R_vj = ratingDict[(other_user, item)]  # Rating of other user for the same item
            R_bar_v = np.mean(reviewsPerUser[other_user])  # Average rating of other user

            numerator += (R_vj - R_bar_v) * Sim_uv
            denominator += Sim_uv

    if denominator > 0:
        return R_bar_i + (numerator / denominator)
    else:
        return R_bar_i

In [287]:
# test set

In [288]:
testPredictions = []
testTrueRatings = []

In [289]:
for d in dataTest:
    user = d['user_id']
    item = d['book_id']
    trueRating = d['rating']

    # Only predict if the user exists in training set (cold start problem)
    if user in itemsPerUser:
        predictedRating = predictRating(user, item)
    else:
        predictedRating = globalAverage  # Use global average if user not in training

    testPredictions.append(predictedRating)
    testTrueRatings.append(trueRating)

In [290]:
mse8 = mean_squared_error(testTrueRatings, testPredictions)

In [291]:
answers['Q8'] = mse8

In [292]:
assertFloat(answers['Q8'])

In [293]:
answers['Q8']

1.2519439058517665

In [294]:
f = open("answers_hw2.txt", 'w')
f.write(str(answers) + '\n')
f.close()