In [321]:
import json
from collections import defaultdict
from sklearn import linear_model
import numpy
import random
import gzip
import dateutil.parser
import math
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [322]:
answers = {}

In [323]:
def assertFloat(x):
    assert type(float(x)) == float

def assertFloatList(items, N):
    assert len(items) == N
    assert [type(float(x)) for x in items] == [float]*N

In [324]:
### Question 1

In [325]:
f = gzip.open("fantasy_10000.json.gz")
dataset = [json.loads(l) for l in f]
for l in f:
    dataset.append(json.loads(l))

In [326]:
def feature(datum):
    # ...
    review_text = datum.get('text', '')
    feat = [1, len(datum['review_text']) / max([len(d['review_text']) for d in dataset])]
    return feat


In [327]:
X = [feature(d) for d in dataset]
Y = [d['rating'] for d in dataset]


In [328]:
X = numpy.array(X)
Y = numpy.array(Y)

In [329]:
from sklearn.metrics import mean_squared_error

model = LinearRegression().fit(X, Y)
theta = [model.intercept_, model.coef_[1]]
predicted_ratings = model.predict(X)
MSE = mean_squared_error(Y, predicted_ratings)

In [330]:
answers['Q1'] = [theta[0], theta[1], MSE]

In [331]:
assertFloatList(answers['Q1'], 3)

print(answers['Q1'])

[3.6856813550169516, 0.9833539181066179, 1.5522086622355378]


In [332]:
### Question 2

In [333]:
for d in dataset:
    t = dateutil.parser.parse(d['date_added'])
    d['parsed_date'] = t

In [334]:
def feature(datum):
    # ...
    t = dateutil.parser.parse(datum['date_added'])
    weekday_features = [1 if t.weekday() == i else 0 for i in range(1,7)]
    month_features = [1 if t.month == i else 0 for i in range(2, 13)]
    
    feat = [1, len(datum['review_text']) / max([len(d['review_text']) for d in dataset])] + weekday_features + month_features
    
    return feat

In [335]:
X = [feature(d) for d in dataset]
Y = [d['rating'] for d in dataset]

In [336]:

answers['Q2'] = [X[0], X[1]]

In [337]:
assertFloatList(answers['Q2'][0], 19)
assertFloatList(answers['Q2'][1], 19)

print(answers['Q2'])

[[1, 0.14581294561722355, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], [1, 0.10631902698168601, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]]


In [338]:
### Question 3

In [339]:
def feature3(datum):
    # ...
    t = dateutil.parser.parse(datum['date_added'])
    feat = [1, len(datum['review_text']) / max([len(d['review_text']) for d in dataset]), t.weekday(), t.month]
    return feat

In [340]:
X3 = [feature3(d) for d in dataset]
Y3 = [d['rating'] for d in dataset]

In [341]:
model2 = LinearRegression().fit(X, Y)
predicted_ratings2 = model2.predict(X)
mse2 = mean_squared_error(Y, predicted_ratings2)

model3 = LinearRegression().fit(X3, Y3)
predicted_ratings3 = model3.predict(X3)
mse3 = mean_squared_error(Y3, predicted_ratings3)

In [342]:
answers['Q3'] = [mse2, mse3]

In [343]:
assertFloatList(answers['Q3'], 2)

print(answers['Q3'])

[1.5466315498487562, 1.5516353711453328]


In [344]:
### Question 4

In [345]:
random.seed(0)
random.shuffle(dataset)

In [346]:
X2 = [feature(d) for d in dataset]
X3 = [feature3(d) for d in dataset]
Y = [d['rating'] for d in dataset]

In [347]:
train2, test2 = X2[:len(X2)//2], X2[len(X2)//2:]
train3, test3 = X3[:len(X3)//2], X3[len(X3)//2:]
trainY, testY = Y[:len(Y)//2], Y[len(Y)//2:]

In [348]:
model2 = LinearRegression().fit(train2, trainY)
predicted_test_ratings2 = model2.predict(test2)
test_mse2 = mean_squared_error(testY, predicted_test_ratings2)

model3 = LinearRegression().fit(train3, trainY)
predicted_test_ratings3 = model3.predict(test3)
test_mse3 = mean_squared_error(testY, predicted_test_ratings3)

In [349]:
answers['Q4'] = [test_mse2, test_mse3]

In [350]:
assertFloatList(answers['Q4'], 2)

print(answers['Q4'])

[1.6264453676167938, 1.6282919476176057]


In [351]:
### Question 5

In [352]:
f = open("beer_50000.json")
dataset = []
for l in f:
    dataset.append(eval(l))

In [353]:
f.close()

In [354]:
X = [[len(d['review/text'])] for d in dataset if 'review/text' in d]
y = [d['review/overall'] >= 4 for d in dataset]

In [355]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
clf = LogisticRegression(class_weight='balanced').fit(X, y)
y_pred = clf.predict(X)
tn, fp, fn, tp = confusion_matrix(y, y_pred).ravel()

In [356]:
TP = tp
TN = tn
FP = fp
FN = fn

In [357]:
BER = 0.5 * (fp / (tn + fp) + fn / (tp + fn))

In [358]:
answers['Q5'] = [TP, TN, FP, FN, BER]

In [359]:
assertFloatList(answers['Q5'], 5)

print(answers['Q5'])

[14201, 10503, 5885, 19411, 0.46830315259572763]


In [360]:
### Question 6

In [361]:
probs = clf.predict_proba(X)[:, 1]

sorted_indices = numpy.argsort(probs)[::-1]

In [362]:
precs = []

In [363]:
for k in [1,100,1000,10000]:
    # ...
    top_k_indices = sorted_indices[:k]
    top_k_true_labels = numpy.array(y)[top_k_indices]
    precision_at_k = sum(top_k_true_labels) / k
    precs.append(precision_at_k)

In [364]:
answers['Q6'] = precs

In [365]:
assertFloatList(answers['Q6'], 4)

print(answers['Q6'])

[1.0, 0.75, 0.71, 0.7146]


In [366]:
### Question 7

In [367]:
# Numerically encode beer styles
styles = list(set([d['beer/style'] for d in dataset]))
style_dict = {style: i for i, style in enumerate(styles)}

def style_encoding(beer_style):
    return [style_dict[beer_style]]

# Simplified feature extraction
def simplified_feature(datum):
    feat = [1, len(datum['review/text'])]
    feat += style_encoding(datum['beer/style'])
    return feat

X_simplified = [simplified_feature(d) for d in dataset]
y = [d['review/overall'] >= 4 for d in dataset]

# Split the data into training and test sets (50%/50%)
trainX_simplified, testX_simplified = X_simplified[:len(X_simplified)//2], X_simplified[len(X_simplified)//2:]
trainY, testY = y[:len(y)//2], y[len(y)//2:]

# Train a logistic regressor using the simplified features
clf_simplified = LogisticRegression(class_weight='balanced').fit(trainX_simplified, trainY)

# Predict the labels for the test set
y_pred_simplified = clf_simplified.predict(testX_simplified)

# Compute the confusion matrix
tn, fp, fn, tp = confusion_matrix(testY, y_pred_simplified).ravel()

# Calculate the Balanced Error Rate (BER)
BER_simplified = 0.5 * (fp / (tn + fp) + fn / (tp + fn))

description = "I used the beer styles and review length as features to simplify the predictor."

answers['Q7'] = [description, BER_simplified]

print(answers['Q7'])

['I used the beer styles and review length as features to simplify the predictor.', 0.42291530942357924]


In [368]:
print(answers)

{'Q1': [3.6856813550169516, 0.9833539181066179, 1.5522086622355378], 'Q2': [[1, 0.14581294561722355, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], [1, 0.10631902698168601, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]], 'Q3': [1.5466315498487562, 1.5516353711453328], 'Q4': [1.6264453676167938, 1.6282919476176057], 'Q5': [14201, 10503, 5885, 19411, 0.46830315259572763], 'Q6': [1.0, 0.75, 0.71, 0.7146], 'Q7': ['I used the beer styles and review length as features to simplify the predictor.', 0.42291530942357924]}


In [369]:
f = open("answers_hw1.txt", 'w')
f.write(str(answers) + '\n')
f.close()
