In [1]:
import json
from matplotlib import pyplot as plt
from collections import defaultdict
from sklearn import linear_model
import numpy
import random
import gzip
import math

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
def assertFloat(x): # Checks that an answer is a float
    assert type(float(x)) == float

def assertFloatList(items, N):
    assert len(items) == N
    assert [type(float(x)) for x in items] == [float]*N

In [5]:
f = gzip.open("young_adult_10000.json.gz")
dataset = []
for l in f:
    dataset.append(json.loads(l))

In [6]:
len(dataset)

10000

In [7]:
answers = {} # Put your answers to each question in this dictionary

In [10]:
dataset[0]

{'user_id': '8842281e1d1347389f2ab93d60773d4d',
 'book_id': '2767052',
 'review_id': '248c011811e945eca861b5c31a549291',
 'rating': 5,
 'review_text': "I cracked and finally picked this up. Very enjoyable quick read - couldn't put it down - it was like crack. \n I'm a bit bothered by the lack of backstory of how Panem and the Hunger Games come about. It is just kind of explained away in a few paragraphs and we are left to accept this very strange world where teenagers are pitted into an arena each year to kill each other? I was expecting it because I've seen Battle Royale, but I would have appreciated knowing more of the backstory of how the world could have come into such a odd state. \n I suppose what makes a book like this interesting is thinking about the strategy of it all. The players are going to be statistically encouraged to band together because they will last longer that way, but by definition of course any partnership will be broken, and the drama of how that unfolds is alw

In [11]:
### Question 1

In [129]:
def feature(datum):
    # number of '!'
    b = datum.count('!')
    return [1]+[b]

In [130]:
X = [feature(b['review_text']) for b in dataset]
Y = [b['rating'] for b in dataset]

In [131]:
X[:10]

[[1, 0],
 [1, 1],
 [1, 0],
 [1, 0],
 [1, 0],
 [1, 0],
 [1, 4],
 [1, 2],
 [1, 1],
 [1, 0]]

In [132]:
theta, res, rank, s = numpy.linalg.lstsq(X,Y)

In [133]:
theta

array([3.68853304, 0.07109019])

In [134]:
theta0 = theta[0]
theta1 = theta[1]

In [135]:
X = numpy.matrix(X)
theta = numpy.matrix(theta)
test = numpy.matmul(theta,X.T)

In [136]:
test

matrix([[3.68853304, 3.75962323, 3.68853304, ..., 3.68853304, 3.68853304,
         3.68853304]])

In [137]:
mse = numpy.square(numpy.subtract(Y,test)).mean()

In [138]:
mse

1.5231747404538287

In [139]:
answers['Q1'] = [theta0, theta1, mse]

In [140]:
answers['Q1']

[3.688533040831994, 0.07109019019954266, 1.5231747404538287]

In [141]:
assertFloatList(answers['Q1'], 3) # Check the format of your answer (three floats)

In [142]:
### Question 2

In [143]:
def feature(datum):
    b = datum.count('!')
    c = len(datum)
    return [1]+[c]+[b]

In [144]:
X = [feature(b['review_text']) for b in dataset]
Y = [b['rating'] for b in dataset]

In [145]:
theta, res, rank, s = numpy.linalg.lstsq(X,Y)

In [146]:
theta

array([ 3.71751281e+00, -4.12150653e-05,  7.52759173e-02])

In [147]:
theta0 = theta[0]
theta1 = theta[1]
theta2 = theta[2]

In [148]:
X = numpy.matrix(X)
theta = numpy.matrix(theta)
test = theta*X.T

In [149]:
mse = numpy.square(numpy.subtract(Y,test)).mean()

In [150]:
answers['Q2'] = [theta0, theta1, theta2, mse]

In [151]:
answers['Q2']

[3.717512807797163,
 -4.1215065294880774e-05,
 0.07527591733232637,
 1.5214029246165832]

In [152]:
assertFloatList(answers['Q2'], 4)

In [153]:
### Question 3

In [166]:
def feature(datum, deg):
    # feature for a specific polynomial degree
    b = datum.count('!')
    arr = [1]
    for i in range(deg):
            arr += [b**(i+1)]
    return arr

In [167]:
## degree 1
X = [feature(b['review_text'], 1) for b in dataset]
Y = [b['rating'] for b in dataset]

In [168]:
theta, res, rank, s = numpy.linalg.lstsq(X,Y)
X = numpy.matrix(X)
theta = numpy.matrix(theta)
test = theta*X.T
mses = [numpy.square(numpy.subtract(Y,test)).mean()]

In [169]:
## degree 2
X = [feature(b['review_text'], 2) for b in dataset]
theta, res, rank, s = numpy.linalg.lstsq(X,Y)
X = numpy.matrix(X)
theta = numpy.matrix(theta)
test = theta*X.T
mses += [numpy.square(numpy.subtract(Y,test)).mean()]

In [170]:
## degree 3
X = [feature(b['review_text'], 3) for b in dataset]
theta, res, rank, s = numpy.linalg.lstsq(X,Y)
X = numpy.matrix(X)
theta = numpy.matrix(theta)
test = theta*X.T
mses += [numpy.square(numpy.subtract(Y,test)).mean()]

In [171]:
## degree 4
X = [feature(b['review_text'], 4) for b in dataset]
theta, res, rank, s = numpy.linalg.lstsq(X,Y)
X = numpy.matrix(X)
theta = numpy.matrix(theta)
test = theta*X.T
mses += [numpy.square(numpy.subtract(Y,test)).mean()]

In [172]:
## degree 5
X = [feature(b['review_text'], 5) for b in dataset]
theta, res, rank, s = numpy.linalg.lstsq(X,Y)
X = numpy.matrix(X)
theta = numpy.matrix(theta)
test = theta*X.T
mses += [numpy.square(numpy.subtract(Y,test)).mean()]

In [173]:
mses

[1.5231747404538287,
 1.5046686106250917,
 1.496684551517923,
 1.4904477302230692,
 1.4896106953961648]

In [174]:
answers['Q3'] = mses

In [175]:
assertFloatList(answers['Q3'], 5)# List of length 5

In [176]:
### Question 4

In [178]:
y_train = Y[:len(Y)//2]
y_test = Y[len(Y)//2:]
mses = []

In [179]:
## All 5 degrees
for i in range(5):
    X = [feature(b['review_text'], i+1) for b in dataset]
    X_train = X[:len(X)//2]
    X_test = X[len(X)//2:]
    theta, res, rank, s = numpy.linalg.lstsq(X_train,y_train)
    X_test = numpy.matrix(X_test)
    theta = numpy.matrix(theta)
    test = theta*X_test.T
    mses += [numpy.square(numpy.subtract(y_test,test)).mean()]

In [180]:
answers['Q4'] = mses

In [181]:
mses

[1.5248743859866296,
 1.497719925932244,
 1.4856632190311243,
 1.4767337440076922,
 1.480957727253333]

In [182]:
assertFloatList(answers['Q4'], 5)

In [183]:
### Question 5

In [184]:
X = [[1] for b in dataset]
X_train = X[:len(X)//2]
X_test = X[len(X)//2:]
theta, res, rank, s = numpy.linalg.lstsq(X_train,y_train)
X_test = numpy.matrix(X_test)
theta = numpy.matrix(theta)
test = theta*X_test.T

In [185]:
mae = numpy.absolute(numpy.subtract(y_test, test)).mean()

In [186]:
answers['Q5'] = mae

In [187]:
answers['Q5']

0.9787841600000015

In [188]:
assertFloat(answers['Q5'])

In [189]:
### Question 6

In [191]:
f = open("beer_50000.json")
dataset = []
for l in f:
    if 'user/gender' in l:
        dataset.append(eval(l))

In [192]:
dataset[0]

{'review/appearance': 4.0,
 'beer/style': 'American Double / Imperial IPA',
 'review/palate': 4.0,
 'review/taste': 4.5,
 'beer/name': 'Cauldron DIPA',
 'review/timeUnix': 1293735206,
 'user/gender': 'Male',
 'user/birthdayRaw': 'Jun 16, 1901',
 'beer/ABV': 7.7,
 'beer/beerId': '64883',
 'user/birthdayUnix': -2163081600,
 'beer/brewerId': '1075',
 'review/timeStruct': {'isdst': 0,
  'mday': 30,
  'hour': 18,
  'min': 53,
  'sec': 26,
  'mon': 12,
  'year': 2010,
  'yday': 364,
  'wday': 3},
 'user/ageInSeconds': 3581417047,
 'review/overall': 4.0,
 'review/text': "According to the website, the style for the Caldera Cauldron changes every year. The current release is a DIPA, which frankly is the only cauldron I'm familiar with (it was an IPA/DIPA the last time I ordered a cauldron at the horsebrass several years back). In any event... at the Horse Brass yesterday.\t\tThe beer pours an orange copper color with good head retention and lacing. The nose is all hoppy IPA goodness, showcasing

In [270]:
def feature(datum):
    # number of '!'
    b = datum.count('!')
    return [1]+[b]

In [271]:
X = [feature(b['review/text']) for b in dataset]
y = [b['user/gender'] for b in dataset]

In [272]:
females = sum(f=='Female' for f in y)
females

308

In [273]:
mod = linear_model.LogisticRegression(C=1.0)
mod.fit(X,y)
test = mod.predict(X)

In [274]:
test

array(['Male', 'Male', 'Male', ..., 'Male', 'Male', 'Male'], dtype='<U6')

In [275]:
#count num of 'male' predictions and stuff
TP =0
FP=0
TN=0
FN=0
##correct = predictions == y
for (Ypred, Ytest) in zip(test, y):
    if Ypred == Ytest:
        if Ytest == 'Female':
            TP = TP+1
        else:
            TN = TN+1
    else:
        if Ytest == 'Female':
            FN = FN+1
        else:
            FP = FP+1
TPR = TP/(TP+FN)
TNR = TN/(TN+FP)
BER = 1-0.5*(TPR+TNR)

In [276]:
answers['Q6'] = [TP, TN, FP, FN, BER]

In [277]:
answers['Q6']

[0, 20095, 0, 308, 0.5]

In [278]:
assertFloatList(answers['Q6'], 5)

In [279]:
### Question 7

In [280]:
mod = linear_model.LogisticRegression(C=1.0, class_weight = 'balanced')
mod.fit(X,y)
test = mod.predict(X)

In [281]:
TP =0
FP=0
TN=0
FN=0
for (Ypred, Ytest) in zip(test, y):
    if Ypred == Ytest:
        if Ytest == 'Female':
            TP = TP+1
        else:
            TN = TN+1
    else:
        if Ytest == 'Female':
            FN = FN+1
        else:
            FP = FP+1
TPR = TP/(TP+FN)
TNR = TN/(TN+FP)
BER = 1-0.5*(TPR+TNR)

In [282]:
answers["Q7"] = [TP, TN, FP, FN, BER]

In [283]:
answers["Q7"]

[88, 16332, 3763, 220, 0.4507731134255145]

In [284]:
assertFloatList(answers['Q7'], 5)

In [285]:
### Question 8

In [309]:
## rewrite X&y to be easier to calculate metrics
X = [[1,b['review/text'].count('!')] for b in dataset]
y = [b['user/gender']=='Female' for b in dataset]
mod = linear_model.LogisticRegression(C=1.0, class_weight = 'balanced')
mod.fit(X,y)

LogisticRegression(class_weight='balanced')

In [310]:
## K most confident predictions; precision@k only interested in which LABELS are true/'female'
## code referenced from textook 
confidences = mod.decision_function(X)
sortedByConfidences = list(zip(confidences, y))
sortedByConfidences.sort(reverse=True)

precisionList = []
trues = [b[1] for b in sortedByConfidences]
ks = [1, 10, 100, 1000, 10000]
for k in ks:
    precisionList += [sum(trues[:k])/k]

In [312]:
answers['Q8'] = precisionList

In [313]:
answers['Q8']

[0.0, 0.0, 0.03, 0.033, 0.0308]

In [250]:
assertFloatList(answers['Q8'], 5) #List of five floats

In [None]:
f = open("answers_hw1.txt", 'w') # Write your answers to a file
f.write(str(answers) + '\n')
f.close()