Week 1 quiz

In [None]:
def MSE(pred, actual):
    summation = 0
    length = len(pred)
    for i in range(length):
        diff = actual[i] - pred[i]
        square_diff = diff**2
        summation = summation + square_diff
    mean_square_error = summation / length
    return mean_square_error

In [None]:
p = [1, 222, 55, 77, 33, 41]
a = [1, 21, 5, 7, 3.2, 4.2]

In [None]:
MSE(p,a)

Week 2 Notes

In [None]:
import gzip
from collections import defaultdict
import string
import random
from nltk.stem.porter import PorterStemmer # Stemming
import numpy

In [None]:
path = 'amazon_reviews_us_Gift_Card_v1_00.tsv.gz'

In [None]:
f = gzip.open(path, 'rt', encoding='utf8')

In [None]:
header = f.readline()
header = header.strip().split('\t')

In [None]:
dataset = []

In [None]:
for line in f:
    fields = line.strip().split('\t')
    d = dict(zip(header,fields))
    d['star_rating'] = int(d['star_rating'])
    d['helpful_votes'] = int(d['helpful_votes'])
    d['total_votes'] = int(d['total_votes'])
    dataset.append(d)

In [None]:
# how many unique words are there?

In [None]:
wordCount = defaultdict(int)
for d in dataset:
    for w in d['review_body'].split():
        wordCount[w] += 1

print(len(wordCount))

In [None]:
# What if we ignore capitalization and punctuations?

In [None]:
wordCount = defaultdict(int)
punctuation = set(string.punctuation)
for d in dataset:
    r = ''.join([c for c in d['review_body'].lower() if not c in punctuation])
    for w in r.split():
        wordCount[w] += 1

print(len(wordCount))

In [None]:
# what if we apply stemming?

In [None]:
wordCount = defaultdict(int)
punctuation = set(string.punctuation)
stemmer = PorterStemmer()
for d in dataset:
    r = ''.join([c for c in d['review_body'].lower() if not c in punctuation])
    for w in r.split():
        w = stemmer.stem(w) # with stemming
        wordCount[w] += 1

print(len(wordCount))

In [None]:
# extract and build features from the most common words

In [None]:
wordCount = defaultdict(int)
punctuation = set(string.punctuation)
for d in dataset:
    r = ''.join([c for c in d['review_body'].lower() if not c in punctuation])
    for w in r.split():
        wordCount[w] += 1

In [None]:
counts = [(wordCount[w], w) for w in wordCount]
counts.sort()
counts.reverse()

words = [x[1] for x in counts[:1000]]

wordId = dict(zip(words, range(len(words))))
wordSet = set(words)

In [None]:
def feature(datum):
    feat = [0]*len(words)
    r = ''.join([c for c in datum['review_body'].lower() if not c in punctuation])
    for w in r.split():
        if w in words:
            feat[wordId[w]] += 1
    feat.append(1) #offset
    return feat

In [None]:
random.shuffle(dataset)

In [None]:
X = [feature(d) for d in dataset]

In [None]:
y = [d['star_rating'] for d in dataset]

In [None]:
theta,residuals,rank,s = numpy.linalg.lstsq(X,y)

In [None]:
wordWeights = list(zip(theta, words + ['offset']))
wordWeights.sort()

In [None]:
wordWeights[:10]

In [None]:
wordWeights[-10:]

Adding a regularizer to our model

In [None]:
from sklearn import linear_model

In [None]:
help(linear_model.Ridge)

In [None]:
model = linear_model.Ridge(1.0, fit_intercept=False)
model.fit(X,y)

In [None]:
theta = model.coef_

In [None]:
wordWeight = list(zip(theta, words + ['offset']))
wordWeights.sort()

In [None]:
wordWeights[:10]

In [None]:
wordWeights[-10:]

In [None]:
predictions = model.predict(X)

In [None]:
differences = [(x-y)**2 for (x,y) in zip(predictions, y)]

In [None]:
MSE = sum(differences) / len(differences) # mean square error
print('MSE = ' + str(MSE))

In [None]:
FVU = MSE / numpy.var(y) # Fraction of Variance Unexplained
R2 = 1 - FVU
print('R2 = ' + str(R2))

Convert the problem to a classification problem and solve using logistic regression

In [None]:
y_class = [(rating > 3) for rating in y]

In [None]:
model = linear_model.LogisticRegression()
model.fit(X, y_class)

In [None]:
predictions = model.predict(X)

In [None]:
correct = predictions == y_class

In [None]:
accuracy = sum(correct) / len(correct)
print('Accuracy = ' + str(accuracy))

In [None]:
TP = sum([(p and l) for (p,l) in zip(predictions, y_class)])
FP = sum([(p and not l) for (p,l) in zip(predictions, y_class)])
TN = sum([(not p and not l) for (p,l) in zip(predictions, y_class)])
FN = sum([(not p and l) for (p,l) in zip(predictions, y_class)])

In [None]:
print('TP = ' + str(TP))
print('FP = ' + str(FP))
print('TN = ' + str(TN))
print('FN = ' + str(FN))

In [None]:
# compute accuracy
(TP + TN) / (TP + FP + TN + FN)

In [None]:
# True Positive Rate and True Negative Rate
TPR = TP / (TP + FN)
TNR = TN / (TN + FP)

In [None]:
# balanced error Rate
BER = 1 - 1/2 * (TPR + TNR)
print("Balanced error rate = " + str(BER))

In [None]:
precision = TP / (TP + FP)

In [None]:
recall = TP / (TP + FN)

In [None]:
precision, recall

In [None]:
F1 = 2 * (precision*recall) / (precision + recall)

In [None]:
F1

In [None]:
help(model)

In [None]:
confidences = model.decision_function(X)

In [None]:
confidences

In [None]:
confidencesAndLabels = list(zip(confidences, y_class))

In [None]:
confidencesAndLabels

In [None]:
labelsRankedByConfidence = [z[1] for z in confidencesAndLabels]

In [None]:
labelsRankedByConfidence

In [None]:
# What is the precision at K values
def precisionAtK(K, y_sorted):
    return sum(y_sorted[:K]) / K

In [None]:
# what is the recall at K values
def recallAtK(K, y_sorted):
    return sum(y_sorted[:K]) / sum(y_sorted)

In [None]:
precisionAtK(50, labelsRankedByConfidence)

In [None]:
precisionAtK(1000,labelsRankedByConfidence)

In [None]:
precisionAtK(10000,labelsRankedByConfidence)

In [None]:
recallAtK(50, labelsRankedByConfidence)

In [None]:
recallAtK(1000,labelsRankedByConfidence)

In [None]:
recallAtK(10000,labelsRankedByConfidence)

Week 2 quiz

In [None]:
from collections import defaultdict
import string

def extract_popular(textstring):
    letterCount = defaultdict(int)
    punctuation = set(string.punctuation)
    for d in textstring:
        r = ''.join([c for c in d.lower() if not c in punctuation])
        for l in r:
            letterCount[l] +=1
    count = [(letterCount[l],l) for l in letterCount]
    count.sort()
    count.reverse()
    return count

In [None]:
print(extract_popular("A man, a plan, a canal: Panama"))

# Week 3 notes

### Data validation

Our data set is now split into 3 sections: training data, validation data, and testing data.
Our traing data helps us find our theta value.
The validation data helps us find our lambda value.
The testing data is used to evalute the model.

Theorems:
1. Error should increase as lambda increases.
    - larget values of lambda penalize complexity more
    - for large lambda the model will generally behave like a "trivial" model
2. Validation and test error will be larger than training error
    - new data will not test as well as training data
    - validation error can be used to identify underfitting and overfitting
3. There will usually be a "sweet spot" between under- and over- fitting; this is the model we ultimately select


Due to randomsness of real datasets, these theorems may not always hold precisely
However they are good guidelines - if these theorems are badly violated it could be a sign of a bug


### Validation pipeline

* Split the data into train/validation/test fractions
* Consider several different values of our hyperparameters (e.g. lambda)
* For each of these values, train a model on the training set
* Evaluate each model's performance on the validation set
* For the model that performs est on the validation set, evaluate its performance on the test set

In [1]:
# Training / Validation / test pipeline

In [2]:
import gzip
from collections import defaultdict
import string
import random

In [3]:
path = 'amazon_reviews_us_Gift_Card_v1_00.tsv.gz'

In [4]:
f = gzip.open(path, 'rt', encoding='utf8')

In [5]:
header = f.readline()
header = header.strip().split('\t')

In [6]:
dataset = []

In [7]:
for line in f:
    fields = line.strip().split('\t')
    d = dict(zip(header, fields))
    d['star_rating'] = int(d['star_rating'])
    d['helpful_votes'] = int(d['helpful_votes'])
    d['total_votes'] = int(d['total_votes'])
    dataset.append(d)

In [10]:
wordCount = defaultdict(int)
punctuation = set(string.punctuation)

for d in dataset:
    r = ''.join([c for c in d['review_body'].lower() if not c in punctuation])
    for w in r.split():
        wordCount[w] += 1

counts = [(wordCount[w], w) for w in wordCount]
counts.sort()
counts.reverse()

words = [x[1] for x in counts[:1000]]

wordId = dict(zip(words, range(len(words))))
wordSet = set(words)

In [12]:
def feature(datum):
    feat = [0] * len(words)
    r = ''.join([c for c in datum['review_body'].lower() if not c in punctuation])
    for w in r.split():
        if w in words:
            feat[wordId[w]] += 1
    feat.append(1) # offset
    return feat

In [13]:
random.shuffle(dataset)

In [15]:
X = [feature(d) for d in dataset]

In [16]:
y = [d['star_rating'] for d in dataset]

In [19]:
# splits the data 50%/25%/25%

N = len(X)
X_train = X[:N//2]  # from beginning to halfway
X_valid = X[N//2:3*N//4] # from halfway to 3/4
X_test = X[3*N//4:] # from 3/4 to end

y_train = y[:N//2]  
y_valid = y[N//2:3*N//4] 
y_test = y[3*N//4:]

In [18]:
len(X), len(X_train), len(X_valid), len(X_test)

(149086, 74543, 37271, 37272)

In [20]:
from sklearn import linear_model

In [21]:
help(linear_model.Ridge)

Help on class Ridge in module sklearn.linear_model._ridge:

class Ridge(sklearn.base.MultiOutputMixin, sklearn.base.RegressorMixin, _BaseRidge)
 |  Ridge(alpha=1.0, fit_intercept=True, normalize=False, copy_X=True, max_iter=None, tol=0.001, solver='auto', random_state=None)
 |  
 |  Linear least squares with l2 regularization.
 |  
 |  Minimizes the objective function::
 |  
 |  ||y - Xw||^2_2 + alpha * ||w||^2_2
 |  
 |  This model solves a regression model where the loss function is
 |  the linear least squares function and regularization is given by
 |  the l2-norm. Also known as Ridge Regression or Tikhonov regularization.
 |  This estimator has built-in support for multi-variate regression
 |  (i.e., when y is a 2d-array of shape (n_samples, n_targets)).
 |  
 |  Read more in the :ref:`User Guide <ridge_regression>`.
 |  
 |  Parameters
 |  ----------
 |  alpha : {float, ndarray of shape (n_targets,)}, default=1.0
 |      Regularization strength; must be a positive float. Regulari

In [23]:
# calculate Mean Square Error
def MSE(model, X, y):
    predictions = model.predict(X)
    differences = [(a-b)**2 for (a,b) in zip(predictions, y)]
    return sum(differences) / len(differences)

In [27]:
# keep track of which model works the best
bestModel = None
bestMSE = None

In [28]:
# fit a model for each value of lambda
for lamb in [0.01, 0.1, 1, 10, 100]:
    model = linear_model.Ridge(lamb, fit_intercept=False)
    model.fit(X_train, y_train)
    
    mseTrain = MSE(model, X_train, y_train)
    mseValid = MSE(model, X_valid, y_valid)
    
    # report the training and validation error
    print('lambda = ' + str(lamb) + ', training/validation error = ' +
            str(mseTrain) + '/' + str(mseValid))
    if not bestModel or mseValid < bestMSE:
        bestModel = model
        bestMSE = mseValid

lambda = 0.01, training/validation error = 0.41378131575626664/0.43752825052808
lambda = 0.1, training/validation error = 0.4137813237671389/0.4375226453192556
lambda = 1, training/validation error = 0.413782116531864/0.43746766804696896
lambda = 10, training/validation error = 0.41385386504743615/0.43701512869734976
lambda = 100, training/validation error = 0.4173667981294458/0.4371620201405547


In [29]:
# Finally reprt the test error for the model with the best performance on the validation set
mseTest = MSE(bestModel, X_test, y_test)
print('test error = ' + str(mseTest))

test error = 0.4630262123790468


Week 3 quiz

In [50]:
def split_dataset(X, y):
    N = len(X)
    X_train = X[:6*N//10]  # 60%
    X_valid = X[6*N//10:8*N//10] # 20%
    X_test = X[8*N//10:] # 20%

    y_train = y[:6*N//10]  
    y_valid = y[6*N//10:8*N//10] 
    y_test = y[8*N//10:]
    return [X_train, X_valid, X_test, y_train, y_valid, y_test]

In [51]:
X = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"]
y = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
split_dataset(X,y)

[['a', 'b', 'c', 'd', 'e', 'f'],
 ['g', 'h'],
 ['i', 'j'],
 [1, 2, 3, 4, 5, 6],
 [7, 8],
 [9, 10]]