# Meaningful Predictive Modeling Final Project

For this project i have used the amazon gift card review data. This data set seems to have good variaty to it and is large enough to provide accurate training and validation data sets.
I have changed a few of the parameters from the lesson to see how they change the results. I added a few values to lambda to see if it gets better with higher or lower lambda values. I have also increased the word bank to 10,000 from 1,000.

In [1]:
import gzip
from collections import defaultdict
import string
import random
from nltk.stem.porter import PorterStemmer # Stemming
import numpy
from sklearn import linear_model

In [2]:
path = 'amazon_reviews_us_Gift_Card_v1_00.tsv.gz'

In [3]:
f = gzip.open(path, 'rt', encoding='utf8')

In [4]:
header = f.readline()
header = header.strip().split('\t')

In [5]:
dataset = []

In [6]:
for line in f:
    fields = line.strip().split('\t')
    d = dict(zip(header,fields))
    d['star_rating'] = int(d['star_rating'])
    d['helpful_votes'] = int(d['helpful_votes'])
    d['total_votes'] = int(d['total_votes'])
    dataset.append(d)

In [7]:
wordCount = defaultdict(int)
punctuation = set(string.punctuation)
for d in dataset:
    r = ''.join([c for c in d['review_body'].lower() if not c in punctuation])
    for w in r.split():
        wordCount[w] += 1

counts = [(wordCount[w], w) for w in wordCount]
counts.sort()
counts.reverse()

words = [x[1] for x in counts[:10_000]]

wordId = dict(zip(words, range(len(words))))
wordSet = set(words)

In [8]:
def feature(datum):
    feat = [0] * len(words)
    r = ''.join([c for c in datum['review_body'].lower() if not c in punctuation])
    for w in r.split():
        if w in words:
            feat[wordId[w]] += 1
    feat.append(1) # offset
    return feat

In [9]:
random.shuffle(dataset)

In [10]:
X = [feature(d) for d in dataset]

In [11]:
y = [d['star_rating'] for d in dataset]

In [12]:
N = len(X)
X_train = X[:N//2]  # from beginning to halfway
X_valid = X[N//2:3*N//4] # from halfway to 3/4
X_test = X[3*N//4:] # from 3/4 to end

y_train = y[:N//2]  # 50%
y_valid = y[N//2:3*N//4] # 25%
y_test = y[3*N//4:]  # 25%

In [13]:
# calculate Mean Square Error
def MSE(model, X, y):
    predictions = model.predict(X)
    differences = [(a-b)**2 for (a,b) in zip(predictions, y)]
    return sum(differences) / len(differences)

In [14]:
# keep track of which model works the best
bestModel = None
bestMSE = None

In [15]:
# fit a model for each value of lambda
for lamb in [0.001, 0.01, 0.1, 1, 10, 100, 1000]:
    model = linear_model.Ridge(lamb, fit_intercept=False)
    model.fit(X_train, y_train)
    
    mseTrain = MSE(model, X_train, y_train)
    mseValid = MSE(model, X_valid, y_valid)
    
    # report the training and validation error
    print('lambda = ' + str(lamb) + ', training/validation error = ' +
            str(mseTrain) + '/' + str(mseValid))
    if not bestModel or mseValid < bestMSE:
        bestModel = model
        bestMSE = mseValid

lambda = 0.001, training/validation error = 0.2918175240699372/0.8186774790509076
lambda = 0.01, training/validation error = 0.29189094015512734/0.7088928432030138
lambda = 0.1, training/validation error = 0.29271204029047565/0.5992015854861438
lambda = 1, training/validation error = 0.3007277044142932/0.4990569421463033
lambda = 10, training/validation error = 0.3344172580344908/0.43391730979735676
lambda = 100, training/validation error = 0.3872563395468438/0.4242473175453558
lambda = 1000, training/validation error = 0.4540654698496216/0.4628953474708325


In [16]:
# Finally reprt the test error for the model with the best performance on the validation set
mseTest = MSE(bestModel, X_test, y_test)
print('test error = ' + str(mseTest))

test error = 0.43646316326319934


In [17]:
FVU = mseTest / numpy.var(y_test) # Fraction of Variance Unexplained
R2 = 1 - FVU
print('R2 = ' + str(R2))

R2 = 0.3730957691813146
