# Amazon Video Game Rating - Classification Analysis

This notebook goes into detail improving from a naive bag of words model. To see introductory data analysis and a 
regressive comparision to this task, please see attached notebook.

In [1]:
import gzip
import sklearn
from collections import defaultdict
import random
import math
import numpy as np
from sklearn.metrics import jaccard_score as jaccard
import json
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
import scipy.optimize
import string
from nltk.stem.porter import *

In [2]:
def parse(path):
    g = gzip.open(path, 'r')
    for l in g:    
        yield json.loads(l)

In [3]:
data = list(parse("./Video_Games_5.json.gz"))

In [4]:
df = pd.DataFrame(data)

In [5]:
df['overall'].value_counts()

5.0    299759
4.0     93654
3.0     49146
1.0     30883
2.0     24135
Name: overall, dtype: int64

In [6]:
X = [d for d in data]
y = [d['overall'] for d in data]


In [7]:
#shuffle data
Xy = list(zip(X,y))
random.shuffle(Xy)

In [8]:
X = np.array([d[0] for d in Xy])
y = np.array([d[1] for d in Xy])
y = y.astype('int')

In [9]:
# df['len_rev'] = df['reviewText'].str.len()

In [10]:
data[0]

{'overall': 5.0,
 'verified': True,
 'reviewTime': '10 17, 2015',
 'reviewerID': 'A1HP7NVNPFMA4N',
 'asin': '0700026657',
 'reviewerName': 'Ambrosia075',
 'reviewText': "This game is a bit hard to get the hang of, but when you do it's great.",
 'summary': "but when you do it's great.",
 'unixReviewTime': 1445040000}

In [11]:
df.groupby(['reviewerID']).size()
# ['overall'].value_counts()

reviewerID
A0059486XI1Z0P98KP35     5
A0220159ZRNBTRKLG08H     6
A0266076X6KPZ6CCHGVS    14
A0277912HT4JSJKVSL3E    10
A02836981FYG9912C66F     7
                        ..
AZZNK89PXD006            7
AZZQCK9ZAKMFR           11
AZZT1ERHBSNQ8            7
AZZTC2OYVNE2Q            6
AZZTOUKVTUMVM            6
Length: 55223, dtype: int64

In [12]:
df[df['reviewerID'].isnull()]

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,vote,style,image


In [13]:
len(data)

497577

In [14]:
len(Xy)

497577

In [56]:
Xtrain = X[:50000]
Xvalid = X[50000:60000]
#Xtest = X[40000:50000]

ytrain = y[:50000]
yvalid = y[50000:60000]


#ytest = y[40000:50000]

In [16]:
C = [.01, .1, 1, 10, 100]

# Unigrams,  punc,  tf-idf

In [58]:
#Unigrams, keep punc, tfidf
#training data
unigrams = defaultdict(int)
for d in Xtrain:
    #not all data has a review
    if 'reviewText' in d:
#     token = nltk.word_tokenize(d['text'])
#     unigram = list(ngrams(token, 1))
        t = d['reviewText']
        text = " ".join(t.splitlines())
        unigram = text.strip().split()
        for u in unigram:
            unigrams[u] += 1

#1000 most common from training set
mostCommonUni =sorted(unigrams.items(),key=lambda v: v[1],reverse=True)[:500]
unigram_words = [u[0] for u in mostCommonUni]
unigramId = dict(zip(unigram_words, range(len(unigram_words))))
unigramSet = set(unigram_words)

In [15]:
#docFreq and tf
#training data
docFreq = defaultdict(set)
for d in Xtrain:
    if 'reviewText' in d: 
        t = d['reviewText']
        text = " ".join(t.splitlines())
        unigram = text.strip().split()
        for u in unigram:
            docFreq[u].add(d['reviewerID'])

#term freq
tf = unigrams


In [14]:
def feature_uni_punc_tfidf(datum):
    feat = [0]*len(unigramSet)
    if 'reviewText' in datum: 
        t = datum['reviewText']
        text = " ".join(t.splitlines())
        unigram_words = text.strip().split()
    
        for u in unigram_words:
            if not (u in unigramSet): continue
            tf_idf_word = np.log(len(Xtrain)/ len(docFreq[u])) * tf[u]
            feat[unigramId[u]] = tf_idf_word

    feat.append(1)
    return feat

In [15]:
Xtrain_1 = [feature_uni_punc_tfidf(d) for d in Xtrain]
Xvalid_1 = [feature_uni_punc_tfidf(d) for d in Xvalid]

In [20]:
punctuation = set(string.punctuation)

# Unigrams, No punc, tf-idf

In [15]:
#unigrams, discard punc, tfidf
def feature_uni_nopunc_tfidf(datum):
    feat = [0]*len(unigramSet)
    if 'reviewText' in datum:
        t = datum['reviewText']
        t = ''.join([c for c in t.lower() if not c in punctuation])

        text = " ".join(t.splitlines())
        unigram_words = text.strip().split()
    #     token = nltk.word_tokenize(t)
    #     unigram_words = list(ngrams(token, 1))

        for u in unigram_words:
            if not (u in unigramSet): continue
            tf_idf_word = np.log(len(Xtrain)/ len(docFreq[u])) * tf[u]
            feat[unigramId[u]] = tf_idf_word

    feat.append(1)
    return feat

In [16]:
Xtrain_2 = [feature_uni_nopunc_tfidf(d) for d in Xtrain]
Xvalid_2 = [feature_uni_nopunc_tfidf(d) for d in Xvalid]

In [14]:
len(unigramSet)

1000

# Unigrams, punc

In [15]:
#unigrams, keep punc, counts
def feature_uni_punc_wc(datum):
    feat = [0]*len(unigramSet)
    if 'reviewText' in datum: 
        t = datum['reviewText']
    #     token = nltk.word_tokenize(t)
    #     unigram_words = list(ngrams(token, 1))
        text = " ".join(t.splitlines())
        unigram_words = text.strip().split()

        for u in unigram_words:
            if not (u in unigramSet): continue
            feat[unigramId[u]] += 1

    feat.append(1)
    return feat

In [16]:
Xtrain_3 = [feature_uni_punc_wc(d) for d in Xtrain]
Xvalid_3 = [feature_uni_punc_wc(d) for d in Xvalid]

# Unigrams, No punc

In [18]:
#unigrams, discard punc, counts
def feature_uni_nopunc_wc(datum):
    feat = [0]*len(unigramSet)
    if 'reviewText' in datum: 
        t = datum['reviewText']
        t = ''.join([c for c in t.lower() if not c in punctuation])

        text = " ".join(t.splitlines())
        unigram_words = text.strip().split()
    #     token = nltk.word_tokenize(t)
    #     unigram_words = list(ngrams(token, 1))
        for u in unigram_words:
            if not (u in unigramSet): continue
            feat[unigramId[u]] += 1

    feat.append(1)
    return feat

In [59]:
Xtrain_4 = [feature_uni_nopunc_wc(d) for d in Xtrain]
Xvalid_4 = [feature_uni_nopunc_wc(d) for d in Xvalid]

In [30]:
#start of bigram models
bigrams = defaultdict(int)

for d in Xtrain:
#     token = nltk.word_tokenize(d['text'])
#     bigram = list(ngrams(token, 2)) 
    if 'reviewText' in d: 
        text = " ".join(d['reviewText'].splitlines())
        bigram = [b for b in zip(text.split(" ")[:-1], text.split(" ")[1:])]
        for b in bigram:
            bigrams[b] += 1
        
#1000 most common from training set
mostCommonBi =sorted(bigrams.items(),key=lambda v: v[1],reverse=True)[:5000]
bigram_words = [u[0] for u in mostCommonBi]
bigramId = dict(zip(bigram_words, range(len(bigram_words))))
bigramSet = set(bigram_words)

In [22]:
#docFreq and tf
#training data
docFreq = defaultdict(set)
for d in Xtrain:
#     token = nltk.word_tokenize(d['text'])
#     bigram = list(ngrams(token, 2)) 
    if 'reviewText' in d: 
        text = " ".join(d['reviewText'].splitlines())
        bigram = [b for b in zip(text.split(" ")[:-1], text.split(" ")[1:])]
        for b in bigram:
            docFreq[b].add(d['reviewerID'])

#term freq
tf = bigrams

# Bigrams,  punc, tf-idf

In [23]:
#bigrams, keep punc, tfidf
def feature_bi_punc_tfidf(datum):
    feat = [0]*len(bigramSet)
    if 'reviewText' in datum: 
        t = datum['reviewText']
    #     token = nltk.word_tokenize(t)
    #     bigram_words = list(ngrams(token, 2))
        text = " ".join(t.splitlines())
        bigram_words = [b for b in zip(text.split(" ")[:-1], text.split(" ")[1:])]

        for b in bigram_words:
            if not (b in bigramSet): continue
            tf_idf_word = np.log(len(Xtrain)/ len(docFreq[b])) * tf[b]
            feat[bigramId[b]] = tf_idf_word

    feat.append(1)
    return feat

In [24]:
Xtrain_5 = [feature_bi_punc_tfidf(d) for d in Xtrain]
Xvalid_5 = [feature_bi_punc_tfidf(d) for d in Xvalid]

# Bigrams, No punc, tf-idf

In [28]:
#bigrams, discard punc, tfidf
def feature_bi_nopunc_tfidf(datum):
    feat = [0]*len(bigramSet)
    if 'reviewText' in datum: 
        t = datum['reviewText']
    #     token = nltk.word_tokenize(t)
    #     bigram_words = list(ngrams(token, 2))
        t = ''.join([c for c in t.lower() if not c in punctuation])
        text = " ".join(t.splitlines())
        bigram_words = [b for b in zip(text.split(" ")[:-1], text.split(" ")[1:])]

        for b in bigram_words:
            if not (b in bigramSet): continue
            tf_idf_word = np.log(len(Xtrain)/ len(docFreq[b])) * tf[b]
            feat[bigramId[b]] = tf_idf_word

    feat.append(1)
    return feat

In [26]:
Xtrain_6 = [feature_bi_nopunc_tfidf(d) for d in Xtrain]
Xvalid_6 = [feature_bi_nopunc_tfidf(d) for d in Xvalid]

# Bigrams, punc

In [27]:
#bigrams, keep punc, counts
def feature_bi_punc_wc(datum):
    feat = [0]*len(bigramSet)
    if 'reviewText' in datum: 
        t = datum['reviewText']

    #     token = nltk.word_tokenize(t)
    #     bigram_words = list(ngrams(token, 2))
        text = " ".join(t.splitlines())
        bigram_words = [b for b in zip(text.split(" ")[:-1], text.split(" ")[1:])]

        for b in bigram_words:
            if not (b in bigramSet): continue
            feat[bigramId[b]] += 1

    feat.append(1)
    return feat

In [28]:
Xtrain_7 = [feature_bi_punc_wc(d) for d in Xtrain]
Xvalid_7 = [feature_bi_punc_wc(d) for d in Xvalid]

# Bigrams, No punc

In [31]:
#bigrams, discard punc, counts
def feature_bi_nopunc_wc(datum):
    feat = [0]*len(bigramSet)
    if 'reviewText' in datum: 
        t = datum['reviewText']

    #     token = nltk.word_tokenize(t)
    #     bigram_words = list(ngrams(token, 2))

        t = ''.join([c for c in t.lower() if not c in punctuation])
        text = " ".join(t.splitlines())
        bigram_words = [b for b in zip(text.split(" ")[:-1], text.split(" ")[1:])]

        for b in bigram_words:
            if not (b in bigramSet): continue
            feat[bigramId[b]] += 1

    feat.append(1)
    return feat

In [32]:
Xtrain_8 = [feature_bi_nopunc_wc(d) for d in Xtrain]
Xvalid_8 = [feature_bi_nopunc_wc(d) for d in Xvalid]

# Pipeline

In [60]:
# pipeline
# to_fit = [Xtrain_1, Xtrain_2, Xtrain_3, Xtrain_4, Xtrain_5, Xtrain_6, Xtrain_7, Xtrain_8]
to_fit = [Xtrain_4]
# to_pred = [Xvalid_1, Xvalid_2, Xvalid_3, Xvalid_4, Xvalid_5, Xvalid_6, Xvalid_7, Xvalid_8]
to_pred = [Xvalid_4]
model_performances = []
for i in range(len(to_fit)):
    #for c in C:
        c = 0.1
        clf = LogisticRegression(C = c, fit_intercept=False, max_iter = 10000) 
        clf.fit(to_fit[i], ytrain)
        theta = clf.coef_
        predictions = clf.predict(to_pred[i])
        correct = predictions == yvalid
        acc = sum(correct) / len(correct)
        print(acc)
        model_performances.append(acc)

0.6283


In [25]:
len([d for d in Xvalid if d['overall'] == 5])/len(Xvalid)

0.5963

In [21]:
len([d for d in Xvalid if d['overall'] == 5])/len(Xvalid)

0.6035

In [None]:
model_names = ["unigrams, keep punc, tfidf",
"unigrams, discard punc, tfidf",
"unigrams, keep punc, counts",
"unigrams, discard punc, counts",
"bigrams, keep punc, tfidf",
"bigrams, discard punc, tfidf",
"bigrams, keep punc, counts",
"bigrams, discard punc, counts"]

index_names = []
for model in model_names:
    for a in A:
        index_names.append((model,a))

In [None]:
index = pd.MultiIndex.from_tuples(index_names, names=['model','regularization param'])