In [2]:
import numpy as np
import sys

In [106]:
import random 

class Sentiment:
    NEGATIVE = 'NEGATIVE'
    NEUTRAL = 'NEUTRAL'
    POSITIVE = 'POSITIVE'

class Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.getSentiment()
    
    def getSentiment(self):
        if self.score <= 2:
            return Sentiment.NEGATIVE
        elif self.score == 3:
            return Sentiment.NEUTRAL
        else: #4 and 5
            return Sentiment.POSITIVE

class ReviewContainer:
    def __init__(self, reviews):
        self.reviews = reviews
        
    def get_text(self):
        return [x.text for x in self.reviews]
    def get_sentiment(self):
        return [x.sentiment for x in self.reviews]

    def evenly_distribute(self):
        negative = list(filter(lambda x: x.sentiment == Sentiment.NEGATIVE, self.reviews))
        positive = list(filter(lambda x: x.sentiment == Sentiment.POSITIVE, self.reviews))
        positive_shrunk = positive[:len(negative)]
        self.reviews = negative + positive_shrunk
        random.shuffle(self.reviews)

# load data

In [50]:
import json

file = '/Users/sarahdoctor/Desktop/Books_small.json'

reviews = []

with open(file) as f:
    for line in f:
        review = json.loads(line)
        reviews.append(Review(review['reviewText'], review['overall']))

In [51]:
reviews[5].sentiment

'POSITIVE'

# Prep data

In [122]:
from sklearn.model_selection import train_test_split

train,test = train_test_split(reviews, test_size = 0.33, random_state=42)

trainContainer = ReviewContainer(train)
testContainer = ReviewContainer(test)


In [123]:
len(train)

670

In [124]:
trainContainer.evenly_distribute()


train_x = trainContainer.get_text()
train_y = trainContainer.get_sentiment()

testContainer.evenly_distribute()


test_x = testContainer.get_text()
test_y = testContainer.get_sentiment()

In [135]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

vectorizer = TfidfVectorizer()
train_x_vectors = vectorizer.fit_transform(train_x)
test_x_vectors = vectorizer.transform(test_x)


# Classification

In [136]:
# Linear SVM

In [137]:
from sklearn import svm

clf_svm = svm.SVC(kernel = 'linear')
clf_svm.fit(train_x_vectors, train_y)

test_x[0]
clf_svm.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

# Decision Trees

In [138]:
from sklearn.tree import DecisionTreeClassifier

clf_dec = DecisionTreeClassifier()
clf_dec = clf_dec.fit(train_x_vectors, train_y)
clf_dec.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

In [139]:
#Naive Bayes

In [140]:
from sklearn.naive_bayes import GaussianNB
clf_gnb = DecisionTreeClassifier()
clf_gnb = clf_gnb.fit(train_x_vectors, train_y)
clf_gnb.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

In [141]:
#Logistic Regression

In [142]:
from sklearn.linear_model import LogisticRegression
clf_log = DecisionTreeClassifier()
clf_log = clf_log.fit(train_x_vectors, train_y)
clf_log.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

# evaluation

In [143]:
# mean accuracy
print(clf_svm.score(test_x_vectors, test_y))
print(clf_dec.score(test_x_vectors, test_y))
print(clf_gnb.score(test_x_vectors, test_y))
print(clf_log.score(test_x_vectors, test_y))

0.7
0.7333333333333333
0.7
0.7333333333333333


In [144]:
# F1 scores
from sklearn.metrics import f1_score

f1_score(test_y, clf_svm.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEGATIVE])


array([0.70967742, 0.68965517])

# Tune the model using Grid Search

In [146]:
from sklearn.model_selection import GridSearchCV
parameters = {'kernel':('linear','rbf'), 'C':(1,4,8,16,32)}
svc = svm.SVC()
clf = GridSearchCV(svc, parameters, cv=5)
clf.fit(train_x_vectors, train_y)

GridSearchCV(cv=5, estimator=SVC(),
             param_grid={'C': (1, 4, 8, 16, 32), 'kernel': ('linear', 'rbf')})

In [147]:
print(clf_svm.score(test_x_vectors, test_y))


0.7
