In [1]:
import json

file_name = 'Books_small_10000.json'

with open(file_name) as f:
    for line in f:
        review = json.loads(line)
        print(review['reviewText'])
        print(review['overall'])
        
        break

I bought both boxed sets, books 1-5.  Really a great series!  Start book 1 three weeks ago and just finished book 5.  Sloane Monroe is a great character and being able to follow her through both private life and her PI life gets a reader very involved!  Although clues may be right in front of the reader, there are twists and turns that keep one guessing until the last page!  These are books you won't be disappointed with.
5.0


In [29]:
import random

class Sentiment:
    NEGATIVE = "NEGATIVE"
    NEUTRAL = "NEUTRAL"
    POSITIVE = "POSITIVE"

class Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
        
    def get_sentiment(self):
        if self.score <= 2:
            return Sentiment.NEGATIVE
        elif self.score == 3:
            return Sentiment.NEUTRAL
        else: #Score of 4 or 5
            return Sentiment.POSITIVE

class ReviewContainer:        
    def __init__(self, reviews):
        self.reviews = reviews
        
    def get_text(self):
        return [x.text for x in self.reviews]
    
    def get_sentiment(self):
        return [x.sentiment for x in self.reviews]
        
    def evenly_distribute(self):
        negative = list(filter(lambda x: x.sentiment == Sentiment.NEGATIVE, self.reviews))
        positive = list(filter(lambda x: x.sentiment == Sentiment.POSITIVE, self.reviews))
        neutral = list(filter(lambda x: x.sentiment == Sentiment.NEUTRAL, self.reviews))
        
        positive_shrunk = positive[:len(negative)]
        neutral_shrunk = neutral[:len(negative)]
        self.reviews = negative + positive_shrunk + neutral_shrunk
        random.shuffle(self.reviews)

In [30]:
reviews = []

with open(file_name) as f:
    for line in f:
        review = json.loads(line) # reading it as a dictionary this time
        reviews.append(Review(review['reviewText'], review['overall']))

In [31]:
from sklearn.model_selection import train_test_split

training, test = train_test_split(reviews, test_size=0.33, random_state=42)

train_container = ReviewContainer(training)
test_container = ReviewContainer(test)

train_container.evenly_distribute()
test_container.evenly_distribute()

len(train_container.reviews)

1308

In [32]:
train_x = train_container.get_text()
train_y = train_container.get_sentiment()

test_x = test_container.get_text()
test_y = test_container.get_sentiment()

print(train_y.count(Sentiment.POSITIVE))
print(train_y.count(Sentiment.NEUTRAL))
print(train_y.count(Sentiment.NEGATIVE))

436
436
436


In [33]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

vectorizer = TfidfVectorizer()
train_x_vectors = vectorizer.fit_transform(train_x)
test_x_vectors = vectorizer.transform(test_x)

In [45]:
from sklearn import svm

clf_svm = svm.SVC(kernel='linear')
clf_svm.fit(train_x_vectors, train_y)


SVC(kernel='linear')

In [35]:
from sklearn.tree import DecisionTreeClassifier

clf_dec = DecisionTreeClassifier()
clf_dec.fit(train_x_vectors, train_y)

DecisionTreeClassifier()

In [36]:
from sklearn.naive_bayes import GaussianNB

clf_gnb = DecisionTreeClassifier()
clf_gnb.fit(train_x_vectors, train_y)

DecisionTreeClassifier()

In [37]:
from sklearn.linear_model import LogisticRegression

clf_log = DecisionTreeClassifier()
clf_log.fit(train_x_vectors, train_y)

DecisionTreeClassifier()

In [46]:
print("SVM Score = ", round(clf_svm.score(test_x_vectors, test_y)*100),"%")
print("Decision Tree Score = ", round(clf_dec.score(test_x_vectors, test_y)*100),"%")
print("Naive Bayes Score = ", round(clf_gnb.score(test_x_vectors, test_y)*100),"%")
print("Logistic Regression Score = ", round(clf_log.score(test_x_vectors, test_y)*100),"%")

SVM Score =  62 %
Decision Tree Score =  45 %
Naive Bayes Score =  46 %
Logistic Regression Score =  43 %


In [39]:
from sklearn.metrics import f1_score

print("SVM Scores[Positive, Neutral, Negative] = ", f1_score(test_y, clf_svm.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE]))
print("Decision Tree Scores[Positive, Neutral, Negative] = ", f1_score(test_y, clf_dec.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE]))
print("Naive Bayes Scores[Positive, Neutral, Negative] = ", f1_score(test_y, clf_gnb.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE]))
print("Logistic Regression Scores[Positive, Neutral, Negative] = ", f1_score(test_y, clf_log.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE]))

SVM Scores[Positive, Neutral, Negative] =  [0.69417476 0.5450237  0.61352657]
Decision Tree Scores[Positive, Neutral, Negative] =  [0.49537037 0.41148325 0.44221106]
Naive Bayes Scores[Positive, Neutral, Negative] =  [0.49765258 0.42335766 0.47201946]
Logistic Regression Scores[Positive, Neutral, Negative] =  [0.46746988 0.37825059 0.45365854]


In [40]:
test_set = ['Liked it but not that great', 'bad book do not buy', 'horrible waste of time']
new_test = vectorizer.transform(test_set)

clf_svm.predict(new_test)

array(['NEUTRAL', 'NEGATIVE', 'NEGATIVE'], dtype='<U8')

### Tuning our model (with Grid Search)

In [53]:
from sklearn.model_selection import GridSearchCV

parameters = {'kernel': ('linear', 'rbf', 'poly'), 'C': (1,2,3), 'gamma': (1,2,3)}

svc = svm.SVC()
clf = GridSearchCV(svc, parameters, cv=5)
clf.fit(train_x_vectors, train_y)

GridSearchCV(cv=5, estimator=SVC(),
             param_grid={'C': (1, 2, 3), 'gamma': (1, 2, 3),
                         'kernel': ('linear', 'rbf', 'poly')})

In [54]:
print(clf.score(test_x_vectors,test_y))
print(clf_svm.score(test_x_vectors,test_y))

0.5993589743589743
0.6169871794871795


In [55]:
clf.get_params()

{'cv': 5,
 'error_score': nan,
 'estimator__C': 1.0,
 'estimator__break_ties': False,
 'estimator__cache_size': 200,
 'estimator__class_weight': None,
 'estimator__coef0': 0.0,
 'estimator__decision_function_shape': 'ovr',
 'estimator__degree': 3,
 'estimator__gamma': 'scale',
 'estimator__kernel': 'rbf',
 'estimator__max_iter': -1,
 'estimator__probability': False,
 'estimator__random_state': None,
 'estimator__shrinking': True,
 'estimator__tol': 0.001,
 'estimator__verbose': False,
 'estimator': SVC(),
 'iid': 'deprecated',
 'n_jobs': None,
 'param_grid': {'kernel': ('linear', 'rbf', 'poly'),
  'C': (1, 2, 3),
  'gamma': (1, 2, 3)},
 'pre_dispatch': '2*n_jobs',
 'refit': True,
 'return_train_score': False,
 'scoring': None,
 'verbose': 0}

In [57]:
clf.best_params_

{'C': 1, 'gamma': 1, 'kernel': 'rbf'}

### Saving Model

In [52]:
import pickle

with open ('sentiment_classifier.pkl', 'wb') as f:
    pickle.dump(clf, f)

In [None]:
print(test_x[0])

loaded_clf.predict()