In [1]:
import random
import numpy as np
import pandas as pd
import seaborn as sns
sns.set()
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [2]:
class Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
        
    def get_sentiment(self):
        if self.score <= 2:
            return 'NEGATIVE'
        elif self.score == 3:
            return 'NEUTRAL'
        else:
            return 'POSITIVE'
        
class ReviewContainer:
    def __init__(self, reviews):
        self.reviews = reviews
        
    def get_text(self):
        return [x.text for x in self.reviews]
    
    def get_sentiment(self):
        return [x.sentiment for x in self.reviews]
        
    def evenly_distribute(self):
        negative = list(filter(lambda x: x.sentiment == 'NEGATIVE', self.reviews))
        positive = list(filter(lambda x: x.sentiment == 'POSITIVE', self.reviews))
        positive_shrunk = positive[:len(negative)]
        self.reviews = negative + positive_shrunk
        random.shuffle(self.reviews)

## Load Data

In [3]:
import json 

file_name = 'Books_small_10000.json'

reviews = []
with open(file_name) as f:
    for line in f:
        review = json.loads(line)
        reviews.append(Review(review['reviewText'],review['overall']))
        
reviews[4].text

'It was a decent read.. typical story line. Nothing unsavory as so many are. Just a slice of life, plausible.'

In [4]:
review

{'reviewerID': 'A1EZD7IJOOAF6S',
 'asin': '0956998569',
 'reviewerName': 'Amazon Customer',
 'helpful': [1, 1],
 'reviewText': "Highly recommend this entire trilogy. It is very well written and held me in suspense and kept me reading.  Even with the same old young girl heroine who goes head strong and hell bent on saving the new world, id tecommend this book to dystopian fiction fans!  Not overdone, thankfully! A fesw situations made it feel like I've read this same plot before....but these were well thought out and much better written!  This authr has a gift a d I will be looking forward to reading more of ber work.",
 'overall': 4.0,
 'summary': 'truly enjoyed',
 'unixReviewTime': 1402358400,
 'reviewTime': '06 10, 2014'}

## Prepare Data

In [6]:
from sklearn.model_selection import train_test_split
train , test = train_test_split(reviews, test_size=0.33, random_state=42)

train_container = ReviewContainer(train)
test_container = ReviewContainer(test)


In [16]:
train_container.evenly_distribute()

train_x = train_container.get_text()
train_y = train_container.get_sentiment()

test_container.evenly_distribute()

test_x = test_container.get_text()
test_y = test_container.get_sentiment()

print(train_y.count('POSITIVE'))
print(train_y.count('NEGATIVE'))

436
436


#### Bag of Word Vectorization

In [40]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

vectorizer = TfidfVectorizer()
train_x_vector = vectorizer.fit_transform(train_x)

test_x_vector = vectorizer.transform(test_x)

## Classification

#### SVM

In [41]:
from sklearn import svm
clf_svm = svm.SVC(kernel='linear')

clf_svm.fit(train_x_vector, train_y)

clf_svm.predict(test_x_vector[0])

array(['POSITIVE'], dtype='<U8')

#### Decision Tree

In [42]:
from sklearn.tree import DecisionTreeClassifier
clf_dec = DecisionTreeClassifier()

clf_dec.fit(train_x_vector, train_y)

clf_dec.predict(test_x_vector[0])

array(['POSITIVE'], dtype='<U8')

#### Random Forest

In [43]:
from sklearn.ensemble import RandomForestClassifier
clf_rf = RandomForestClassifier()

clf_rf.fit(train_x_vector, train_y)

clf_rf.predict(test_x_vector[0])

array(['POSITIVE'], dtype='<U8')

#### Light GBM

In [44]:
from lightgbm import LGBMClassifier
clf_lgbm=LGBMClassifier()

clf_lgbm.fit(train_x_vector.astype('float32'), train_y)

clf_lgbm.predict(test_x_vector[0].astype('float32'))

array(['POSITIVE'], dtype='<U8')

#### Logistic Regression

In [45]:
from sklearn.linear_model import LogisticRegression
clf_log = LogisticRegression()

clf_log.fit(train_x_vector, train_y)

clf_log.predict(test_x_vector[0])

array(['POSITIVE'], dtype='<U8')

## Evaluation

In [46]:
# Mean Accuracy
print(clf_svm.score(test_x_vector, test_y))
print(clf_dec.score(test_x_vector, test_y))
print(clf_log.score(test_x_vector, test_y))
print(clf_rf.score(test_x_vector, test_y))
print(clf_lgbm.score(test_x_vector.astype('float32'), test_y))

0.8076923076923077
0.6706730769230769
0.8052884615384616
0.7908653846153846
0.7692307692307693


In [47]:
# F1 score
from sklearn.metrics import f1_score

print(f1_score(test_y, clf_svm.predict(test_x_vector), average=None, labels=['POSITIVE', 'NEGATIVE']))
print(f1_score(test_y, clf_dec.predict(test_x_vector), average=None, labels=['POSITIVE', 'NEGATIVE']))
print(f1_score(test_y, clf_log.predict(test_x_vector), average=None, labels=['POSITIVE', 'NEGATIVE']))
print(f1_score(test_y, clf_rf.predict(test_x_vector), average=None, labels=['POSITIVE', 'NEGATIVE']))

[0.80582524 0.80952381]
[0.6617284  0.67915691]
[0.80291971 0.80760095]
[0.78518519 0.79625293]


## Evaluation

In [48]:
test_set = ['I like this very good', 'bad book do not buy', 'horrible waste of time']
new_test = vectorizer.transform(test_set)

clf_svm.predict(new_test)

array(['POSITIVE', 'NEGATIVE', 'NEGATIVE'], dtype='<U8')

## Saving Model

In [51]:
import pickle

with open('sentiment_classifier.pkl', 'wb') as f:
    pickle.dump(clf_svm, f)

In [54]:
with open('sentiment_classifier.pkl', 'rb') as f:
    loaded_clf = pickle.load(f)

In [63]:
print(test_x[5])

loaded_clf.predict(test_x_vector[5])

This was OK. I had problems finishing it and actually read the last chapter at about 65% and decided that it wasn't worth it to finish. That's very unusual for me, but it just wasn't my cup of tea. It started out pretty good and the concept was good, but I became uncomfortable reading it at about 1/4 of the way through.


array(['NEGATIVE'], dtype='<U8')