In [2]:
import json

file_name = 'Books_small_10000.json'

with open(file_name) as f:
    for line in f:
        review = json.loads(line)
        print(review['reviewText'])
        print(review['overall'])
        
        break

I bought both boxed sets, books 1-5.  Really a great series!  Start book 1 three weeks ago and just finished book 5.  Sloane Monroe is a great character and being able to follow her through both private life and her PI life gets a reader very involved!  Although clues may be right in front of the reader, there are twists and turns that keep one guessing until the last page!  These are books you won't be disappointed with.
5.0


### This is to evenly distribute the postive, neutral and negative reviews accross the sample we will use to train        


#### Will also include a method for getting the text and sentiment, to keep things neat!

In [24]:
import random

class Sentiment:
    NEGATIVE = "NEGATIVE"
    NEUTRAL = "NEUTRAL"
    POSITIVE = "POSITIVE"

class Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
        
    def get_sentiment(self):
        if self.score <= 2:
            return Sentiment.NEGATIVE
        elif self.score == 3:
            return Sentiment.NEUTRAL
        else: #Score of 4 or 5
            return Sentiment.POSITIVE

class ReviewContainer:        
    def __init__(self, reviews):
        self.reviews = reviews
        
    def get_text(self):
        return [x.text for x in self.reviews]
    
    def get_sentiment(self):
        return [x.sentiment for x in self.reviews]
        
    def evenly_distribute(self):
        negative = list(filter(lambda x: x.sentiment == Sentiment.NEGATIVE, self.reviews))
        positive = list(filter(lambda x: x.sentiment == Sentiment.POSITIVE, self.reviews))
        neutral = list(filter(lambda x: x.sentiment == Sentiment.NEUTRAL, self.reviews))
        
        positive_shrunk = positive[:len(negative)]
        neutral_shrunk = neutral[:len(negative)]
        self.reviews = negative + positive_shrunk + neutral_shrunk
        random.shuffle(self.reviews)
        

In [4]:
reviews = []

with open(file_name) as f:
    for line in f:
        review = json.loads(line) # reading it as a dictionary this time
        reviews.append(Review(review['reviewText'], review['overall']))

### This is where we will insert the evenly distributed data

In [43]:
from sklearn.model_selection import train_test_split

training, test = train_test_split(reviews, test_size=0.33, random_state=42)

train_container = ReviewContainer(training)
test_container = ReviewContainer(test)

train_container.evenly_distribute()
test_container.evenly_distribute()

len(train_container.reviews)

1308

In [44]:
train_x = train_container.get_text()
train_y = train_container.get_sentiment()

test_x = test_container.get_text()
test_y = test_container.get_sentiment()

print(train_y.count(Sentiment.POSITIVE))
print(train_y.count(Sentiment.NEUTRAL))
print(train_y.count(Sentiment.NEGATIVE))

436
436
436


In [45]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
train_x_vectors = vectorizer.fit_transform(train_x)
test_x_vectors = vectorizer.transform(test_x)

In [46]:
from sklearn import svm

clf_svm = svm.SVC(kernel='linear')
clf_svm.fit(train_x_vectors, train_y)

SVC(kernel='linear')

In [47]:
from sklearn.tree import DecisionTreeClassifier

clf_dec = DecisionTreeClassifier()
clf_dec.fit(train_x_vectors, train_y)

DecisionTreeClassifier()

In [48]:
from sklearn.naive_bayes import GaussianNB

clf_gnb = DecisionTreeClassifier()
clf_gnb.fit(train_x_vectors, train_y)

DecisionTreeClassifier()

In [49]:
from sklearn.linear_model import LogisticRegression

clf_log = DecisionTreeClassifier()
clf_log.fit(train_x_vectors, train_y)

DecisionTreeClassifier()

In [50]:
print("SVM Score = ", round(clf_svm.score(test_x_vectors, test_y)*100),"%")
print("Decision Tree Score = ", round(clf_dec.score(test_x_vectors, test_y)*100),"%")
print("Naive Bayes Score = ", round(clf_gnb.score(test_x_vectors, test_y)*100),"%")
print("Logistic Regression Score = ", round(clf_log.score(test_x_vectors, test_y)*100),"%")

SVM Score =  56 %
Decision Tree Score =  42 %
Naive Bayes Score =  42 %
Logistic Regression Score =  44 %


In [51]:
from sklearn.metrics import f1_score

print("SVM Scores[Positive, Neutral, Negative] = ", f1_score(test_y, clf_svm.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE]))
print("Decision Tree Scores[Positive, Neutral, Negative] = ", f1_score(test_y, clf_dec.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE]))
print("Naive Bayes Scores[Positive, Neutral, Negative] = ", f1_score(test_y, clf_gnb.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE]))
print("Logistic Regression Scores[Positive, Neutral, Negative] = ", f1_score(test_y, clf_log.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE]))

SVM Scores[Positive, Neutral, Negative] =  [0.61881188 0.51748252 0.54457831]
Decision Tree Scores[Positive, Neutral, Negative] =  [0.47087379 0.35322196 0.43645084]
Naive Bayes Scores[Positive, Neutral, Negative] =  [0.4824356  0.35436893 0.43031785]
Logistic Regression Scores[Positive, Neutral, Negative] =  [0.5106383  0.38004751 0.43564356]


### Let's do some qualitative analysis

In [74]:
test_set = ['I would recommend it', 'bad book do not buy', 'horrible waste of time']
new_test = vectorizer.transform(test_set)

clf_svm.predict(new_test)

array(['POSITIVE', 'NEGATIVE', 'NEGATIVE'], dtype='<U8')