In [2]:
import random

class Sentiment:
    NEGATIVE = "NEGATIVE"
    NEUTRAL = "NEUTRAL"
    POSITIVE = "POSITIVE"

class Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
        
    def get_sentiment(self):
        if self.score <= 2:
            return Sentiment.NEGATIVE
        elif self.score == 3:
            return Sentiment.NEUTRAL
        else: #Score of 4 or 5
            return Sentiment.POSITIVE

In [6]:
import json


file_name = 'books.json'
reviews = []
with open(file_name) as f:
    for line in f:
        review = json.loads(line)
        reviews.append(Review(review['reviewText'], review['overall']))
        
reviews[4].sentiment

'POSITIVE'

In [4]:
from sklearn import *
from sklearn.model_selection import train_test_split

training, test = train_test_split(reviews, test_size=0.33, random_state=42)

In [7]:
print(training[1].text)

Oh this is an awful bookA delightful awful bookJust the sort of awful bookYour whole family will adoreMy daughters requested it so often from the library, I finally just bought a copy.


In [8]:
train_x = [x.text for x in training]
train_y = [x.sentiment for x in training]


test_x = [x.text for x in test]
test_y = [x.sentiment for x in test]

print(train_y.count(Sentiment.POSITIVE))
print(train_y.count(Sentiment.NEGATIVE))

552
47


In [9]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

vectorizer = TfidfVectorizer()
train_x_vectors = vectorizer.fit_transform(train_x)

test_x_vectors = vectorizer.transform(test_x)

print(train_x[1])
print(train_x_vectors[1].toarray())

Oh this is an awful bookA delightful awful bookJust the sort of awful bookYour whole family will adoreMy daughters requested it so often from the library, I finally just bought a copy.
[[0. 0. 0. ... 0. 0. 0.]]


In [10]:
from sklearn import svm

clf_svm = svm.SVC(kernel='linear')

clf_svm.fit(train_x_vectors, train_y)

test_x[0]

clf_svm.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

In [11]:
from sklearn.tree import DecisionTreeClassifier

clf_dec = DecisionTreeClassifier()
clf_dec.fit(train_x_vectors, train_y)

clf_dec.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

In [12]:
from sklearn.naive_bayes import GaussianNB

clf_gnb = DecisionTreeClassifier()
clf_gnb.fit(train_x_vectors, train_y)

clf_gnb.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

In [13]:
from sklearn.linear_model import LogisticRegression

clf_log = LogisticRegression()
clf_log.fit(train_x_vectors, train_y)

clf_log.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

In [22]:
#Mean Accuracy
print("Linear SVM:", round(clf_svm.score(test_x_vectors, test_y)*100, 2))
print("Decision Tree:", round(clf_dec.score(test_x_vectors, test_y)*100, 2))
print("GaussianNB:", round(clf_gnb.score(test_x_vectors, test_y)*100, 2))
print("Logistic Regression:", round(clf_log.score(test_x_vectors, test_y)*100, 2))

Linear SVM: 85.76
Decision Tree: 70.91
GaussianNB: 73.64
Logistic Regression: 85.76


In [21]:
# F1 Scores
from sklearn.metrics import f1_score

f1_score(test_y, clf_svm.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEGATIVE])

array([0.9233279, 0.       , 0.       ])

In [23]:
test_set = ['very fun', "bad book do not buy", 'horrible waste of time']
new_test = vectorizer.transform(test_set)

clf_svm.predict(new_test)

array(['POSITIVE', 'POSITIVE', 'POSITIVE'], dtype='<U8')

In [24]:
from sklearn.model_selection import GridSearchCV

parameters = {'kernel': ('linear', 'rbf'), 'C': (1,4,8,16,32)}

svc = svm.SVC()
clf = GridSearchCV(svc, parameters, cv=5)
clf.fit(train_x_vectors, train_y)

GridSearchCV(cv=5, estimator=SVC(),
             param_grid={'C': (1, 4, 8, 16, 32), 'kernel': ('linear', 'rbf')})

In [25]:
print(clf.score(test_x_vectors, test_y))

0.8575757575757575


In [27]:
import pickle

with open('./sentiment_classifier.pkl', 'wb') as f:
    pickle.dump(clf, f)

In [31]:
with open('./sentiment_classifier.pkl', 'rb') as f:
    load_clf = pickle.load(f)

In [34]:
print(test_x[3])

load_clf.predict(test_x_vectors[3])

After all of William Lane Craig's book on philosophy and apologetics, I expected more depth.  The book is short and has some very interesting points but left me saying there has to be more.


array(['POSITIVE'], dtype='<U8')