In [114]:
import json
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
import random
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
import pickle

Load file

In [95]:
# file_name = './data/sentiment/Books_small.json'
# loaded in new dataset see not below

#Review the dataset 
file_name = './data/sentiment/Books_small_10000.json'

Open file and inspect the data

In [96]:

reviews = []
with open(file_name) as f:
    for line in f:
        review = json.loads(line)
        # print(review['reviewText'])
        # print(review['overall'])
        # break
        reviews.append((review['reviewText'], review['overall']))
        

In [97]:
#Check a sample

# reviews[5]
reviews[5][1]

5.0

To keep data neat and easy to read - create a data class

In [98]:
#Class Sentiment is a "enum class"
class Sentiment:
    NEGATIVE = "NEGATIVE"
    NETURAL = "NETURAL"
    POSITIVE = "POSITIVE"

class Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
        
    def get_sentiment(self):
        if self.score <= 2:
            return Sentiment.NEGATIVE
        elif self.score == 3:
            return Sentiment.NETURAL
        else: #Score of 4 or 5
            return Sentiment.POSITIVE

class ReviewContainer:
    def __init__(self, reviews):
        self.reviews = reviews
        
    def get_text(self):
        return [x.text for x in self.reviews]
    
    def get_sentiment(self):
        return [x.sentiment for x in self.reviews]
    
    def evenly_distribute(self):
        negative = list(filter(lambda x: x.sentiment == Sentiment.NEGATIVE, self.reviews))
        postitive = list(filter(lambda x: x.sentiment == Sentiment.POSITIVE, self.reviews))
        netural = list(filter(lambda x: x.sentiment == Sentiment.NETURAL, self.reviews))
        postitive_shruck = postitive[:len(negative)]
        self.reviews = negative + postitive_shruck
        random.shuffle(self.reviews)
            

Load data

In [99]:
reviews = []
with open(file_name) as f:
    for line in f:
        review = json.loads(line)
        reviews.append(Review(review['reviewText'], review['overall']))
# reviews[5].score
reviews[5].text
# reviews[5].sentiment


'I hoped for Mia to have some peace in this book, but her story is so real and raw.  Broken World was so touching and emotional because you go from Mia\'s trauma to her trying to cope.  I love the way the story displays how there is no "just bouncing back" from being sexually assaulted.  Mia showed us how those demons come for you every day and how sometimes they best you. I was so in the moment with Broken World and hurt with Mia because she was surrounded by people but so alone and I understood her feelings.  I found myself wishing I could give her some of my courage and strength or even just to be there for her.  Thank you Lizzy for putting a great character\'s voice on a strong subject and making it so that other peoples story may be heard through Mia\'s.'

Prep data

In [100]:
len(reviews) #10000 records

10000

In [101]:
training, test = train_test_split(reviews, test_size=0.33, random_state=42)

train_container = ReviewContainer(training)

test_container = ReviewContainer(test)


In [102]:
train_container.evenly_distribute()
train_x = train_container.get_text()
train_y = train_container.get_sentiment()

test_container.evenly_distribute()
test_x = test_container.get_text()
test_y = test_container.get_sentiment()

print(train_y.count(Sentiment.POSITIVE))
print(train_y.count(Sentiment.NEGATIVE))

436
436


Bag of words vectorization

In [103]:
# vectorizer = CountVectorizer()
vectorizer = TfidfVectorizer() #TfidVectorizer weighs less freq words higher
train_x_vectors = vectorizer.fit_transform(train_x) #this step fits and transforms your dataset

#These are the two seperate steps which are completed in the one step above
# vectorizer.fit(train_x)
# train_x_vectors = vectorizer.transform(train_x)

test_x_vectors = vectorizer.transform(test_x)

print(train_x[0])
print(train_x_vectors[0].toarray())



The first book I understood all the drama and everything that led up to the second book but she you get to the second I understand the hard feeling between the characters in the beginning but it just keeps going and it seems to be unnecessary it never calms down at any point I liked most of the book in a whole but I feel like the continued drama was crazy and the way it ended I hated it,it just doesn't seem to fit
[[0. 0. 0. ... 0. 0. 0.]]


Classification

Linear SVM

In [104]:
clf_svm = svm.SVC(kernel='linear')

clf_svm.fit(train_x_vectors, train_y)

test_x[0]
# test_x_vectors[0]

clf_svm.predict(test_x_vectors[0])


array(['NEGATIVE'], dtype='<U8')

Decision tree

In [105]:
clf_dec = DecisionTreeClassifier()
clf_dec.fit(train_x_vectors, train_y)

clf_dec.predict(test_x_vectors[0])

array(['NEGATIVE'], dtype='<U8')

Naive bayes

In [106]:
clf_gnb = DecisionTreeClassifier()
clf_gnb.fit(train_x_vectors, train_y)

clf_gnb.predict(test_x_vectors[0])

array(['NEGATIVE'], dtype='<U8')

Logistic regression

In [107]:
clf_log = LogisticRegression()
clf_log.fit(train_x_vectors, train_y)

clf_log.predict(test_x_vectors[0])

array(['NEGATIVE'], dtype='<U8')

Evaluation

In [108]:
#Mean accuracy

print(clf_svm.score(test_x_vectors, test_y))
print(clf_dec.score(test_x_vectors, test_y))
print(clf_gnb.score(test_x_vectors, test_y))
print(clf_log.score(test_x_vectors, test_y))



0.8076923076923077
0.6442307692307693
0.6225961538461539
0.8028846153846154


In [109]:
#F1 scores

print(f1_score(test_y, clf_svm.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NETURAL, Sentiment.NEGATIVE]))
print(f1_score(test_y, clf_dec.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NETURAL, Sentiment.NEGATIVE]))
print(f1_score(test_y, clf_gnb.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NETURAL, Sentiment.NEGATIVE]))
print(f1_score(test_y, clf_log.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NETURAL, Sentiment.NEGATIVE]))

#results identify an issue with the data. all good for positive but poor for the others

[0.80582524 0.         0.80952381]
[0.63366337 0.         0.65420561]
[0.61425061 0.         0.63058824]
[0.80097087 0.         0.8047619 ]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [110]:
#Review the dataset 
train_x[0]
train_y[0:5]

train_y.count(Sentiment.POSITIVE) #552 5611
train_y.count(Sentiment.NEGATIVE) #47 436
train_y.count(Sentiment.NETURAL) #71 653


0

Qualitative testing

In [111]:
test_set = ['it was good','such an amazing book, 5 out of 5', 'the book is garbage, do not buy', 'boring, dont waste your time']
new_test = vectorizer.transform(test_set)

clf_svm.predict(new_test)

array(['POSITIVE', 'POSITIVE', 'NEGATIVE', 'NEGATIVE'], dtype='<U8')

Tuning our model (with Grid Search)

In [112]:
parameters = {'kernel' : ('linear', 'rbf'), 'C' : (1,4,8,16,32)}

svc = svm.SVC()
clf = GridSearchCV(svc, parameters, cv=5)
clf.fit(train_x_vectors, train_y)

  _data = np.array(data, dtype=dtype, copy=copy,


In [113]:
print(clf_svm.score(test_x_vectors, test_y))
print(clf_dec.score(test_x_vectors, test_y))
print(clf_gnb.score(test_x_vectors, test_y))
print(clf_log.score(test_x_vectors, test_y))

0.8076923076923077
0.6442307692307693
0.6225961538461539
0.8028846153846154


Saving model

In [115]:
with open('./models/sentiment_classifier_stu.pk1', 'wb') as f:
    pickle.dump(clf, f)

Load in pickle model

In [117]:
with open('./models/sentiment_classifier_stu.pk1', 'rb') as f:
    loaded_clf = pickle.load(f)

In [119]:
print(test_x[0])
loaded_clf.predict(test_x_vectors[0])

Do not waste your time on this book. It goes no where and has a ridiculous ending. I did not enjoy the read, however I made myself finish it.


array(['NEGATIVE'], dtype='<U8')