## Loading Libraries


In [1]:
import json
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer #<-- to vectorize bag of words
import random

## Data Class


In [2]:
class Sentiment:
    NEGATIVE = 'NEGATIVE'
    NEUTRAL = 'NEUTRAL'
    POSITIVE = 'POSITIVE'
    
class Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
        
    def get_sentiment(self):
        if self.score <= 2:
            return Sentiment.NEGATIVE
        
        elif self.score == 3:
            return Sentiment.NEUTRAL
        else: #score of 4 or 5
            return Sentiment.POSITIVE
        
class ReviewContainer:
    def __init__(self, reviews):
        self.reviews = reviews
        
    def get_text(self):
        return [x.text for x in self.reviews]
    
    def get_sentiment(self):
        return [x.sentiment for x in self.reviews]
        
    def evenly_distribute(self):
        negative = list(filter(lambda x: x.sentiment == Sentiment.NEGATIVE, self.reviews))
        positive = list(filter(lambda x: x.sentiment == Sentiment.POSITIVE, self.reviews))
        positive_shrunk = positive[:len(negative)]
        self.reviews = negative + positive_shrunk
        random.shuffle(self.reviews)

## Load Data

In [3]:
file_name = './data/sentiments/books_small_10000.json'

reviews = []
with open(file_name) as f:
    for line in f:
        review = json.loads(line)
        reviews.append(Review(review['reviewText'], review['overall']))

print(reviews[5].sentiment)
        

POSITIVE


## Prep Data

In [45]:
training, test = train_test_split(reviews, test_size = 0.33, random_state = 42) #random state will give the same split
                                                                # when random_state = 42 it will give the same result you got when you ran it the first time
#training[0].text

train_cont = ReviewContainer(training)
train_cont.evenly_distribute()

test_cont = ReviewContainer(test)
test_cont.evenly_distribute()
len(train_cont.reviews)

872

In [46]:
train_x = train_cont.get_text() 
train_y = train_cont.get_sentiment()

# test_x = [x.text for x in test]
# test_y = [x.sentiment for x in test]

test_x = test_cont.get_text()
test_y = test_cont.get_sentiment()

print(test_y.count(Sentiment.POSITIVE))
print(test_y.count(Sentiment.NEGATIVE))
print(train_x[0])

208
208
The Pope is probably the best known figure in the Catholic religion, but no one ever considers the fact that the Pope is more than a figure, he is a person. A man with emotions and thoughts, and John Donald Carlucci did an incredible job leading us into his head. I am not a Catholic myself, but this was an amazing story of true human emotion and faith, and I recommend it for Catholics as well as anyone who has ever thought about who the Pope really was. He touched many people's lives, and no one probably even considers it. A five star book I am loaning to my mother, who is a devout Catholic and probably loves the Pope more than me sometimes! Great book, Carlucci is a great story teller and I hope there is more from this author for me to read!


#### Bag of words vectorization

In [48]:
vectorizer = CountVectorizer()
train_x_vectors = vectorizer.fit_transform(train_x) #vectorize the train_x

# vectorizer.fit(train_x)
# train_x_vectors = vectorizer.transform(train_x) <-- can do the same thing fit_transform but in two steps

test_x_vectors = vectorizer.transform(test_x)

print(train_x[0])
print(train_x_vectors[0].toarray())

The Pope is probably the best known figure in the Catholic religion, but no one ever considers the fact that the Pope is more than a figure, he is a person. A man with emotions and thoughts, and John Donald Carlucci did an incredible job leading us into his head. I am not a Catholic myself, but this was an amazing story of true human emotion and faith, and I recommend it for Catholics as well as anyone who has ever thought about who the Pope really was. He touched many people's lives, and no one probably even considers it. A five star book I am loaning to my mother, who is a devout Catholic and probably loves the Pope more than me sometimes! Great book, Carlucci is a great story teller and I hope there is more from this author for me to read!
[[0 0 0 ... 0 0 0]]


## Classification

### Linear svm

In [49]:
from sklearn import svm

clf_svm = svm.SVC(kernel = 'linear')

In [50]:
clf_svm.fit(train_x_vectors, train_y)

test_x[0]

clf_svm.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

### Decision Tree

In [51]:
from sklearn.tree import DecisionTreeClassifier

clf_dec = DecisionTreeClassifier()

In [52]:
clf_dec.fit(train_x_vectors, train_y)

clf_dec.predict(test_x_vectors[0])

array(['NEGATIVE'], dtype='<U8')

### Naive Bayes classifier

In [53]:
from sklearn.naive_bayes import GaussianNB

clf_bayes = GaussianNB()

In [54]:
clf_bayes.fit(train_x_vectors.toarray(), train_y)

clf_bayes.predict(test_x_vectors[3].toarray())

array(['NEGATIVE'], dtype='<U8')

### Logistic Regression

In [55]:
from sklearn.linear_model import LogisticRegression

clf_log = LogisticRegression(solver='liblinear', multi_class='ovr')

In [56]:
clf_log.fit(train_x_vectors, train_y)

clf_log.predict(test_x_vectors[3])

array(['NEGATIVE'], dtype='<U8')





### KNN

In [57]:
from sklearn.neighbors import KNeighborsClassifier

clf_knn = KNeighborsClassifier(n_neighbors = 5)

In [58]:
clf_knn.fit(train_x_vectors, train_y)

clf_knn.predict(test_x_vectors[3])

array(['NEGATIVE'], dtype='<U8')

## Evaluation

In [59]:
# Mean Accuracy
print(clf_svm.score(test_x_vectors, test_y))
print(clf_dec.score(test_x_vectors, test_y))
print(clf_bayes.score(test_x_vectors.toarray(), test_y))
print(clf_log.score(test_x_vectors, test_y))
print(clf_knn.score(test_x_vectors, test_y))

0.7980769230769231
0.6298076923076923
0.6346153846153846
0.8173076923076923
0.5985576923076923


In [63]:
# F1 scores
from sklearn.metrics import f1_score

f1_score(test_y, clf_svm.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEGATIVE])

array([0.8028169 , 0.79310345])

## Testing

In [78]:
test_set = ['this book is fun', 'this book sucks', "brilliant"]
test_set_vectors = vectorizer.transform(test_set)
clf_svm.predict(test_set_vectors)

array(['POSITIVE', 'NEGATIVE', 'NEGATIVE'], dtype='<U8')