## Classes

In [59]:
import random
class Sentiment:
    NEGATIVE = 'NEGATIVE'
    NEUTRAL = 'NEUTRAL'
    POSITIVE = 'POSITIVE'


class Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
        
    def get_sentiment(self):
        if self.score <= 2:
            return Sentiment.NEGATIVE
        elif self.score == 3:
            return Sentiment.NEUTRAL
        else:
            return Sentiment.POSITIVE

class ReviewContainer:
    def __init__(self, reviews):
        self.reviews = reviews
    
    def evenly_distribute(self):
        negative = list(filter(lambda x: x.sentiment == Sentiment.NEGATIVE, self.reviews))
        positive = list(filter(lambda x: x.sentiment == Sentiment.POSITIVE, self.reviews))
        positive_shrunk = positive[:len(negative)]
        self.reviews = negative + positive_shrunk
        random.shuffle(self.reviews)
        
    def get_text(self):
        return [x.text for x in self.reviews]
    
    def get_sentiment(self):
        return [x.sentiment for x in self.reviews]

        

## Preprocessing

In [60]:
import json
file_name = 'Books_small.json'

reviews = []

with open(file_name) as f:
    for line in f:
        review = json.loads(line)
        reviews.append(Review(review['reviewText'], review['overall']))
        

In [61]:
from sklearn.model_selection import train_test_split

training, testing = train_test_split(reviews, test_size = 0.33, random_state=42)

train_container = ReviewContainer(training)
test_container = ReviewContainer(testing)

train_container.evenly_distribute()
test_container.evenly_distribute()

In [62]:
train_X = train_container.get_text()
train_y = train_container.get_sentiment()

test_X = test_container.get_text()
test_y = test_container.get_sentiment()

train_y.count(Sentiment.NEGATIVE)

436

In [63]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
vectorizer =TfidfVectorizer()

train_X_vectors = vectorizer.fit_transform(train_X)
test_X_vectors = vectorizer.transform(test_X)


## Classification

### Linear SVM

In [64]:
from sklearn import svm
clf_svm = svm.SVC(kernel='linear')
clf_svm.fit(train_X_vectors, train_y)

clf_svm.predict(test_X_vectors[0])

array(['POSITIVE'], dtype='<U8')

### Decision Tree

In [65]:
from sklearn.tree import DecisionTreeClassifier

clf_dec = DecisionTreeClassifier()
clf_dec.fit(train_X_vectors, train_y)

clf_dec.predict(test_X_vectors[0])

array(['POSITIVE'], dtype='<U8')

### Naive Bayes

In [66]:
from sklearn.naive_bayes import GaussianNB

clf_nb = GaussianNB()
clf_nb.fit(train_X_vectors.toarray(), train_y)

clf_nb.predict(test_X_vectors[0].toarray())

array(['NEGATIVE'], dtype='<U8')

### Logistic Regression

In [67]:
from sklearn.linear_model import LogisticRegression
clf_lr = LogisticRegression(max_iter=1000)

clf_lr.fit(train_X_vectors,train_y)
 
clf_lr.predict(test_X_vectors[0])

array(['POSITIVE'], dtype='<U8')

## Evaluation

### Mean Accuracy

In [68]:
print(clf_svm.score(test_X_vectors, test_y))
print(clf_dec.score(test_X_vectors, test_y))
print(clf_nb.score(test_X_vectors.toarray(), test_y))
print(clf_lr.score(test_X_vectors, test_y))

0.8076923076923077
0.6490384615384616
0.6610576923076923
0.8052884615384616


### F1 Score

In [69]:
from sklearn.metrics import f1_score

print(f1_score(test_y, clf_svm.predict(test_X_vectors), average = None, labels = [Sentiment.POSITIVE, Sentiment.NEGATIVE]))
print(f1_score(test_y, clf_dec.predict(test_X_vectors), average = None, labels = [Sentiment.POSITIVE, Sentiment.NEGATIVE]))
print(f1_score(test_y, clf_nb.predict(test_X_vectors.toarray()), average = None, labels = [Sentiment.POSITIVE, Sentiment.NEGATIVE]))
print(f1_score(test_y, clf_lr.predict(test_X_vectors), average = None, labels = [Sentiment.POSITIVE, Sentiment.NEGATIVE]))





[0.80582524 0.80952381]
[0.647343  0.6507177]
[0.65693431 0.66508314]
[0.80291971 0.80760095]


## Tuning with GridSearch 

In [70]:
from sklearn.model_selection import GridSearchCV


## Saving the model

In [73]:
import pickle

with open('./models/sentiment_classifier.pkl', 'wb') as f:
    pickle.dump(clf_svm, f)
with open('./models/sentiment_classifier_vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)   

## Loading the Model

In [None]:
with open('./models/sentiment_classifier.pkl', 'rb') as f:
    loaded_clf = pickle.load(f)