In [44]:
import random

class Sentiment:        #enum for sentiments
    NEGATIVE = "NEGATIVE"
    NEUTRAL = "NEUTRAL"
    POSITIVE = "POSITIVE"

class Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()

    def get_sentiment(self):
        if self.score > 3: 
            return Sentiment.POSITIVE
        elif self.score < 3: 
            return Sentiment.NEGATIVE
        else: 
            return Sentiment.NEUTRAL
        
class ReviewContainer:
    def __init__(self, reviews):
        self.reviews = reviews

    def get_text(self):
        return [x.text for x in self.reviews]
    
    def get_sentiment(self):
        return [x.sentiment for x in self.reviews]
    
    #to avoid bias in training data
    def equalize_distribution(self):
        #filtering the various sentiments in different lists
        negative_reviews = list(filter(lambda x: x.sentiment == Sentiment.NEGATIVE, self.reviews))
        neutral_reviews = list(filter(lambda x: x.sentiment == Sentiment.NEUTRAL, self.reviews))
        positive_reviews = list(filter(lambda x: x.sentiment == Sentiment.POSITIVE, self.reviews))
        
        #common length is the minimum count of sentiment samples that we have in dataset
        common_len = min(len(negative_reviews), len(neutral_reviews), len(positive_reviews))

        #creating the new list where all the sentiments have equal samples
        negative_reviews = negative_reviews[:common_len]
        neutral_reviews = neutral_reviews[:common_len]
        positive_reviews = positive_reviews[:common_len]

        combined = negative_reviews + neutral_reviews + positive_reviews
        random.shuffle(combined)
        self.reviews = combined


#### Reading dataset from json file

In [45]:
import json

file_name = 'Books_small_10000.json'

reviews = []

with open(file_name) as f:
    for line in f:
        review = json.loads(line)
        reviews.append(Review(review['reviewText'], review['overall']))

        


#### Preperation of dataset: vectorization of text data

In [46]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(reviews, test_size=0.33, random_state=42)

train_data_cont = ReviewContainer(train_data)
test_data_cont = ReviewContainer(test_data)

# equalize sentiment samples in the training data & test data
train_data_cont.equalize_distribution()
test_data_cont.equalize_distribution()

#preperation of vectors to feed into models
train_data_x = train_data_cont.get_text()
train_data_y = train_data_cont.get_sentiment()

test_data_x = test_data_cont.get_text()
test_data_y = test_data_cont.get_sentiment()



In [47]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

vectorizer = TfidfVectorizer()
vectorized_train_x = vectorizer.fit_transform(train_data_x).toarray()
vectorized_test_x = vectorizer.transform(test_data_x).toarray()


#### Classification models training

In [48]:
#GAUSSIAN NAIVE BAYES

from sklearn.naive_bayes import GaussianNB

clf_gnb = GaussianNB()
clf_gnb.fit(vectorized_train_x, train_data_y)

clf_gnb.score(vectorized_test_x, test_data_y)

0.4342948717948718

In [49]:
#SUPPORT VECTOR CLASSIFICATION

from sklearn.svm import SVC

clf_svc = SVC()
clf_svc.fit(vectorized_train_x, train_data_y)

clf_svc.score(vectorized_test_x, test_data_y)

0.5993589743589743

In [50]:
#DECISION TREE CLASSIFIER

from sklearn.tree import DecisionTreeClassifier

clf_dtc = DecisionTreeClassifier()
clf_dtc.fit(vectorized_train_x, train_data_y)

clf_dtc.score(vectorized_test_x, test_data_y)

0.44711538461538464

#### Evaluation of models: f1 score

In [51]:
from sklearn.metrics import f1_score

print("GaussianNB: ", f1_score(test_data_y, clf_gnb.predict(vectorized_test_x), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE]))
print("SVC: ", f1_score(test_data_y, clf_svc.predict(vectorized_test_x), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE]))
print("Decision Tree Classifier: ", f1_score(test_data_y, clf_dtc.predict(vectorized_test_x), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE]))

GaussianNB:  [0.46786632 0.40089087 0.43902439]
SVC:  [0.6888361  0.51551313 0.59313725]
Decision Tree Classifier:  [0.49308756 0.39902676 0.44665012]


#### SVC seems to be the best model here

#### Hyperparameter tuning:

In [None]:
from sklearn.model_selection import GridSearchCV

svc = SVC()
clf = GridSearchCV(svc, {'kernel': ['linear', 'rbf'], 'C': [1, 2, 4, 8, 16]}, cv= 5)
clf.fit(vectorized_train_x, train_data_y)

print("Mean score: ", clf.score(vectorized_test_x, test_data_y))
print("F1 score: ", f1_score(test_data_y, clf.predict(vectorized_test_x), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE]))

Mean score:  0.5993589743589743
F1 score:  [0.6888361  0.51551313 0.59313725]


#### Saving model

In [60]:
import pickle

with open('clf_save.pkl', 'wb') as f:
    pickle.dump(clf, f)

#### Loading model

In [None]:
import pickle

with open('clf_save.pkl', 'rb') as f:
    saved_clf = pickle.load(f)


Mean score:  0.5993589743589743
