In [35]:
import random

class Sentiment:
    NEGATIVE = 'NEGATIVE'
    NEUTRAL = 'NEUTRAL'
    POSITIVE = 'POSITIVE'

class Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
        
        
    def get_sentiment(self):
        if self.score <= 2:
            return Sentiment.NEGATIVE
        elif self.score == 3:
            return Sentiment.NEUTRAL
        else:
            return Sentiment.POSITIVE


class ReviewContainer:
    def __init__(self, reviews):
        self.reviews = reviews

    
    def get_text(self):
        return [x.text for x in self.reviews]
    
    def get_sentiment(self):
        return [x.sentiment for x in self.reviews]
        
    def evnely_distribute(self):
        negative = list(filter(lambda x: x.sentiment == Sentiment.NEGATIVE, self.reviews))
        positive = list(filter(lambda x: x.sentiment == Sentiment.POSITIVE, self.reviews))
        
        positive_shrunk = positive[:len(negative)]
        self.reviews = negative + positive_shrunk
        random.shuffle(self.reviews)
        
        
#         print(negative[0].text)
#         print(len(negative))
#         print(len(positive))

# Loading of Data in json

In [36]:
import json 

file_name = 'Books_small_10000.json'

reviews = []
with open(file_name) as f:
    for line in f: 
        review = json.loads(line)
        reviews.append(Review(review['reviewText'], review['overall']))
        
reviews[50].sentiment

'NEGATIVE'

# Data Prep

In [37]:
from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(reviews, test_size=0.33, random_state = 42)
training, test = train_test_split(reviews, test_size=0.33, random_state = 42)


train_container = ReviewContainer(training)
# cont = ReviewContainer(training)

test_container = ReviewContainer(test)

# cont.evnely_distribute()

cont.evnely_distribute() 

len(cont.reviews)

872

In [38]:
len(training)

6700

In [39]:
len(test)

3300

In [59]:
train_container.evnely_distribute()

train_x = train_container.get_text()
train_y = train_container.get_sentiment()


test_container.evnely_distribute()
test_x = test_container.get_text()
test_y = test_container.get_sentiment()

# train_x[0]
# train_y[0]

print(train_y.count(Sentiment.POSITIVE))
print(train_y.count(Sentiment.NEGATIVE))

436
436


# Bag of Words Vectorization

In [72]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# vectorizer = CountVectorizer()
vectorizer = TfidfVectorizer()
train_x_vectors = vectorizer.fit_transform(train_x)

test_x_vectors = vectorizer.transform(test_x)

# vectorizer.fit(train_x)
# train_x_vectors = vectorizer.transform(train_x)

print(train_x[0])
print(train_x_vectors[0].toarray())

I could barely put this down, actually woke up in the middle of the night and read for an hour till I could no longer keep my eyes open.  Thank you Anne for another gift of your talent!
[[0. 0. 0. ... 0. 0. 0.]]


# Classification

In [73]:
from sklearn import svm

clf_svm = svm.SVC(kernel='linear')

clf_svm.fit(train_x_vectors, train_y)

test_x[0]
# test_x_vectors[0]

'did not serve my need.'

## Prediction

In [74]:
clf_svm.predict(test_x_vectors[0])

array(['NEGATIVE'], dtype='<U8')

# Decision Tree

In [75]:
from sklearn.tree import DecisionTreeClassifier

clf_dec = DecisionTreeClassifier()
clf_dec.fit(train_x_vectors, train_y)

clf_dec.predict(test_x_vectors[0])

array(['NEGATIVE'], dtype='<U8')

# Multinomial Naive Bayes

In [76]:
from sklearn.naive_bayes import MultinomialNB
clf_mnb = MultinomialNB()
clf_mnb.fit(train_x_vectors, train_y)

clf_mnb.predict(test_x_vectors[0])

array(['NEGATIVE'], dtype='<U8')

# Gaussian Naive Bayes (Not working for some reason though)

# Logistic Regression

In [77]:
import warnings
warnings.filterwarnings('ignore')

from sklearn.linear_model import LogisticRegression

clf_log = LogisticRegression()
clf_log.fit(train_x_vectors, train_y)

clf_log.predict(test_x_vectors[0])

array(['NEGATIVE'], dtype='<U8')

## Evaluatin with score() method

In [78]:
print(clf_svm.score(test_x_vectors, test_y))
print(clf_dec.score(test_x_vectors, test_y))
print(clf_mnb.score(test_x_vectors, test_y))
print(clf_log.score(test_x_vectors, test_y))

0.8076923076923077
0.6514423076923077
0.8125
0.8052884615384616


# F1 Score

In [79]:
from sklearn.metrics import f1_score

f1_score(test_y, clf_svm.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE])
f1_score(test_y, clf_dec.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE])
f1_score(test_y, clf_mnb.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE])
f1_score(test_y, clf_log.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE])



array([0.80291971, 0.        , 0.80760095])

### the above score suggests that the model is good for picking Positive sentiments but not that great in picking negative and neutral

In [80]:
train_y.count(Sentiment.POSITIVE)

436

In [81]:
train_y.count(Sentiment.NEGATIVE)

436

In [82]:
train_y.count(Sentiment.NEUTRAL)

0

# Checking the model

In [71]:
test_set = ['sabir is the best', 'she is the worst' , 'they are the best people overall']

new_test = vectorizer.transform(test_set)

clf_svm.predict(new_test)

array(['POSITIVE', 'NEGATIVE', 'POSITIVE'], dtype='<U8')

# Tuning our model with GridSearchCV

In [83]:
from sklearn.model_selection import GridSearchCV

In [85]:
parameters = {'kernel': ('linear', 'rbf'), 'C': (1,4,8,16,32)}

svc = svm.SVC()
clf = GridSearchCV(svc, parameters, cv = 5)
clf.fit(train_x_vectors, train_y)

In [86]:
clf.best_params_

{'C': 1, 'kernel': 'linear'}

In [87]:
clf.best_score_

0.8337011494252874

# before it was 0.80 now it's 0.83. 

# Saving the model 


In [89]:
import pickle

with open('Sentiment_Classifier.pkl', 'wb') as f:
    pickle.dump(clf, f)

# Loading the model and using it

In [90]:
with open('Sentiment_Classifier.pkl', 'rb') as f:
    loaded_clf = pickle.load(f)

In [91]:
print(test_x[0])

loaded_clf.predict(test_x_vectors[0])

did not serve my need.


array(['NEGATIVE'], dtype='<U8')