In [1]:
class Sentiment:
    NEGATIVE = 'NEGATIVE'
    NEUTRAL = 'NEUTRAL'
    POSITIVE = 'POSITIVE'

class Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
        
        
    def get_sentiment(self):
        if self.score <= 2:
            return Sentiment.NEGATIVE
        elif self.score == 3:
            return Sentiment.NEUTRAL
        else:
            return Sentiment.POSITIVE
        

# Loading of Data in json

In [2]:
import json 

file_name = 'Books_small.json'

reviews = []
with open(file_name) as f:
    for line in f: 
        review = json.loads(line)
        reviews.append(Review(review['reviewText'], review['overall']))
        
reviews[50].sentiment

'POSITIVE'

# Data Prep

In [3]:
from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(reviews, test_size=0.33, random_state = 42)
training, test = train_test_split(reviews, test_size=0.33, random_state = 42)

In [4]:
len(training)

670

In [5]:
len(test)

330

In [6]:
train_x = [x.text for x in training]
train_y = [x.sentiment for x in training]

test_x = [x.text for x in test]
test_y = [x.sentiment for x in test]

train_x[0]
train_y[0]

'POSITIVE'

# Bag of Words Vectorization

In [11]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
train_x_vectors = vectorizer.fit_transform(train_x)

test_x_vectors = vectorizer.transform(test_x)

# vectorizer.fit(train_x)
# train_x_vectors = vectorizer.transform(train_x)

print(train_x[0])
print(train_x_vectors[0].toarray())

Vivid characters and descriptions. The author has created a tale that grabs your attention and I couldn't put it down.
[[0 0 0 ... 0 0 0]]


# Classification

In [13]:
from sklearn import svm

clf_svm = svm.SVC(kernel='linear')

clf_svm.fit(train_x_vectors, train_y)

test_x[0]
# test_x_vectors[0]

"Every new Myke Cole book is better than the last, and this is no exception. If you haven't read the Shadow Ops series before start with Control Point, but go ahead and order Fortress Frontier and Breach Zone as well - you're going to want them."

## Prediction

In [14]:
clf_svm.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

# Decision Tree

In [17]:
from sklearn.tree import DecisionTreeClassifier

clf_dec = DecisionTreeClassifier()
clf_dec.fit(train_x_vectors, train_y)

clf_dec.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

# Multinomial Naive Bayes

In [21]:
from sklearn.naive_bayes import MultinomialNB
clf_mnb = MultinomialNB()
clf_mnb.fit(train_x_vectors, train_y)

clf_mnb.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

# Gaussian Naive Bayes (Not working for some reason though)

# Logistic Regression

In [25]:
import warnings
warnings.filterwarnings('ignore')

from sklearn.linear_model import LogisticRegression

clf_log = LogisticRegression()
clf_log.fit(train_x_vectors, train_y)

clf_log.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

## Evaluatin with score() method

In [27]:
print(clf_svm.score(test_x_vectors, test_y))
print(clf_dec.score(test_x_vectors, test_y))
print(clf_mnb.score(test_x_vectors, test_y))
print(clf_log.score(test_x_vectors, test_y))

0.8242424242424242
0.7454545454545455
0.8575757575757575
0.8303030303030303


# F1 Score

In [29]:
from sklearn.metrics import f1_score

f1_score(test_y, clf_svm.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE])
f1_score(test_y, clf_dec.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE])
f1_score(test_y, clf_mnb.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE])
f1_score(test_y, clf_log.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE])



array([0.91370558, 0.12244898, 0.1       ])

### the above score suggests that the model is good for picking Positive sentiments but not that great in picking negative and neutral

In [33]:
train_y.count(Sentiment.POSITIVE)

552

In [34]:
train_y.count(Sentiment.NEGATIVE)

47

In [35]:
train_y.count(Sentiment.NEUTRAL)

71

### So the problem may not be of these models but the data.

## We'll do the whole process with a bigger data set and see if it works or we'll do some more tweaking in that