# Data Class

In [1]:
import random


class Sentiment:
    NEGATIVE = "NEGATIVE"
    NEUTRAL = "NEUTRAL"
    POSITIVE = "POSITIVE"


class Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()

    def get_sentiment(self):
        if self.score <= 2:
            return Sentiment.NEGATIVE
        elif self.score == 3:
            return Sentiment.NEUTRAL
        else:  #score of 4 or 5
            return Sentiment.POSITIVE


class ReviewContainer:
    def __init__(self, reviews):
        self.reviews = reviews

    def get_text(self):
        return [x.text for x in self.reviews]

    def get_sentiment(self):
        return [x.sentiment for x in self.reviews]

    def evenly_distribution(self):
        negative = list(filter(lambda x: x.sentiment == Sentiment.NEGATIVE, self.reviews))
        positive = list(filter(lambda x: x.sentiment == Sentiment.POSITIVE, self.reviews))

        positive_shrunk = positive[:len(negative)]
        self.reviews = negative + positive_shrunk
        random.shuffle(self.reviews)



# Load Data 

In [3]:
import json

file_name = r'ignore/dataBooks_5_new.json'

reviews = []
with open(file_name) as f:
    for line in f:
        review = json.loads(line)
        #         print(review['reviewText'])
        #         print(review['overall'])
        reviews.append(Review(review['reviewText'], review['overall']))

reviews[5].text


"Boy, what a whiney person Adrianna is, demanding too. Muscling Adam into going to a party just to make her happy. Each has lusted after the other for 4years. Surprise, surprise she gets them lost in the woods and he has no backbone to stand up to her. I will read the next story, but I'm not sold yet.  Only time will tell."

# Prep Data

In [4]:
from sklearn.model_selection import train_test_split

training, test = train_test_split(reviews, test_size=0.33, random_state=42)

train_container = ReviewContainer(training)
test_container = ReviewContainer(test)


In [5]:
train_container.evenly_distribution()
train_x = train_container.get_text()
train_y = train_container.get_sentiment()

test_container.evenly_distribution()
test_x = test_container.get_text()
test_y = test_container.get_sentiment()

print(train_y.count(Sentiment.POSITIVE))
print(train_y.count(Sentiment.NEGATIVE))

21779
21779


## Bag of words vectorization 

In [6]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

vectorizer = TfidfVectorizer()
train_x_vectors = vectorizer.fit_transform(train_x)

test_x_vectors = vectorizer.transform(test_x)

print(train_x[0])
print(train_x_vectors[0].toarray())



Even with it being free, it was an entire waste of time -- albeit a short one.  I can summarize the whole thing for you in three little sentences:1.  Write what you know and do a lot of observing.2.  Place yourself in the minds of your characters.3.  Read ALL of my books, as I don't make a lot of money.She deliberately won't even mention which piece of inspiration went with which book, apparently in the hopes we'll buy all of them.  (Not kidding.  She says that in as many words, beginning this article with a gripe about how "criminally little" she earns as a writer.)There you go.  That's the wisdom, given with the hopes it will inspire you to read the author's books.  It's a promotional article with little else, written as if we're all in junior high.
[[0. 0. 0. ... 0. 0. 0.]]


# Classification

### Linear SVM

In [7]:
from sklearn import svm

clf_svm = svm.SVC(kernel='linear')

clf_svm.fit(train_x_vectors, train_y)

test_x[0]

clf_svm.predict(test_x_vectors[0])


array(['POSITIVE'], dtype='<U8')

### Decision Tree

In [8]:
from sklearn.tree import DecisionTreeClassifier

clf_dec = DecisionTreeClassifier()
clf_dec.fit(train_x_vectors, train_y)

clf_dec.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

### Naive Bayes

In [9]:
from sklearn.naive_bayes import GaussianNB

clf_gnb = DecisionTreeClassifier()
clf_gnb.fit(train_x_vectors, train_y)

clf_gnb.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

### Logistic Regression

In [10]:
from sklearn.linear_model import LogisticRegression

clf_log = LogisticRegression()

clf_log.fit(train_x_vectors, train_y)

clf_log.predict(test_x_vectors[0])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


array(['POSITIVE'], dtype='<U8')

# Evaluation

In [11]:
# Mean Accuracy
print(clf_svm.score(test_x_vectors, test_y))
print(clf_dec.score(test_x_vectors, test_y))
print(clf_gnb.score(test_x_vectors, test_y))
print(clf_log.score(test_x_vectors, test_y))

0.9041268950028074
0.7357757813962194
0.7352142990829122
0.9027231892195395


In [12]:
# F1 scores
from sklearn.metrics import f1_score

# f1_score(test_y, clf_svm.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEGATIVE])
f1_score(test_y, clf_svm.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEGATIVE])



array([0.9040326 , 0.90422101])

In [13]:
test_set = ['not great', 'i dont like it', 'relax', 'trash', 'absolute junk']
new_test = vectorizer.transform(test_set)

clf_svm.predict(new_test)

array(['POSITIVE', 'NEGATIVE', 'POSITIVE', 'NEGATIVE', 'NEGATIVE'],
      dtype='<U8')

## Tuning our model (with grid search)

In [14]:
# from sklearn.model_selection import GridSearchCV
#
# parameters = {'kernel': ('linear', 'rbf'), 'C': (1, 4, 8, 16, 32)}
#
# svc = svm.SVC()
# clf = GridSearchCV(svc, parameters, cv=5)
# clf.fit(train_x_vectors, train_y)




In [15]:
# print(clf.score(test_x_vectors, test_y))

## Saving Model

In [22]:
import pickle

with open(r'./models/sentiment_classifier.pkl', 'wb') as f:
    pickle.dump(clf_svm, f)

with open(r'./models/vectorizer.pkl', 'wb') as g:
    pickle.dump(vectorizer, g)

### load model

In [23]:
import pickle

with open(r'./models/sentiment_classifier.pkl', 'rb') as f:
    loaded_clf = pickle.load(f)

with open(r'./models/vectorizer.pkl', 'rb') as g:
    loaded_vectorizer = pickle.load(g)

In [28]:
# print(test_x[56])
# from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

t1 = ['there was nothing interesting in the book, waste of time', 'absolute disgusting', 'loved it', 'amazing content', 'not a good one']
test_case = loaded_vectorizer.transform(t1)
loaded_clf.predict(test_case)

array(['NEGATIVE', 'NEGATIVE', 'POSITIVE', 'POSITIVE', 'NEGATIVE'],
      dtype='<U8')