In [189]:
import numpy as np
import random

class Sentiment:
    NEGATIVE = 'NEGATIVE'
    NEUTRAL = 'NEUTRAL'
    POSITIVE = 'POSITIVE'

class Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
        
    def get_sentiment(self):
        if self.score <= 2:
            return Sentiment.NEGATIVE
        elif self.score == 3:
            return Sentiment.NEUTRAL
        else:
            return Sentiment.POSITIVE # Score of 4 or 5
         
class ReviewContainer:
    def __init__(self,reviews):
        self.reviews = reviews
        
    def get_text(self):
        return [x.text for x in self.reviews]
    
    def get_sentiment(self):
        return [x.sentiment for x in self.reviews]
       
    def evenly_distribute(self):
        negative = list(filter(lambda x: x.sentiment == Sentiment.NEGATIVE, self.reviews))
        positive = list(filter(lambda x: x.sentiment == Sentiment.POSITIVE, self.reviews))
        neutral = list(filter(lambda x: x.sentiment == Sentiment.NEUTRAL, self.reviews))
        
        positive_shrunk = positive[:len(negative)]
        
        self.reviews = negative + positive_shrunk
        random.shuffle(self.reviews)
        
#         print(len(positive))
#         print(len(negative))
        
        

### Load Data

In [190]:
# Import json to process json files
import json

reviews = []
with open('Books_small_2.json') as f:
    for line in f:
        review = json.loads(line)
        reviews.append(Review(review['reviewText'],review['overall']))
        

In [191]:
len(reviews)

10000

In [192]:
reviews[0].sentiment, reviews[0].score

('POSITIVE', 5.0)

### Prep Data

In [193]:
from sklearn.model_selection import train_test_split

# Splitting into train and test sets
train, test = train_test_split(reviews,test_size=0.2)

train_container = ReviewContainer(train)

test_container = ReviewContainer(test)

train_container.evenly_distribute()
test_container.evenly_distribute()

len(train_container.reviews), len(test_container.reviews)

(1014, 274)

In [199]:
x_train = train_container.get_text()
y_train = train_container.get_sentiment()

x_test = test_container.get_text()
y_test = test_container.get_sentiment()

y_train.count(Sentiment.POSITIVE), y_train.count(Sentiment.NEGATIVE)

(507, 507)

In [195]:
x_train[0], y_train[0]

("When I first started reading H.M. Ward's books, I was definitely hooked. But now, I am seeing that it's more of a money making game than it is writing to please your fan base. Don't get me wrong, the story lines are amazing. But the more I pay out for ALL of the series she has going on with all of them tying together, the less and less story I am getting! There are too many series going at once, and this most recent release Propisition 2, was short..very short. And all over the place. Very hard to follow. This series, I am willing to give up on. I am too invested in the Arrangement to let go of that. Also, has Trystan Scott been forgotten about? I feel that the author is getting so wrapped up in the money being made off the Ferro's that she completely lost sight of the fans waiting patiently for the next Trytan Scott book. Little by little, I am giving up on this author...my wallet isn't very happy either.",
 'NEGATIVE')

### Bag Of Words Vectorization
[Bag Of Words](https://medium.com/greyatom/an-introduction-to-bag-of-words-in-nlp-ac967d43b428)

In [237]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# This book is great!
# This book was so bad

# Tfidf allows us to weight (great,bad) higher than stuff like this,book,is,was etc.

vectorizer = TfidfVectorizer()
x_train_vector = vectorizer.fit_transform(x_train) 

x_test_vector = vectorizer.transform(x_test)
# vectorizer.fit(train_x)
# x_train_vector = vectorizer.transform(train_x)

In [238]:
import pandas as pd

print(x_train[0])
pd.DataFrame(x_train_vector[0].toarray())


When I first started reading H.M. Ward's books, I was definitely hooked. But now, I am seeing that it's more of a money making game than it is writing to please your fan base. Don't get me wrong, the story lines are amazing. But the more I pay out for ALL of the series she has going on with all of them tying together, the less and less story I am getting! There are too many series going at once, and this most recent release Propisition 2, was short..very short. And all over the place. Very hard to follow. This series, I am willing to give up on. I am too invested in the Arrangement to let go of that. Also, has Trystan Scott been forgotten about? I feel that the author is getting so wrapped up in the money being made off the Ferro's that she completely lost sight of the fans waiting patiently for the next Trytan Scott book. Little by little, I am giving up on this author...my wallet isn't very happy either.


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9866,9867,9868,9869,9870,9871,9872,9873,9874,9875
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Classification

**Linear SVM**

In [239]:
from sklearn.svm import SVC

# Set up a random seed
np.random.seed(50)

# Instatiate the model 
svm_model = SVC(kernel='linear')

# Fit the model to data
svm_model.fit(x_train_vector,y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [240]:
# Evaluating model on training data
svm_model.score(x_train_vector,y_train)

0.9812623274161736

In [241]:
# Evaluating model on test data
svm_model.score(x_test_vector,y_test)

0.8540145985401459

In [242]:
y_test[0],
# predicting
svm_model.predict(x_test_vector[0])

array(['NEGATIVE'], dtype='<U8')

**Decision Tree**

In [243]:
from sklearn.tree import DecisionTreeClassifier

# Set up a random seed
np.random.seed(50)

# Instantiate the model
dec_model = DecisionTreeClassifier()

# Fit the model to data
dec_model.fit(x_train_vector,y_train)

# Evaluate the model on test data
dec_model.score(x_test_vector,y_test)

0.6642335766423357

In [244]:
# predicting
dec_model.predict(x_test_vector[0])

array(['NEGATIVE'], dtype='<U8')

**Naive Bayes**

In [245]:
from sklearn.naive_bayes import GaussianNB

# Set up random seed
np.random.seed(50)

# Instatiate the model
nb_model = GaussianNB()

# Fit the model to data
nb_model.fit(x_train_vector.toarray(),y_train)

# Evaluate the model on test data
nb_model.score(x_test_vector.toarray(),y_test)

0.5620437956204379

In [246]:
# predicting
dec_model.predict(x_test_vector[0])

array(['NEGATIVE'], dtype='<U8')

**Logistic Regression**

In [247]:
from sklearn.linear_model import LogisticRegression

# Set up random seed
np.random.seed(50)

# Instatiate the model
logis_model = LogisticRegression()

# Fit the model to data
logis_model.fit(x_train_vector,y_train)

# Evaluate the model on test data
logis_model.score(x_test_vector,y_test)



0.843065693430657

In [248]:
# predicting
logis_model.predict(x_test_vector[0])

array(['NEGATIVE'], dtype='<U8')

**RandomForestClassifier**

In [249]:
from sklearn.ensemble import RandomForestClassifier

# Set up random seed
np.random.seed(50)

# Instatiate the model
rfc_model = RandomForestClassifier(n_estimators=100)

# Fit the model to data
rfc_model.fit(x_train_vector,y_train)


# Evaluate the model on test data
rfc_model.score(x_test_vector,y_test)

0.791970802919708

In [250]:
# predicting
logis_model.predict(x_test_vector[0])

array(['NEGATIVE'], dtype='<U8')

### Evaluation

In [251]:
# Mean Accuracy
print(f'Linear SVM: {svm_model.score(x_test_vector,y_test)}')
print(f'Dicision Tree: {dec_model.score(x_test_vector,y_test)}')
print(f'Naive Bayes: {nb_model.score(x_test_vector.toarray(),y_test)}')
print(f'Logistic Regression: {logis_model.score(x_test_vector,y_test)}')
print(f'RandomForestClassifier: {rfc_model.score(x_test_vector,y_test)}')

Linear SVM: 0.8540145985401459
Dicision Tree: 0.6642335766423357
Naive Bayes: 0.5620437956204379
Logistic Regression: 0.843065693430657
RandomForestClassifier: 0.791970802919708


In [252]:
# F1 score
from sklearn.metrics import f1_score

f1_score(y_test,svm_model.predict(x_test_vector),average=None, 
         labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE])

f1_score(y_test,svm_model.predict(x_test_vector),average=None, 
         labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE])

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


array([0.85074627, 0.        , 0.85714286])

In [222]:
y_test.count(Sentiment.POSITIVE)

137

In [256]:
test_set = ['I hate this','5 stars','do not buy','horrible']
new_test = vectorizer.transform(test_set)

rfc_model.predict(new_test)

array(['POSITIVE', 'POSITIVE', 'NEGATIVE', 'POSITIVE'], dtype='<U8')

In [257]:
svm_model.predict(new_test)

array(['NEGATIVE', 'POSITIVE', 'NEGATIVE', 'NEGATIVE'], dtype='<U8')

### Tuning Our Model 

In [260]:
from sklearn.model_selection import GridSearchCV

grid = {'kernel':('linear','rbf'),
        'C': (1,4,8,16,32)}

clf = GridSearchCV(estimator=svm_model,
                  cv=5,
                  param_grid=grid)

clf.fit(x_train_vector, y_train)



GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto_deprecated', kernel='linear',
                           max_iter=-1, probability=False, random_state=None,
                           shrinking=True, tol=0.001, verbose=False),
             iid='warn', n_jobs=None,
             param_grid={'C': (1, 4, 8, 16, 32), 'kernel': ('linear', 'rbf')},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [261]:
clf.best_params_

{'C': 1, 'kernel': 'linear'}

In [262]:
clf.score(x_test_vector,y_test)

0.8540145985401459

In [263]:
f1_score(y_test,svm_model.predict(x_test_vector),average=None, 
         labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE])

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


array([0.85074627, 0.        , 0.85714286])

In [278]:
test_set = ['I hate this','5 stars','do not buy','was not good']
new_test = vectorizer.transform(test_set)

svm_model.predict(new_test)

array(['NEGATIVE', 'POSITIVE', 'NEGATIVE', 'NEGATIVE'], dtype='<U8')

### Saving Model

In [279]:
import pickle

pickle.dump(svm_model,open('nlp_classifier','wb'))

loaded_svm_model = pickle.load(open('nlp_classifier','rb'))

In [280]:
loaded_svm_model.score(x_test_vector,y_test)

0.8540145985401459