# Learnt from

In [84]:
### https://youtu.be/M9Itm95JzL0

In [216]:
import random
import numpy as np

class Sentiment:
    NEGATIVE = "NEGATIVE"
    NEUTRAL = "NEUTRAL"
    POSITIVE = "POSITIVE"

class Review:
    def __init__(self,text,score):
        self.text=text
        self.score=score
        self.sentiment=self.getSentiment()
    def getSentiment(self):
        if self.score<=2:
            return Sentiment.NEGATIVE
        elif self.score>2 and self.score<4:
            return Sentiment.NEUTRAL
        else:
            return Sentiment.POSITIVE
        
class ReviewContainer:
    def __init__(self,reviews):
        self.reviews=reviews ########## "reviews" collection of "Review" objects
    def get_text(self):
        return [x.text for x in self.reviews]
    def get_sentiment(self):
        return [x.sentiment for x in self.reviews]
    def evenly_distribute(self):
        negative=list(filter(lambda x:x.sentiment==Sentiment.NEGATIVE,self.reviews))
        positive=list(filter(lambda x:x.sentiment==Sentiment.POSITIVE,self.reviews))
        nutral=list(filter(lambda x:x.sentiment==Sentiment.NEUTRAL,self.reviews))
        positive_chunk=positive[:len(negative)+1]
        self.reviews=negative+positive_chunk+nutral
        random.shuffle(self.reviews)

In [217]:
import json
file_name = './data/sentiment/books_small_10000.json'
reviews=[]

with open(file_name) as f:
    for line in f: ###### go line by line
        review=json.loads(line)
        reviews.append(Review(review['reviewText'],review['overall']))
        


In [218]:
from sklearn.model_selection import train_test_split
training, test = train_test_split(reviews, test_size=.2, random_state=42)
train_container = ReviewContainer(training)
test_container = ReviewContainer(test)

In [219]:
train_container.evenly_distribute()
test_container.evenly_distribute()

train_x=train_container.get_text()
train_y=train_container.get_sentiment()

test_x=test_container.get_text()
test_y=test_container.get_sentiment()

print(train_y.count(Sentiment.POSITIVE))
print(train_y.count(Sentiment.NEGATIVE))
print(train_y.count(Sentiment.NEUTRAL))

print(test_y.count(Sentiment.POSITIVE))
print(test_y.count(Sentiment.NEGATIVE))
print(test_y.count(Sentiment.NEUTRAL))

514
513
783
132
131
195


In [227]:
from sklearn.feature_extraction.text import CountVectorizer ### unweighted not so much better
from sklearn.feature_extraction.text import TfidfVectorizer ### weighted and better than CountVectorizer

vectorizer=TfidfVectorizer()


##vectorizer.fit(train_x)
##train_x_vectors=vectorizer.transform(train_x)
##test_x_vectors=vectorizer.transform(test_x)

## OR

train_x_vectors=vectorizer.fit_transform(train_x)
test_x_vectors=vectorizer.transform(test_x)  ###<<<<< test data fitted according to train data

print(train_x[0])
print(train_x_vectors[0].toarray())


A sweet regency love story with a little twist . I liked it and thought the trilogy had potential .
[[0. 0. 0. ... 0. 0. 0.]]


In [228]:
##########classifier amplifiers ##############
from sklearn.model_selection import GridSearchCV
############classifiers ############
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression

In [229]:
clf_svm = svm.SVC()
clf_dec = DecisionTreeClassifier()
clf_gnb = GaussianNB()
clf_log = LogisticRegression(penalty='l2',dual=False)

In [230]:
param_svm={'kernel':('linear', 'rbf'),'C':(1,4,8,16,32),'degree':(3,4,5),'gamma':('scale','auto')}
param_dec={'criterion':('gini','entropy'),'splitter':('best','random')
           ,'max_depth':(2,3,4),'min_samples_leaf':(2,3,4),'max_features':('auto','sqrt','log2',2,3,4)} 
param_gnb={'var_smoothing':(1e-09,1e-08,1e-07)}
param_log={'tol':(0.0001,0.0002,0.0003,0.001),'C':(1,4,8,16,32),
          'solver':('newton-cg','lbfgs','sag','saga'),
          'multi_class':('auto','ovr','multinomial'),
          'max_iter':(100,150,200)}

In [231]:
G_clf_svm=GridSearchCV(clf_svm,param_svm,cv=5)
G_clf_dec=GridSearchCV(clf_dec,param_dec,cv=5)
G_clf_gnb=GridSearchCV(clf_gnb,param_gnb,cv=5)
G_clf_log=GridSearchCV(clf_log,param_log,cv=5)

In [233]:
G_clf_svm.fit(train_x_vectors,train_y)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto_deprecated', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='warn', n_jobs=None,
             param_grid={'C': (1, 4, 8, 16, 32), 'degree': (3, 4, 5),
                         'gamma': ('scale', 'auto'),
                         'kernel': ('linear', 'rbf')},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [234]:
G_clf_dec.fit(train_x_vectors,train_y)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=DecisionTreeClassifier(class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort=False, random_state=None,
                                              splitter='best'),
             iid='warn', n_jobs=None,
             param_grid={'criterion': ('gini', 'entropy'),
                         'max_depth': (2, 3, 4),
                         'ma

In [235]:
G_clf_log.fit(train_x_vectors,train_y)









GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='warn',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'C': (1, 4, 8, 16, 32), 'max_iter': (100, 150, 200),
                         'multi_class': ('auto', 'ovr', 'multinomial'),
                         'solver': ('newton-cg', 'lbfgs', 'sag', 'saga'),
                         'tol': (0.0001, 0.0002, 0.0003, 0.001)},
             pre_dispatch='2*n_jobs', refit=True, ret

In [243]:
import numpy as np
G_clf_gnb.fit(train_x_vectors.toarray(),np.array(train_y))



GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=GaussianNB(priors=None, var_smoothing=1e-09), iid='warn',
             n_jobs=None, param_grid={'var_smoothing': (1e-09, 1e-08, 1e-07)},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [245]:
# Mean Accuracy
print(G_clf_svm.score(test_x_vectors, test_y))
print(G_clf_dec.score(test_x_vectors, test_y))
print(G_clf_log.score(test_x_vectors, test_y))
print(G_clf_gnb.score(test_x_vectors.toarray(), np.array(test_y)))

0.6331877729257642
0.4388646288209607
0.62882096069869
0.462882096069869


In [247]:
'''
F1 score: 2(precision*recall)/(precision+recall)
'''
from sklearn.metrics import f1_score
##### (actual,predicted,avg,labels=[...])
print(f1_score(test_y,G_clf_svm.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE,Sentiment.NEUTRAL,Sentiment.NEGATIVE]))
print(f1_score(test_y,G_clf_dec.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE,Sentiment.NEUTRAL,Sentiment.NEGATIVE]))
print(f1_score(test_y,G_clf_gnb.predict(test_x_vectors.toarray()), average=None, labels=[Sentiment.POSITIVE,Sentiment.NEUTRAL,Sentiment.NEGATIVE]))
print(f1_score(test_y,G_clf_log.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE,Sentiment.NEUTRAL,Sentiment.NEGATIVE]))

[0.66129032 0.65429234 0.56540084]
[0.06896552 0.60031596 0.08695652]
[0.47014925 0.51707317 0.36134454]
[0.66666667 0.64319249 0.56170213]


In [257]:
import joblib

classifiers={'G_clf_svm':G_clf_svm,'G_clf_dec':G_clf_dec,'G_clf_gnb':G_clf_gnb,'G_clf_log':G_clf_log}

for classifier in classifiers.keys():
    filename = f'{classifier}.sav'
    joblib.dump(classifiers[classifier],filename)


In [258]:
loaded_clf=[]
for classifier in classifiers.keys():
    loaded_clf.append(joblib.load(filename))

In [260]:
test_set = ['very fun', "bad book do not buy", 'horrible waste of time','quite good']
new_test = vectorizer.transform(test_set)
for classifier in loaded_clf:
    if classifier!=G_clf_gnb:
        print(classifier.predict(new_test))
    else:
        print(classifier.predict(new_test.toarray()))

['POSITIVE' 'NEGATIVE' 'NEGATIVE' 'NEUTRAL']
['POSITIVE' 'NEGATIVE' 'NEGATIVE' 'NEUTRAL']
['POSITIVE' 'NEGATIVE' 'NEGATIVE' 'NEUTRAL']
['POSITIVE' 'NEGATIVE' 'NEGATIVE' 'NEUTRAL']


'5 porn'