# Sentiments Analysis Model


### Author Shipra

In [55]:
import random

In [56]:
class Sentiment:
    NEGATIVE="NEGATIVE"
    NEUTRAL="NEUTRAL"
    POSITIVE="POSITIVE"

In [57]:
class Review:
    def __init__(self,text,score):
        self.text=text
        self.score=score
        self.sentiment=self.get_sentiment()
        
    def get_sentiment(self):
        if self.score <=2:
            return Sentiment.NEGATIVE
        elif self.score == 3:
            return Sentiment.NEUTRAL
        else :
            return Sentiment.POSITIVE
        
        
        
class ReviewContainer:
    def __init__ (self,reviews):
        self.reviews=reviews
        
    def get_text(self):
        return [x.text  for x in self.reviews]
    
    def get_sentiment(self):
        return [x.sentiment for x in self.reviews]
    def evenly_distribute(self):
        negative=list(filter(lambda x : x.sentiment==Sentiment.NEGATIVE,self.reviews))
        positive=list(filter(lambda x : x.sentiment==Sentiment.POSITIVE,self.reviews))
        positive_shrunk=positive[:len(negative)]
        self.reviews=negative+positive_shrunk
        random.shuffle(self.reviews)
    
                
        

In [58]:
import json


In [59]:
file_name= 'C:/Users/asus/Desktop/web development/Books_small_10000b.json'


In [60]:

        
reviews=[]
with open(file_name) as f:
    for line in f:
        review = json.loads(line)
        reviews.append(Review(review['reviewText'],review['overall']))
        
        
reviews[3].text

    
        

'I really enjoyed this adventure and look forward to reading more of Robert Spire. I especially liked all the info on global warming. You did a good job on the research.'

## Prep Data

In [61]:
len(reviews)

10000

In [62]:
from sklearn.model_selection import train_test_split

In [63]:
training,test=train_test_split(reviews,test_size=0.33,random_state=42)

train_container=ReviewContainer(training)
test_container=ReviewContainer(test)
train_container.evenly_distribute()
train_x=train_container.get_text()
train_y=train_container.get_sentiment()
test_container.evenly_distribute()
test_x=test_container.get_text()
test_y=test_container.get_sentiment()

print(train_y.count(Sentiment.POSITIVE))
print(train_y.count(Sentiment.NEGATIVE))
print(test_y.count(Sentiment.POSITIVE))
print(test_y.count(Sentiment.NEGATIVE))


436
436
208
208


In [64]:
print(training[0].sentiment)

POSITIVE


In [65]:
##train_x=[x.text for x in training]
##train_y=[x.sentiment for x in training]

##test_x=[x.text for x in test]
##test_y=[x.sentiment for x in test]

In [66]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

In [67]:
vectorizer=TfidfVectorizer()

train_x_vectors = vectorizer.fit_transform(train_x)
test_x_vectors=vectorizer.transform(test_x)

In [68]:
print(train_x_vectors[0].toarray())

[[0. 0. 0. ... 0. 0. 0.]]


## Classification

In [69]:
## linear svm
from sklearn import svm
clf_svm=svm.SVC(kernel='linear')
clf_svm.fit(train_x_vectors,train_y)
clf_svm.predict(test_x_vectors[0])

array(['NEGATIVE'], dtype='<U8')

In [70]:
## Decision Tree
from sklearn.tree import DecisionTreeClassifier
clf_dec=DecisionTreeClassifier()
clf_dec.fit(train_x_vectors,train_y)
clf_dec.predict(test_x_vectors[0])

array(['NEGATIVE'], dtype='<U8')

In [71]:
## Naive Bayes
from sklearn.naive_bayes import GaussianNB
clf_gnb=DecisionTreeClassifier()
clf_gnb.fit(train_x_vectors,train_y)
clf_gnb.predict(test_x_vectors[0])


array(['NEGATIVE'], dtype='<U8')

In [72]:

## logistic Regression
from sklearn.linear_model import LogisticRegression
clf_log=LogisticRegression()
clf_log.fit(train_x_vectors,train_y)
clf_log.predict(test_x_vectors[0])

array(['NEGATIVE'], dtype='<U8')

## Evaluation

In [73]:
## Mean Accuracy
print(clf_svm.score(test_x_vectors,test_y))
print(clf_dec.score(test_x_vectors,test_y))
print(clf_gnb.score(test_x_vectors,test_y))
print(clf_log.score(test_x_vectors,test_y))

0.8076923076923077
0.6370192307692307
0.6225961538461539
0.8052884615384616


In [74]:
## F1 Score
from sklearn.metrics import f1_score

In [75]:
f1_score(test_y,clf_svm.predict(test_x_vectors),average=None,labels=[Sentiment.POSITIVE,Sentiment.NEUTRAL,Sentiment.NEGATIVE])


  average, "true nor predicted", 'F-score is', len(true_sum)


array([0.80582524, 0.        , 0.80952381])

In [76]:
f1_score(test_y,clf_dec.predict(test_x_vectors),average=None,labels=[Sentiment.POSITIVE,Sentiment.NEUTRAL,Sentiment.NEGATIVE])

array([0.63260341, 0.        , 0.64133017])

In [77]:
f1_score(test_y,clf_gnb.predict(test_x_vectors),average=None,labels=[Sentiment.POSITIVE,Sentiment.NEUTRAL,Sentiment.NEGATIVE])

array([0.61425061, 0.        , 0.63058824])

In [78]:
f1_score(test_y,clf_log.predict(test_x_vectors),average=None,labels=[Sentiment.POSITIVE,Sentiment.NEUTRAL,Sentiment.NEGATIVE])

array([0.80291971, 0.        , 0.80760095])

In [79]:
train_y.count(Sentiment.NEGATIVE)

436

In [80]:
train_y.count(Sentiment.POSITIVE)

436

In [81]:
test_y.count(Sentiment.POSITIVE)

208

In [82]:
test_y.count(Sentiment.NEGATIVE)

208

In [95]:
test_set=['this restraunt is not reviewed place']
new_test=vectorizer.transform(test_set)
clf_svm.predict(new_test)

array(['NEGATIVE'], dtype='<U8')

# Tunning our model with grid search

In [84]:
from sklearn.model_selection import GridSearchCV
parameters={'kernel':('linear','rbf'),'C':(1,4,8,16,32)}
svc=svm.SVC()
clf=GridSearchCV(svc,parameters,cv=5)
clf.fit(train_x_vectors,train_y)

GridSearchCV(cv=5, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': (1, 4, 8, 16, 32), 'kernel': ('linear', 'rbf')},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [85]:
print(clf.score(test_x_vectors,test_y))

0.8197115384615384


# Saving Model

In [86]:
pip install pickle-mixin

Note: you may need to restart the kernel to use updated packages.


In [87]:
import pickle

In [88]:
with open('C:/Users/asus/Desktop/web development/model_sentiment.pkl','wb') as f:
    pickle.dump(clf,f)

# Load model

In [89]:
with open('C:/Users/asus/Desktop/web development/model_sentiment.pkl','rb') as f:
    loaded_clf=pickle.load(f)

In [90]:
print(test_x[0])

I read all the reviews on this one and really sounded like it would be a good read, but unfortunately, this book is really bad. The plot is very confusing. I could not understand half the writing. The story just did not make any sense to me, it kept jumping around. I really tried to keep reading it,  but I gave up not even half way thru. Just didn't get it.


In [91]:
loaded_clf.predict(test_x_vectors[0])

array(['NEGATIVE'], dtype='<U8')

In [100]:
test_set=['Very unstable service, on both phone page and browser']
new_test=vectorizer.transform(test_set)
clf_log.predict(new_test)

array(['POSITIVE'], dtype='<U8')