## Data  class

In [1]:
import random

class Sentiment:
    NEGATIVE = "NEGATIVE"
    NEUTRAL = "NEUTRAL"
    POSITIVE = "POSITIVE"

class Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
        
    def get_sentiment(self):
        if self.score <= 2:
            return Sentiment.NEGATIVE
        elif self.score == 3:
            return Sentiment.NEUTRAL
        else: #score of 4 or 5
            return Sentiment.POSITIVE
        
#this class is responsible for evenly distributing the poa_lab[POSITIVE & NEGATIVE]  
class ReviewContainer: 
    def __init__(self,reviews):
        self.reviews = reviews
        
    def get_text(self):
        return[x.text for x in self.reviews]
    
    def get_sentiment(self):
        return[x.sentiment for x in self.reviews]
        
    def evenly_distribute(self):
        negative = list(filter(lambda x: x.sentiment == Sentiment.NEGATIVE, self.reviews))
        positive = list(filter(lambda x: x.sentiment == Sentiment.POSITIVE, self.reviews))
        
        positive_shrunk = positive[:len(negative)]
        self.reviews = negative + positive_shrunk
        random.shuffle(self.reviews)
        
        

## load  data

In [2]:
import json

file_name = './data/sentiment/Books_small_10000.json'

#take of the differece(reviews,review)
reviews = []
with open(file_name) as f:
    for line in f:
        review = json.loads(line)
        reviews.append(Review(review['reviewText'], review['overall'])) #reviewText(text) and overallRating(score)
        
reviews[5].text

'I hoped for Mia to have some peace in this book, but her story is so real and raw.  Broken World was so touching and emotional because you go from Mia\'s trauma to her trying to cope.  I love the way the story displays how there is no "just bouncing back" from being sexually assaulted.  Mia showed us how those demons come for you every day and how sometimes they best you. I was so in the moment with Broken World and hurt with Mia because she was surrounded by people but so alone and I understood her feelings.  I found myself wishing I could give her some of my courage and strength or even just to be there for her.  Thank you Lizzy for putting a great character\'s voice on a strong subject and making it so that other peoples story may be heard through Mia\'s.'

## Prep data

In [39]:
from sklearn.model_selection import train_test_split

training, test = train_test_split(reviews,test_size=0.33,random_state=42,)

#re-write the new sets into the  ReviewContainer class for evenly_distribution
train_container = ReviewContainer(training)
test_container = ReviewContainer(test)

In [40]:
#arrays of text and sentiment for trainning and testing set called from the ReviewContainer class. 
#which are evenly_distributed



train_container.evenly_distribute()
train_x = train_container.get_text()
train_y = train_container.get_sentiment()

test_container.evenly_distribute()
test_x = test_container.get_text()
test_y = test_container.get_sentiment()

print(train_y.count(Sentiment.POSITIVE))
print(train_y.count(Sentiment.NEGATIVE))


436
436


### Bags of words Vectorization 

In [41]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer 


#vectorizer = CountVectorizer()
vectorizer = TfidfVectorizer() #TimeFrequency & InversedocumentFrequency

train_x_vectors = vectorizer.fit_transform(train_x)
test_x_vectors = vectorizer.transform(test_x) ## because we dont want to fit another model so we just transform


print(train_x[0])
print(train_x_vectors[0].toarray())

#we are going to create models for our training set (train_x_vector, train_y)



This was our book club pick this month, so I chose to finish it.  I would not have gone much past half way otherwise.  It became so disjointed and boring that I scanned the last third just to get through to the end.  This author needed a good editor.
[[0. 0. 0. ... 0. 0. 0.]]


 ## Classification
 
 ### Lear SVM

In [42]:
from sklearn import svm
clf_svm = svm.SVC(kernel='linear')

clf_svm.fit(train_x_vectors, train_y)

 

clf_svm.predict(test_x_vectors[0]) 

array(['POSITIVE'], dtype='<U8')

### Decision Tree

In [68]:
from sklearn.tree import DecisionTreeClassifier

clf_dec = DecisionTreeClassifier()
clf_dec.fit(train_x_vectors, train_y)

clf_dec.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

### Naive Bayes

In [44]:
from sklearn.naive_bayes import GaussianNB


clf_gnb = GaussianNB()


#A sparse matrix was passed, but dense data is required. Use .todense() to convert to a dense numpy array.
train_x_vectors_dense = train_x_vectors.todense()
test_x_vectors_dense= test_x_vectors.todense()


clf_gnb.fit(train_x_vectors_dense, train_y)

clf_gnb.predict(test_x_vectors_dense[0])

array(['NEGATIVE'], dtype='<U8')

### LogisticRegression

In [45]:
from sklearn.linear_model import LogisticRegression

clf_log = LogisticRegression()
clf_log.fit(train_x_vectors, train_y)

 

clf_log.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

## Evaluation

### mean Accuracy

In [71]:
print(clf_svm.score(test_x_vectors, test_y))
print(clf_dec.score(test_x_vectors, test_y))
print(clf_gnb.score(test_x_vectors.todense(), test_y))
print(clf_log.score(test_x_vectors, test_y))

0.8076923076923077
0.6514423076923077
0.6610576923076923
0.8052884615384616


### F1 score

In [47]:
from sklearn.metrics import f1_score

#[y_true, x_predict,average=None, Pos_lables]
#during the corse of improvement we evenlized the positive to negative only
#therefore the f1_score returns only the [POSITIVE & NEGATIVE score]

print(f1_score(test_y, clf_svm.predict(test_x_vectors),
               average=None, 
               labels=[Sentiment.POSITIVE,Sentiment.NEGATIVE]))

print(f1_score(test_y,
               clf_dec.predict(test_x_vectors), 
               average=None, labels=[Sentiment.POSITIVE,Sentiment.NEGATIVE]))

print(f1_score(test_y, clf_gnb.predict(test_x_vectors_dense),
               average=None, 
               labels=[Sentiment.POSITIVE,Sentiment.NEGATIVE]))

print(f1_score(test_y,
               clf_log.predict(test_x_vectors), 
               average=None, labels=[Sentiment.POSITIVE,Sentiment.NEGATIVE]))

#Results arrage according to pos_labels

[0.80582524 0.80952381]
[0.63942308 0.63942308]
[0.65693431 0.66508314]
[0.80291971 0.80760095]


In [48]:
#testing 
test_set = ['very fun', "bad book do not buy", 'horible waste of time']
new_test = vectorizer.transform(test_set)

clf_svm.predict(new_test)

array(['POSITIVE', 'NEGATIVE', 'NEGATIVE'], dtype='<U8')

## Turning our Model (with grid search)

In [72]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

tuned_parameters = [{'kernel': ['linear','rbf'], 'C': [1,4,8,16,32]}]

clf = GridSearchCV(SVC(), tuned_parameters, cv=5)
clf.fit(train_x_vectors, train_y)

#clf.predict(test_x_vectors)


GridSearchCV(cv=5, estimator=SVC(),
             param_grid=[{'C': [1, 4, 8, 16, 32], 'kernel': ['linear', 'rbf']}])

### Check Performace of turn(clf) on mean accuracy

In [79]:
print(clf.score(test_x_vectors, test_y))

#it can be seen that there was an improvement

0.8197115384615384


### Check Performace of turn(clf) on f1_score

In [81]:
print(f1_score(test_y, clf.predict(test_x_vectors),
               average=None, 
               labels=[Sentiment.POSITIVE,Sentiment.NEGATIVE]))


[0.82269504 0.81662592]
