## Data set class

In [60]:
class Sentiment:
    NEGATIVE = "NEGATIVE"
    NEUTRAL = "NEUTRAL"
    POSITIVE = "POSITIVE"

class Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
        
    def get_sentiment(self):
        if self.score <= 2:
            return Sentiment.NEGATIVE
        elif self.score == 3:
            return Sentiment.NEUTRAL
        else: #score of 4 or 5
            return Sentiment.POSITIVE
        
        

## loading the dataset

In [2]:
import json

file_name = './data/sentiment/Books_small.json'

#take of the differece(reviews,review)
reviews = []
with open(file_name) as f:
    for line in f:
        review = json.loads(line)
        reviews.append(Review(review['reviewText'], review['overall'])) #reviewText(text) and overallRating(score)
        
reviews[5].text

'Love the book, great story line, keeps you entertained.for a first novel from this author she did a great job,  Would definitely recommend!'

## Prep data

In [5]:
from sklearn.model_selection import train_test_split

training, test = train_test_split(reviews,test_size=0.33,random_state=42,)

In [61]:
#arrays of text and sentiment for trainning and testing set
train_x = [x.text for x in training]
train_y = [y.sentiment for y in training]

test_x = [x.text for x in test]
test_y = [y.sentiment for y in test]

### Bags of words Vectorization 

In [62]:
from sklearn.feature_extraction.text import CountVectorizer


vectorizer = CountVectorizer()
train_x_vectors = vectorizer.fit_transform(train_x)
test_x_vectors = vectorizer.transform(test_x) ## because we dont want to fit another model so we just transform


print(train_x[0])
print(train_x_vectors[0].toarray())

#we are going to create models for our training set (train_x_vector, train_y)



Vivid characters and descriptions. The author has created a tale that grabs your attention and I couldn't put it down.
[[0 0 0 ... 0 0 0]]


 ## Classification
 
 ### Lear SVM

In [63]:
from sklearn import svm

clf_svm = svm.SVC(kernel='linear')

clf_svm.fit(train_x_vectors, train_y)

 

clf_svm.predict(test_x_vectors[0]) 

array(['POSITIVE'], dtype='<U8')

### Decision Tree

In [64]:
from sklearn.tree import DecisionTreeClassifier

clf_dec = DecisionTreeClassifier()
clf_dec.fit(train_x_vectors, train_y)

clf_dec.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

### Naive Bayes

In [65]:
from sklearn.naive_bayes import GaussianNB


clf_gnb = GaussianNB()


#A sparse matrix was passed, but dense data is required. Use .todense() to convert to a dense numpy array.
train_x_vectors_dense = train_x_vectors.todense()
test_x_vectors_dense= test_x_vectors.todense()


clf_gnb.fit(train_x_vectors_dense, train_y)

clf_gnb.predict(test_x_vectors_dense[0])

array(['POSITIVE'], dtype='<U8')

### LogisticRegression

In [66]:
from sklearn.linear_model import LogisticRegression

clf_log = LogisticRegression()
clf_log.fit(train_x_vectors, train_y)

 

clf_log.predict(test_x_vectors[0])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


array(['POSITIVE'], dtype='<U8')

## Evaluation

### mean Accuracy

In [67]:
print(clf_svm.score(test_x_vectors, test_y))
print(clf_dec.score(test_x_vectors, test_y))
print(clf_gnb.score(test_x_vectors.todense(), test_y))
print(clf_log.score(test_x_vectors, test_y))

0.8242424242424242
0.7545454545454545
0.8121212121212121
0.8303030303030303


### F1 score

In [77]:
from sklearn.metrics import f1_score

#[y_true, x_predict,average=None, Pos_lables]

print(f1_score(test_y, clf_svm.predict(test_x_vectors),
               average=None, 
               labels=[Sentiment.POSITIVE,Sentiment.NEUTRAL,Sentiment.NEGATIVE]))

print(f1_score(test_y,
               clf_dec.predict(test_x_vectors), 
               average=None, labels=[Sentiment.POSITIVE,Sentiment.NEUTRAL,Sentiment.NEGATIVE]))

print(f1_score(test_y, clf_gnb.predict(test_x_vectors_dense),
               average=None, 
               labels=[Sentiment.POSITIVE,Sentiment.NEUTRAL,Sentiment.NEGATIVE]))

print(f1_score(test_y,
               clf_log.predict(test_x_vectors), 
               average=None, labels=[Sentiment.POSITIVE,Sentiment.NEUTRAL,Sentiment.NEGATIVE]))

#Results arrage according to pos_labels

[0.91319444 0.21052632 0.22222222]
[0.86219081 0.12698413 0.06451613]
[0.89678511 0.08510638 0.09090909]
[0.91370558 0.12244898 0.1       ]
