## Loading Libraries


In [26]:
import json
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer #<-- to vectorize bag of words
from sklearn.feature_extraction.text import TfidfVectorizer #<-- term frequency inverse document frequency
import random

## Data Class


In [2]:
class Sentiment:
    NEGATIVE = 'NEGATIVE'
    NEUTRAL = 'NEUTRAL'
    POSITIVE = 'POSITIVE'
    
class Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
        
    def get_sentiment(self):
        if self.score <= 2:
            return Sentiment.NEGATIVE
        
        elif self.score == 3:
            return Sentiment.NEUTRAL
        else: #score of 4 or 5
            return Sentiment.POSITIVE
        
## to evenly distribute the data to increase the f1 score
class ReviewContainer:
    def __init__(self, reviews):
        self.reviews = reviews
        
    def get_text(self):
        return [x.text for x in self.reviews]
    
    def get_sentiment(self):
        return [x.sentiment for x in self.reviews]
        
    def evenly_distribute(self):
        negative = list(filter(lambda x: x.sentiment == Sentiment.NEGATIVE, self.reviews))
        positive = list(filter(lambda x: x.sentiment == Sentiment.POSITIVE, self.reviews))
        positive_shrunk = positive[:len(negative)]
        self.reviews = negative + positive_shrunk
        random.shuffle(self.reviews)

## Load Data

In [3]:
file_name = './data/sentiments/books_small_10000.json'

reviews = []
with open(file_name) as f:
    for line in f:
        review = json.loads(line)
        reviews.append(Review(review['reviewText'], review['overall']))

print(reviews[5].sentiment)
        

POSITIVE


## Prep Data

In [4]:
training, test = train_test_split(reviews, test_size = 0.33, random_state = 42) #random state will give the same split
                                                                # when random_state = 42 it will give the same result you got when you ran it the first time
#training[0].text

train_cont = ReviewContainer(training)
train_cont.evenly_distribute()

test_cont = ReviewContainer(test)
test_cont.evenly_distribute() # the model tends to determin data with 50-50 positive and negative data 
                              # this step is really unnecessary if the model good
len(train_cont.reviews)

872

In [5]:
train_x = train_cont.get_text() 
train_y = train_cont.get_sentiment()

# test_x = [x.text for x in test]
# test_y = [x.sentiment for x in test]

test_x = test_cont.get_text()
test_y = test_cont.get_sentiment()

print(test_y.count(Sentiment.POSITIVE))
print(test_y.count(Sentiment.NEGATIVE))
print(train_x[0])

208
208
I wasn't sure I would like this story.  But the story turned into a delightful feeling of hope for lovers


#### Bag of words vectorization

In [53]:
vectorizer = TfidfVectorizer()
# vectorizer = CountVectorizer()
train_x_vectors = vectorizer.fit_transform(train_x) #vectorize the train_x

# vectorizer.fit(train_x)
# train_x_vectors = vectorizer.transform(train_x) <-- can do the same thing fit_transform but in two steps

test_x_vectors = vectorizer.transform(test_x)

print(train_x[0])
print(train_x_vectors[0].toarray())

I wasn't sure I would like this story.  But the story turned into a delightful feeling of hope for lovers
[[0. 0. 0. ... 0. 0. 0.]]


## Classification

### Linear svm

In [54]:
from sklearn import svm

clf_svm = svm.SVC(kernel = 'linear')

In [55]:
clf_svm.fit(train_x_vectors, train_y)

test_x[0]

clf_svm.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

### Decision Tree

In [56]:
from sklearn.tree import DecisionTreeClassifier

clf_dec = DecisionTreeClassifier()

In [57]:
clf_dec.fit(train_x_vectors, train_y)

clf_dec.predict(test_x_vectors[0])

array(['NEGATIVE'], dtype='<U8')

### Naive Bayes classifier

In [58]:
from sklearn.naive_bayes import GaussianNB

clf_bayes = GaussianNB()

In [59]:
clf_bayes.fit(train_x_vectors.toarray(), train_y)

clf_bayes.predict(test_x_vectors[3].toarray())

array(['NEGATIVE'], dtype='<U8')

### Logistic Regression

In [60]:
from sklearn.linear_model import LogisticRegression

clf_log = LogisticRegression(solver='liblinear', multi_class='ovr')

In [61]:
clf_log.fit(train_x_vectors, train_y)

clf_log.predict(test_x_vectors[3])

array(['NEGATIVE'], dtype='<U8')





### KNN

In [62]:
from sklearn.neighbors import KNeighborsClassifier

clf_knn = KNeighborsClassifier(n_neighbors = 5)

In [63]:
clf_knn.fit(train_x_vectors, train_y)

clf_knn.predict(test_x_vectors[3])

array(['NEGATIVE'], dtype='<U8')

## Evaluation

In [64]:
# Mean Accuracy
print(clf_svm.score(test_x_vectors, test_y))
print(clf_dec.score(test_x_vectors, test_y))
print(clf_bayes.score(test_x_vectors.toarray(), test_y))
print(clf_log.score(test_x_vectors, test_y))
print(clf_knn.score(test_x_vectors, test_y))

0.8076923076923077
0.6274038461538461
0.6610576923076923
0.8028846153846154
0.6634615384615384


In [39]:
# F1 scores
from sklearn.metrics import f1_score

f1_score(test_y, clf_svm.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEGATIVE])

array([0.80582524, 0.80952381])

## Testing

In [40]:
#qualitative analysis
test_set = ['this book is fun', 'this book sucks', "very good"]
test_set_vectors = vectorizer.transform(test_set)
clf_svm.predict(test_set_vectors)

array(['POSITIVE', 'POSITIVE', 'POSITIVE'], dtype='<U8')

## Tuning the model (with grid search)

In [71]:
from sklearn.model_selection import GridSearchCV

parameters = {'kernel': ('linear', 'rbf'), 'C': (1, 4, 8, 16, 32), 'gamma': ('scale', 'auto')}

svc = svm.SVC()

clf = GridSearchCV(svc, parameters, cv=5) #how many times the data should be split and cross validate with specific parametes??
clf.fit(train_x_vectors, train_y)


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto_deprecated', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='warn', n_jobs=None,
             param_grid={'C': (1, 4, 8, 16, 32), 'gamma': ('scale', 'auto'),
                         'kernel': ('linear', 'rbf')},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [72]:
clf.score(test_x_vectors, test_y)

0.8197115384615384

## Saving the model

In [73]:
import pickle

with open('./sentiment_classifier.pkl', 'wb') as f:
    pickle.dump(clf, f)

### Load model

In [78]:
with open('./sentiment_classifier.pkl', 'rb') as f:
    loaded_clf = pickle.load(f)

In [76]:
loaded_clf.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')