# DATA CLASS

In [15]:
import random

class Sentiment:
    NEGATIVE = 'NEGATIVE'
    NEUTRAL = 'NEUTRAL'
    POSITIVE = 'POSITIVE'

class Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
        
    
    def get_sentiment(self):
        if self.score <= 2:
            return Sentiment.NEGATIVE
        elif self.score == 3:
            return Sentiment.NEUTRAL
        else:
            return Sentiment.POSITIVE
        

class ReviewContainer:
    def __init__(self, reviews):
        self.reviews = reviews
    
    def evenly_distribute(self):
        negative = list(filter(lambda x: x.sentiment == Sentiment.NEGATIVE, self.reviews))
        positive = list(filter(lambda x: x.sentiment == Sentiment.POSITIVE, self.reviews))
        shrink_positive = positive[:len(negative)]
        self.reviews = negative + shrink_positive
        random.shuffle(self.reviews)
        
    def get_text(self):
        return [x.text for x in self.reviews]
    
    def get_sentiment(self):
        return [x.sentiment for x in self.reviews]


# LOAD DATA

In [2]:
import json

file_name = 'amazon_reviews.json'

reviews = []
with open(file_name) as f:
    for line in f:
        review = json.loads(line)
        reviews.append(Review(review['reviewText'], review['overall']))

# PREP DATA

In [17]:
from sklearn.model_selection import train_test_split

training, test = train_test_split(reviews, test_size=0.33, random_state=42)

training_cont = ReviewContainer(training)
test_cont = ReviewContainer(test)


In [26]:
training_cont.evenly_distribute()
train_x = training_cont.get_text()
train_y = training_cont.get_sentiment()

test_cont.evenly_distribute()
test_x = test_cont.get_text()
test_y = test_cont.get_sentiment()

#### Bags of Words Vectorizer

In [31]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

vectorizer = TfidfVectorizer()
train_x_vectors = vectorizer.fit_transform(train_x)
test_x_vectors = vectorizer.transform(test_x)



# CLASSIFICATION


#### Linear SVM

In [32]:
from sklearn import svm

clf_svm = svm.SVC(kernel='linear')

clf_svm.fit(train_x_vectors, train_y)



SVC(kernel='linear')

#### Decision Tree

In [33]:
from sklearn.tree import DecisionTreeClassifier

clf_dec = DecisionTreeClassifier()
clf_dec.fit(train_x_vectors, train_y)
clf_dec.predict(test_x_vectors[0])


array(['NEGATIVE'], dtype='<U8')

# Evaluation

In [34]:
# Mean Accuracy
print(clf_svm.score(test_x_vectors, test_y))


0.8076923076923077


In [35]:
# F1 Scores
from sklearn.metrics import f1_score

f1_score(test_y, clf_svm.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEGATIVE])


array([0.80582524, 0.80952381])

#### Tuning Model Using Grid Search

In [37]:
from sklearn.model_selection import GridSearchCV

parameters = {'kernel': ('linear', 'rbf'), 'C': (1, 4, 8, 16, 32)}

svc = svm.SVC() 
clf = GridSearchCV(svc, parameters, cv=5)
clf.fit(train_x_vectors, train_y)

GridSearchCV(cv=5, estimator=SVC(),
             param_grid={'C': (1, 4, 8, 16, 32), 'kernel': ('linear', 'rbf')})

# Saving Model

In [39]:
import pickle

with open('sentiment_classifer.pkl', 'wb') as f:
    pickle.dump(clf, f)

In [40]:
with open('sentiment_classifer.pkl', 'rb') as f:
    loaded_clf = pickle.load(f)

In [46]:
loaded_clf.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')