# Data Science Basics - Scikit Learn - Sentiments Model

## Predicting Positive or Negative Comments

In [None]:
class Sentiment:
    NEGATIVE = "NEGATIVE"
    NEUTRAL = "NEUTRAL"
    POSITIVE = "POSITIVE"

In [None]:
class Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()

    def get_sentiment(self):
        if self.score <= 2:
            return Sentiment.NEGATIVE
        elif self.score == 3:
            return Sentiment.NEUTRAL
        else:
            return Sentiment.POSITIVE

In [None]:
# Not used but can be used to evenly distribute the training data so we don't have more of one status over
# other and our mode performs better.


class ReviewContainer:
    def __init__(self, reviews):
        self.reviews = reviews

    def evenly_distribute(self):
        negative = filter(lambda x: x.sentiment == Sentiment.NEGATIVE, self.reviews)
        positive = filter(lambda x: x.sentiment == Sentiment.POSITIVE, self.reviews)
        neutral = filter(lambda x: x.sentiment == Sentiment.NEUTRAL, self.reviews)

        # Print samples
        print(negative[0].text)
        print(positive[0].text)
        print(neutral[0].text)

In [None]:
import json

In [None]:
# Run this first and we will check the model. The model performs good for Positive but not for others as you can see below.
# file_name = "books_small.json"

# Run this after which has varied data. In this we can see all three status performing well.

file_name = "data/books_big.json"

reviews = []

with open(file_name) as f:
    for line in f:
        review = json.loads(line)
        reviews.append(Review(review["reviewText"], review["overall"]))

print(reviews[1].text)
print(reviews[1].score)
print(reviews[1].sentiment)

## Prep Data

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# 33% will be test data and 67 will be training data

training, test = train_test_split(reviews, test_size=0.33, random_state=42)

print(len(training))
print(len(test))
print(training[0].text)
print(test[0].text)

## Training Data

In [None]:
# Training data X and Y axis

train_x = [x.text for x in training]  # X axis is text or comments
train_y = [x.sentiment for x in training]  # Y axis is Positive, Negative or Neutral

print(train_x[0])
print(train_y[0])

## Test Data

In [None]:
test_x = [x.text for x in test]
test_y = [x.sentiment for x in test]

print(test_x[0])
print(test_y[0])

## Bag of Words Vectorization (Converts each words in to a vector)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

### Training and Test Data Vectorization

In [None]:
vectorizer = CountVectorizer()

# Way 1

# vectorizer.fit(train_x)
# train_x_vectors = vectorizer.transform(train_x)

# Way 2

train_x_vectors = vectorizer.fit_transform(train_x)

# No need to fit test data. Just need to transform.
test_x_vectors = vectorizer.transform(test_x)

print(train_x[0])
print(train_x_vectors[0])
print(train_x_vectors[0].toarray())

## Classification

##### Linear SVM (Support Vector Machine)

In [None]:
from sklearn import svm

clf_svm = svm.SVC(kernel="linear")

clf_svm.fit(train_x_vectors, train_y)

print(test_x[0])
print(test_x_vectors[0])

clf_svm.predict(test_x_vectors[0])

#### SVM Accuracy

In [None]:
clf_svm.score(test_x_vectors, test_y)

##### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

clf_dec = DecisionTreeClassifier()

clf_dec.fit(train_x_vectors, train_y)

clf_dec.predict(test_x_vectors[0])

#### Decision Tree Accuracy

In [None]:
clf_dec.score(test_x_vectors, test_y)

#### Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

clf_gnb = GaussianNB()

clf_gnb.fit(train_x_vectors.todense(), train_y)  # For NB needs dense array

clf_gnb.predict(test_x_vectors.todense()[0])

#### Naive Bayes Accuracy

In [None]:
clf_gnb.score(test_x_vectors.todense(), test_y)

#### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

clf_log = LogisticRegression()

clf_log.fit(train_x_vectors, train_y)

clf_log.predict(test_x_vectors[0])

#### Logistic Regression Accuracy

In [None]:
clf_log.score(test_x_vectors, test_y)

## Accuracy

In [None]:
# Mean Accuracy

print(clf_svm.score(test_x_vectors, test_y))
print(clf_dec.score(test_x_vectors, test_y))
print(clf_gnb.score(test_x_vectors.todense(), test_y))
print(clf_log.score(test_x_vectors, test_y))

## F1 Score

In [None]:
# F1 Score

from sklearn.metrics import f1_score

print(f1_score(test_y, clf_svm.predict(test_x_vectors), average=None))

# SVM for Positive is good but for others its bad using books_small.json. Gets better after using books_big.json
print(
    f1_score(
        test_y,
        clf_svm.predict(test_x_vectors),
        average=None,
        labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE],
    )
)

# Decision Tree for Positive is good but for others its bad books_small.json. Gets better after using books_big.json
print(
    f1_score(
        test_y,
        clf_dec.predict(test_x_vectors),
        average=None,
        labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE],
    )
)

# Naive Bayes for Positive is good but for others its bad books_small.json. Gets better after using books_big.json
print(
    f1_score(
        test_y,
        clf_gnb.predict(test_x_vectors.todense()),
        average=None,
        labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE],
    )
)

# Logistic Regression for Positive is good but for others its bad books_small.json. Gets better after using books_big.json
print(
    f1_score(
        test_y,
        clf_log.predict(test_x_vectors.todense()),
        average=None,
        labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE],
    )
)

# It seems like the models are predicting Positive good but others very bad. It could be a model or a data issue.
# The model performs better after using books_big.json

## Looking at training data to fix the model

In [None]:
print("TOTAL: " + str(len(train_y)))
print("POSITIVE : " + str(train_y.count(Sentiment.POSITIVE)))
print("NEUTRAL : " + str(train_y.count(Sentiment.NEUTRAL)))
print("NEGATIVE : " + str(train_y.count(Sentiment.NEGATIVE)))

# As we can see most of our training data has Positive cases than others.

## Sample Testing

In [None]:
test_set = [
    "I thoroughly enjoyed this, 5 stars",
    "bad book do not buy",
    "horrible waste of time",
]
new_test = vectorizer.transform(test_set)

print(clf_svm.predict(new_test))
print(clf_log.predict(new_test))
print(clf_dec.predict(new_test))
print(clf_gnb.predict(new_test.todense()))

## Tuning model with Grid Search

In [None]:
from sklearn.model_selection import GridSearchCV

parameters = {"kernel": ("linear", "rbf"), "C": (1, 4, 8, 16, 32)}

svc = svm.SVC()

clf_grd = GridSearchCV(svc, parameters, cv=5)

clf_grd.fit(train_x_vectors, train_y)

## F1 Score

In [None]:
print(clf_grd.score(test_x_vectors, test_y))

## Saving Model

In [None]:
import pickle

with open("./models/Scikit_Learn_Sentiment_Classifier.pkl", "wb") as f:
    pickle.dump(clf_grd, f)

## Importing the Model

In [None]:
with open("./models/Scikit_Learn_Sentiment_Classifier.pkl", "rb") as f:
    loaded_clf = pickle.load(f)

print(test_x[0])
print(loaded_clf.predict(test_x_vectors[0]))