### Import Json and Load File

In [1]:
from collections import Counter

import pandas as pd
import numpy as np
import json
import warnings
warnings.filterwarnings('ignore')

file = 'Books_small_10000.json'

reviews = []
with open(file) as f:
    for line in f:
        review = json.loads(line)
        reviews.append(review)
        
books_df = pd.DataFrame(reviews)
books_df = books_df[['reviewText', 'overall']]
books_df.head()

Unnamed: 0,reviewText,overall
0,"I bought both boxed sets, books 1-5. Really a...",5.0
1,I enjoyed this short book. But it was way way ...,3.0
2,I love Nicholas Sparks. I&#8217;ve read everyt...,4.0
3,I really enjoyed this adventure and look forwa...,4.0
4,It was a decent read.. typical story line. Not...,3.0


### Preprocessing Data

In [2]:
# Change columns name
books_df.columns = ['text', 'sentiment']
books_df.head()

Unnamed: 0,text,sentiment
0,"I bought both boxed sets, books 1-5. Really a...",5.0
1,I enjoyed this short book. But it was way way ...,3.0
2,I love Nicholas Sparks. I&#8217;ve read everyt...,4.0
3,I really enjoyed this adventure and look forwa...,4.0
4,It was a decent read.. typical story line. Not...,3.0


In [3]:
for i in range(len(books_df)):
    if books_df.iloc[i, 1] <= 2:
        books_df.iloc[i, 1] = 'Negative'
    elif 2 < books_df.iloc[i, 1] < 4:
        books_df.iloc[i, 1] = 'Neutral'
    else:
        books_df.iloc[i, 1] = 'Positive'
        
books_df.head()

Unnamed: 0,text,sentiment
0,"I bought both boxed sets, books 1-5. Really a...",Positive
1,I enjoyed this short book. But it was way way ...,Neutral
2,I love Nicholas Sparks. I&#8217;ve read everyt...,Positive
3,I really enjoyed this adventure and look forwa...,Positive
4,It was a decent read.. typical story line. Not...,Neutral


In [4]:
books_df = books_df[books_df.sentiment != 'Neutral']

In [5]:
X = books_df.text
y = books_df.sentiment

In [6]:
print(Counter(y).items())

dict_items([('Positive', 8378), ('Negative', 644)])


### Bag of Words Vectorization

In [7]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(X)
y_tfidf = tfidf.fit_transform(y)

### UnderSampling Data

In [8]:
from imblearn.under_sampling import NearMiss

nm = NearMiss(sampling_strategy='all')
X_res, y_res = nm.fit_resample(X_tfidf, y_tfidf)

In [9]:
print(Counter(y_res[:,0]).items())

dict_items([(0, 644), (1, 644)])


### Preparation Data

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.3, random_state=22)

### Classification

#### Linear SVM

In [11]:
from sklearn.svm import SVC

books_svc = SVC(kernel='linear')
books_svc.fit(X_train, y_train)

y_pred_svc = books_svc.predict(X_test)

print(confusion_matrix(y_test, y_pred_svc))
print(classification_report(y_test, y_pred_svc))

[[175  24]
 [ 17 171]]
              precision    recall  f1-score   support

           0       0.91      0.88      0.90       199
           1       0.88      0.91      0.89       188

    accuracy                           0.89       387
   macro avg       0.89      0.89      0.89       387
weighted avg       0.89      0.89      0.89       387



#### Decision Tree

In [12]:
from sklearn.tree import DecisionTreeClassifier

books_tree = DecisionTreeClassifier()
books_tree.fit(X_train, y_train)

y_pred_tree = books_tree.predict(X_test)

print(confusion_matrix(y_test, y_pred_tree))
print(classification_report(y_test, y_pred_tree))

[[149  50]
 [ 53 135]]
              precision    recall  f1-score   support

           0       0.74      0.75      0.74       199
           1       0.73      0.72      0.72       188

    accuracy                           0.73       387
   macro avg       0.73      0.73      0.73       387
weighted avg       0.73      0.73      0.73       387



#### Logistic Regression

In [13]:
from sklearn.linear_model import LogisticRegression

books_reg = LogisticRegression()
books_reg.fit(X_train, y_train)

y_pred_reg = books_reg.predict(X_test)

print(confusion_matrix(y_test, y_pred_reg))
print(classification_report(y_test, y_pred_reg))

[[168  31]
 [ 22 166]]
              precision    recall  f1-score   support

           0       0.88      0.84      0.86       199
           1       0.84      0.88      0.86       188

    accuracy                           0.86       387
   macro avg       0.86      0.86      0.86       387
weighted avg       0.86      0.86      0.86       387



#### Naive Bayes

In [14]:
from sklearn.naive_bayes import GaussianNB

books_nb = LogisticRegression()
books_nb.fit(X_train, y_train)

y_pred_nb = books_nb.predict(X_test)

print(confusion_matrix(y_test, y_pred_nb))
print(classification_report(y_test, y_pred_nb))

[[168  31]
 [ 22 166]]
              precision    recall  f1-score   support

           0       0.88      0.84      0.86       199
           1       0.84      0.88      0.86       188

    accuracy                           0.86       387
   macro avg       0.86      0.86      0.86       387
weighted avg       0.86      0.86      0.86       387



### Evaluation

#### Mean Accuracy

In [15]:
print(books_svc.score(X_test, y_test))
print(books_tree.score(X_test, y_test))
print(books_reg.score(X_test, y_test))
print(books_nb.score(X_test, y_test))

0.8940568475452196
0.7338501291989664
0.8630490956072352
0.8630490956072352


#### F1 Score

In [16]:
from sklearn.metrics import f1_score

print(f1_score(y_test, y_pred_svc, average=None))
print(f1_score(y_test, y_pred_tree, average=None))
print(f1_score(y_test, y_pred_reg, average=None))
print(f1_score(y_test, y_pred_nb, average=None))

[0.89514066 0.89295039]
[0.74314214 0.72386059]
[0.86375321 0.86233766]
[0.86375321 0.86233766]


### Tuning Model With Grid Search

In [17]:
from sklearn.model_selection import GridSearchCV

parameters = {'kernel': ['linear', 'rbf'], 'C': [0.1, 1, 10, 100]}

grid = GridSearchCV(SVC(), parameters, cv=5)
grid.fit(X_train, y_train)

print(grid.best_params_)

{'C': 1, 'kernel': 'linear'}


### Saving Model

In [19]:
import joblib

joblib.dump(books_svc, filename='model_sentiment.pkl')

['model_sentiment.pkl']

### Load Model

In [20]:
loaded_books = joblib.load(filename='model_sentiment.pkl')