In [1]:
import pandas as pd

In [2]:
import json

# Getting Data

In [3]:
reviews = []
rating = []
with open('Books_small_10000.json') as f:
    for line in f:
        review = json.loads(line)
        reviews.append(review['reviewText'])
        rating.append(review['overall'])

In [4]:
reviews = pd.DataFrame(reviews)
rating = pd.DataFrame(rating)

In [5]:
df = pd.concat([reviews, rating], axis=1)

In [6]:
df.columns = ['reviews', 'ratings']

In [7]:
df.head(5)

Unnamed: 0,reviews,ratings
0,"I bought both boxed sets, books 1-5. Really a...",5.0
1,I enjoyed this short book. But it was way way ...,3.0
2,I love Nicholas Sparks. I&#8217;ve read everyt...,4.0
3,I really enjoyed this adventure and look forwa...,4.0
4,It was a decent read.. typical story line. Not...,3.0


* If review is less than 2, it will be considered negative while review greater than 3 will be considered positive.

In [9]:
def get_sentiment(col):
    if col <=2:
        return 'Negative'
    elif col > 3:
        return 'Positive'

In [10]:
df['sentiment'] = df['ratings'].apply(get_sentiment)

In [11]:
df = df.dropna()

In [12]:
positive = df[df['sentiment'] == 'Positive'][:1000]
# since data is not equally distributed we'll take 1000 positive values only

In [13]:
not_positive = df[df['sentiment'] != 'Positive']

In [14]:
final_df = pd.concat([not_positive, positive])

In [287]:
# shuffle the dataframe and reset the index

In [15]:
final_df = final_df.sample(frac=1).reset_index(drop=True)

In [16]:
final_df = pd.get_dummies(final_df, columns=['sentiment'], drop_first=True)

In [294]:
# sum(final_df['ratings'] == 4)

279

# Train test Split

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
X = final_df['reviews']
y = final_df['sentiment_Positive']

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Text preprocessing

In [20]:
import string
from nltk.corpus import stopwords

In [21]:
def text_process(mess):
    no_punc = ''.join([char for char in mess if char not in string.punctuation])
    return [char for char in no_punc.split() if char.lower() not in stopwords.words('english')]

In [22]:
final_df['reviews'][1:5].apply(text_process)

1    [finished, Cast, Sorrows, cant, wait, Cast, Fl...
2    [love, Pauls, heart, way, shares, messages, bo...
3    [Damn, book, beginning, end, since, first, boo...
4    [book, took, really, long, time, go, anywhere,...
Name: reviews, dtype: object

In [23]:
from sklearn.feature_extraction.text import CountVectorizer

In [24]:
cv = CountVectorizer(analyzer=text_process)

In [25]:
X_train = cv.fit_transform(X_train)

In [26]:
X_test = cv.transform(X_test)

In [27]:
from sklearn.feature_extraction.text import TfidfTransformer

In [28]:
tf = TfidfTransformer()

In [29]:
X_train = tf.fit_transform(X_train)

In [30]:
X_test = tf.transform(X_test)

# Creating Model using naive Bayes

In [31]:
from sklearn.naive_bayes import MultinomialNB

In [32]:
model = MultinomialNB()

In [54]:
model.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [55]:
predict = model.predict(X_test)

In [56]:
from sklearn.metrics import classification_report

In [57]:
print(classification_report(predict, y_test))

              precision    recall  f1-score   support

           0       0.18      0.90      0.30        39
           1       0.99      0.65      0.79       455

    accuracy                           0.67       494
   macro avg       0.58      0.78      0.55       494
weighted avg       0.92      0.67      0.75       494



# Creating Model using SVC

In [58]:
from sklearn.svm import SVC

In [59]:
svc = SVC().fit(X_train, y_train)

In [60]:
predict = svc.predict(X_test)

In [61]:
print(classification_report(predict, y_test))

              precision    recall  f1-score   support

           0       0.53      0.83      0.65       121
           1       0.93      0.76      0.84       373

    accuracy                           0.78       494
   macro avg       0.73      0.80      0.74       494
weighted avg       0.83      0.78      0.79       494



# Grid Search

In [41]:
from sklearn.model_selection import GridSearchCV

In [42]:
parameter = {'C': [0.1, 1, 10, 100, 1000], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001]}

In [43]:
grid = GridSearchCV(SVC(), parameter, refit=True, verbose=3)

In [44]:
grid.fit(X_train, y_train)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV] C=0.1, gamma=1 ..................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ...................... C=0.1, gamma=1, score=0.604, total=   0.4s
[CV] C=0.1, gamma=1 ..................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s remaining:    0.0s


[CV] ...................... C=0.1, gamma=1, score=0.604, total=   0.4s
[CV] C=0.1, gamma=1 ..................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.7s remaining:    0.0s


[CV] ...................... C=0.1, gamma=1, score=0.609, total=   0.3s
[CV] C=0.1, gamma=1 ..................................................
[CV] ...................... C=0.1, gamma=1, score=0.609, total=   0.3s
[CV] C=0.1, gamma=1 ..................................................
[CV] ...................... C=0.1, gamma=1, score=0.609, total=   0.3s
[CV] C=0.1, gamma=0.1 ................................................
[CV] .................... C=0.1, gamma=0.1, score=0.604, total=   0.3s
[CV] C=0.1, gamma=0.1 ................................................
[CV] .................... C=0.1, gamma=0.1, score=0.604, total=   0.3s
[CV] C=0.1, gamma=0.1 ................................................
[CV] .................... C=0.1, gamma=0.1, score=0.609, total=   0.3s
[CV] C=0.1, gamma=0.1 ................................................
[CV] .................... C=0.1, gamma=0.1, score=0.609, total=   0.3s
[CV] C=0.1, gamma=0.1 ................................................
[CV] .

[CV] .................... C=10, gamma=0.01, score=0.604, total=   0.3s
[CV] C=10, gamma=0.01 ................................................
[CV] .................... C=10, gamma=0.01, score=0.604, total=   0.3s
[CV] C=10, gamma=0.01 ................................................
[CV] .................... C=10, gamma=0.01, score=0.609, total=   0.3s
[CV] C=10, gamma=0.01 ................................................
[CV] .................... C=10, gamma=0.01, score=0.613, total=   0.3s
[CV] C=10, gamma=0.01 ................................................
[CV] .................... C=10, gamma=0.01, score=0.609, total=   0.3s
[CV] C=10, gamma=0.001 ...............................................
[CV] ................... C=10, gamma=0.001, score=0.604, total=   0.3s
[CV] C=10, gamma=0.001 ...............................................
[CV] ................... C=10, gamma=0.001, score=0.604, total=   0.3s
[CV] C=10, gamma=0.001 ...............................................
[CV] .

[CV] ................. C=1000, gamma=0.001, score=0.857, total=   0.3s
[CV] C=1000, gamma=0.001 .............................................
[CV] ................. C=1000, gamma=0.001, score=0.839, total=   0.3s
[CV] C=1000, gamma=0.0001 ............................................
[CV] ................ C=1000, gamma=0.0001, score=0.604, total=   0.3s
[CV] C=1000, gamma=0.0001 ............................................
[CV] ................ C=1000, gamma=0.0001, score=0.604, total=   0.3s
[CV] C=1000, gamma=0.0001 ............................................
[CV] ................ C=1000, gamma=0.0001, score=0.609, total=   0.3s
[CV] C=1000, gamma=0.0001 ............................................
[CV] ................ C=1000, gamma=0.0001, score=0.613, total=   0.3s
[CV] C=1000, gamma=0.0001 ............................................
[CV] ................ C=1000, gamma=0.0001, score=0.609, total=   0.3s


[Parallel(n_jobs=1)]: Done 125 out of 125 | elapsed:   39.8s finished


GridSearchCV(cv=None, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [0.1, 1, 10, 100, 1000],
                         'gamma': [1, 0.1, 0.01, 0.001, 0.0001]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=3)

In [45]:
grid.best_params_

{'C': 10, 'gamma': 0.1}

* Evaluating the model

In [46]:
grid_pred = grid.predict(X_test)

In [47]:
print(classification_report(grid_pred, y_test))

              precision    recall  f1-score   support

           0       0.72      0.78      0.75       176
           1       0.87      0.83      0.85       318

    accuracy                           0.81       494
   macro avg       0.80      0.81      0.80       494
weighted avg       0.82      0.81      0.82       494

