# Importing JSON File 

In [1]:
import json
import numpy as np
import pandas as pd
file_name='books_small.json'

# Creating a Review Class which takes review texts and sentiment 

In [2]:
class Review:
    def __init__(self,text,score):
        self.text=text
        self.score=score
        self.sentiment=self.get_sent()
    def get_sent(self):
        if self.score>3:
            return "POSITIVE"
        elif self.score==3:
            return "NEUTRAL"
        else:
            return "NEGETIVE"

        

In [3]:
texts=np.array([])
reviews=np.array([])
with open(file_name) as f:
    for line in f:
        review=json.loads(line)
        texts=np.append(texts,review['reviewText'])
        reviews=np.append(reviews,Review(review['reviewText'],review['overall']).get_sent())
               

In [4]:

len(reviews)

1000

#   Preparing data Train-Test split

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
X_train,X_test,y_train,y_test=train_test_split(texts,reviews,test_size=.33,random_state=42,stratify=reviews)

In [7]:
from sklearn import preprocessing
le=preprocessing.LabelEncoder()
le.fit(['NEGETIVE','NEUTRAL','POSITIVE'])
y_train=le.transform(y_train)
y_test=le.transform(y_test)

In [8]:
type(X_train)

numpy.ndarray

In [9]:
len(X_train)

670

In [10]:
y_train

array([2, 1, 2, 1, 1, 2, 2, 2, 0, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1,
       2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 0, 0,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0,
       2, 2, 2, 2, 2, 0, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 0, 2, 1, 2, 2, 2, 1, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2,
       0, 2, 2, 1, 2, 1, 2, 1, 1, 2, 2, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1,
       2, 2, 2, 2, 2, 0, 2, 2, 2, 1, 1, 1, 2, 2, 2, 2, 2, 1, 2, 2, 1, 2,
       1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       0, 2, 2, 1, 2, 2, 2, 2, 0, 2, 1, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 1,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 0, 2, 2, 0, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 1, 2, 1, 2, 1, 1, 2, 0,

# Converting Text data into numerical data 

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
count_vect = TfidfVectorizer()
train_X_num=count_vect.fit_transform(X_train)
test_X_num=count_vect.transform(X_test)

In [12]:
train_X_num.size

42503

# Classification ML model

In [13]:
from sklearn import svm
clf_svc=svm.SVC(kernel='linear')
clf_svc.fit(train_X_num,y_train)
clf_svc.predict(test_X_num[0])

array([2])

In [14]:
test_X_num

<330x7576 sparse matrix of type '<class 'numpy.float64'>'
	with 17548 stored elements in Compressed Sparse Row format>

In [15]:
clf_svc.score(test_X_num,y_test)

0.8363636363636363

In [16]:
X_test[0]

'I would like anyone that loves to read know this series is wonderful. I have enjoyed everyone so far and looking forward to the rest.'

In [17]:
clf_svc.predict(test_X_num[0])

array([2])

In [18]:
from sklearn.metrics import f1_score
f1_score(y_test,clf_svc.predict(test_X_num),average=None)

array([0.        , 0.        , 0.91089109])

# Decision Tree Model

In [19]:
from sklearn.linear_model import LogisticRegression
clf_lgc=LogisticRegression(random_state=0)
clf_lgc.fit(train_X_num,y_train)
clf_lgc.score(test_X_num,y_test)

0.8363636363636363

In [20]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,clf_lgc.predict(test_X_num))

array([[  0,   0,  20],
       [  0,   0,  34],
       [  0,   0, 276]], dtype=int64)

In [21]:
f1_score(y_test,clf_lgc.predict(test_X_num),average=None)

array([0.        , 0.        , 0.91089109])

# Decision Tree ML model

In [22]:
from sklearn.tree import DecisionTreeClassifier
clf_dec=DecisionTreeClassifier()
clf_dec.fit(train_X_num,y_train)
clf_dec.score(test_X_num,y_test)

0.7424242424242424

In [23]:
f1_score(y_test,clf_dec.predict(test_X_num),average=None)

array([0.04761905, 0.23880597, 0.85662432])

# Model Tuning

In [24]:
from sklearn.model_selection import GridSearchCV
from sklearn import svm
parameters={
    'kernel': ('linear','rbf'),
    'C':((1,4,8,16,32))
}
svc=svm.SVC()
clf=GridSearchCV(svc,parameters,cv=5)
clf.fit(train_X_num,y_train)

GridSearchCV(cv=5, estimator=SVC(),
             param_grid={'C': (1, 4, 8, 16, 32), 'kernel': ('linear', 'rbf')})