In [6]:
import pandas as pd
import json

# Importing all the files

In [52]:
books = []
with open('Books_small.json') as book:
    for line in book:
        rb = json.loads(line)
        books.append(rb['reviewText'])
        
books = pd.DataFrame(books, columns = ['Review'])
books['code'] = 1

In [51]:
patio = []
with open('Patio_small.json') as pat:
    for line in pat:
        rb = json.loads(line)
        patio.append(rb['reviewText'])

patio = pd.DataFrame(patio, columns = ['Review'])
patio['code'] = 2

In [50]:
clothings = []
with open('Clothing_small.json') as clothing:
    for line in clothing:
        rb = json.loads(line)
        clothings.append(rb['reviewText'])

clothings = pd.DataFrame(clothings, columns = ['Review'])
clothings['code'] = 3

In [49]:
electronics = []
with open('Electronics_small.json') as electronic:
    for line in electronic:
        rb = json.loads(line)
        electronics.append(rb['reviewText'])

electronics = pd.DataFrame(electronics, columns = ['Review'])
electronics['code'] = 4

In [47]:
groceries = []
with open('Grocery_small.json') as grocery:
    for line in grocery:
        rb = json.loads(line)
        groceries.append(rb['reviewText'])

groceries = pd.DataFrame(groceries, columns = ['Review'])
groceries['code'] = 5

In [54]:
df = pd.concat([books, patio, clothings, electronics, groceries])

In [56]:
df = df.sample(frac=1).reset_index(drop=True)

# Text Preprocessing

In [58]:
import string
from nltk.corpus import stopwords

In [59]:
def text_process(mess):
    no_punc = ''.join([char for char in mess if char not in string.punctuation])
    return [char for char in no_punc.split() if char.lower() not in stopwords.words('english')]

In [72]:
X = df['Review']
y = df['code']

* Train Test Split

In [73]:
from sklearn.model_selection import train_test_split

In [74]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

* Count Vectorizer

In [75]:
from sklearn.feature_extraction.text import CountVectorizer

In [76]:
cv = CountVectorizer(analyzer=text_process)

In [77]:
X_train = cv.fit_transform(X_train)

In [78]:
X_test = cv.transform(X_test)

* TFIDF

In [79]:
from sklearn.feature_extraction.text import TfidfTransformer

tf = TfidfTransformer()

X_train = tf.fit_transform(X_train)


In [80]:
X_test = tf.transform(X_test)

# Creating Model

In [81]:
from sklearn.naive_bayes import MultinomialNB

In [82]:
model = MultinomialNB()

In [83]:
model.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [84]:
predict = model.predict(X_test)

* Evaluating the model

In [85]:
from sklearn.metrics import classification_report

In [86]:
print(classification_report(predict, y_test))

              precision    recall  f1-score   support

           1       0.96      0.98      0.97       292
           2       0.96      0.84      0.90       335
           3       0.94      0.93      0.94       311
           4       0.84      0.97      0.90       261
           5       0.93      0.95      0.94       301

    accuracy                           0.93      1500
   macro avg       0.93      0.93      0.93      1500
weighted avg       0.93      0.93      0.93      1500



In [87]:
from sklearn.svm import SVC

In [88]:
svc = SVC().fit(X_train, y_train)

In [89]:
s_predict = svc.predict(X_test)

In [90]:
print(classification_report(s_predict, y_test))

              precision    recall  f1-score   support

           1       0.93      0.99      0.96       278
           2       0.92      0.95      0.93       283
           3       0.92      0.90      0.91       314
           4       0.89      0.87      0.88       306
           5       0.94      0.91      0.92       319

    accuracy                           0.92      1500
   macro avg       0.92      0.92      0.92      1500
weighted avg       0.92      0.92      0.92      1500



* Trying grid search to improve the model

In [91]:
from sklearn.model_selection import GridSearchCV

In [92]:
parameter = {'C': [0.1, 1, 10, 100, 1000], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001]}

In [93]:
grid = GridSearchCV(SVC(), parameter, refit=True, verbose=3)

In [94]:
grid.fit(X_train, y_train)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV] C=0.1, gamma=1 ..................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ...................... C=0.1, gamma=1, score=0.254, total=   4.4s
[CV] C=0.1, gamma=1 ..................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    4.3s remaining:    0.0s


[CV] ...................... C=0.1, gamma=1, score=0.293, total=   4.3s
[CV] C=0.1, gamma=1 ..................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    8.6s remaining:    0.0s


[CV] ...................... C=0.1, gamma=1, score=0.263, total=   4.4s
[CV] C=0.1, gamma=1 ..................................................
[CV] ...................... C=0.1, gamma=1, score=0.266, total=   4.4s
[CV] C=0.1, gamma=1 ..................................................
[CV] ...................... C=0.1, gamma=1, score=0.256, total=   4.3s
[CV] C=0.1, gamma=0.1 ................................................
[CV] .................... C=0.1, gamma=0.1, score=0.203, total=   4.2s
[CV] C=0.1, gamma=0.1 ................................................
[CV] .................... C=0.1, gamma=0.1, score=0.203, total=   4.3s
[CV] C=0.1, gamma=0.1 ................................................
[CV] .................... C=0.1, gamma=0.1, score=0.201, total=   4.3s
[CV] C=0.1, gamma=0.1 ................................................
[CV] .................... C=0.1, gamma=0.1, score=0.201, total=   4.4s
[CV] C=0.1, gamma=0.1 ................................................
[CV] .

[CV] .................... C=10, gamma=0.01, score=0.889, total=   4.0s
[CV] C=10, gamma=0.01 ................................................
[CV] .................... C=10, gamma=0.01, score=0.897, total=   4.1s
[CV] C=10, gamma=0.01 ................................................
[CV] .................... C=10, gamma=0.01, score=0.874, total=   3.9s
[CV] C=10, gamma=0.01 ................................................
[CV] .................... C=10, gamma=0.01, score=0.884, total=   3.8s
[CV] C=10, gamma=0.01 ................................................
[CV] .................... C=10, gamma=0.01, score=0.889, total=   4.0s
[CV] C=10, gamma=0.001 ...............................................
[CV] ................... C=10, gamma=0.001, score=0.206, total=   5.2s
[CV] C=10, gamma=0.001 ...............................................
[CV] ................... C=10, gamma=0.001, score=0.204, total=   4.9s
[CV] C=10, gamma=0.001 ...............................................
[CV] .

[CV] ................. C=1000, gamma=0.001, score=0.929, total=   4.0s
[CV] C=1000, gamma=0.001 .............................................
[CV] ................. C=1000, gamma=0.001, score=0.904, total=   4.0s
[CV] C=1000, gamma=0.0001 ............................................
[CV] ................ C=1000, gamma=0.0001, score=0.889, total=   4.2s
[CV] C=1000, gamma=0.0001 ............................................
[CV] ................ C=1000, gamma=0.0001, score=0.900, total=   3.9s
[CV] C=1000, gamma=0.0001 ............................................
[CV] ................ C=1000, gamma=0.0001, score=0.873, total=   3.8s
[CV] C=1000, gamma=0.0001 ............................................
[CV] ................ C=1000, gamma=0.0001, score=0.887, total=   4.5s
[CV] C=1000, gamma=0.0001 ............................................
[CV] ................ C=1000, gamma=0.0001, score=0.890, total=   4.0s


[Parallel(n_jobs=1)]: Done 125 out of 125 | elapsed:  9.2min finished


GridSearchCV(cv=None, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [0.1, 1, 10, 100, 1000],
                         'gamma': [1, 0.1, 0.01, 0.001, 0.0001]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=3)

In [95]:
grid.best_params_

{'C': 10, 'gamma': 1}

In [96]:
grid_pred = grid.predict(X_test)

In [97]:
print(classification_report(grid_pred, y_test))

              precision    recall  f1-score   support

           1       0.94      0.99      0.97       281
           2       0.92      0.93      0.93       288
           3       0.93      0.90      0.91       318
           4       0.88      0.89      0.89       293
           5       0.94      0.90      0.92       320

    accuracy                           0.92      1500
   macro avg       0.92      0.92      0.92      1500
weighted avg       0.92      0.92      0.92      1500



* Evaluating the model with completely new data

In [108]:
text = ['The shirt is of amazing quality', 'every page is amazingly written']

In [109]:
text = cv.transform(text)

In [110]:
text = tf.transform(text)

In [111]:
grid.predict(text)

array([3, 1], dtype=int64)