# Mounting drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Importing necessary libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn import model_selection
from sklearn.utils import class_weight
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier

# Loading data

In [None]:
data_frames = []
chunk_value = 5000
temp = pd.read_json('/content/drive/MyDrive/Cell_Phones_and_Accessories.json.gz', lines=True, chunksize=chunk_value)
for chunk in temp:
        data_frames.append(chunk)

df = pd.concat(data_frames, ignore_index=True, sort=True)

In [None]:
df.head()

Unnamed: 0,asin,image,overall,reviewText,reviewTime,reviewerID,reviewerName,style,summary,unixReviewTime,verified,vote
0,098949232X,,5,If your into space this is the Calendar for you.,"11 19, 2014",A1GG51FWU0XQYH,Paul Williams,,Five Stars,1416355200,False,
1,098949232X,,5,Awesome pictures!,"11 19, 2014",AVFIDS9RK38E0,Sean Powell,,Five Stars,1416355200,False,
2,098949232X,,5,Great wall art and information for space explo...,"11 19, 2014",A2S4AVR5SJ7KMI,Tom Davis,,Five Stars,1416355200,False,
3,098949232X,,5,"As always, it is a quality calendar full of ve...","11 19, 2014",AEMMMVOR9BFLI,Kwajmeck,,I love it. I buy a new one every year,1416355200,False,
4,098949232X,,5,This is a fantastic calendar. This is my third...,"11 19, 2014",A2DZXMBTY7KLYP,ScottG43,,Great Calendar.,1416355200,False,


In [None]:
df.shape

(10063255, 12)

# Data preprocessing

Extarcting only reviews between 2017-2018

In [None]:
indices = []
for i in range(0,df.shape[0]):
    if df.iloc[i,4][7:11] == '2017':
        indices.append(i)
        
df = df.iloc[indices, :]
df = df.reset_index(drop=True)

In [None]:
df.shape

(975705, 12)

Removing unnecessary columns and sampling data

In [None]:
df = df.sample(n = 100000)
df = df[['reviewText','overall']]

Assigning a sentiment based on the rating received.


Positive -> 2


Neutral -> 1


Negative -> 0

In [None]:
sentiments = []
for i in range(0,df.shape[0]):
    rating = df.iloc[i,1]
    if rating>=4:
        sentiments.append(2)
    elif (rating>=2 and rating<4):
        sentiments.append(1)
    else:
        sentiments.append(0)
        

df['Sentiment'] = np.vstack(sentiments)
df = df[["reviewText", "Sentiment"]]

In [None]:
df

Unnamed: 0,reviewText,Sentiment
309737,I am sure it would have worked well with the p...,2
601976,Good quality,2
750632,Complete garbage. stay away if you don't want ...,0
772847,A glass one works A lot nicer,0
677184,These are great hangers. Nothing will slide o...,2
...,...,...
279465,"I needed a better phone case for my Note 5, be...",2
941336,Awesome!,2
436057,"I bought this SIM card for my wife, who has a ...",1
592940,Works great,2


In [None]:
df = df.dropna()

# Formatting data as required for training

In [None]:
from sklearn.model_selection import train_test_split
X = list(df['reviewText'])
y = list(df['Sentiment'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0, stratify = y)

# Training the model

In [None]:
"""
dfs = []
models = [
          ('SVM', SVC()),
          ('XGB', XGBClassifier())
        ]

results = []

names = []
scoring = ['accuracy', 'precision_weighted', 'recall_weighted', 'f1_weighted', 'roc_auc']
target_names = ['negative', 'neutral', 'positive']

for name, model in models:

        clf = Pipeline([('tfidf',TfidfVectorizer()),('clf',model)])
        clf.fit(X_train,y_train)
        y_pred = clf.predict(X_test)
        print(name)
        print(classification_report(y_test, y_pred, target_names=target_names))

"""


In [None]:
from sklearn.pipeline import Pipeline

text_clf = Pipeline([('tfidf',TfidfVectorizer()),('clf',SVC(C=1, gamma=1.0))])

text_clf.fit(X_train,y_train)

predictions = text_clf.predict(X_test)

print(confusion_matrix(y_test,predictions))
cm = confusion_matrix(y_test,predictions)
print(classification_report(y_test,predictions))

[[ 2120   411   408]
 [  701  1042  1070]
 [  156   353 13717]]
              precision    recall  f1-score   support

           0       0.71      0.72      0.72      2939
           1       0.58      0.37      0.45      2813
           2       0.90      0.96      0.93     14226

    accuracy                           0.84     19978
   macro avg       0.73      0.69      0.70     19978
weighted avg       0.83      0.84      0.83     19978



In [None]:
cm

array([[ 2120,   411,   408],
       [  701,  1042,  1070],
       [  156,   353, 13717]])

# Saving the model weights

In [None]:
from joblib import dump, load
model_file = '/content/drive/MyDrive/finalized_model.joblib'
dump(text_clf, model_file) 

['/content/drive/MyDrive/finalized_model.joblib']