# Importing dependencies

Importing the libraries

In [None]:
import numpy as np
import pandas as pd
np.set_printoptions(threshold=np.inf)

Importing dataset

In [None]:
df = pd.read_csv("Dataset-SA.csv")

In [None]:
df.head()

Unnamed: 0,product_name,product_price,Rate,Review,Summary,Sentiment
0,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,5,super!,great cooler excellent air flow and for this p...,positive
1,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,5,awesome,best budget 2 fit cooler nice cooling,positive
2,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,3,fair,the quality is good but the power of air is de...,positive
3,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,1,useless product,very bad product its a only a fan,negative
4,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,3,fair,ok ok product,neutral


Data Visualization

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205052 entries, 0 to 205051
Data columns (total 6 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   product_name   205052 non-null  object
 1   product_price  205052 non-null  object
 2   Rate           205052 non-null  object
 3   Review         180388 non-null  object
 4   Summary        205041 non-null  object
 5   Sentiment      205052 non-null  object
dtypes: object(6)
memory usage: 9.4+ MB


# Feature Engineering

First we shall categorise the Rate column.

In [None]:
rating_words = {
    '1': 'Awful',
    '2': 'Poor',
    '3': 'Average',
    '4': 'Good',
    '5': 'Awesome'
}

# Replace the entire column with words
df['ratings'] = df['Rate'].map(rating_words)

In [None]:
df.head()

Unnamed: 0,product_name,product_price,Rate,Review,Summary,Sentiment,ratings
0,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,5,super!,great cooler excellent air flow and for this p...,positive,Awesome
1,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,5,awesome,best budget 2 fit cooler nice cooling,positive,Awesome
2,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,3,fair,the quality is good but the power of air is de...,positive,Average
3,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,1,useless product,very bad product its a only a fan,negative,Awful
4,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,3,fair,ok ok product,neutral,Average


We should combine the Rate , Review and Summary attributes.

In [None]:
df['FullReview']  = df['ratings'] +  " " + df['Review'] + " " + df['Summary']
df  = df.drop(['ratings' , 'Review' , 'Summary' , 'Rate'] , axis = 1)

In [None]:
df.head()

Unnamed: 0,product_name,product_price,Sentiment,FullReview
0,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,positive,Awesome super! great cooler excellent air flow...
1,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,positive,Awesome awesome best budget 2 fit cooler nice ...
2,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,positive,Average fair the quality is good but the power...
3,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,negative,Awful useless product very bad product its a o...
4,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,neutral,Average fair ok ok product


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205052 entries, 0 to 205051
Data columns (total 4 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   product_name   205052 non-null  object
 1   product_price  205052 non-null  object
 2   Sentiment      205052 non-null  object
 3   FullReview     180376 non-null  object
dtypes: object(4)
memory usage: 6.3+ MB


We shall drop the null values.

In [None]:
df = df.dropna(axis = 0)

# Word Processing

In [None]:
import nltk
import re
nltk.download('stopwords')
stemmer = nltk.SnowballStemmer("english")
from nltk.corpus import stopwords
import string
stopword=set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Function to clean the text

In [None]:
def clean(text):
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = [word for word in text.split(' ') if word not in stopword]
    text=" ".join(text)
    text = [stemmer.stem(word) for word in text.split(' ')]
    text=" ".join(text)
    return text

Applying the method to the text

In [None]:
df['FullReview'] = df['FullReview'].apply(clean)

In [None]:
df.head()

Unnamed: 0,product_name,product_price,Sentiment,FullReview
0,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,positive,awesom super great cooler excel air flow price...
1,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,positive,awesom awesom best budget fit cooler nice cool
2,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,positive,averag fair qualiti good power air decent
3,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,negative,aw useless product bad product fan
4,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,neutral,averag fair ok ok product


Word Vectorizing

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1000 , stop_words = 'english')

In [None]:
vector = cv.fit_transform(df['FullReview'].astype(str)).toarray()

In [None]:
cv.get_feature_names_out()

array(['abl', 'absolut', 'ac', 'accept', 'accessori', 'accord', 'accur',
       'actual', 'ad', 'adapt', 'adaptor', 'add', 'addit', 'adjust',
       'afford', 'ago', 'ahead', 'air', 'alexa', 'alreadi', 'alway',
       'amaz', 'android', 'anoth', 'answer', 'anyon', 'anyth', 'apart',
       'app', 'appl', 'appli', 'appreci', 'area', 'arrang', 'arriv',
       'ask', 'aspect', 'assembl', 'assist', 'atleast', 'attach',
       'attract', 'audio', 'auto', 'automat', 'aux', 'avail', 'avarag',
       'averag', 'avoid', 'aw', 'away', 'awesom', 'awsm', 'awsom', 'babi',
       'backup', 'bad', 'bag', 'bajaj', 'bake', 'balanc', 'ball', 'band',
       'bank', 'bar', 'base', 'basic', 'bass', 'bat', 'batteri', 'beast',
       'beat', 'beauti', 'becom', 'bed', 'beginn', 'behaviour', 'believ',
       'bend', 'best', 'better', 'bicycl', 'big', 'bigger', 'billion',
       'bit', 'black', 'blade', 'blanket', 'blind', 'blow', 'blue',
       'bluetooth', 'board', 'boat', 'bodi', 'boil', 'book', 'bottl',
    

In [None]:
vector[0]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

# Data Preprocessing

Train Test Split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x = df['FullReview']
y = df['Sentiment']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:
x_train = cv.fit_transform(x_train)
x_test = cv.transform(x_test)

# Model Building

Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
lrc = LogisticRegression(random_state = 0)
lrc.fit(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
rfc.fit(x_train, y_train)

Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
dtc.fit(x_train, y_train)

Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(x_train.toarray(), y_train)

KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
knn.fit(x_train, y_train)

Predictions

In [None]:
y_pred3 = lrc.predict(x_test)
y_pred4 = rfc.predict(x_test)
y_pred5 = dtc.predict(x_test)
y_pred6 = gnb.predict(x_test.toarray())
y_pred7 = knn.predict(x_test)

Accuracy SCores

In [None]:
from sklearn.metrics import accuracy_score
print("Accuracy Scores: \n")

Accuracy Scores: 



In [None]:
print("Logistic Regression : " , accuracy_score(y_pred3 , y_test))
print("Random Forest : " , accuracy_score(y_pred4 , y_test))
print("Decision Tree : " , accuracy_score(y_pred5 , y_test))
print("Naive-Bayes : " , accuracy_score(y_pred6 , y_test))
print("K-NN : " , accuracy_score(y_pred7 , y_test))

Logistic Regression :  0.9300365894223306
Random Forest :  0.9316720257234726
Decision Tree :  0.9165095908637321
Naive-Bayes :  0.2061758509812618
K-NN :  0.9250471227408804


Since Random Forest performed the best, we shall improve the model using GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
param_grid = {'criterion' : ['gini' , 'entropy'] ,
              'n_estimators' :[10 , 30 , 50] }
grid_model = GridSearchCV(estimator = rfc , param_grid = param_grid , scoring = 'neg_mean_squared_error' , cv=3 , verbose=1)
grid_model.fit(x_train,y_train)
grid_model.best_params_

Fitting 3 folds for each of 6 candidates, totalling 18 fits


Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_scorer.py", line 234, in __call__
    return self._score(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_scorer.py", line 282, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_regression.py", line 442, in mean_squared_error
    y_type, y_true, y_pred, multioutput = _check_reg_targets(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_regression.py", line 101, in _check_reg_targets
    y_true = check_array(y_true, ensure_2d=False, dtype=dtype)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/validation.py", line 879, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtyp

{'criterion': 'gini', 'n_estimators': 10}

Predicting using the new model

In [None]:
rfc2 = RandomForestClassifier(n_estimators = 10, criterion = 'gini', random_state = 0)
rfc2.fit(x_train, y_train)
y_pred = rfc2.predict(x_test)

In [None]:
accuracy_score(y_pred , y_test)

0.9316720257234726

We see that for test set, the accuracy is lower with the gini criterion, as opposed to the entropy criterion. Thus y_pred4 is our most accurate model.

In [None]:
y_test.shape

(36076, 1)

In [None]:
y_pred4.shape

(36076, 1)

In [None]:
y_pred4 = y_pred4.reshape(len(y_pred4), 1)
y_test = np.asarray(y_test).reshape(len(y_test), 1)

print(np.concatenate((y_pred4, y_test), axis=1))

[['positive' 'positive']
 ['negative' 'negative']
 ['positive' 'positive']
 ['positive' 'positive']
 ['positive' 'positive']
 ['positive' 'positive']
 ['positive' 'negative']
 ['positive' 'positive']
 ['positive' 'positive']
 ['positive' 'positive']
 ['positive' 'positive']
 ['positive' 'positive']
 ['negative' 'negative']
 ['positive' 'positive']
 ['neutral' 'neutral']
 ['positive' 'positive']
 ['positive' 'positive']
 ['positive' 'positive']
 ['positive' 'positive']
 ['positive' 'positive']
 ['positive' 'positive']
 ['neutral' 'positive']
 ['positive' 'positive']
 ['positive' 'positive']
 ['positive' 'positive']
 ['positive' 'positive']
 ['positive' 'positive']
 ['positive' 'positive']
 ['positive' 'positive']
 ['positive' 'positive']
 ['positive' 'positive']
 ['positive' 'positive']
 ['negative' 'neutral']
 ['positive' 'positive']
 ['positive' 'positive']
 ['positive' 'positive']
 ['positive' 'positive']
 ['positive' 'positive']
 ['positive' 'positive']
 ['positive' 'positive']
 ['p