### Importing all required libraries

In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import nltk
import re
import string
import sklearn 
import seaborn as sns
import collections
from collections import Counter
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE
from textblob import TextBlob
from sklearn.model_selection import train_test_split


### Data Loading

In [2]:
df=pd.read_csv('../Project/Musical_Instruments_Reviews.csv')
df.head()
df.isnull().sum() # checking for null values
df['reviewText']=df['reviewText'].fillna('Missing')

In [3]:
len(df)

10261

The length of the data is 10261

### Defined a function 'f' to classify into 3 classes i.e., Positive, Neutral and Negative by checking the conditions below

In [4]:
def f(row):
    if row['overall'] == 3.0:
        val = 'Neutral'
    elif row['overall'] == 1.0 or row['overall'] == 2.0:
        val = 'Negative'
    elif row['overall'] == 4.0 or row['overall'] == 5.0:
        val = 'Positive'
    else:
        val = -1
    return val
df['sentiment'] = df.apply(f, axis=1) # create a new column sentiment to store the values of returned values from function f
df.head()


Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,sentiment
0,A2IBPI20UZIR0U,1384719342,"cassandra tu ""Yeah, well, that's just like, u...","[0, 0]","Not much to write about here, but it does exac...",5.0,good,1393545600,"02 28, 2014",Positive
1,A14VAT5EAX3D9S,1384719342,Jake,"[13, 14]",The product does exactly as it should and is q...,5.0,Jake,1363392000,"03 16, 2013",Positive
2,A195EZSQDW3E21,1384719342,"Rick Bennette ""Rick Bennette""","[1, 1]",The primary job of this device is to block the...,5.0,It Does The Job Well,1377648000,"08 28, 2013",Positive
3,A2C00NNG1ZQQG2,1384719342,"RustyBill ""Sunday Rocker""","[0, 0]",Nice windscreen protects my MXL mic and preven...,5.0,GOOD WINDSCREEN FOR THE MONEY,1392336000,"02 14, 2014",Positive
4,A94QU4C90B1AX,1384719342,SEAN MASLANKA,"[0, 0]",This pop filter is great. It looks and perform...,5.0,No more pops when I record my vocals.,1392940800,"02 21, 2014",Positive


### Pre-processed the data  
+ Checking for Duplicates
+ Checking for Null Values
+ Making text lowercase
+ Removing hyperlinks
+ Removing punctuation marks
+ Eliminating Stop Words
+ Data Resampling


### Defined a function review_cleaning

In [5]:
def review_cleaning(text):
    text = str(text).lower()
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    return text


### Created new column reviews to store processed reviewText

In [6]:
df['reviews']=df['reviewText'].apply(review_cleaning)
df.head()


Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,sentiment,reviews
0,A2IBPI20UZIR0U,1384719342,"cassandra tu ""Yeah, well, that's just like, u...","[0, 0]","Not much to write about here, but it does exac...",5.0,good,1393545600,"02 28, 2014",Positive,not much to write about here but it does exact...
1,A14VAT5EAX3D9S,1384719342,Jake,"[13, 14]",The product does exactly as it should and is q...,5.0,Jake,1363392000,"03 16, 2013",Positive,the product does exactly as it should and is q...
2,A195EZSQDW3E21,1384719342,"Rick Bennette ""Rick Bennette""","[1, 1]",The primary job of this device is to block the...,5.0,It Does The Job Well,1377648000,"08 28, 2013",Positive,the primary job of this device is to block the...
3,A2C00NNG1ZQQG2,1384719342,"RustyBill ""Sunday Rocker""","[0, 0]",Nice windscreen protects my MXL mic and preven...,5.0,GOOD WINDSCREEN FOR THE MONEY,1392336000,"02 14, 2014",Positive,nice windscreen protects my mxl mic and preven...
4,A94QU4C90B1AX,1384719342,SEAN MASLANKA,"[0, 0]",This pop filter is great. It looks and perform...,5.0,No more pops when I record my vocals.,1392940800,"02 21, 2014",Positive,this pop filter is great it looks and performs...


### Created new column polarity to store polarity value using TextBlob
### Created new column subjectivity to store subjectivity value using TextBlob
### Created new column review_len to store the length of reviews
### Created new column word_count to store number of words present in reviews

In [7]:
df['polarity'] = df['reviews'].map(lambda text: TextBlob(text).sentiment.polarity)
df['subjectivity'] = df['reviews'].map(lambda text: TextBlob(text).sentiment.subjectivity)
df['review_len'] = df['reviews'].astype(str).apply(len)
df['word_count'] = df['reviews'].apply(lambda x: len(str(x).split()))
df.head()


Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,sentiment,reviews,polarity,subjectivity,review_len,word_count
0,A2IBPI20UZIR0U,1384719342,"cassandra tu ""Yeah, well, that's just like, u...","[0, 0]","Not much to write about here, but it does exac...",5.0,good,1393545600,"02 28, 2014",Positive,not much to write about here but it does exact...,0.25,0.398611,261,51
1,A14VAT5EAX3D9S,1384719342,Jake,"[13, 14]",The product does exactly as it should and is q...,5.0,Jake,1363392000,"03 16, 2013",Positive,the product does exactly as it should and is q...,0.014286,0.292857,529,103
2,A195EZSQDW3E21,1384719342,"Rick Bennette ""Rick Bennette""","[1, 1]",The primary job of this device is to block the...,5.0,It Does The Job Well,1377648000,"08 28, 2013",Positive,the primary job of this device is to block the...,0.1675,0.434286,431,77
3,A2C00NNG1ZQQG2,1384719342,"RustyBill ""Sunday Rocker""","[0, 0]",Nice windscreen protects my MXL mic and preven...,5.0,GOOD WINDSCREEN FOR THE MONEY,1392336000,"02 14, 2014",Positive,nice windscreen protects my mxl mic and preven...,0.2,0.925,204,35
4,A94QU4C90B1AX,1384719342,SEAN MASLANKA,"[0, 0]",This pop filter is great. It looks and perform...,5.0,No more pops when I record my vocals.,1392940800,"02 21, 2014",Positive,this pop filter is great it looks and performs...,0.8,0.75,155,28


### Used LabelEncoder() library to normalize labels i.e., sentiment

In [8]:
label_encoder = preprocessing.LabelEncoder()   
df['sentiment']= label_encoder.fit_transform(df['sentiment']) 
print(df['sentiment'].unique())
df.sample(10)

[2 1 0]


Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,sentiment,reviews,polarity,subjectivity,review_len,word_count
8756,A1VFUMY02HJIFY,B004PG1A4A,D. Dreifus,"[0, 0]","The odor of adhesive lingered for some time, b...",3.0,Works well for what it is,1392940800,"02 21, 2014",1,the odor of adhesive lingered for some time bu...,0.171354,0.595312,732,147
5226,A1FO10Z93GKN33,B000OR88JE,Fancy,"[0, 0]",these are wonderful strings for acoustic playi...,5.0,wonderful!!,1365379200,"04 8, 2013",2,these are wonderful strings for acoustic playi...,0.66,0.63,135,24
9963,A3G5BIFX6VS1OP,B00A716FB0,jeff h,"[0, 0]",Great gig bag for acoustic guitar. Very well m...,5.0,Great gig bag,1388793600,"01 4, 2014",2,great gig bag for acoustic guitar very well ma...,0.616667,0.733333,290,55
5899,A37A41GWQFMK3D,B000XPPURU,Dr. Freud,"[1, 1]",this strap works fine on my Fender Strat. it ...,4.0,good for the money but does not fit all guitars,1322438400,"11 28, 2011",2,this strap works fine on my fender strat it i...,-0.033333,0.6125,205,37
2493,A3GAP455S8YH0M,B0002GWFEQ,"Joe's Gadgets ""JOE H.""","[0, 0]",My new guitar had only the back strap connecto...,5.0,Exactly what I needed,1369872000,"05 30, 2013",2,my new guitar had only the back strap connecto...,0.131385,0.450649,359,64
1329,AXU9VX024GPSS,B0002E1G5C,"RazzSpew10 ""Rev. Neo Gnostic""","[0, 0]",This tool has high marks for one reason: It wo...,5.0,Rated high for a reason.,1336262400,"05 6, 2012",2,this tool has high marks for one reason it wor...,0.455,0.495,146,30
6508,A2CJVLER896Q7L,B001FSZR4U,Diego A. Umana,"[0, 1]",This is a very useful little device. I use thi...,5.0,Very useful little device,1372723200,"07 2, 2013",2,this is a very useful little device i use this...,0.1325,0.208,368,66
7447,A3APKXMNJAEDQM,B002R2IUEW,hargy15,"[1, 1]",Can't get your banjo to sound right without a ...,5.0,Banjo essentials,1352505600,"11 10, 2012",2,cant get your banjo to sound right without a g...,0.495238,0.561905,116,23
9441,A2WYAHJGST6AOT,B005VLWHP4,Matt,"[0, 0]","For 3 dollars this is a great quality capo, it...",5.0,Good Cheap Capo,1393113600,"02 23, 2014",2,for 3 dollars this is a great quality capo it ...,0.533333,0.583333,150,30
5407,A2XL6TZM34HFV3,B000RN53LQ,"Review Man ""Review Man""","[2, 2]",I wanted a portable headphone amp but bought t...,4.0,Amazingly versatile practice tool,1298937600,"03 1, 2011",2,i wanted a portable headphone amp but bought t...,0.199575,0.484539,2045,392


### Used TF_IDF Vectorizer library to transform the text into a meaningful representation of numbers 

In [9]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(2,2))
x= tfidf_vectorizer.fit_transform(df['reviews'])


In [10]:
y=df['sentiment']


### Used SMOTE library to handle the imbalanced data and to resample the data

In [11]:
print(f'Original dataset shape : {Counter(y)}')
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(x, y)
print(f'Resampled dataset shape {Counter(y_res)}')


Original dataset shape : Counter({2: 9022, 1: 772, 0: 467})
Resampled dataset shape Counter({2: 9022, 1: 9022, 0: 9022})


### Splitted the data into training dataset and testing dataset using train_test_split with training size is 75% and test size is 25%

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.25, random_state=0)
print(X_train)

  (0, 2465)	0.25192015245002997
  (0, 238)	0.25192015245002997
  (0, 301)	0.22601006503751472
  (0, 41)	0.24184282621116868
  (0, 770)	0.24311676244127614
  (0, 287)	0.21024975349790787
  (0, 2981)	0.21633307899127588
  (0, 1631)	0.23207966697122637
  (0, 4602)	0.21883624259337975
  (0, 4520)	0.21949101652787728
  (0, 3199)	0.22296210996355365
  (0, 1425)	0.19297041214894337
  (0, 3266)	0.20550882923916944
  (0, 2223)	0.18673914124797528
  (0, 260)	0.20420323517990788
  (0, 4668)	0.20335911275423715
  (0, 1440)	0.14834202705783137
  (0, 4699)	0.16822795964347834
  (0, 1625)	0.17027520289758696
  (0, 358)	0.18943503923216817
  (0, 1850)	0.1757020457779255
  (0, 4375)	0.11846407608644513
  (0, 1915)	0.14975754835716712
  (0, 4865)	0.15397381419189665
  (0, 1962)	0.10065896278437624
  :	:
  (20298, 2784)	0.22535697427877155
  (20298, 1173)	0.2194631915760093
  (20298, 207)	0.1730908092705323
  (20298, 3502)	0.22724716516002513
  (20298, 3654)	0.20948505875846263
  (20298, 1733)	0.21126014

In [13]:
print(X_test)

  (0, 691)	0.06072250343055784
  (0, 4159)	0.07593726081305335
  (0, 2273)	0.06883930389353445
  (0, 4670)	0.10443893938815348
  (0, 2909)	0.08380504247555592
  (0, 1139)	0.1007309001359896
  (0, 4272)	0.06051664321306361
  (0, 4605)	0.06630455367039684
  (0, 4011)	0.07348282443361831
  (0, 1892)	0.06341500508674673
  (0, 3132)	0.07769445621542241
  (0, 3911)	0.06366038368041288
  (0, 3341)	0.0898644198147258
  (0, 4595)	0.07294594925452581
  (0, 1256)	0.07973177353894506
  (0, 3089)	0.08717044522377833
  (0, 4891)	0.08717044522377833
  (0, 2609)	0.11357921431787195
  (0, 4965)	0.07477163146205029
  (0, 2870)	0.08343981279110894
  (0, 2616)	0.0934071116160749
  (0, 4012)	0.10412241016641159
  (0, 4154)	0.06865280045766331
  (0, 407)	0.11950618233733247
  (0, 1951)	0.11054952564828699
  :	:
  (6766, 544)	0.08640462169291022
  (6766, 1053)	0.0916027882022906
  (6766, 4600)	0.36879674685859803
  (6766, 2685)	0.06774706983946305
  (6766, 4527)	0.11449699874447025
  (6766, 2899)	0.000571628

In [14]:
y_train

9249     2
5929     2
14753    0
19966    1
18975    1
        ..
13123    0
19648    1
9845     2
10799    0
2732     2
Name: sentiment, Length: 20299, dtype: int32

In [15]:
y_test.sample(30)

8694     2
24229    1
19104    1
14982    0
18769    0
1474     1
17861    0
15003    0
17654    0
11211    0
14663    0
23600    1
4115     2
25929    1
12854    0
26839    1
7046     2
17786    0
21381    1
16932    0
21366    1
25155    1
1560     2
4507     2
21213    1
19929    1
13863    0
2894     2
25792    1
10894    0
Name: sentiment, dtype: int32

In [16]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

### Implemented Logistic Regression Algorithm

In [17]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(C=10000.0, random_state=0, max_iter=1000)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))
print(f'\nConfusion Matrix for Logistic Regression:\n {confusion_matrix(y_test, y_pred)}')
print(f'\nClassification Report for Logistic Regression:\n {classification_report(y_test, y_pred)}')

Accuracy of logistic regression classifier on test set: 0.95

Confusion Matrix for Logistic Regression:
 [[2326    0    0]
 [   0 2232    0]
 [ 173  169 1867]]

Classification Report for Logistic Regression:
               precision    recall  f1-score   support

           0       0.93      1.00      0.96      2326
           1       0.93      1.00      0.96      2232
           2       1.00      0.85      0.92      2209

    accuracy                           0.95      6767
   macro avg       0.95      0.95      0.95      6767
weighted avg       0.95      0.95      0.95      6767



In [18]:
#import matplotlib.pyplot as plt

#matrix = plot_confusion_matrix(clf, X_test, y_test,
#                                 cmap=plt.cm.Blues,
#                                 normalize='true')
#plt.title('Confusion matrix for our classifier')
#plt.show(matrix)
#plt.show()

### Predicted the output using logistic regression

In [19]:
pred = logreg.predict(X_test[5741])
print(pred)

actual = y_test[5741]
a = [0, 0 ,0]
if(pred == 2):
    a[2] = 1
    a[0] = 0
    a[1] = 0
elif(pred == 1):
    a[1] = 1
    a[0] = 0
    a[2] = 0
else:
    a[0] = 1
    a[1] = 0
    a[2] = 0
print(a)

[1]
[0, 1, 0]


### Implemented Decision Tree Classifier Algorithm (both gini and entropy)

In [20]:
from sklearn.tree import DecisionTreeClassifier

clf_gini = DecisionTreeClassifier(criterion="gini", random_state=100, max_depth=3, min_samples_leaf=5)
clf_gini.fit(X_train, y_train)
y_pred_clf_gini = clf_gini.predict(X_test)

In [21]:
print(f'Confusion Matrix for Decision Tree(gini):\n {confusion_matrix(y_test, y_pred_clf_gini)}')
print(f'\nAccuracy for Decision Tree(gini): {accuracy_score(y_test, y_pred_clf_gini)*100}')
print(f'\nClassification Report for Decision Tree(gini):\n {classification_report(y_test, y_pred_clf_gini)}')

Confusion Matrix for Decision Tree(gini):
 [[ 725  766  835]
 [ 495  890  847]
 [ 129  592 1488]]

Accuracy for Decision Tree(gini): 45.854883995862274

Classification Report for Decision Tree(gini):
               precision    recall  f1-score   support

           0       0.54      0.31      0.39      2326
           1       0.40      0.40      0.40      2232
           2       0.47      0.67      0.55      2209

    accuracy                           0.46      6767
   macro avg       0.47      0.46      0.45      6767
weighted avg       0.47      0.46      0.45      6767



In [22]:
clf_entropy = DecisionTreeClassifier(
        criterion="entropy", random_state=100,
        max_depth=3, min_samples_leaf=5)

clf_entropy.fit(X_train, y_train)
y_pred_clf_entropy = clf_entropy.predict(X_test)

In [23]:
print(f'Confusion Matrix for Decision Tree(entropy):\n {confusion_matrix(y_test, y_pred_clf_entropy)}')
print(f'\nAccuracy for Decision Tree(entropy): {accuracy_score(y_test, y_pred_clf_entropy)*100}')
print(f'\nClassification Report for Decision Tree(entropy):\n {classification_report(y_test, y_pred_clf_entropy)}')

Confusion Matrix for Decision Tree(entropy):
 [[ 708  783  835]
 [ 449  936  847]
 [ 116  605 1488]]

Accuracy for Decision Tree(entropy): 46.283434313580614

Classification Report for Decision Tree(entropy):
               precision    recall  f1-score   support

           0       0.56      0.30      0.39      2326
           1       0.40      0.42      0.41      2232
           2       0.47      0.67      0.55      2209

    accuracy                           0.46      6767
   macro avg       0.48      0.47      0.45      6767
weighted avg       0.48      0.46      0.45      6767



### Implemented SVC algorithm

In [24]:
from sklearn.svm import SVC  

clf = SVC(kernel='linear') 
clf.fit(X_train, y_train) 
clf_predictions = clf.predict(X_test)

In [25]:
print(f'Confusion Matrix for Support Vector Classifier:\n {confusion_matrix(y_test, clf_predictions)}')
print(f'\nAccuracy for Support Vector Classifier: {accuracy_score(y_test, clf_predictions)*100}')
print(f'\nClassification Report for Support Vector Classifier:\n {classification_report(y_test, clf_predictions)}')

Confusion Matrix for Support Vector Classifier:
 [[2326    0    0]
 [   5 2218    9]
 [ 128  212 1869]]

Accuracy for Support Vector Classifier: 94.76873060440373

Classification Report for Support Vector Classifier:
               precision    recall  f1-score   support

           0       0.95      1.00      0.97      2326
           1       0.91      0.99      0.95      2232
           2       1.00      0.85      0.91      2209

    accuracy                           0.95      6767
   macro avg       0.95      0.95      0.95      6767
weighted avg       0.95      0.95      0.95      6767

