In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import re

In [2]:
#importing library of Machine learning 
from sklearn.linear_model import LogisticRegression as lr
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

In [3]:
# Importing the library of NLP
from nltk.corpus import stopwords
from string import punctuation
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

In [4]:
#importing the library for the Bag of words
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [5]:
#For Reading the Data 
data=pd.read_csv('data_review.csv')

In [6]:
#First Five Value
data.head()

Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes
0,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,I feel so LUCKY to have found this used (phone...,1.0
1,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,"nice phone, nice up grade from my pantach revu...",0.0
2,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,Very pleased,0.0
3,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,It works good but it goes slow sometimes but i...,0.0
4,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,Great phone to replace my lost phone. The only...,0.0


In [7]:
#Printing the datatype of the column
df=data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 413840 entries, 0 to 413839
Data columns (total 6 columns):
Product Name    413840 non-null object
Brand Name      348669 non-null object
Price           407907 non-null float64
Rating          413840 non-null int64
Reviews         413778 non-null object
Review Votes    401544 non-null float64
dtypes: float64(2), int64(1), object(3)
memory usage: 18.9+ MB


In [8]:
#Printing the statistical details
df.describe()

Unnamed: 0,Price,Rating,Review Votes
count,407907.0,413840.0,401544.0
mean,226.867155,3.819578,1.507237
std,273.006259,1.548216,9.163853
min,1.73,1.0,0.0
25%,79.99,3.0,0.0
50%,144.71,5.0,0.0
75%,269.99,5.0,1.0
max,2598.0,5.0,645.0


In [9]:
#Taking only the Review and Rating Column
df=df[['Reviews','Rating']]

In [10]:
df.head

<bound method NDFrame.head of                                                   Reviews  Rating
0       I feel so LUCKY to have found this used (phone...       5
1       nice phone, nice up grade from my pantach revu...       4
2                                            Very pleased       5
3       It works good but it goes slow sometimes but i...       4
4       Great phone to replace my lost phone. The only...       4
5       I already had a phone with problems... I know ...       1
6       The charging port was loose. I got that solder...       2
7       Phone looks good but wouldn't stay charged, ha...       2
8       I originally was using the Samsung S2 Galaxy f...       5
9       It's battery life is great. It's very responsi...       3
10      My fiance had this phone previously, but cause...       3
11      This is a great product it came after two days...       5
12      These guys are the best! I had a little situat...       5
13      I'm really disappointed about my phone

In [11]:
#Printing the datatype of the column
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 413840 entries, 0 to 413839
Data columns (total 2 columns):
Reviews    413778 non-null object
Rating     413840 non-null int64
dtypes: int64(1), object(1)
memory usage: 6.3+ MB


In [12]:
#Removing the NaN Rows and Column
df=df.dropna()
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 413778 entries, 0 to 413839
Data columns (total 2 columns):
Reviews    413778 non-null object
Rating     413778 non-null int64
dtypes: int64(1), object(1)
memory usage: 9.5+ MB


In [13]:
#Removing the row that has the Rating 3 as it is the neutral Sentiment
df=df[df['Rating']!=3]
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 382015 entries, 0 to 413839
Data columns (total 2 columns):
Reviews    382015 non-null object
Rating     382015 non-null int64
dtypes: int64(1), object(1)
memory usage: 8.7+ MB


In [14]:
df=df.reset_index(drop=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 382015 entries, 0 to 382014
Data columns (total 2 columns):
Reviews    382015 non-null object
Rating     382015 non-null int64
dtypes: int64(1), object(1)
memory usage: 5.8+ MB


In [15]:
#Here we have Created the Sentiment Column and we have to put the value 1 if rating greater than 3 and ratings 0 if less than 3.
df['sentiment']=np.where(df['Rating'] > 3, 1, 0)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 382015 entries, 0 to 382014
Data columns (total 3 columns):
Reviews      382015 non-null object
Rating       382015 non-null int64
sentiment    382015 non-null int32
dtypes: int32(1), int64(1), object(1)
memory usage: 7.3+ MB


In [16]:
#cleaning the dataset 
#stopword are is am the and etc
#punction are added as to remove the stopwords and the punctuation from the text
Cstopwords=set(stopwords.words('english')+list(punctuation))

In [17]:
#Gives the  root word
lemma=WordNetLemmatizer()

In [18]:
#Defining the Function to Clean the review
def clean_review(review_column):
    review_corpus=[]
    for i in range(0, len(review_column)):
        review=review_column[i]
        #want ro keep the 
        review=re.sub('[^a-zA-Z]',' ',review)
        #Convert it to string and convert it to lower case
        review=str(review).lower()
        #seperate every word
        review=word_tokenize(review)
        #storing the root word
        review=[lemma.lemmatize(w) for w in review ]
        #joining the white space and storing it on the review
        review=' '.join(review)
        #Storing the review data on the clean_corpus
        review_corpus.append(review)
    return review_corpus

In [19]:
#taking the value of the  df and storing.
review_column=df['Reviews']

In [20]:
#sending the argument(revie column) to the clean_review()  
review_corpus=clean_review(review_column)

In [21]:
#making new column in the df and storing it to the clean_review column
df['clean_review']=review_corpus
df.tail()

Unnamed: 0,Reviews,Rating,sentiment,clean_review
382010,good rugged phone that has a long-lasting batt...,4,1,good rugged phone that ha a long lasting batte...
382011,used hard,1,0,used hard
382012,another great deal great price,5,1,another great deal great price
382013,Passes every drop test onto porcelain tile!,5,1,pass every drop test onto porcelain tile
382014,Only downside is that apparently Verizon no lo...,4,1,only downside is that apparently verizon no lo...


In [22]:
#create a bag of words max_features is the word  min_df means ignore the term that appear less than 5 percent in the document ng_gram means it will contain minmum 1 and maximum 2 word in a senstence
cv=CountVectorizer(max_features=20000,min_df=5,ngram_range=(1,2))
#Storing it to the X1
X1=cv.fit_transform(df['clean_review'])
#Print the dimension of the array
X1.shape

(382015, 20000)

In [23]:
print(df['clean_review'])

0         i feel so lucky to have found this used phone ...
1         nice phone nice up grade from my pantach revue...
2                                              very pleased
3         it work good but it go slow sometimes but it a...
4         great phone to replace my lost phone the only ...
5         i already had a phone with problem i know it s...
6         the charging port wa loose i got that soldered...
7         phone look good but wouldn t stay charged had ...
8         i originally wa using the samsung s galaxy for...
9         this is a great product it came after two day ...
10        these guy are the best i had a little situatio...
11        i m really disappointed about my phone and ser...
12        ordered this phone a a replacement for the sam...
13        had this phone before and loved it but wa not ...
14        i wa able to get the phone i previously owned ...
15        i brought this phone a a replacement for my da...
16        i love the phone it doe everyt

In [24]:
#Increase the importance of the word which are more rare 
tfidf=TfidfVectorizer(min_df=5, max_df=0.95, max_features = 20000, ngram_range = ( 1, 2 ),
                              sublinear_tf = True)
#Applying the tfdif to the clean_review.
tfidf=tfidf.fit(df['clean_review'])

#Storing it in the X2
X2=tfidf.transform(df['clean_review'])
X2.shape
print(X2)

  (0, 19947)	0.06867839370678101
  (0, 19834)	0.049633832159609134
  (0, 19799)	0.10529254355048036
  (0, 19797)	0.07356584384736188
  (0, 19697)	0.05663879009287084
  (0, 19138)	0.07894124360907383
  (0, 18776)	0.07710511247287204
  (0, 18654)	0.1351945519010737
  (0, 18653)	0.07108481245100842
  (0, 18239)	0.0449991961826796
  (0, 18108)	0.18729277287194954
  (0, 18094)	0.13946111730652627
  (0, 17990)	0.1064252613337285
  (0, 17980)	0.08941580108764781
  (0, 17339)	0.11991693837456786
  (0, 17270)	0.08299514619835606
  (0, 17129)	0.058311210372695675
  (0, 16953)	0.14395657528590275
  (0, 16934)	0.0973412423288398
  (0, 16911)	0.08139136364159605
  (0, 16832)	0.07501701071785949
  (0, 16610)	0.12672927307870027
  (0, 16608)	0.07299836454687054
  (0, 15728)	0.044374720771756665
  (0, 15716)	0.08344067186478418
  :	:
  (382012, 6477)	0.2829396020731201
  (382012, 3984)	0.32606945430654727
  (382012, 1233)	0.5953948194958951
  (382012, 1226)	0.29224362986948443
  (382013, 17048)	0.5524

In [25]:
#Storing the value of sentiment of df column to the y greater than 3 assign 1 amd less than 3 assign 0
y=df['sentiment'].values
y.shape

(382015,)

In [26]:
#Machine Learning Implementation 

# storing the tfidf value X2 to the X
X=X2

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [35]:
#Logistic Regression
model_lr=lr(random_state=0)
model_lr=lr(penalty='l2',C=1.0,random_state=0)
model_lr.fit(X_train,y_train)
#pridicting the result by giving the input X_test
y_pred_lr=model_lr.predict(X_test)

In [36]:
y_pred_lr

array([1, 1, 0, ..., 1, 1, 1])

In [37]:
# Printing the Confusion Matrix
cm = confusion_matrix(y_test, y_pred_lr)
cm

array([[17833,  1632],
       [ 1245, 55693]], dtype=int64)

In [38]:
print('F1 score for Logistic Regression :',f1_score(y_test,y_pred_lr))
print('Precision score for Logistic Regression :',precision_score(y_test,y_pred_lr))
print('recall score for Logistic Regression :',recall_score(y_test,y_pred_lr))
print('AUC: ', roc_auc_score(y_test, y_pred_lr))

F1 score for Logistic Regression : 0.9748212457225874
Precision score for Logistic Regression : 0.9715307457479285
recall score for Logistic Regression : 0.978134110787172
AUC:  0.9471456580136735


In [39]:
# get the feature names as numpy array
feature_names = np.array(cv.get_feature_names())

# Sort the coefficients from the model
sorted_coef_index = model_lr.coef_[0].argsort()

# Find the 10 smallest and 10 largest coefficients
# The 10 largest coefficients are being indexed using [:-11:-1] 
# so the list returned is in order of largest to smallest
print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Largest Coefs: \n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))

Smallest Coefs:
['not' 'worst' 'disappointed' 'not happy' 'poor' 'terrible' 'doesn'
 'horrible' 'useless' 'return']

Largest Coefs: 
['great' 'love' 'excellent' 'perfect' 'no problem' 'amazing' 'awesome'
 'best' 'love this' 'not bad']


In [40]:
#NAIVE BAYES
from sklearn.naive_bayes import MultinomialNB
model_nb=MultinomialNB()
model_nb.fit(X_train,y_train)
y_pred_nb=model_nb.predict(X_test)
print('accuracy for Naive Bayes Classifier :',accuracy_score(y_test,y_pred_nb))
print('confusion matrix for Naive Bayes Classifier:\n',confusion_matrix(y_test,y_pred_nb))
print('F1 score for Logistic Regression :',f1_score(y_test,y_pred_nb))
print('Precision score for Logistic Regression :',precision_score(y_test,y_pred_nb))
print('recall score for Logistic Regression :',recall_score(y_test,y_pred_nb))
print('AUC: ', roc_auc_score(y_test, y_pred_nb))

accuracy for Naive Bayes Classifier : 0.9368480295276364
confusion matrix for Naive Bayes Classifier:
 [[16687  2778]
 [ 2047 54891]]
F1 score for Logistic Regression : 0.9578996047361854
Precision score for Logistic Regression : 0.9518285387296468
recall score for Logistic Regression : 0.9640486142822017
AUC:  0.9106654579245582


In [41]:
#RANDOM FOREST
from sklearn.ensemble import RandomForestClassifier
model_rf=RandomForestClassifier()
model_rf.fit(X_train,y_train)
y_pred_rf=model_rf.predict(X_test)
print('accuracy for Random Forest Classifier :',accuracy_score(y_test,y_pred_rf))
print('confusion matrix for Random Forest Classifier:\n',confusion_matrix(y_test,y_pred_rf))

accuracy for Random Forest Classifier : 0.9704854521419316
confusion matrix for Random Forest Classifier:
 [[18279  1186]
 [ 1069 55869]]
