In [0]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [45]:
from google.colab import drive
drive.mount('/content/drive',force_remount=True)

Mounted at /content/drive


In [0]:
import tensorflow as tf
import keras
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense

In [0]:
import re

In [0]:
import nltk

In [0]:
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [0]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [0]:
df=pd.read_csv('/content/drive/My Drive/Colab Notebooks/amazon_baby.csv')

In [52]:
df.head()

Unnamed: 0,name,review,rating
0,Planetwise Flannel Wipes,"These flannel wipes are OK, but in my opinion ...",3
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5


In [53]:
df.shape

(183531, 3)

In [54]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 183531 entries, 0 to 183530
Data columns (total 3 columns):
name      183213 non-null object
review    182702 non-null object
rating    183531 non-null int64
dtypes: int64(1), object(2)
memory usage: 4.2+ MB


### Data Cleaning and Text Preprocessing 

In [55]:
df.isnull().sum()

name      318
review    829
rating      0
dtype: int64

In [0]:
df.drop('name',axis=1,inplace=True)

In [0]:
df.dropna(inplace=True)

In [58]:
df.shape

(182702, 2)

In [59]:
df.head()

Unnamed: 0,review,rating
0,"These flannel wipes are OK, but in my opinion ...",3
1,it came early and was not disappointed. i love...,5
2,Very soft and comfortable and warmer than it l...,5
3,This is a product well worth the purchase. I ...,5
4,All of my kids have cried non-stop when I trie...,5


In [60]:
df['rating'].value_counts()

5    106483
4     33099
3     16727
1     15116
2     11277
Name: rating, dtype: int64

### Removing the rating which are 3 as they are neutral sentiment

In [0]:
df.drop(df[df['rating']==3].index,inplace=True)

In [62]:
df.head()

Unnamed: 0,review,rating
1,it came early and was not disappointed. i love...,5
2,Very soft and comfortable and warmer than it l...,5
3,This is a product well worth the purchase. I ...,5
4,All of my kids have cried non-stop when I trie...,5
5,"When the Binky Fairy came to our house, we did...",5


In [63]:
df.shape

(165975, 2)

### For rating >=4 we consider it positive and value=1
### For rating <4 we consider it negative and value=0

In [0]:
def add_sentiment(a):
    if a>=4:
        return 1
    else:
        return 0

In [0]:
df['sentiment']=df['rating'].apply(add_sentiment)

In [66]:
df.head()

Unnamed: 0,review,rating,sentiment
1,it came early and was not disappointed. i love...,5,1
2,Very soft and comfortable and warmer than it l...,5,1
3,This is a product well worth the purchase. I ...,5,1
4,All of my kids have cried non-stop when I trie...,5,1
5,"When the Binky Fairy came to our house, we did...",5,1


In [67]:
df['rating'].value_counts()

5    106483
4     33099
1     15116
2     11277
Name: rating, dtype: int64

In [68]:
df['sentiment'].value_counts()

1    139582
0     26393
Name: sentiment, dtype: int64

In [0]:
wordnet=WordNetLemmatizer()

### Cleaning Text

In [0]:
def clean_text(a):
    text=re.sub('[^A-za-z0-9]',' ',a)
    text=text.lower()
    text=text.split(' ')
    text = [wordnet.lemmatize(word) for word in text if word not in (stopwords.words('english'))]
    text = ' '.join(text)
    return text

In [0]:
df['review']=df['review'].apply(clean_text)

In [72]:
df.head()

Unnamed: 0,review,rating,sentiment
1,came early disappointed love planet wise bag ...,5,1
2,soft comfortable warmer look fit full size b...,5,1
3,product well worth purchase found anything e...,5,1
4,kid cried non stop tried ween pacifier found ...,5,1
5,binky fairy came house special gift book help...,5,1


In [108]:
df['sentiment'].value_counts()

1    139582
0     26393
Name: sentiment, dtype: int64

In [0]:
from imblearn.over_sampling import SMOTE

###  Vectorizer



In [0]:
cov=CountVectorizer()
tf=TfidfVectorizer()

In [75]:
x1=cov.fit_transform(df['review'])
x1.shape

(165975, 59022)

In [76]:
x2=tf.fit_transform(df['review'])
x2.shape

(165975, 59022)

In [0]:
y=df['sentiment']

In [0]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit

In [0]:
cv=ShuffleSplit(n_splits=6,test_size=0.3,random_state=42)

### Trying different classification algorithms

In [0]:
from sklearn.naive_bayes import MultinomialNB

In [0]:
from xgboost import XGBClassifier

### Checking accuracy with cross val score

In [84]:
print(cross_val_score(MultinomialNB(),x1,y,cv=cv,scoring='accuracy'))

[0.89954411 0.89982528 0.89590906 0.89596931 0.89801779 0.89773663]


In [85]:
print(cross_val_score(XGBClassifier(),x1,y,cv=cv,scoring='accuracy'))

[0.88042496 0.88281485 0.88046513 0.87889864 0.88036471 0.88028438]


In [86]:
print(cross_val_score(MultinomialNB(),x2,y,cv=cv,scoring='accuracy'))

[0.84783002 0.85180648 0.84921575 0.84640411 0.84857309 0.84971783]


In [87]:
print(cross_val_score(XGBClassifier(),x2,y,cv=cv,scoring='accuracy'))

[0.88028438 0.88189103 0.88006346 0.87893881 0.88034463 0.87984255]


### Count Vectorizer with MultinomialNB provides better accuracy of 89%


In [0]:
x_train,x_test,y_train,y_test=train_test_split(x1,y,test_size=0.3,random_state=42)

In [0]:
model=MultinomialNB()

In [103]:
model.fit(x_train,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [0]:
prediction=model.predict(x_test)

In [0]:
from sklearn.metrics import classification_report,confusion_matrix

In [106]:
print(classification_report(prediction,y_test))

              precision    recall  f1-score   support

           0       0.65      0.70      0.67      7306
           1       0.95      0.93      0.94     42487

    accuracy                           0.90     49793
   macro avg       0.80      0.82      0.81     49793
weighted avg       0.90      0.90      0.90     49793



In [107]:
print(confusion_matrix(prediction,y_test))

[[ 5113  2193]
 [ 2809 39678]]


### As it is a imbalanced dataset trying for oversampling


In [0]:
sm=SMOTE()

In [112]:
x1_new,y1_new=sm.fit_resample(x1,y)



In [113]:
x2_new,y2_new=sm.fit_resample(x2,y)



### Trying ML algorithms and checking accuracy


In [114]:
print(cross_val_score(MultinomialNB(),x1_new,y1_new,cv=cv,scoring='accuracy'))

[0.81915224 0.81684776 0.82032239 0.82047761 0.86993433 0.86967164]


In [115]:
print(cross_val_score(MultinomialNB(),x2_new,y2_new,cv=cv,scoring='accuracy'))

[0.87873433 0.87467463 0.87917612 0.87830448 0.87882985 0.88007164]


In [116]:
print(cross_val_score(XGBClassifier(),x1_new,y1_new,cv=cv,scoring='accuracy'))

[0.91628657 0.91682388 0.91676418 0.91609552 0.91789851 0.91515224]


In [117]:
print(cross_val_score(XGBClassifier(),x2_new,y2_new,cv=cv,scoring='accuracy'))

[0.87305075 0.87151045 0.87436418 0.87478209 0.8726806  0.87357612]


### Building a model,Checking its accuracy and saving it.


In [0]:
model=XGBClassifier()

In [0]:
x_train,x_test,y_train,y_test=train_test_split(x1_new,y1_new,test_size=0.3,random_state=42)

In [122]:
model.fit(x_train,y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [0]:
prediction=model.predict(x_test)

In [124]:
print(confusion_matrix(prediction,y_test))

[[35427   391]
 [ 6620 41312]]


In [125]:
print(classification_report(prediction,y_test))

              precision    recall  f1-score   support

           0       0.84      0.99      0.91     35818
           1       0.99      0.86      0.92     47932

    accuracy                           0.92     83750
   macro avg       0.92      0.93      0.92     83750
weighted avg       0.93      0.92      0.92     83750



### Accuracy of model increased to 92%


In [0]:
import pickle

In [0]:
with open('amazon_sentiment','wb') as f:
  pickle.dump(model,f)

In [0]:
def predict(a):
  a=clean_text(a)
  x=cov.transform([a])
  if model.predict(x):
    return "Positive"
  else:
    return "Negative"

In [149]:
ans=predict("I have returned this item 15 days back but no one came to pick up the return order")
print(ans)

Negative


In [155]:
ans=predict("Awesome device, and hats off to amazon for such speedy delivery.I ordered it on 16th May 12PM and it got delivered by 17th May 10AM And so far I m absolutely loving this device, buttery smooth and blazing fast. But its a bit big, afterall 6.7 inches is quite a number and super slippery if used without the case.")
print(ans)

Positive


In [156]:
ans=predict("Waste of money this time one plus 7 pro heavy weight not disply gud compare to samsung, also inbuilt screen guard getting with bubles and in youtube show wireless headphone and 2 axtra cover with phone but not recived this time really waste of money")
print(ans)

Negative
