In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

import nltk
from nltk.corpus import stopwords
from nltk import PorterStemmer as stemmer
from bs4 import BeautifulSoup 
import string
import re

import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv("C:/Users/khist/AmazonProductReview/reviews/Roborock_reviews.csv")

In [3]:
data.head()

Unnamed: 0,Rating,Review_title,Product_review
0,5.0 out of 5 stars,BEST ROB VAC (in this price range),I purchased the Roborock E4 over a month ago.
1,4.0 out of 5 stars,A True LIFE SAVER!!!!,I've only noticed a few minor issues with this...
2,5.0 out of 5 stars,Great value robot vac,We have had this little vac for almost two mo...
3,5.0 out of 5 stars,My new helper,"First of all, I used to be a person who saw a ..."
4,1.0 out of 5 stars,Gets stuck on EVERYTHING. Constant supervision...,Grabbed this on Cyber Monday and was really e...


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1378 entries, 0 to 1377
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Rating          1367 non-null   object
 1   Review_title    1378 non-null   object
 2   Product_review  1378 non-null   object
dtypes: object(3)
memory usage: 32.4+ KB


In [5]:
data.dropna(inplace = True)

In [6]:
def strip_rating(r):
    return str(r).split('.')[0]

In [7]:
data['Rating'] = data['Rating'].apply(strip_rating)

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1367 entries, 0 to 1366
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Rating          1367 non-null   object
 1   Review_title    1367 non-null   object
 2   Product_review  1367 non-null   object
dtypes: object(3)
memory usage: 42.7+ KB


In [9]:
data.head()

Unnamed: 0,Rating,Review_title,Product_review
0,5,BEST ROB VAC (in this price range),I purchased the Roborock E4 over a month ago.
1,4,A True LIFE SAVER!!!!,I've only noticed a few minor issues with this...
2,5,Great value robot vac,We have had this little vac for almost two mo...
3,5,My new helper,"First of all, I used to be a person who saw a ..."
4,1,Gets stuck on EVERYTHING. Constant supervision...,Grabbed this on Cyber Monday and was really e...


In [10]:
nltk.download('stopwords')
STOPWORDS=stopwords.words("english") #is, he, that, etc.

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\khist\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
def Remove_Emojify(review):
    return review.encode('ascii', 'ignore').decode('ascii') 

def clean_text(text):
    ps=stemmer()
    
    text = Remove_Emojify(text) # remove Emojis
    
    text_cleaned = "".join([x for x in text if x not in string.punctuation]) # remove punctuation
    
    text_cleaned = re.sub(' +', ' ', text_cleaned) # remove extra white spaces
    
    text_cleaned = text_cleaned.lower() # converting to lowercase
    
    tokens = text_cleaned.split(" ")
    tokens = [token for token in tokens if token not in STOPWORDS] # Taking only those words which are not stopwords
    text_cleaned = " ".join([ps.stem(token) for token in tokens])
    
    return text_cleaned

data['cleaned_review']=data['Product_review'].apply(lambda x:clean_text(x))

In [12]:
data.head()

Unnamed: 0,Rating,Review_title,Product_review,cleaned_review
0,5,BEST ROB VAC (in this price range),I purchased the Roborock E4 over a month ago.,purchas roborock e4 month ago
1,4,A True LIFE SAVER!!!!,I've only noticed a few minor issues with this...,ive notic minor issu vacuum sometim doesnt ent...
2,5,Great value robot vac,We have had this little vac for almost two mo...,littl vac almost two month thought time review...
3,5,My new helper,"First of all, I used to be a person who saw a ...",first use person saw robot vacuum worth coupl ...
4,1,Gets stuck on EVERYTHING. Constant supervision...,Grabbed this on Cyber Monday and was really e...,grab cyber monday realli excit first clean ses...


In [13]:
df = data[['Rating','cleaned_review']] 
df['Rating'] = df['Rating'].apply(lambda x: 'neg' if int(x) <= 3 else 'pos')
df.columns = ['label', 'text']
df = pd.concat([df, df.label.astype('str').str.get_dummies()], axis=1, sort=False)
df = df[['text', 'neg','pos']]
df.head()

Unnamed: 0,text,neg,pos
0,purchas roborock e4 month ago,0,1
1,ive notic minor issu vacuum sometim doesnt ent...,0,1
2,littl vac almost two month thought time review...,0,1
3,first use person saw robot vacuum worth coupl ...,0,1
4,grab cyber monday realli excit first clean ses...,1,0


In [14]:
import ktrain
from ktrain import text

In [15]:
trn, val, preproc = text.texts_from_df(df, 
                                       'text', # name of column containing review text
                                       label_columns=['neg','pos'],
                                       maxlen=75, 
                                       max_features=100000,
                                       preprocess_mode='bert',
                                       val_pct=0.1)

['neg', 'pos']
      neg  pos
646     0    1
704     0    1
1192    0    1
894     0    1
1082    0    1
['neg', 'pos']
      neg  pos
1009    0    1
151     0    1
41      0    1
990     1    0
1188    0    1
preprocessing train...
language: en


Is Multi-Label? False
preprocessing test...
language: en


In [16]:
model = text.text_classifier('bert', train_data=trn, preproc=preproc)

Is Multi-Label? False
maxlen is 75
done.


In [26]:
learner = ktrain.get_learner(model, train_data=trn, batch_size=15)

In [21]:
#learner.lr_find(max_epochs=5)
#learner.lr_plot()

In [23]:
#learner.validate(val_data=val)

In [27]:
learner.fit_onecycle(lr = 2e-5, epochs = 5)



begin training using onecycle policy with max lr of 2e-05...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1ce3e59c148>

In [28]:
learner.validate(val_data=val)

              precision    recall  f1-score   support

           0       0.69      0.59      0.63        41
           1       0.83      0.89      0.86        96

    accuracy                           0.80       137
   macro avg       0.76      0.74      0.75       137
weighted avg       0.79      0.80      0.79       137



array([[24, 17],
       [11, 85]], dtype=int64)

In [29]:
predictor = ktrain.get_predictor(learner.model, preproc)

In [34]:
data = ['waste of money', 
        'beautiful as a gift',
        'I use it all day, everyday',
       'Worst product']

In [35]:
predictor.predict(data)

['neg', 'pos', 'pos', 'neg']

In [38]:
predictor.save('C:/Users/khist/Documents/GitHub/Amazon-Product-Review/my_predictor')

In [49]:
# save Predictor (i.e., model and Preprocessor instance) after partially training
predictor.save('C:/Users/khist/Documents/GitHub/Amazon-Product-Review/my_predictor')



In [50]:
# reload Predictor and extract model
model = ktrain.load_predictor('C:/Users/khist/Documents/GitHub/Amazon-Product-Review/my_predictor').model
preproc = ktrain.load_predictor('C:/Users/khist/Documents/GitHub/Amazon-Product-Review/my_predictor').preproc

predictor = ktrain.get_predictor(model, preproc)

In [51]:
predictor.predict(data)

['neg', 'pos', 'pos', 'neg']

In [52]:
import ktrain
from ktrain import text