In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, SimpleRNN, Dropout, LSTM, Embedding, GRU
import nltk
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package stopwords to C:\Users\Roshni
[nltk_data]     Gupta\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Roshni
[nltk_data]     Gupta\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Roshni
[nltk_data]     Gupta\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
df=pd.read_csv("amazonreviews.tsv",sep="\t")
df.head(10)

Unnamed: 0,label,review
0,pos,Stuning even for the non-gamer: This sound tra...
1,pos,The best soundtrack ever to anything.: I'm rea...
2,pos,Amazing!: This soundtrack is my favorite music...
3,pos,Excellent Soundtrack: I truly like this soundt...
4,pos,"Remember, Pull Your Jaw Off The Floor After He..."
5,pos,an absolute masterpiece: I am quite sure any o...
6,neg,"Buyer beware: This is a self-published book, a..."
7,pos,Glorious story: I loved Whisper of the wicked ...
8,pos,A FIVE STAR BOOK: I just finished reading Whis...
9,pos,Whispers of the Wicked Saints: This was a easy...


In [3]:
df.label.value_counts()

neg    5097
pos    4903
Name: label, dtype: int64

In [4]:
df["label"].replace({"neg":0,"pos":1},inplace=True)

In [5]:
def cleantext(text):
    tokens=word_tokenize(text.lower())
    wordtoken=[t for t in tokens if t.isalpha()]
    stop=stopwords.words("english")
    cleantoken=[t for t in wordtoken if(t not in stop)]
    lemma=WordNetLemmatizer()
    lemmatoken=[lemma.lemmatize(t) for t in cleantoken]
    
    return " ".join(lemmatoken)

In [6]:
df["review"]

0       Stuning even for the non-gamer: This sound tra...
1       The best soundtrack ever to anything.: I'm rea...
2       Amazing!: This soundtrack is my favorite music...
3       Excellent Soundtrack: I truly like this soundt...
4       Remember, Pull Your Jaw Off The Floor After He...
                              ...                        
9995    A revelation of life in small town America in ...
9996    Great biography of a very interesting journali...
9997    Interesting Subject; Poor Presentation: You'd ...
9998    Don't buy: The box looked used and it is obvio...
9999    Beautiful Pen and Fast Delivery.: The pen was ...
Name: review, Length: 10000, dtype: object

In [7]:
df["review"]=df["review"].apply(cleantext)

In [8]:
x=df["review"]
y=df["label"]

In [9]:
xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.3,random_state=1,stratify=y)

In [10]:
sentlen=[]
for sent in df["review"]:
    sentlen.append(len(word_tokenize(sent)))
    
df["sent_len"]=sentlen
df.head()


Unnamed: 0,label,review,sent_len
0,1,stuning even sound track beautiful paint sener...,42
1,1,best soundtrack ever anything reading lot revi...,44
2,1,amazing soundtrack favorite music time hand in...,67
3,1,excellent soundtrack truly like soundtrack enj...,68
4,1,remember pull jaw floor hearing played game kn...,46


In [11]:
max(sentlen)

140

In [12]:
max_len=np.quantile(sentlen,0.95)
max_len

79.0

In [13]:
tok=Tokenizer(char_level=False,split=" ")
tok.fit_on_texts(xtrain)
tok.index_word

{1: 'book',
 2: 'one',
 3: 'movie',
 4: 'like',
 5: 'read',
 6: 'good',
 7: 'great',
 8: 'would',
 9: 'time',
 10: 'get',
 11: 'story',
 12: 'really',
 13: 'make',
 14: 'much',
 15: 'work',
 16: 'could',
 17: 'even',
 18: 'first',
 19: 'love',
 20: 'well',
 21: 'buy',
 22: 'year',
 23: 'best',
 24: 'product',
 25: 'thing',
 26: 'better',
 27: 'character',
 28: 'way',
 29: 'think',
 30: 'also',
 31: 'film',
 32: 'people',
 33: 'cd',
 34: 'little',
 35: 'know',
 36: 'ever',
 37: 'many',
 38: 'see',
 39: 'want',
 40: 'go',
 41: 'never',
 42: 'bad',
 43: 'life',
 44: 'say',
 45: 'new',
 46: 'song',
 47: 'reading',
 48: 'album',
 49: 'back',
 50: 'money',
 51: 'got',
 52: 'music',
 53: 'use',
 54: 'find',
 55: 'still',
 56: 'made',
 57: 'bought',
 58: 'recommend',
 59: 'two',
 60: 'game',
 61: 'give',
 62: 'review',
 63: 'found',
 64: 'dvd',
 65: 'thought',
 66: 'look',
 67: 'take',
 68: 'old',
 69: 'lot',
 70: 'day',
 71: 'ca',
 72: 'star',
 73: 'version',
 74: 'another',
 75: 'need',
 76:

In [14]:
vocab_len=len(tok.index_word)
vocab_len

22174

In [15]:
seq_train=tok.texts_to_sequences(xtrain)
seq_train

[[519,
  225,
  1,
  272,
  881,
  1137,
  95,
  268,
  519,
  881,
  11630,
  268,
  1,
  17,
  76,
  193,
  11631,
  95,
  41,
  4155],
 [39,
  118,
  620,
  7,
  257,
  7,
  231,
  115,
  231,
  3057,
  5115,
  64,
  30,
  1090,
  255,
  1138,
  3483,
  5116,
  1352,
  1673,
  8543,
  8544,
  1718,
  88],
 [312,
  345,
  478,
  457,
  62,
  2886,
  83,
  1,
  1396,
  603,
  457,
  6,
  62,
  78,
  1,
  10,
  37,
  316,
  115,
  13,
  2358,
  88,
  105,
  495,
  399,
  2707,
  1781,
  345,
  51,
  184,
  882,
  27,
  1165,
  4554,
  5877,
  1674,
  387,
  4555,
  1907,
  101,
  305,
  387,
  4555,
  12,
  101,
  305,
  2260,
  731,
  6915,
  2260,
  279,
  560,
  5117,
  6916,
  747,
  2590,
  5878,
  11632,
  664,
  1191,
  440,
  1433,
  3484,
  368,
  17,
  11633,
  6917,
  11634,
  5118,
  664,
  1575,
  204,
  1985,
  2591,
  5879],
 [11635,
  2708,
  4556,
  11636,
  11637,
  11638,
  8545,
  2709,
  11639,
  2708,
  832,
  2476,
  2168,
  11640,
  11641,
  11642,
  11643,
  11

In [16]:
# 272 , 881, 1137
# ordered introduction social
tok.index_word.get(1137)

'social'

In [17]:
seq_mat_train=sequence.pad_sequences(seq_train,maxlen=int(max_len))
seq_mat_train

array([[   0,    0,    0, ...,   95,   41, 4155],
       [   0,    0,    0, ..., 8544, 1718,   88],
       [   0,    0,    0, ..., 1985, 2591, 5879],
       ...,
       [   0,    0,    0, ...,  373,    9,  585],
       [   0,    0,    0, ..., 4476,    1,   28],
       [   0,    0,    0, ..., 1014,  373,  331]])

In [18]:
rnn = Sequential()

rnn.add(Embedding(vocab_len+1,700,input_length=int(max_len),mask_zero=True))
rnn.add(GRU(units=32,activation="tanh"))
rnn.add(Dense(units=32,activation="relu"))
rnn.add(Dropout(0.2))
rnn.add(Dense(units=1,activation="sigmoid"))
rnn.compile(optimizer="adam",loss="binary_crossentropy")
rnn.fit(seq_mat_train,ytrain,batch_size=50,epochs=50)

seq_mat_test=sequence.pad_sequences(tok.texts_to_sequences(xtest),maxlen=int(max_len))
ypred=rnn.predict(seq_mat_test)
ypred=np.where(ypred>0.5,1,0)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [19]:
from sklearn.metrics import classification_report
print(classification_report(ytest,ypred))

              precision    recall  f1-score   support

           0       0.83      0.82      0.83      1529
           1       0.82      0.83      0.82      1471

    accuracy                           0.83      3000
   macro avg       0.83      0.83      0.83      3000
weighted avg       0.83      0.83      0.83      3000



In [23]:
# new observation test
def checkreview(review):
    vect=cleantext(review)
    newseq=tok.texts_to_sequences(vect)
    vect1=sequence.pad_sequences(newseq,maxlen=int(max_len))
    ypred=rnn.predict(vect1)
    ypreds=np.where(ypred>0.5,1,0)

In [30]:
new_amazon_review=("Very good service and good collection in stock. Almost all varities shown about daily need products. So afordable. We are satisfied.")

In [31]:
checkreview(new_amazon_review)

