In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# NN 
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout,Embedding,SimpleRNN,LSTM, GRU,Bidirectional

# preprocessing
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence

In [2]:
df = pd.read_csv("/kaggle/input/review/review.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,review,sentiment
0,0,Not sure who was more lost - the flat characte...,0
1,1,Attempting artiness with black & white and cle...,0
2,2,Very little music or anything to speak of.,0
3,3,The best scene in the movie was when Gerardo i...,1
4,4,"The rest of the movie lacks art, charm, meanin...",0


In [3]:
# feature target split
x = df["review"]
y = df["sentiment"]

In [4]:
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.3,random_state=1)

In [5]:
# Keras tokenization

tok = Tokenizer()
tok.fit_on_texts(xtrain)

In [6]:
vocabulary = tok.index_word
vocab_length = len(vocabulary)
vocab_length

2486

In [7]:
print(vocabulary)



In [8]:
# Sequence
train_sequence = tok.texts_to_sequences(xtrain)
print(train_sequence)

[[7, 912, 60, 913, 529, 16, 914, 2, 530, 2, 4, 531, 44, 915, 532, 102], [6, 5, 3, 377, 12], [18, 7, 5, 378, 916], [1, 533, 31, 917, 47, 918, 2, 66, 5, 29, 132, 7, 919, 534, 379, 30, 81, 535, 920, 3, 921, 922, 1, 923, 9, 244, 380, 381, 8, 23, 69, 297, 30, 107, 4, 6, 13, 2, 382, 21, 9, 245, 924, 213, 3, 925, 536, 9, 926, 537, 927], [1, 12, 44, 188, 62, 246, 71, 10, 7, 2, 32, 115, 214], [1, 928, 71, 19, 133, 929, 189, 34, 383, 930, 134, 247, 931, 932, 31, 384, 14, 19, 76, 933, 934, 4, 1, 935, 385], [936, 77, 3, 538, 18, 8, 91, 6, 5, 24, 4, 1, 63, 67, 78], [8, 116, 21, 937, 9, 42, 117, 7, 15, 32, 215, 15, 539, 4, 938, 8, 939], [22, 29, 26, 22, 147, 135, 216, 23, 15, 14, 540], [6, 12, 5, 82, 541], [6, 12, 5, 29, 386], [542, 940, 11, 387, 388], [22, 136, 543, 14, 6, 72, 13, 11, 65, 10, 941, 17, 544, 298, 108, 73, 942, 943, 10, 1, 944, 945, 217, 2, 299, 529, 31, 14, 39], [10, 248, 8, 148, 7, 74, 61, 946, 16, 3, 947, 2, 8, 148, 6, 948, 949, 950, 74, 61, 951, 389], [8, 300, 952, 1, 162, 39, 8, 

In [9]:
# Length of all documents
doc_length = []
for doc in train_sequence:
  doc_length.append(len(doc))

In [10]:
max(doc_length)

69

In [11]:
# 90% quantile
# 90% document length is less than or equal to 30
np.quantile(doc_length, 0.9)

27.0

In [12]:
# 99% quantile
# 99% document length is less than or equal to 45
np.quantile(doc_length, 0.99)

45.0

In [13]:
max_length = 45

In [14]:
# Padding
train_matrix = sequence.pad_sequences(train_sequence,maxlen=max_length)
train_matrix

array([[  0,   0,   0, ..., 915, 532, 102],
       [  0,   0,   0, ...,   3, 377,  12],
       [  0,   0,   0, ...,   5, 378, 916],
       ...,
       [  0,   0,   0, ...,   3,  85,  12],
       [  0,   0,   0, ...,  20,   2,  20],
       [  0,   0,   0, ...,  22, 454, 840]], dtype=int32)

In [15]:
# Testing data
test_sequence = tok.texts_to_sequences(xtest)
test_matrix = sequence.pad_sequences(test_sequence,maxlen=max_length)
test_matrix

array([[  0,   0,   0, ...,   3, 388, 111],
       [  0,   0,   0, ...,   9, 101,  77],
       [  0,   0,   0, ...,  42,  35,  15],
       ...,
       [  0,   0,   0, ...,   3,  35, 222],
       [  0,   0,   0, ...,  22,  23, 551],
       [  0,   0,   0, ...,  12,   5, 128]], dtype=int32)

## Simple RNN

In [16]:
# model
model = Sequential()
model.add(Embedding(input_dim=vocab_length+1, # vocabulary length, +1 -> 0 tokens
                    input_length=max_length,  # max length of document -> 45
                    output_dim=100,           # hyperparameter -> vector length of each token
                    mask_zero=True))          # do not learn for 0 token
model.add(SimpleRNN(32))
model.add(Dense(32,activation="relu"))
model.add(Dense(16,activation="relu"))
model.add(Dense(1,activation="sigmoid"))

In [17]:
model.compile(optimizer="adam",loss="binary_crossentropy")
model.fit(train_matrix,ytrain,epochs=20,batch_size=32)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f96b45dea90>

In [18]:
# prediction
y_pred = model.predict(test_matrix)
y_pred = np.where(y_pred >= 0.5, 1, 0)
print(classification_report(ytest,y_pred))

              precision    recall  f1-score   support

           0       0.61      0.54      0.57       151
           1       0.58      0.64      0.61       149

    accuracy                           0.59       300
   macro avg       0.59      0.59      0.59       300
weighted avg       0.59      0.59      0.59       300



## Bidirectional RNN

In [19]:
# model
model2 = Sequential()
model2.add(Embedding(input_dim=vocab_length+1, # vocabulary length, +1 -> 0 tokens
                    input_length=max_length,  # max length of document -> 45
                    output_dim=100,           # hyperparameter -> vector length of each token
                    mask_zero=True))          # do not learn for 0 token
model2.add(Bidirectional(SimpleRNN(32)))
model2.add(Dense(32,activation="relu"))
model2.add(Dense(16,activation="relu"))
model2.add(Dense(1,activation="sigmoid"))

In [20]:
model2.compile(optimizer="adam",loss="binary_crossentropy")
model2.fit(train_matrix,ytrain,epochs=20,batch_size=32)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f964678e410>

In [21]:
# prediction
y_pred = model2.predict(test_matrix)
y_pred = np.where(y_pred >= 0.5, 1, 0)
print(classification_report(ytest,y_pred))

              precision    recall  f1-score   support

           0       0.74      0.66      0.70       151
           1       0.69      0.77      0.73       149

    accuracy                           0.71       300
   macro avg       0.72      0.71      0.71       300
weighted avg       0.72      0.71      0.71       300



## Multi-layer RNN

In [22]:
# model
model3 = Sequential()
model3.add(Embedding(input_dim=vocab_length+1, # vocabulary length, +1 -> 0 tokens
                    input_length=max_length,  # max length of document -> 45
                    output_dim=100,           # hyperparameter -> vector length of each token
                    mask_zero=True))          # do not learn for 0 token
model3.add(SimpleRNN(32, return_sequences=True))
model3.add(SimpleRNN(32, return_sequences=True))
model3.add(SimpleRNN(32))
model3.add(Dense(32,activation="relu"))
model3.add(Dense(16,activation="relu"))
model3.add(Dense(1,activation="sigmoid"))

In [23]:
model3.compile(optimizer="adam",loss="binary_crossentropy")
model3.fit(train_matrix,ytrain,epochs=20,batch_size=32)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f9645dcdd10>

In [24]:
# prediction
y_pred = model3.predict(test_matrix)
y_pred = np.where(y_pred >= 0.5, 1, 0)
print(classification_report(ytest,y_pred))

              precision    recall  f1-score   support

           0       0.53      0.59      0.56       151
           1       0.53      0.47      0.50       149

    accuracy                           0.53       300
   macro avg       0.53      0.53      0.53       300
weighted avg       0.53      0.53      0.53       300



## Bidirectional Multilayer

In [25]:
# model
model4 = Sequential()
model4.add(Embedding(input_dim=vocab_length+1, # vocabulary length, +1 -> 0 tokens
                    input_length=max_length,  # max length of document -> 45
                    output_dim=100,           # hyperparameter -> vector length of each token
                    mask_zero=True))          # do not learn for 0 token
model4.add(Bidirectional(SimpleRNN(32, return_sequences=True)))
model4.add(Bidirectional(SimpleRNN(32)))
model4.add(Dense(32,activation="relu"))
model4.add(Dense(16,activation="relu"))
model4.add(Dense(1,activation="sigmoid"))

In [26]:
model4.compile(optimizer="adam",loss="binary_crossentropy")
model4.fit(train_matrix,ytrain,epochs=20,batch_size=32)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f96455d4ad0>

In [27]:
# prediction
y_pred = model4.predict(test_matrix)
y_pred = np.where(y_pred >= 0.5, 1, 0)
print(classification_report(ytest,y_pred))

              precision    recall  f1-score   support

           0       0.72      0.62      0.66       151
           1       0.66      0.75      0.70       149

    accuracy                           0.68       300
   macro avg       0.69      0.68      0.68       300
weighted avg       0.69      0.68      0.68       300



## LSTM

In [28]:
# model
model5 = Sequential()
model5.add(Embedding(input_dim=vocab_length+1, # vocabulary length, +1 -> 0 tokens
                    input_length=max_length,  # max length of document -> 45
                    output_dim=100,           # hyperparameter -> vector length of each token
                    mask_zero=True))          # do not learn for 0 token
model5.add(LSTM(64))
model5.add(Dense(64,activation="relu"))
model5.add(Dense(32,activation="relu"))
model5.add(Dense(1,activation="sigmoid"))

In [29]:
model5.compile(optimizer="adam",loss="binary_crossentropy")
model5.fit(train_matrix,ytrain,epochs=20,batch_size=32)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f960ae051d0>

In [30]:
# prediction
y_pred = model5.predict(test_matrix)
y_pred = np.where(y_pred >= 0.5, 1, 0)
print(classification_report(ytest,y_pred))

              precision    recall  f1-score   support

           0       0.77      0.76      0.76       151
           1       0.76      0.77      0.76       149

    accuracy                           0.76       300
   macro avg       0.76      0.76      0.76       300
weighted avg       0.76      0.76      0.76       300



## Bidirectional LSTM

In [31]:
# model
model6 = Sequential()
model6.add(Embedding(input_dim=vocab_length+1, # vocabulary length, +1 -> 0 tokens
                    input_length=max_length,  # max length of document -> 45
                    output_dim=100,           # hyperparameter -> vector length of each token
                    mask_zero=True))          # do not learn for 0 token
model6.add(Bidirectional(LSTM(64)))
model6.add(Dense(64,activation="relu"))
model6.add(Dense(64,activation="relu"))
model6.add(Dense(1,activation="sigmoid"))

In [32]:
model6.compile(optimizer="adam",loss="binary_crossentropy")
model6.fit(train_matrix,ytrain,epochs=20,batch_size=32)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f960b2f0690>

In [33]:
# prediction
y_pred = model6.predict(test_matrix)
y_pred = np.where(y_pred >= 0.5, 1, 0)
print(classification_report(ytest,y_pred))

              precision    recall  f1-score   support

           0       0.78      0.72      0.75       151
           1       0.74      0.80      0.77       149

    accuracy                           0.76       300
   macro avg       0.76      0.76      0.76       300
weighted avg       0.76      0.76      0.76       300



## Multi layer LSTM

In [34]:
# model
model7 = Sequential()
model7.add(Embedding(input_dim=vocab_length+1, # vocabulary length, +1 -> 0 tokens
                    input_length=max_length,  # max length of document -> 45
                    output_dim=100,           # hyperparameter -> vector length of each token
                    mask_zero=True))          # do not learn for 0 token
model7.add(LSTM(64, return_sequences=True))
model7.add(LSTM(64))
model7.add(Dense(64,activation="relu"))
model7.add(Dense(64,activation="relu"))
model7.add(Dense(1,activation="sigmoid"))

In [35]:
model7.compile(optimizer="adam",loss="binary_crossentropy")
model7.fit(train_matrix,ytrain,epochs=20,batch_size=32)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f9609db7a90>

In [36]:
# prediction
y_pred = model7.predict(test_matrix)
y_pred = np.where(y_pred >= 0.5, 1, 0)
print(classification_report(ytest,y_pred))

              precision    recall  f1-score   support

           0       0.75      0.74      0.74       151
           1       0.74      0.74      0.74       149

    accuracy                           0.74       300
   macro avg       0.74      0.74      0.74       300
weighted avg       0.74      0.74      0.74       300



## GRU

In [37]:
# model
model8 = Sequential()
model8.add(Embedding(input_dim=vocab_length+1, # vocabulary length, +1 -> 0 tokens
                    input_length=max_length,  # max length of document -> 45
                    output_dim=100,           # hyperparameter -> vector length of each token
                    mask_zero=True))          # do not learn for 0 token
model8.add(GRU(32))
model8.add(Dense(32,activation="relu"))
model8.add(Dense(16,activation="relu"))
model8.add(Dense(1,activation="sigmoid"))

In [38]:
model8.compile(optimizer="adam",loss="binary_crossentropy")
model8.fit(train_matrix,ytrain,epochs=20,batch_size=32)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f95f74bafd0>

In [39]:
# prediction
y_pred = model8.predict(test_matrix)
y_pred = np.where(y_pred >= 0.5, 1, 0)
print(classification_report(ytest,y_pred))

              precision    recall  f1-score   support

           0       0.71      0.75      0.73       151
           1       0.73      0.68      0.71       149

    accuracy                           0.72       300
   macro avg       0.72      0.72      0.72       300
weighted avg       0.72      0.72      0.72       300

