In [1]:
from tensorflow.keras.layers import SimpleRNN, LSTM, GRU, Bidirectional, Dense, Embedding
from tensorflow.keras.datasets import imdb
from tensorflow.keras.models import Sequential
import numpy as np

In [15]:
# Getting reviews with words that come under 5000, most occuring words in the entire corpus

vocab_size = 5000
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=vocab_size)

print(x_train[0])

[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 2, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 2, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 2, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 2, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 2, 19, 178, 32]


In [16]:
# getting the index of all the words
word_idx = imdb.get_word_index()

# Originally the index number of a value and not a key,
# hence converting the index as key and the words as values
word_idx = {i: word for word, i in word_idx.items()}

# again printing the review
print([word_idx[i] for i in x_train[0]])

['the', 'as', 'you', 'with', 'out', 'themselves', 'powerful', 'lets', 'loves', 'their', 'becomes', 'reaching', 'had', 'journalist', 'of', 'lot', 'from', 'anyone', 'to', 'have', 'after', 'out', 'atmosphere', 'never', 'more', 'room', 'and', 'it', 'so', 'heart', 'shows', 'to', 'years', 'of', 'every', 'never', 'going', 'and', 'help', 'moments', 'or', 'of', 'every', 'chest', 'visual', 'movie', 'except', 'her', 'was', 'several', 'of', 'enough', 'more', 'with', 'is', 'now', 'current', 'film', 'as', 'you', 'of', 'mine', 'potentially', 'unfortunately', 'of', 'you', 'than', 'him', 'that', 'with', 'out', 'themselves', 'her', 'get', 'for', 'was', 'camp', 'of', 'you', 'movie', 'sometimes', 'movie', 'that', 'with', 'scary', 'but', 'and', 'to', 'story', 'wonderful', 'that', 'in', 'seeing', 'in', 'character', 'to', 'of', '70s', 'and', 'with', 'heart', 'had', 'shadows', 'they', 'of', 'here', 'that', 'with', 'her', 'serious', 'to', 'have', 'does', 'when', 'from', 'why', 'what', 'have', 'critics', 'they'

In [17]:
# Get the minimum and the maximum length of reviews
print("Max length of a review:: ", len(max((x_train+x_test), key=len)))
print("Min length of a review:: ", len(min((x_train+x_test), key=len)))

Max length of a review::  2697
Min length of a review::  70


Since now we know the minimum and maximum word count of a reivew, we need to pad the words with less number of reviews since a NN expects all the input in a fixed size.

In [18]:
from tensorflow.keras.preprocessing import sequence

# Keeping a fixed length of all reviews to max 400 words
max_words = 400

x_train = sequence.pad_sequences(x_train, maxlen=max_words)
x_test = sequence.pad_sequences(x_test, maxlen=max_words)

x_valid, y_valid = x_train[:64], y_train[:64]
x_train_, y_train_ = x_train[64:], y_train[64:]

In [19]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense

# Define parameters
embd_len = 32

# Create the RNN model
model = Sequential(name='Simple_RNN')
model.add(Embedding(vocab_size, embd_len, input_length=max_words))
model.add(SimpleRNN(128, activation='tanh', return_sequences=False))
model.add(Dense(1, activation='sigmoid'))

# build the model
model.build(input_shape=(None, max_words))  # Batch size is None, sequence length is max_words

# Print the model summary
print(model.summary())

None


In [20]:
# compile the model
model.compile(loss='binary_crossentropy',
              optimizer = 'adam',
              metrics = ['accuracy'])

In [21]:
# Train the model
history = model.fit(
    x_train_, y_train_,
    batch_size = 64,
    epochs = 5,
    verbose=1,
    validation_data = (x_valid, y_valid)
    )

Epoch 1/5
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 114ms/step - accuracy: 0.5155 - loss: 0.6960 - val_accuracy: 0.6406 - val_loss: 0.6739
Epoch 2/5
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 113ms/step - accuracy: 0.6218 - loss: 0.6486 - val_accuracy: 0.6094 - val_loss: 0.6525
Epoch 3/5
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 117ms/step - accuracy: 0.6870 - loss: 0.5842 - val_accuracy: 0.7344 - val_loss: 0.6352
Epoch 4/5
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 113ms/step - accuracy: 0.7347 - loss: 0.5197 - val_accuracy: 0.7969 - val_loss: 0.4906
Epoch 5/5
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 117ms/step - accuracy: 0.8013 - loss: 0.4365 - val_accuracy: 0.5938 - val_loss: 0.6717


In [22]:
print("Simple RNN Score: ", model.evaluate(x_test, y_test, verbose=0))

Simple RNN Score:  [0.6127382516860962, 0.6550400257110596]


In [23]:
# Defining GRU model
gru_model = Sequential(name="GRU_Model")
gru_model.add(Embedding(vocab_size,
						embd_len,
						input_length=max_words))
gru_model.add(GRU(128,
				activation='tanh',
				return_sequences=False))
gru_model.add(Dense(1, activation='sigmoid'))

# Printing the Summary
print(gru_model.summary())

# Compiling the model
gru_model.compile(
	loss="binary_crossentropy",
	optimizer='adam',
	metrics=['accuracy']
)

# Training the GRU model
history2 = gru_model.fit(x_train_, y_train_,
						batch_size=64,
						epochs=5,
						verbose=1,
						validation_data=(x_valid, y_valid))

# Printing model score on test data
print()
print("GRU model Score---> ", gru_model.evaluate(x_test, y_test, verbose=0))



None
Epoch 1/5
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m175s[0m 443ms/step - accuracy: 0.6254 - loss: 0.6193 - val_accuracy: 0.7031 - val_loss: 0.5371
Epoch 2/5
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m201s[0m 441ms/step - accuracy: 0.8313 - loss: 0.3795 - val_accuracy: 0.8906 - val_loss: 0.2177
Epoch 3/5
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m202s[0m 442ms/step - accuracy: 0.8936 - loss: 0.2692 - val_accuracy: 0.9219 - val_loss: 0.2014
Epoch 4/5
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m203s[0m 444ms/step - accuracy: 0.9199 - loss: 0.2130 - val_accuracy: 0.9219 - val_loss: 0.2394
Epoch 5/5
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m173s[0m 443ms/step - accuracy: 0.9421 - loss: 0.1614 - val_accuracy: 0.9375 - val_loss: 0.1901

GRU model Score--->  [0.29444757103919983, 0.8869600296020508]
