# Simple RNN project using IMDB dataset to classify the reviews as positive or negative.

## Importing the Libraries

In [1]:
import pandas as pd
import numpy as np
import sklearn as sk
import tensorflow as tf
from tensorflow.keras.datasets import imdb
from tensorflow.keras.models import Sequential
import tensorflow.keras.layers as Embedding
from tensorflow.keras.layers import SimpleRNN, Dense
from tensorflow.keras.preprocessing import sequence


## Loading the IMDB dataset

In [2]:
max_features = 10000 # we will select 10000 most frequent words from the dataset to train the model 
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)

# Print the shape of the data
print(f"Training data shape: {x_train.shape}"), print(f"Traning label shape: {y_test.shape}")
print(f"Test data shape: {x_test.shape}"), print(f"Test label shape: {y_test.shape}")
# The 25000 division is not defined by us, it is the default division of the dataset
# That 25000 samples will have more than 10000 words but we will only consider the 10000 most frequent words from x_train and x_test

Training data shape: (25000,)
Traning label shape: (25000,)
Test data shape: (25000,)
Test label shape: (25000,)


(None, None)

In [3]:
# Sample review
sample_review = x_train[0]
sample_label = y_train[0]
print(f"Sample review: {sample_review}")
print(f"Sample label: {sample_label}")

Sample review: [1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 5952, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]
Sample label: 1


In [4]:
# Decode the review back to English
word_index = imdb.get_word_index() # it gets the word corresponding to the index in the dataset as it is encoded
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()]) # reverse the key-value pairs in the dictionary
decoded_review = ' '.join([reverse_word_index.get(i - 3, '?') for i in sample_review]) # get the word corresponding to the index in sample_review[0] and join them to form a sentence for first movie review
# this comes from tensorflow documentation
print(f"Decoded review: {decoded_review}")

Decoded review: ? this film was just brilliant casting location scenery story direction everyone's really suited the part they played and you could just imagine being there robert ? is an amazing actor and now the same being director ? father came from the same scottish island as myself so i loved the fact there was a real connection with this film the witty remarks throughout the film were great it was just brilliant so much that i bought the film as soon as it was released for ? and would recommend it to everyone to watch and the fly fishing was amazing really cried at the end it was so sad and you know what they say if you cry at a film it must have been good and this definitely was also ? to the two little boy's that played the ? of norman and paul they were just brilliant children are often left out of the ? list i think because the stars that play them all grown up are such a big profile for the whole film but these children are amazing and should be praised for what they have do

## Preprocessing the data

### Adding padding to the data to make it of same length

In [5]:
from tensorflow.keras.preprocessing import sequence
maxlen = 500 # we will truncate the reviews to 500 words , if exceeds 500 words we will truncate it and if less than 500 words we will pad it with zeros
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
x_test

array([[   0,    0,    0, ...,   14,    6,  717],
       [   0,    0,    0, ...,  125,    4, 3077],
       [  33,    6,   58, ...,    9,   57,  975],
       ...,
       [   0,    0,    0, ...,   21,  846, 5518],
       [   0,    0,    0, ..., 2302,    7,  470],
       [   0,    0,    0, ...,   34, 2005, 2643]], dtype=int32)

## Building the model

In [6]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense
model = Sequential()
model.add(Embedding(10000, 128, input_length=500)) # for converting the words to vectors , each word will be mapped to a 128 dimensional vector
# these 128 dim vectors have words which are decided by the model itself and are not predefined
model.add(SimpleRNN(128, activation='relu')) # adding a simple RNN layer with 128 units
model.add(Dense(1, activation='sigmoid')) # adding a dense layer with 1 unit and sigmoid activation function as it is a binary classification problem



In [7]:
model.summary()

In [8]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

In [9]:
## Create an instance of EarlyStoppping Callback
from tensorflow.keras.callbacks import EarlyStopping
earlystopping=EarlyStopping(monitor='val_loss',patience=5,restore_best_weights=True)
earlystopping

<keras.src.callbacks.early_stopping.EarlyStopping at 0x1ec9fb8aba0>

In [10]:
history=model.fit(
    x_train, y_train,epochs=10,batch_size=32,
    validation_split=0.2,
    callbacks=[earlystopping]
)

Epoch 1/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m149s[0m 230ms/step - acc: 0.5845 - loss: 1181.8484 - val_acc: 0.6572 - val_loss: 0.6012
Epoch 2/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m140s[0m 224ms/step - acc: 0.7148 - loss: 147703.7969 - val_acc: 0.6118 - val_loss: 0.6386
Epoch 3/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m167s[0m 267ms/step - acc: 0.7047 - loss: 0.5492 - val_acc: 0.7136 - val_loss: 0.5593
Epoch 4/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m148s[0m 236ms/step - acc: 0.8239 - loss: 0.4173 - val_acc: 0.7912 - val_loss: 0.4629
Epoch 5/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m130s[0m 208ms/step - acc: 0.8954 - loss: 0.3082 - val_acc: 0.8120 - val_loss: 0.4249
Epoch 6/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m148s[0m 236ms/step - acc: 0.9278 - loss: 0.2373 - val_acc: 0.8240 - val_loss: 0.4146
Epoch 7/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

In [11]:
model.save('rnn_model.h5')

