# Text classification using simple rnn with IMDB dataset

In [6]:
import numpy as np
import tensorflow as tf 
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing import sequence 
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,SimpleRNN,Dense 

In [7]:
# loading the dataset
max_features=10000 # vocab_size

(x_train,y_train),(x_test,y_test)=imdb.load_data(num_words=max_features)


Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
[1m17464789/17464789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 1us/step


In [8]:
print(f"Training data shape: {x_train.shape}")
print(f"Training labels shape: {y_train.shape}")
print(f"Testing data shape: {x_test.shape}")
print(f"Testing labels shape: {y_test.shape}")


Training data shape: (25000,)
Training labels shape: (25000,)
Testing data shape: (25000,)
Testing labels shape: (25000,)


In [16]:
sample_review=x_train[0]
sample_label=y_train[0]

print(f"Sample review: {sample_review}")
print(f"Sample label: {sample_label}")

# this is one hot representation based on vocab size
# and lable is 1 and 0 for positive and negative review


Sample review: [1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 5952, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]
Sample label: 1


In [25]:
word_index = imdb.get_word_index()
word_index
# trying to decode the review data

reversed_word_index = dict([(value, key) for key, value in imdb.get_word_index().items()])
decoded_review = " ".join([reversed_word_index.get(i-3, "?") for i in sample_review])

In [26]:
decoded_review

"? this film was just brilliant casting location scenery story direction everyone's really suited the part they played and you could just imagine being there robert ? is an amazing actor and now the same being director ? father came from the same scottish island as myself so i loved the fact there was a real connection with this film the witty remarks throughout the film were great it was just brilliant so much that i bought the film as soon as it was released for ? and would recommend it to everyone to watch and the fly fishing was amazing really cried at the end it was so sad and you know what they say if you cry at a film it must have been good and this definitely was also ? to the two little boy's that played the ? of norman and paul they were just brilliant children are often left out of the ? list i think because the stars that play them all grown up are such a big profile for the whole film but these children are amazing and should be praised for what they have done don't you th

In [32]:
from tensorflow.keras.preprocessing import sequence

max_len=500

x_train=sequence.pad_sequences(x_train,maxlen=max_len)
x_test=sequence.pad_sequences(x_test,maxlen=max_len)

# padding add 0 and when we pass it through 0 then we 
# will converts them to our feature representation


In [33]:
x_train[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   

In [45]:
# make model now 
# we already prep data first it will pas through embedding layer]

feature_dimension=128 
#max_features=10000 # vocab_size
#max_length=500 padding 

model=Sequential()
model.add(Embedding(max_features,feature_dimension,input_length=max_len))
# embedding layer 


model.add(SimpleRNN(128,activation="relu"))
model.add(Dense(1,activation="sigmoid"))
# binary classifcation hence using sigmoid

model.summary()



In [47]:
model.compile(optimizer="adam",loss="binary_crossentropy",metrics=["accuracy"])


In [50]:
# we have to do early stopping

from tensorflow.keras.callbacks import EarlyStopping

callback=EarlyStopping(monitor="val_loss",patience=10,restore_best_weights=True)
# keep checking change in validation loss if it does not change till 5 epochs stop 
# the operation and restore the weight value to min loss instance

In [None]:
# train here with x_train

model.fit(
    x_train,y_train,epochs=10,batch_size=32,
    validation_split=0.2,
    callbacks=[callback]
)

Epoch 1/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 79ms/step - accuracy: 0.7018 - loss: nan - val_accuracy: 0.5062 - val_loss: nan
Epoch 2/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 77ms/step - accuracy: 0.4973 - loss: nan - val_accuracy: 0.5062 - val_loss: nan
Epoch 3/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 74ms/step - accuracy: 0.5027 - loss: nan - val_accuracy: 0.5062 - val_loss: nan
Epoch 4/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 73ms/step - accuracy: 0.5035 - loss: nan - val_accuracy: 0.5062 - val_loss: nan
Epoch 5/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 74ms/step - accuracy: 0.5020 - loss: nan - val_accuracy: 0.5062 - val_loss: nan
Epoch 6/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 72ms/step - accuracy: 0.4996 - loss: nan - val_accuracy: 0.5062 - val_loss: nan
Epoch 7/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━

In [None]:
# saving the model

model.save("simple_rnn_imdb.h5")
