# IMDB Review Semantic Analysis - Simple RNN Implementation

In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard
from tensorflow.keras.optimizers import Adam

In [30]:
## Load the imdb dataset

max_features = 10000 #Vocabulary size
(X_train,y_train), (X_test,y_test) = imdb.load_data(num_words=max_features)

# Print the shape of the data

print(f"Training data shape: {X_train.shape}, Training labels shape: {y_train.shape}")
print(f"Testing data shape: {X_test.shape}, Testing labels shape: {y_test.shape}")

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
[1m17464789/17464789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 0us/step
Training data shape: (25000,), Training labels shape: (25000,)
Testing data shape: (25000,), Testing labels shape: (25000,)


In [31]:
## Inspect a sample review and its label

sample_review=X_train[0]
sample_label=y_train[0]
print(f"Sample review: {sample_review}")
print(f"Sample label: {sample_label}")

Sample review: [1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 5952, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]
Sample label: 1


In [33]:
## Mapping of word index back to word

word_index = imdb.get_word_index()
reverse_word_index = {value: key for key,value in word_index.items()}
decoded_review = ' '.join([reverse_word_index.get(i-3,'?') for i in sample_review])
print(f"Decoded review: {decoded_review}")


Decoded review: ? this film was just brilliant casting location scenery story direction everyone's really suited the part they played and you could just imagine being there robert ? is an amazing actor and now the same being director ? father came from the same scottish island as myself so i loved the fact there was a real connection with this film the witty remarks throughout the film were great it was just brilliant so much that i bought the film as soon as it was released for ? and would recommend it to everyone to watch and the fly fishing was amazing really cried at the end it was so sad and you know what they say if you cry at a film it must have been good and this definitely was also ? to the two little boy's that played the ? of norman and paul they were just brilliant children are often left out of the ? list i think because the stars that play them all grown up are such a big profile for the whole film but these children are amazing and should be praised for what they have do

In [17]:
# Checking max length of a sentence

max_len_train = max([len(x) for x in X_train])
max_len_test = max([len(x) for x in X_test])

print(f"Max length of sentence in X_train: {max_len_train}")
print(f"Max length of sentence in X_test: {max_len_test}")

max_len = max(max_len_train, max_len_test)
print(f"Overall max length: {max_len}")

Max length of sentence in X_train: 2494
Max length of sentence in X_test: 2315
Overall max length: 2494


In [18]:
#Checking the median of the length of sentences

median_len_train = np.median([len(x) for x in X_train])
median_len_test = np.median([len(x) for x in X_test])

print(f"Median length of sentence in X_train: {median_len_train}")
print(f"Median length of sentence in X_test: {median_len_test}")

Median length of sentence in X_train: 178.0
Median length of sentence in X_test: 174.0


In [34]:
# Padding the data

max_len = 500
X_train = sequence.pad_sequences(X_train, maxlen=max_len)
X_test = sequence.pad_sequences(X_test, maxlen=max_len)

In [29]:
# Designing the model
model = Sequential()
model.add(Embedding(max_features, 128))
model.add(SimpleRNN(128, dropout=0.2, recurrent_dropout=0.2, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.summary()

In [30]:
# Create an instance of Early Stopping Callback
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True, verbose=1)

In [31]:
# Train the model with Early Stopping
adam = Adam(learning_rate=0.0001)
model.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy'])
model.fit(X_train, y_train, batch_size=32, epochs=10, validation_split=0.2, callbacks=[early_stopping])

Epoch 1/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 46ms/step - accuracy: 0.5168 - loss: 0.6917 - val_accuracy: 0.5866 - val_loss: 0.6825
Epoch 2/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 45ms/step - accuracy: 0.6173 - loss: 0.6808 - val_accuracy: 0.6608 - val_loss: 0.6445
Epoch 3/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 45ms/step - accuracy: 0.6810 - loss: 0.6310 - val_accuracy: 0.7026 - val_loss: 0.5922
Epoch 4/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 46ms/step - accuracy: 0.7443 - loss: 0.5542 - val_accuracy: 0.7342 - val_loss: 0.5415
Epoch 5/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 45ms/step - accuracy: 0.7787 - loss: 0.4884 - val_accuracy: 0.7438 - val_loss: 0.5176
Epoch 6/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 46ms/step - accuracy: 0.8041 - loss: 0.4390 - val_accuracy: 0.7452 - val_loss: 0.5093
Epoch 7/10
[1m6

<keras.src.callbacks.history.History at 0x7b4825dfef50>

In [32]:
# Save the model
model.save('simple_rnn_imdb.keras')

In [15]:
# Load the pre-trained model with ReLU activation
model = load_model('simple_rnn_imdb.keras')
model.summary()

In [16]:
model.get_weights()

[array([[-0.02976437, -0.00882436,  0.02724012, ...,  0.00207404,
         -0.06450394, -0.04821667],
        [ 0.01621924, -0.01513571, -0.00691114, ...,  0.01250886,
         -0.0340037 , -0.03810706],
        [ 0.05296165,  0.03147747,  0.04964292, ..., -0.01572418,
         -0.02564909,  0.0015419 ],
        ...,
        [-0.02629832, -0.04608593,  0.04625484, ...,  0.02282253,
          0.04504054,  0.0302557 ],
        [ 0.03650793,  0.03244345, -0.03043529, ...,  0.02510871,
          0.01200504, -0.02785615],
        [ 0.04848333, -0.03077552,  0.04196044, ..., -0.06245155,
         -0.02178361,  0.04568968]], dtype=float32),
 array([[ 0.02221957,  0.09532358, -0.01882246, ..., -0.12888099,
         -0.04765936, -0.04894518],
        [ 0.04578915, -0.01204052,  0.11601876, ..., -0.02319083,
         -0.17668724,  0.05178858],
        [ 0.01153003, -0.06662818, -0.00932592, ..., -0.09111843,
          0.1890512 ,  0.06115425],
        ...,
        [ 0.07180934,  0.13473895, -0.1

In [25]:
# Helper functions
## Decode reviews
def decode_review(encoded_review):
  return ''.join([reverse_word_index.get(i-3,'?') for i in encoded_review])
## Preprocess user input
def preprocess_text(text):
  words = text.lower().split()
  encoded_review = [word_index.get(word, 2) + 3 for word in words]
  padded_review = sequence.pad_sequences([encoded_review], maxlen=500)
  return padded_review

In [26]:
# Prediction function
def predict_sentiment(review):
  preprocessed_input = preprocess_text(review)
  prediction = model.predict(preprocessed_input)
  sentiment = 'Positive' if prediction[0][0] > 0.5 else 'Negative'
  return sentiment, prediction[0][0]


In [None]:
# User input and prediction
example_review = "This movie was fantastic! I loved every minute of it."
sentiment, confidence = predict_sentiment(example_review)
print(f"Sentiment: {sentiment}, Confidence: {confidence}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
Sentiment: Positive, Confidence: 0.7643303275108337 with 75.50% accuracy.
