# **IMDB Movie Review Using Simple RNN**

### Import required libraries

In [1]:
import tensorflow
import numpy as np
import pandas as pd

from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.layers import Dense, Embedding, SimpleRNN,GlobalAveragePooling1D
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping


## Get the dataset

In [2]:
# dataset
data = imdb

## Define Vocabulary Size

In [3]:
voc_size = 10000

## Split the train and test data

In [4]:
#split train and test data
(X_train, y_train), (X_test, y_test) = data.load_data(num_words=voc_size)
print("X_train shape: ", X_train.shape)
print("X_test shape: ", X_test.shape)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
[1m17464789/17464789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 0us/step
X_train shape:  (25000,)
X_test shape:  (25000,)


## Preprocess the Data



*   Here max words is 500 as it is effcient
*   pad sequence is used to ensure all input sequence has same length





In [5]:
#preprocessing the data
max_words = 500
X_train = sequence.pad_sequences(X_train,maxlen=max_words)
X_test = sequence.pad_sequences(X_test,maxlen=max_words)

print(f"X_train shape: {X_train.shape}\nX_train type: {type(X_train)}")
print(f"y_test shape: {y_test.shape}\ny_test type: {type(y_test)}")

X_train shape: (25000, 500)
X_train type: <class 'numpy.ndarray'>
y_test shape: (25000,)
y_test type: <class 'numpy.ndarray'>


In [52]:

!pip install keras-tuner --upgrade


Collecting keras-tuner
  Downloading keras_tuner-1.4.7-py3-none-any.whl.metadata (5.4 kB)
Collecting kt-legacy (from keras-tuner)
  Downloading kt_legacy-1.0.5-py3-none-any.whl.metadata (221 bytes)
Downloading keras_tuner-1.4.7-py3-none-any.whl (129 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.1/129.1 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading kt_legacy-1.0.5-py3-none-any.whl (9.6 kB)
Installing collected packages: kt-legacy, keras-tuner
Successfully installed keras-tuner-1.4.7 kt-legacy-1.0.5


### Defining the model



*   Using Keras Tuner to optimize the performance of the model by tuning hyperparameters like units by using Hyperband strategy




In [54]:
import keras_tuner as kt

def build_model(hp):
  model = Sequential()
  model.add(Embedding(input_dim=voc_size,output_dim=hp.Int('embedding_dim',min_value=32,max_value=512,step=32)))
  model.add(GlobalAveragePooling1D())
  model.add(Dense(units=hp.Int('dense_units',min_value=32, max_value=512, step=32),activation='relu'))
  model.add(Dense(units=1,activation='sigmoid'))

  model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

  return model

In [55]:
#tune the hyperparameter
tuner = kt.Hyperband(
    build_model,
    objective='val_accuracy',
    max_epochs=15,
    factor=3,
    directory='my_dir',
    project_name='imdb'
)

Early Stopping to prevent overfitting

In [56]:
#early_stopping = EarlyStopping(monitor='val_loss',patience=2, restore_best_weights=True)
early_stopping = EarlyStopping(monitor='val_loss',patience=5)

In [57]:
#search for best hp's
tuner.search(X_train,y_train,epochs=15,validation_data=(X_test,y_test),callbacks=[early_stopping])

Trial 30 Complete [00h 00m 53s]
val_accuracy: 0.888480007648468

Best val_accuracy So Far: 0.888480007648468
Total elapsed time: 00h 14m 06s


In [60]:
best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]

print(f"""
The hyperparameter search is complete. The optimal number of embedding dimenssion in the first densely-connected
layer is {best_hps.get('embedding_dim')} and the dense units for the optimizer
is {best_hps.get('dense_units')}.
""")


The hyperparameter search is complete. The optimal number of units in the first densely-connected
layer is 64 and the optimal learning rate for the optimizer
is 128.



## Train the Model

In [61]:
#train the model

model = tuner.hypermodel.build(best_hps)
history=model.fit(
    X_train,y_train,
    epochs=15,
    validation_data=(X_test,y_test),
    callbacks=[early_stopping]
)
val_acc_per_epoch = history.history['val_accuracy']
best_epoch = val_acc_per_epoch.index(max(val_acc_per_epoch)) + 1
print('Best epoch: %d' % (best_epoch,))

Epoch 1/15
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 6ms/step - accuracy: 0.5880 - loss: 0.6456 - val_accuracy: 0.8561 - val_loss: 0.3673
Epoch 2/15
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 6ms/step - accuracy: 0.8329 - loss: 0.3714 - val_accuracy: 0.8777 - val_loss: 0.3057
Epoch 3/15
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.8657 - loss: 0.3092 - val_accuracy: 0.8746 - val_loss: 0.3013
Epoch 4/15
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - accuracy: 0.8838 - loss: 0.2792 - val_accuracy: 0.8851 - val_loss: 0.2835
Epoch 5/15
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.9040 - loss: 0.2403 - val_accuracy: 0.8857 - val_loss: 0.2845
Epoch 6/15
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 6ms/step - accuracy: 0.9046 - loss: 0.2401 - val_accuracy: 0.8689 - val_loss: 0.3123
Epoch 7/15
[1m782/782[0m 

Re intitate the hypermodel by considering the est epoch

In [62]:
#re instatiating the hypermodel
hypermodel = tuner.hypermodel.build(best_hps)
hypermodel.fit(
    X_train,y_train,
    epochs=best_epoch,
    validation_data=(X_test,y_test),
    callbacks=[early_stopping]
)

Epoch 1/8
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 6ms/step - accuracy: 0.5990 - loss: 0.6390 - val_accuracy: 0.6376 - val_loss: 0.8217
Epoch 2/8
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 5ms/step - accuracy: 0.8168 - loss: 0.3936 - val_accuracy: 0.7448 - val_loss: 0.5601
Epoch 3/8
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - accuracy: 0.8932 - loss: 0.2727 - val_accuracy: 0.7396 - val_loss: 0.6374
Epoch 4/8
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - accuracy: 0.8896 - loss: 0.2772 - val_accuracy: 0.8824 - val_loss: 0.2841
Epoch 5/8
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 6ms/step - accuracy: 0.9036 - loss: 0.2321 - val_accuracy: 0.8892 - val_loss: 0.2780
Epoch 6/8
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 6ms/step - accuracy: 0.9147 - loss: 0.2123 - val_accuracy: 0.8803 - val_loss: 0.2933
Epoch 7/8
[1m782/782[0m [32m━━━

<keras.src.callbacks.history.History at 0x7cd86196d650>

In [43]:
history = model.fit(
    X_train,y_train,
    epochs=20,
    validation_data=(X_test,y_test),
    callbacks=[early_stopping]
)

Epoch 1/20
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 6ms/step - accuracy: 0.6088 - loss: 0.6264 - val_accuracy: 0.8521 - val_loss: 0.3589
Epoch 2/20
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 6ms/step - accuracy: 0.8072 - loss: 0.4255 - val_accuracy: 0.7336 - val_loss: 0.5211
Epoch 3/20
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - accuracy: 0.8712 - loss: 0.3007 - val_accuracy: 0.8827 - val_loss: 0.2879
Epoch 4/20
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 6ms/step - accuracy: 0.8900 - loss: 0.2654 - val_accuracy: 0.8810 - val_loss: 0.2888
Epoch 5/20
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - accuracy: 0.9087 - loss: 0.2291 - val_accuracy: 0.8772 - val_loss: 0.2970


Validate on test set

In [63]:
eval_result = hypermodel.evaluate(X_test,y_test)
print("[test loss, test accuracy]:", eval_result)

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.8849 - loss: 0.2872
[test loss, test accuracy]: [0.28737059235572815, 0.8845599889755249]


In [68]:
hypermodel.save('best_model_hypermodel.h5')



## Prediction

In [64]:
word_index = imdb.get_word_index()


In [65]:
def preprocess_text(text):
    words = text.lower().split()
    encoded_review = [word_index.get(word, 2) + 3 for word in words]
    padded_review = sequence.pad_sequences([encoded_review], maxlen=500)
    return padded_review

In [66]:
def predict_sentiment(review):
    preprocessed_input=preprocess_text(review)

    prediction=hypermodel.predict(preprocessed_input)

    sentiment = 'Positive' if prediction[0][0] > 0.5 else 'Negative'

    return sentiment, prediction[0][0]



In [73]:
sample_review = "total crap movie. worst cinematography"

sentiment,score=predict_sentiment(sample_review)

print(f'Review: {sample_review}')
print(f'Sentiment: {sentiment}')
print(f'Prediction Score: {score}')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
Review: total crap movie. worst cinematography
Sentiment: Negative
Prediction Score: 0.18349485099315643
