In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [2]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import regularizers
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


In [3]:
path = "C:\\Users\\dhuma\\Desktop\\datasets\\IMDB Dataset.csv"
df = pd.read_csv(path)

In [4]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [5]:
sentences = df['review'].values
labels = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0).values

In [6]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [7]:
# Tokenization and Padding
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)

In [8]:
maxlen = 200
X = pad_sequences(sequences, maxlen=maxlen)
y = np.array(labels)

In [9]:
# splitting the data
X_train , X_test, y_train , y_test= train_test_split(X,y, test_size=0.2, random_state=21)

In [10]:
# train_test_split: Splits the data into training (80%) and testing (20%) sets. 
# The random_state=42 ensures reproducibility.

In [11]:
# Building the RNN model
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128))
model.add(SimpleRNN(64, return_sequences=False, 
               kernel_regularizer=regularizers.l2(0.01)))
model.add(Dropout(0.7))
model.add(Dense(32, activation='relu', 
                kernel_regularizer=regularizers.l2(0.01)))
model.add(Dropout(0.7))
model.add(Dense(1, activation='sigmoid'))

In [12]:
# Compiling the model
model.compile(optimizer=Adam(learning_rate=0.0001), loss = 'binary_crossentropy', metrics=['accuracy'])

In [13]:
# Training the model
early_stopping = EarlyStopping(monitor='val_loss',
                               patience=3, restore_best_weights=True)

history = model.fit(X_train, y_train, 
                    epochs=10, batch_size=128, 
                    validation_data=(X_test, y_test),
                    callbacks=[early_stopping])

Epoch 1/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 71ms/step - accuracy: 0.5006 - loss: 1.9141 - val_accuracy: 0.5698 - val_loss: 1.6257
Epoch 2/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 71ms/step - accuracy: 0.5605 - loss: 1.5514 - val_accuracy: 0.7641 - val_loss: 1.2500
Epoch 3/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 71ms/step - accuracy: 0.7228 - loss: 1.2164 - val_accuracy: 0.8431 - val_loss: 0.9640
Epoch 4/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 74ms/step - accuracy: 0.8395 - loss: 0.9485 - val_accuracy: 0.8497 - val_loss: 0.7886
Epoch 5/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 72ms/step - accuracy: 0.8705 - loss: 0.7842 - val_accuracy: 0.8649 - val_loss: 0.6750
Epoch 6/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 73ms/step - accuracy: 0.8858 - loss: 0.6774 - val_accuracy: 0.8699 - val_loss: 0.6015
Epoch 7/10
[1m3

In [14]:
# Evalutating the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {accuracy:.4f}')


[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 9ms/step - accuracy: 0.8733 - loss: 0.4470
Test Accuracy: 0.8715


In [15]:
y_pred = (model.predict(X_test) > 0.5).astype("int32")
print(f'Accuracy Score: {accuracy_score(y_test, y_pred):.4f}')

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 10ms/step
Accuracy Score: 0.8715
