# IMBD Data LSTM Model Test

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping


### Ran using Google Colab over Jupypter Notebook



### Importing the IMBD dataset manually



In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from google.colab import files
uploaded = files.upload()

Saving IMDB_Dataset.csv to IMDB_Dataset.csv


### Loading the IMBD dataset and mapping sentiment labels to binary integers before the split:

In [None]:
df = pd.read_csv("IMDB_Dataset.csv")

df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

train_data, test_data = train_test_split(df, train_size=3000, stratify=df['sentiment'], random_state=42)


### Tokenization and pad sequences

In [None]:
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(train_data['review'])

maxlen = 200
X_train = tokenizer.texts_to_sequences(train_data['review'])
X_train = pad_sequences(X_train, maxlen=maxlen, padding='post')

X_test = tokenizer.texts_to_sequences(test_data['review'])
X_test = pad_sequences(X_test, maxlen=maxlen, padding='post')

y_train = train_data['sentiment'].astype(int).values
y_test = test_data['sentiment'].astype(int).values


### Building the LSTM model

In [None]:
model = Sequential([
    Embedding(input_dim=10000, output_dim=64, input_length=maxlen),
    LSTM(64),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()




### Training the LSTM model


In [None]:
early_stop = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)
model.fit(X_train, y_train, epochs=10, batch_size=64, validation_split=0.2, callbacks=[early_stop])


Epoch 1/10
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 172ms/step - accuracy: 0.4900 - loss: 0.6935 - val_accuracy: 0.5050 - val_loss: 0.6926
Epoch 2/10
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 184ms/step - accuracy: 0.5427 - loss: 0.6885 - val_accuracy: 0.5750 - val_loss: 0.6777
Epoch 3/10
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 168ms/step - accuracy: 0.6225 - loss: 0.6550 - val_accuracy: 0.5583 - val_loss: 0.6894
Epoch 4/10
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 149ms/step - accuracy: 0.6712 - loss: 0.6454 - val_accuracy: 0.5583 - val_loss: 0.6604
Epoch 5/10
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 183ms/step - accuracy: 0.7055 - loss: 0.5611 - val_accuracy: 0.5933 - val_loss: 0.6307
Epoch 6/10
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 150ms/step - accuracy: 0.7202 - loss: 0.4760 - val_accuracy: 0.6017 - val_loss: 0.6641
Epoch 7/10
[1m38/38[0

<keras.src.callbacks.history.History at 0x799aab934150>

### Prediction and evaluation of the F1-Score

In [None]:
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int)

f1 = f1_score(y_test, y_pred)
print(f"F1 Score: {f1:.4f}")


[1m1469/1469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 29ms/step
F1 Score: 0.5059
