In [105]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dense
from sklearn.model_selection import train_test_split


In [107]:
# 1. Load the IMDB dataset (assumes CSV format)
df = pd.read_csv('IMDB Dataset - IMDB Dataset.csv')

In [109]:
# 2. Map the 'sentiment' column to binary values (0 = negative, 1 = positive)
df['sentiment'] = df['sentiment'].map({'negative': 0, 'positive': 1})


In [111]:
# 3. Split the dataset into training and testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['review'], df['sentiment'], test_size=0.2, random_state=42)

In [113]:
# 4. Ensure labels are integers
train_labels = train_labels.astype(int)
test_labels = test_labels.astype(int)

In [115]:
# 5. Tokenization and padding
max_words = 10000
max_len = 200

In [117]:
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(train_texts)


In [118]:
x_train_seq = tokenizer.texts_to_sequences(train_texts)
x_test_seq = tokenizer.texts_to_sequences(test_texts)


In [119]:
x_train_pad = pad_sequences(x_train_seq, maxlen=max_len)
x_test_pad = pad_sequences(x_test_seq, maxlen=max_len)

In [120]:
train_labels = np.array(train_labels)
test_labels = np.array(test_labels)


In [121]:
# 4. Build the model
model = Sequential([
    Embedding(input_dim=max_words, output_dim=32, input_length=max_len),
    Flatten(),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')  # Binary classification
])



In [123]:
# 5. Compile and train the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
#model.fit(x_train_pad, train_labels, epochs=5, batch_size=128, validation_split=0.2)

In [124]:
model.fit(x_train_pad, train_labels, epochs=5, batch_size=128, validation_split=0.2)

Epoch 1/5
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 21ms/step - accuracy: 0.6561 - loss: 0.5693 - val_accuracy: 0.8676 - val_loss: 0.3049
Epoch 2/5
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 20ms/step - accuracy: 0.9388 - loss: 0.1708 - val_accuracy: 0.8759 - val_loss: 0.3061
Epoch 3/5
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 19ms/step - accuracy: 0.9894 - loss: 0.0498 - val_accuracy: 0.8658 - val_loss: 0.3801
Epoch 4/5
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 20ms/step - accuracy: 0.9981 - loss: 0.0109 - val_accuracy: 0.8655 - val_loss: 0.4357
Epoch 5/5
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 20ms/step - accuracy: 0.9999 - loss: 0.0029 - val_accuracy: 0.8681 - val_loss: 0.4700


<keras.src.callbacks.history.History at 0x2ac235f3b60>

In [131]:
loss, accuracy = model.evaluate(x_test_pad, test_labels)
print(f'Test Accuracy: {accuracy:.4f}')

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.8698 - loss: 0.4438
Test Accuracy: 0.8711
