In [1]:
import nltk
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

# Download required NLTK data
nltk.download('stopwords')
nltk.download('punkt')

# Load dataset
df = pd.read_csv('data/IMDB Dataset.csv')

# Set of English stop words
stop_words = set(stopwords.words('english'))

# Function to clean text data
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r"\'", " ", text)  # Remove apostrophes
    text = re.sub(r"[^\w\s]", " ", text)  # Remove punctuation
    text = ' '.join([word for word in text.split() if word not in stop_words])  # Remove stopwords
    text = ' '.join([w for w in text.split() if len(w) > 2])  # Remove short words
    text = re.sub(r"<.*?>", "", text)  # Remove HTML tags
    return text

# Clean the reviews
df["review"] = df["review"].astype(str)
df["review"] = df["review"].apply(clean_text)

# Tokenize the reviews
tokenized_reviews = df["review"].apply(nltk.word_tokenize)
df["tokenized_reviews"] = tokenized_reviews

# Create a tokenizer (which internally uses a hash table)
tokenizer = Tokenizer(num_words=5000)  # Limit the vocabulary size
tokenizer.fit_on_texts(tokenized_reviews)

# Convert texts to sequences
sequences = tokenizer.texts_to_sequences(tokenized_reviews)

# Use a hash table for word index
word_index = tokenizer.word_index  # This is already a dictionary (hash table)
vocab_size = min(tokenizer.num_words, len(word_index)) + 1

# Pad sequences
X = pad_sequences(sequences, maxlen=500)

# Label encoding
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df["sentiment"])

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, shuffle=True)

# Early stopping and learning rate decay
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=3, min_lr=1e-5)

# Build LSTM model
model_lstm = Sequential([
    Embedding(input_dim=vocab_size, output_dim=128, input_length=500),
    LSTM(128, return_sequences=True),
    Dropout(0.2),
    LSTM(64),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])

# Compile the model
model_lstm.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Fit the model
model_lstm.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=32, callbacks=[early_stopping, reduce_lr])


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/camille/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/camille/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
2024-06-12 10:27:00.007732: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1 Pro
2024-06-12 10:27:00.007756: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2024-06-12 10:27:00.007765: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2024-06-12 10:27:00.007786: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-06-12 10:27:00.007837: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memo

Epoch 1/10


2024-06-12 10:27:00.797460: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


[1m 784/1250[0m [32m━━━━━━━━━━━━[0m[37m━━━━━━━━[0m [1m1:34[0m 204ms/step - accuracy: 0.7640 - loss: 0.4746