In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense, Bidirectional, LSTM, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler
import keras_tuner as kt

# Load the dataset
data = pd.read_csv('cleaned_balanced_dataset_FINAL.csv')

# Handle missing values in the 'comment' column
data['comment'].fillna('', inplace=True)

# Reduce dataset size for memory efficiency (sample 10,000 records)
data = data.sample(n=10000, random_state=42)

# Encode target labels if necessary
label_column = 'label'
label_encoder = LabelEncoder()
data[label_column] = label_encoder.fit_transform(data[label_column])

# Text Vectorization using TF-IDF with fewer features
tfidf = TfidfVectorizer(max_features=1000)
X = tfidf.fit_transform(data['comment']).toarray()

# Split data into features and target
y = data[label_column]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the data
scaler = StandardScaler(with_mean=False)
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define the hypermodel
def build_model(hp):
    model = Sequential()
    
    # Input layer
    model.add(Input(shape=(X_train.shape[1], 1)))
    
    # Bidirectional LSTM layer
    model.add(Bidirectional(LSTM(units=hp.Int('units', min_value=32, max_value=128, step=32), return_sequences=True)))
    
    # Additional LSTM layers with dropout and batch normalization
    for i in range(hp.Int('num_layers', 1, 3)):
        model.add(Bidirectional(LSTM(units=hp.Int(f'lstm_units_{i}', min_value=32, max_value=128, step=32), return_sequences=True)))
        model.add(Dropout(hp.Float(f'dropout_{i}', min_value=0.2, max_value=0.5, step=0.1)))
        model.add(BatchNormalization())
    
    model.add(Bidirectional(LSTM(units=hp.Int('final_lstm_units', min_value=32, max_value=128, step=32))))
    
    # Dense layer with sigmoid activation
    model.add(Dense(1, activation='sigmoid'))
    
    # Compile the model
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

# Initialize the tuner
tuner = kt.RandomSearch(
    build_model,
    objective='val_accuracy',
    max_trials=10,
    executions_per_trial=3,
    directory='hyperband',
    project_name='bidirectional_lstm_optimization'
)

# Search for the best hyperparameters
tuner.search(X_train, y_train, epochs=10, validation_split=0.2)

# Get the optimal hyperparameters
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

# Build the model with the optimal hyperparameters
model = build_model(best_hps)

# Define early stopping to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Define learning rate scheduler
def scheduler(epoch, lr):
    if epoch < 10:
        return lr
    else:
        return lr * tf.math.exp(-0.1)

lr_scheduler = LearningRateScheduler(scheduler)

# Reshape the data for LSTM input
X_train_rnn = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_test_rnn = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)

# Train the model
history = model.fit(
    X_train_rnn, y_train,
    epochs=50,
    validation_split=0.2,
    batch_size=32,
    callbacks=[early_stopping, lr_scheduler]
)

# Predict on the test set
y_pred = (model.predict(X_test_rnn) > 0.5).astype("int32")

# Calculate the F1 score
f1 = f1_score(y_test, y_pred)
print(f"F1 Score: {f1}")

# Function to preprocess and predict new input
def preprocess_and_predict(comment):
    # Preprocess the input comment
    input_vector = tfidf.transform([comment]).toarray()
    input_vector = scaler.transform(input_vector)
    input_vector = input_vector.reshape(input_vector.shape[0], input_vector.shape[1], 1)
    
    # Make prediction
    prediction = (model.predict(input_vector) > 0.5).astype("int32")
    return prediction

# Example usage for new input
new_comment = "This is a sample comment for prediction."
prediction = preprocess_and_predict(new_comment)
print(f"Prediction for new comment: {prediction[0][0]}")



Search: Running Trial #1

Value             |Best Value So Far |Hyperparameter
96                |96                |units
2                 |2                 |num_layers
128               |128               |lstm_units_0
0.4               |0.4               |dropout_0
64                |64                |final_lstm_units


Epoch 1/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1442s[0m 7s/step - accuracy: 0.5104 - loss: 0.7075 - val_accuracy: 0.5163 - val_loss: 0.6925
Epoch 2/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8581s[0m 43s/step - accuracy: 0.5134 - loss: 0.6968 - val_accuracy: 0.5300 - val_loss: 0.6916
Epoch 3/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1248s[0m 6s/step - accuracy: 0.5040 - loss: 0.6952 - val_accuracy: 0.5325 - val_loss: 0.6872
Epoch 4/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1127s[0m 6s/step - accuracy: 0.5037 - loss: 0.6960 - val_accuracy: 0.5375 - val_loss: 0.6837
Epoch 5/10
[1