In [4]:
import joblib

X_train, y_train, X_test, y_test = joblib.load('/kaggle/input/bert-emb/bert_train_test_data.pkl')

In [2]:
import tensorflow as tf
from tensorflow.keras.layers import MultiHeadAttention, Dense, Dropout, Bidirectional, LSTM, Input, LayerNormalization, GlobalAveragePooling1D
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers.schedules import CosineDecayRestarts
import numpy as np
import gc

# Best model parameters
max_sequence_length, embedding_dim = 128, 768
fixed_lstm_units = 256  # LSTM units fixed at 256
attention_heads = 16  # Best attention heads
first_decay_steps = 40  # Best first decay steps
num_classes = len(np.unique(y_train))  # Assuming y_train is defined with labels

# Define the model with the fixed best parameters
inputs = Input(shape=(max_sequence_length, embedding_dim))
x = Bidirectional(LSTM(fixed_lstm_units, return_sequences=True))(inputs)
x = Dropout(0.2)(x)

# Multi-head attention layer
attention_output = MultiHeadAttention(num_heads=attention_heads, key_dim=64)(x, x)
attention_output = LayerNormalization()(attention_output + x)

# Global Average Pooling
pooled_output = GlobalAveragePooling1D()(attention_output)

# Dense layers
x = Dense(64, activation='relu')(pooled_output)
x = Dropout(0.2)(x)
x = Dense(32, activation='relu')(x)
x = Dropout(0.2)(x)
outputs = Dense(num_classes, activation='softmax')(x)

# Define the model
model = Model(inputs=inputs, outputs=outputs)

# Cosine annealing learning rate schedule
cosine_annealing = CosineDecayRestarts(
    initial_learning_rate=0.0005,
    first_decay_steps=first_decay_steps,
    t_mul=2,
    alpha=0.01
)
optimizer = Adam(learning_rate=cosine_annealing)

# Compile the model
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Early stopping
early_stopping = EarlyStopping(
    monitor='val_accuracy',
    patience=10,
    restore_best_weights=True
)

# Train the model
history = model.fit(
    X_train, y_train,
    epochs=100,  # Limit epochs to 100 to reduce time and memory usage
    validation_data=(X_test, y_test),
    callbacks=[early_stopping],
    verbose=1
)

# Clear memory after training
tf.keras.backend.clear_session()
gc.collect()  # Explicit garbage collection

# Print completion message
print("Completed training with the best model configuration: LSTM Units=256, Attention Heads=16, First Decay Steps=40")


Epoch 1/100
[1m472/472[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 58ms/step - accuracy: 0.1595 - loss: 2.6940 - val_accuracy: 0.5419 - val_loss: 1.4889
Epoch 2/100
[1m472/472[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 50ms/step - accuracy: 0.4948 - loss: 1.6183 - val_accuracy: 0.6340 - val_loss: 1.2000
Epoch 3/100
[1m472/472[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 50ms/step - accuracy: 0.6116 - loss: 1.2631 - val_accuracy: 0.6257 - val_loss: 1.2078
Epoch 4/100
[1m472/472[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 50ms/step - accuracy: 0.6266 - loss: 1.2214 - val_accuracy: 0.6687 - val_loss: 1.0729
Epoch 5/100
[1m472/472[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 50ms/step - accuracy: 0.7020 - loss: 1.0054 - val_accuracy: 0.6889 - val_loss: 1.0195
Epoch 6/100
[1m472/472[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 50ms/step - accuracy: 0.7110 - loss: 0.9436 - val_accuracy: 0.6740 - val_loss: 1.1156
Epoch 7/10

In [3]:
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")

[1m118/118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 21ms/step - accuracy: 0.7183 - loss: 1.7675
Test Loss: 1.7180156707763672
Test Accuracy: 0.7222811579704285


# CNN-LSTM

In [6]:
import tensorflow as tf
from tensorflow.keras.layers import Conv1D, MultiHeadAttention, Dense, Dropout, Bidirectional, LSTM, Input, LayerNormalization, GlobalAveragePooling1D
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers.schedules import CosineDecayRestarts
from tensorflow.keras.regularizers import l2
import numpy as np
import gc

# Model parameters
max_sequence_length, embedding_dim = 128, 768
fixed_lstm_units = 256  # Best LSTM units
attention_heads = 16  # Best attention heads
first_decay_steps = 40  # Best first decay steps
num_classes = len(np.unique(y_train))  # Assuming y_train is defined with labels

# Define the model with CNN layer before LSTM and added regularization
inputs = Input(shape=(max_sequence_length, embedding_dim))

# CNN layer with L2 regularization
x = Conv1D(filters=128, kernel_size=3, padding="same", activation="relu", kernel_regularizer=l2(0.01))(inputs)
x = Dropout(0.3)(x)  # Dropout after CNN to prevent overfitting

# Bidirectional LSTM with L2 regularization
x = Bidirectional(LSTM(fixed_lstm_units, return_sequences=True, kernel_regularizer=l2(0.01), recurrent_regularizer=l2(0.01)))(x)
x = Dropout(0.3)(x)  # Dropout after LSTM

# Multi-head attention layer with LayerNormalization
attention_output = MultiHeadAttention(num_heads=attention_heads, key_dim=64)(x, x)
attention_output = LayerNormalization()(attention_output + x)

# Global Average Pooling
pooled_output = GlobalAveragePooling1D()(attention_output)

# Dense layers with L2 regularization and Dropout
x = Dense(64, activation='relu', kernel_regularizer=l2(0.01))(pooled_output)
x = Dropout(0.3)(x)
x = Dense(32, activation='relu', kernel_regularizer=l2(0.01))(x)
x = Dropout(0.3)(x)
outputs = Dense(num_classes, activation='softmax', kernel_regularizer=l2(0.01))(x)

# Define the model
model = Model(inputs=inputs, outputs=outputs)

# Cosine annealing learning rate schedule
cosine_annealing = CosineDecayRestarts(
    initial_learning_rate=0.0005,
    first_decay_steps=first_decay_steps,
    t_mul=2,
    alpha=0.01
)
optimizer = Adam(learning_rate=cosine_annealing)

# Compile the model
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Early stopping
early_stopping = EarlyStopping(
    monitor='val_accuracy',
    patience=10,
    restore_best_weights=True
)

# Train the model
history = model.fit(
    X_train, y_train,
    epochs=100,  # Limit epochs to 100 to reduce time and memory usage
    validation_data=(X_test, y_test),
    callbacks=[early_stopping],
    verbose=1
)

# Clear memory after training
tf.keras.backend.clear_session()
gc.collect()  # Explicit garbage collection

# Print completion message
print("Completed training with CNN + LSTM + Attention model with regularization")


Epoch 1/100
[1m472/472[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 54ms/step - accuracy: 0.0522 - loss: 9.5758 - val_accuracy: 0.1546 - val_loss: 3.1628
Epoch 2/100
[1m472/472[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 46ms/step - accuracy: 0.1748 - loss: 2.9743 - val_accuracy: 0.4520 - val_loss: 1.9822
Epoch 3/100
[1m472/472[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 46ms/step - accuracy: 0.4087 - loss: 2.1039 - val_accuracy: 0.4859 - val_loss: 1.8979
Epoch 4/100
[1m472/472[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 46ms/step - accuracy: 0.4515 - loss: 2.0071 - val_accuracy: 0.5310 - val_loss: 1.7241
Epoch 5/100
[1m472/472[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 46ms/step - accuracy: 0.5268 - loss: 1.7485 - val_accuracy: 0.5708 - val_loss: 1.6081
Epoch 6/100
[1m472/472[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 46ms/step - accuracy: 0.5410 - loss: 1.7018 - val_accuracy: 0.5130 - val_loss: 1.8023
Epoch 7/10

In [7]:
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")

[1m118/118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 18ms/step - accuracy: 0.6377 - loss: 1.4929
Test Loss: 1.463086724281311
Test Accuracy: 0.6458885669708252


In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Conv1D, MultiHeadAttention, Dense, Dropout, Bidirectional, LSTM, Input, LayerNormalization, GlobalAveragePooling1D
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers.schedules import CosineDecayRestarts
from tensorflow.keras.regularizers import l2
import numpy as np
import itertools
import json
import gc
import os

# Model parameters
max_sequence_length, embedding_dim = 128, 768
num_classes = len(np.unique(y_train))  # Assuming y_train is defined with labels

# Fixed values
dropout_rate = 0.3
learning_rate = 0.0005

# Define hyperparameter options for grid search
lstm_units_options = [128, 256]
cnn_filters_options = [64, 128, 256]
kernel_size_options = [3, 5]
attention_heads_options = [8, 16]

# File to store results incrementally
results_file = "/kaggle/working/model_results.json"

# Check if results file exists and load existing results if present
if os.path.exists(results_file):
    with open(results_file, 'r') as f:
        results = json.load(f)
else:
    results = []

# Grid search with memory management
for lstm_units, cnn_filters, kernel_size, attention_heads in itertools.product(
        lstm_units_options, cnn_filters_options, kernel_size_options, attention_heads_options):

    # Define the model with current hyperparameters
    inputs = Input(shape=(max_sequence_length, embedding_dim))
    
    # LSTM layer with bidirectional configuration
    x = Bidirectional(LSTM(lstm_units, return_sequences=True, kernel_regularizer=l2(0.01), recurrent_regularizer=l2(0.01)))(inputs)
    x = Dropout(dropout_rate)(x)

    # CNN layer with current filter and kernel size
    x = Conv1D(filters=cnn_filters, kernel_size=kernel_size, padding="same", activation="relu", kernel_regularizer=l2(0.01))(x)
    x = Dropout(dropout_rate)(x)

    # Multi-head attention layer
    attention_output = MultiHeadAttention(num_heads=attention_heads, key_dim=64)(x, x)
    attention_output = LayerNormalization()(attention_output + x)

    # Global Average Pooling
    pooled_output = GlobalAveragePooling1D()(attention_output)

    # Dense layers with dropout
    x = Dense(64, activation='relu', kernel_regularizer=l2(0.01))(pooled_output)
    x = Dropout(dropout_rate)(x)
    x = Dense(32, activation='relu', kernel_regularizer=l2(0.01))(x)
    x = Dropout(dropout_rate)(x)
    outputs = Dense(num_classes, activation='softmax', kernel_regularizer=l2(0.01))(x)

    # Define the model
    model = Model(inputs=inputs, outputs=outputs)

    # Cosine annealing learning rate schedule
    cosine_annealing = CosineDecayRestarts(
        initial_learning_rate=learning_rate,
        first_decay_steps=40,
        t_mul=2,
        alpha=0.01
    )
    optimizer = Adam(learning_rate=cosine_annealing)

    # Compile the model
    model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    # Early stopping
    early_stopping = EarlyStopping(
        monitor='val_accuracy',
        patience=10,
        restore_best_weights=True
    )

    # Train the model
    history = model.fit(
        X_train, y_train,
        epochs=100,
        validation_data=(X_test, y_test),
        callbacks=[early_stopping],
        verbose=1
    )

    # Convert history values to 32-bit floats to save memory
    history_data = {
        'train_loss': [float(np.float32(loss)) for loss in history.history['loss']],
        'train_accuracy': [float(np.float32(acc)) for acc in history.history['accuracy']],
        'val_loss': [float(np.float32(val_loss)) for val_loss in history.history['val_loss']],
        'val_accuracy': [float(np.float32(val_acc)) for val_acc in history.history['val_accuracy']]
    }

    # Store model parameters and performance in a dictionary
    model_result = {
        'lstm_units': lstm_units,
        'cnn_filters': cnn_filters,
        'kernel_size': kernel_size,
        'attention_heads': attention_heads,
        'history': history_data
    }

    # Append to results list and immediately save it to the file
    results.append(model_result)
    with open(results_file, 'w') as f:
        json.dump(results, f)

    # Clear the model from memory
    tf.keras.backend.clear_session()
    gc.collect()  # Explicit garbage collection

    # Print the progress to monitor during execution
    print(f"Completed training with LSTM units={lstm_units}, CNN filters={cnn_filters}, kernel size={kernel_size}, attention heads={attention_heads}")

# After all configurations have run, results will contain the complete list of experiments.


Epoch 1/100
[1m472/472[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 43ms/step - accuracy: 0.1376 - loss: 11.1102 - val_accuracy: 0.4546 - val_loss: 3.2949
Epoch 2/100
[1m472/472[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 35ms/step - accuracy: 0.3759 - loss: 3.3066 - val_accuracy: 0.5358 - val_loss: 2.3942
Epoch 3/100
[1m472/472[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 35ms/step - accuracy: 0.4736 - loss: 2.5208 - val_accuracy: 0.5451 - val_loss: 2.1742
Epoch 4/100
[1m472/472[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 35ms/step - accuracy: 0.4775 - loss: 2.3339 - val_accuracy: 0.5676 - val_loss: 1.9104
Epoch 5/100
[1m472/472[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 35ms/step - accuracy: 0.5313 - loss: 2.0475 - val_accuracy: 0.5846 - val_loss: 1.8073
Epoch 6/100
[1m472/472[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 35ms/step - accuracy: 0.5389 - loss: 1.9873 - val_accuracy: 0.5679 - val_loss: 1.8710
Epoch 7/1

In [None]:
import json

# Load results from file (assuming results were stored in 'model_results.json')
results_file = "/kaggle/working/model_results.json"

with open(results_file, 'r') as f:
    results = json.load(f)

# Find the best model based on highest validation accuracy
best_model = max(results, key=lambda x: max(x['history']['val_accuracy']))

# Extract best model stats
best_stats = {
    "LSTM Units": best_model['lstm_units'],
    "CNN Filters": best_model['cnn_filters'],
    "Kernel Size": best_model['kernel_size'],
    "Attention Heads": best_model['attention_heads'],
    "Best Validation Accuracy": max(best_model['history']['val_accuracy']),
    "Validation Loss at Best Accuracy": best_model['history']['val_loss'][best_model['history']['val_accuracy'].index(max(best_model['history']['val_accuracy']))]
}

best_stats


# LSTM-CNN

In [2]:
import tensorflow as tf
from tensorflow.keras.layers import Conv1D, MultiHeadAttention, Dense, Dropout, Bidirectional, LSTM, Input, LayerNormalization, GlobalAveragePooling1D
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers.schedules import CosineDecayRestarts
from tensorflow.keras.regularizers import l2
import numpy as np
import gc

# Model parameters
max_sequence_length, embedding_dim = 128, 768
fixed_lstm_units = 256  # Best LSTM units
attention_heads = 16  # Best attention heads
first_decay_steps = 40  # Best first decay steps
num_classes = len(np.unique(y_train))  # Assuming y_train is defined with labels

# Define the LSTM-CNN model
inputs = Input(shape=(max_sequence_length, embedding_dim))

# LSTM layer with bidirectional configuration
x = Bidirectional(LSTM(fixed_lstm_units, return_sequences=True, kernel_regularizer=l2(0.01), recurrent_regularizer=l2(0.01)))(inputs)
x = Dropout(0.3)(x)  # Dropout after LSTM to prevent overfitting

# CNN layer to capture spatial patterns from the LSTM output
x = Conv1D(filters=128, kernel_size=3, padding="same", activation="relu", kernel_regularizer=l2(0.01))(x)
x = Dropout(0.3)(x)  # Dropout after CNN

# Multi-head attention layer with LayerNormalization
attention_output = MultiHeadAttention(num_heads=attention_heads, key_dim=64)(x, x)
attention_output = LayerNormalization()(attention_output + x)

# Global Average Pooling
pooled_output = GlobalAveragePooling1D()(attention_output)

# Dense layers with L2 regularization and Dropout
x = Dense(64, activation='relu', kernel_regularizer=l2(0.01))(pooled_output)
x = Dropout(0.3)(x)
x = Dense(32, activation='relu', kernel_regularizer=l2(0.01))(x)
x = Dropout(0.3)(x)
outputs = Dense(num_classes, activation='softmax', kernel_regularizer=l2(0.01))(x)

# Define the model
model = Model(inputs=inputs, outputs=outputs)

# Cosine annealing learning rate schedule
cosine_annealing = CosineDecayRestarts(
    initial_learning_rate=0.0005,
    first_decay_steps=first_decay_steps,
    t_mul=2,
    alpha=0.01
)
optimizer = Adam(learning_rate=cosine_annealing)

# Compile the model
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Early stopping
early_stopping = EarlyStopping(
    monitor='val_accuracy',
    patience=10,
    restore_best_weights=True
)

# Train the model
history = model.fit(
    X_train, y_train,
    epochs=100,  # Limit epochs to 100 to reduce time and memory usage
    validation_data=(X_test, y_test),
    callbacks=[early_stopping],
    verbose=1
)

# Clear memory after training
tf.keras.backend.clear_session()
gc.collect()  # Explicit garbage collection

# Print completion message
print("Completed training with LSTM-CNN model configuration")


Epoch 1/100
[1m472/472[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 54ms/step - accuracy: 0.1369 - loss: 13.6376 - val_accuracy: 0.4777 - val_loss: 2.9797
Epoch 2/100
[1m472/472[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 46ms/step - accuracy: 0.4135 - loss: 3.0448 - val_accuracy: 0.5316 - val_loss: 2.2504
Epoch 3/100
[1m472/472[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 46ms/step - accuracy: 0.4800 - loss: 2.3734 - val_accuracy: 0.5472 - val_loss: 2.1288
Epoch 4/100
[1m472/472[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 46ms/step - accuracy: 0.4852 - loss: 2.2898 - val_accuracy: 0.5695 - val_loss: 1.8949
Epoch 5/100
[1m472/472[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 46ms/step - accuracy: 0.5448 - loss: 1.9684 - val_accuracy: 0.6040 - val_loss: 1.7494
Epoch 6/100
[1m472/472[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 46ms/step - accuracy: 0.5585 - loss: 1.9007 - val_accuracy: 0.5668 - val_loss: 1.8668
Epoch 7/1

In [3]:
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")

[1m118/118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 19ms/step - accuracy: 0.6308 - loss: 1.5375
Test Loss: 1.4952224493026733
Test Accuracy: 0.6445623636245728


# CNN Attention

In [3]:
import tensorflow as tf
from tensorflow.keras.layers import MultiHeadAttention, Dense, Dropout, Conv1D, GlobalMaxPooling1D, Input, LayerNormalization, Concatenate, Flatten, Reshape
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers.schedules import CosineDecayRestarts
import numpy as np
import gc

# Model parameters
max_sequence_length, embedding_dim = 128, 768
nb_filters = 128  # Number of filters for each Conv1D layer
attention_heads = 16  # Best attention heads
first_decay_steps = 40  # Best first decay steps
num_classes = len(np.unique(y_train))  # Assuming y_train is defined with labels

# Define the model with the fixed best parameters
inputs = Input(shape=(max_sequence_length, embedding_dim))

# Multiple Conv1D layers with different kernel sizes for bi-gram, tri-gram, and four-gram features
x_bigram = Conv1D(filters=nb_filters, kernel_size=2, padding="same", activation="relu")(inputs)
x_bigram = GlobalMaxPooling1D()(x_bigram)

x_trigram = Conv1D(filters=nb_filters, kernel_size=3, padding="same", activation="relu")(inputs)
x_trigram = GlobalMaxPooling1D()(x_trigram)

x_fourgram = Conv1D(filters=nb_filters, kernel_size=4, padding="same", activation="relu")(inputs)
x_fourgram = GlobalMaxPooling1D()(x_fourgram)

# Concatenate pooled features from different kernel sizes
merged = Concatenate(axis=-1)([x_bigram, x_trigram, x_fourgram])  # Shape: (batch_size, 3 * nb_filters)

# Dense layer for feature processing after concatenation
merged = Dense(256, activation="relu")(merged)
merged = Dropout(rate=0.2)(merged)

# Reshape to add an extra dimension for MultiHeadAttention compatibility
merged_expanded = Reshape((1, 256))(merged)  # Shape: (batch_size, 1, 256)

# Multi-head attention layer
attention_output = MultiHeadAttention(num_heads=attention_heads, key_dim=64)(merged_expanded, merged_expanded)
attention_output = LayerNormalization()(attention_output + merged_expanded)  # Residual connection

# Flatten the attention output
attention_output = Flatten()(attention_output)

# Final Dense layers for classification
x = Dense(64, activation='relu')(attention_output)
x = Dropout(0.2)(x)
x = Dense(32, activation='relu')(x)
x = Dropout(0.2)(x)

# Output layer for multi-class classification
outputs = Dense(num_classes, activation="softmax")(x)  # Softmax for multi-class

# Define the model
model = Model(inputs=inputs, outputs=outputs)

# Cosine annealing learning rate schedule
cosine_annealing = CosineDecayRestarts(
    initial_learning_rate=0.0005,
    first_decay_steps=first_decay_steps,
    t_mul=2,
    alpha=0.01
)
optimizer = Adam(learning_rate=cosine_annealing)

# Compile the model for multi-class classification
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Early stopping
early_stopping = EarlyStopping(
    monitor='val_accuracy',
    patience=10,
    restore_best_weights=True
)

# Train the model
history = model.fit(
    X_train, y_train,
    epochs=100,  # Limit epochs to 100 to reduce time and memory usage
    validation_data=(X_test, y_test),
#     callbacks=[early_stopping],
    callbacks=[],
    verbose=1
)

# Clear memory after training
tf.keras.backend.clear_session()
gc.collect()  # Explicit garbage collection

# Print completion message
print("Completed training with the multi-kernel CNN-Attention model configuration")


Epoch 1/100


I0000 00:00:1731390575.203451      97 service.cc:145] XLA service 0x7de564088f00 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1731390575.203513      97 service.cc:153]   StreamExecutor device (0): Tesla P100-PCIE-16GB, Compute Capability 6.0


[1m 16/472[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m5s[0m 11ms/step - accuracy: 0.0542 - loss: 3.1382  

I0000 00:00:1731390582.345081      97 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m472/472[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 35ms/step - accuracy: 0.0841 - loss: 2.9082 - val_accuracy: 0.4599 - val_loss: 1.6604
Epoch 2/100
[1m472/472[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 13ms/step - accuracy: 0.4121 - loss: 1.7939 - val_accuracy: 0.5387 - val_loss: 1.3419
Epoch 3/100
[1m472/472[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 13ms/step - accuracy: 0.5434 - loss: 1.3731 - val_accuracy: 0.5777 - val_loss: 1.2810
Epoch 4/100
[1m472/472[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 13ms/step - accuracy: 0.5794 - loss: 1.2971 - val_accuracy: 0.6432 - val_loss: 1.1291
Epoch 5/100
[1m472/472[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 13ms/step - accuracy: 0.6756 - loss: 0.9829 - val_accuracy: 0.6570 - val_loss: 1.1078
Epoch 6/100
[1m472/472[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 13ms/step - accuracy: 0.7227 - loss: 0.8738 - val_accuracy: 0.6427 - val_loss: 1.1340
Epoch 7/100
[1m472/472[0

In [4]:
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")

[1m118/118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.6713 - loss: 2.9162
Test Loss: 2.793219804763794
Test Accuracy: 0.6806365847587585


In [8]:
import tensorflow as tf
from tensorflow.keras.layers import MultiHeadAttention, Dense, Dropout, Conv1D, GlobalMaxPooling1D, Input, LayerNormalization, Concatenate, BatchNormalization, Reshape, Flatten
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers.schedules import CosineDecayRestarts
import numpy as np
import gc

# Model parameters
max_sequence_length, embedding_dim = 128, 768
nb_filters = 128  # Number of filters for each Conv1D layer
attention_heads = 16  # Best attention heads
first_decay_steps = 40  # Best first decay steps
num_classes = len(np.unique(y_train))  # Assuming y_train is defined with labels

# Define the model with the fixed best parameters
inputs = Input(shape=(max_sequence_length, embedding_dim))

# Multiple Conv1D layers with different kernel sizes for bi-gram, tri-gram, and four-gram features
x_bigram = Conv1D(filters=nb_filters, kernel_size=2, padding="same", activation="relu")(inputs)
x_bigram = BatchNormalization()(x_bigram)
x_bigram = GlobalMaxPooling1D()(x_bigram)  # Max pooling over bigrams

x_trigram = Conv1D(filters=nb_filters, kernel_size=3, padding="same", activation="relu")(inputs)
x_trigram = BatchNormalization()(x_trigram)
x_trigram = GlobalMaxPooling1D()(x_trigram)  # Max pooling over trigrams

x_fourgram = Conv1D(filters=nb_filters, kernel_size=4, padding="same", activation="relu")(inputs)
x_fourgram = BatchNormalization()(x_fourgram)
x_fourgram = GlobalMaxPooling1D()(x_fourgram)  # Max pooling over fourgrams

# Concatenate pooled features from different kernel sizes
merged = Concatenate(axis=-1)([x_bigram, x_trigram, x_fourgram])  # Shape: (batch_size, 3 * nb_filters)

# Dense layer for feature processing after concatenation
merged = Dense(256, activation="relu", kernel_regularizer=tf.keras.regularizers.l2(0.001))(merged)
merged = Dropout(rate=0.3)(merged)

# Reshape for MultiHeadAttention compatibility
merged_expanded = Reshape((1, 256))(merged)  # Shape: (batch_size, 1, 256)

# Multi-head attention layer
attention_output = MultiHeadAttention(num_heads=attention_heads, key_dim=64)(merged_expanded, merged_expanded)
attention_output = LayerNormalization()(attention_output + merged_expanded)  # Residual connection

# Flatten the attention output
attention_output = Flatten()(attention_output)

# Final Dense layers for classification
x = Dense(128, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.001))(attention_output)
x = Dropout(0.3)(x)
x = Dense(64, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.001))(x)
x = Dropout(0.3)(x)

# Output layer for multi-class classification
outputs = Dense(num_classes, activation="softmax")(x)  # Softmax for multi-class

# Define the model
model = Model(inputs=inputs, outputs=outputs)

# Cosine annealing learning rate schedule
cosine_annealing = CosineDecayRestarts(
    initial_learning_rate=0.0005,
    first_decay_steps=first_decay_steps,
    t_mul=2,
    alpha=0.01
)
optimizer = Adam(learning_rate=cosine_annealing)

# Compile the model for multi-class classification
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Early stopping
early_stopping = EarlyStopping(
    monitor='val_accuracy',
    patience=10,
    restore_best_weights=True
)

# Train the model
history = model.fit(
    X_train, y_train,
    epochs=100,
    validation_data=(X_test, y_test),
    callbacks=[early_stopping],
    verbose=1
)

# Clear memory after training
tf.keras.backend.clear_session()
gc.collect()  # Explicit garbage collection

# Print completion message
print("Completed training with the optimized multi-kernel CNN-Attention model configuration including max pooling")


Epoch 1/100
[1m472/472[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 36ms/step - accuracy: 0.0984 - loss: 3.4263 - val_accuracy: 0.4944 - val_loss: 2.0955
Epoch 2/100
[1m472/472[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 14ms/step - accuracy: 0.4347 - loss: 2.1976 - val_accuracy: 0.5732 - val_loss: 1.6766
Epoch 3/100
[1m472/472[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 13ms/step - accuracy: 0.5609 - loss: 1.7557 - val_accuracy: 0.5867 - val_loss: 1.6224
Epoch 4/100
[1m472/472[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 14ms/step - accuracy: 0.5933 - loss: 1.6516 - val_accuracy: 0.6377 - val_loss: 1.4665
Epoch 5/100
[1m472/472[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 13ms/step - accuracy: 0.6702 - loss: 1.3518 - val_accuracy: 0.6549 - val_loss: 1.4411
Epoch 6/100
[1m472/472[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 14ms/step - accuracy: 0.7005 - loss: 1.2498 - val_accuracy: 0.6406 - val_loss: 1.4862
Epoch 7/100
[1

In [None]:
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")

# CNN-LSTM-Attention

In [4]:
import tensorflow as tf
from tensorflow.keras.layers import MultiHeadAttention, Dense, Dropout, Conv1D, GlobalMaxPooling1D, LSTM, Input, LayerNormalization, Concatenate, BatchNormalization, Reshape, Flatten
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers.schedules import CosineDecayRestarts
import numpy as np
import gc

# Model parameters
max_sequence_length, embedding_dim = 128, 768
nb_filters = 128  # Number of filters for each Conv1D layer
attention_heads = 16  # Best attention heads
first_decay_steps = 40  # Best first decay steps
num_classes = len(np.unique(y_train))  # Assuming y_train is defined with labels

# Define the model with the fixed best parameters
inputs = Input(shape=(max_sequence_length, embedding_dim))

# Multiple Conv1D layers with different kernel sizes for bi-gram, tri-gram, and four-gram features
x_bigram = Conv1D(filters=nb_filters, kernel_size=2, padding="same", activation="relu")(inputs)
x_bigram = BatchNormalization()(x_bigram)
x_bigram = GlobalMaxPooling1D()(x_bigram)  # Max pooling over bigrams

x_trigram = Conv1D(filters=nb_filters, kernel_size=3, padding="same", activation="relu")(inputs)
x_trigram = BatchNormalization()(x_trigram)
x_trigram = GlobalMaxPooling1D()(x_trigram)  # Max pooling over trigrams

x_fourgram = Conv1D(filters=nb_filters, kernel_size=4, padding="same", activation="relu")(inputs)
x_fourgram = BatchNormalization()(x_fourgram)
x_fourgram = GlobalMaxPooling1D()(x_fourgram)  # Max pooling over fourgrams

# Concatenate pooled features from different kernel sizes
merged = Concatenate(axis=-1)([x_bigram, x_trigram, x_fourgram])  # Shape: (batch_size, 3 * nb_filters)

# Reshape for LSTM compatibility
merged_reshaped = Reshape((3, nb_filters))(merged)  # Shape: (batch_size, 3, nb_filters)

# LSTM layer for sequential feature processing
lstm_output = LSTM(128, return_sequences=True)(merged_reshaped)

# Flatten LSTM output to connect with Dense layers and attention
flattened_lstm_output = Flatten()(lstm_output)

# Dense layer for feature processing after LSTM
x = Dense(256, activation="relu", kernel_regularizer=tf.keras.regularizers.l2(0.001))(flattened_lstm_output)
x = Dropout(rate=0.3)(x)

# Reshape for MultiHeadAttention compatibility
x_expanded = Reshape((1, 256))(x)  # Shape: (batch_size, 1, 256)

# Multi-head attention layer
attention_output = MultiHeadAttention(num_heads=attention_heads, key_dim=64)(x_expanded, x_expanded)
attention_output = LayerNormalization()(attention_output + x_expanded)  # Residual connection

# Flatten the attention output
attention_output = Flatten()(attention_output)

# Final Dense layers for classification
x = Dense(128, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.001))(attention_output)
x = Dropout(0.3)(x)
x = Dense(64, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.001))(x)
x = Dropout(0.3)(x)

# Output layer for multi-class classification
outputs = Dense(num_classes, activation="softmax")(x)  # Softmax for multi-class

# Define the model
model = Model(inputs=inputs, outputs=outputs)

# Cosine annealing learning rate schedule
cosine_annealing = CosineDecayRestarts(
    initial_learning_rate=0.0005,
    first_decay_steps=first_decay_steps,
    t_mul=2,
    alpha=0.01
)
optimizer = Adam(learning_rate=cosine_annealing)

# Compile the model for multi-class classification
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Early stopping
early_stopping = EarlyStopping(
    monitor='val_accuracy',
    patience=10,
    restore_best_weights=True
)

# Train the model
history = model.fit(
    X_train, y_train,
    epochs=100,
    validation_data=(X_test, y_test),
    callbacks=[],
    verbose=1
)

# Clear memory after training
tf.keras.backend.clear_session()
gc.collect()  # Explicit garbage collection

# Print completion message
print("Completed training with the multi-kernel CNN-Attention-LSTM model configuration")



Epoch 1/100
[1m472/472[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 28ms/step - accuracy: 0.0966 - loss: 3.4476 - val_accuracy: 0.4218 - val_loss: 2.2213
Epoch 2/100
[1m472/472[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 20ms/step - accuracy: 0.4049 - loss: 2.2545 - val_accuracy: 0.5520 - val_loss: 1.7308
Epoch 3/100
[1m472/472[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 20ms/step - accuracy: 0.5361 - loss: 1.7663 - val_accuracy: 0.5759 - val_loss: 1.6686
Epoch 4/100
[1m472/472[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 20ms/step - accuracy: 0.5574 - loss: 1.6934 - val_accuracy: 0.6196 - val_loss: 1.4823
Epoch 5/100
[1m472/472[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 20ms/step - accuracy: 0.6504 - loss: 1.3677 - val_accuracy: 0.6263 - val_loss: 1.4541
Epoch 6/100
[1m472/472[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 20ms/step - accuracy: 0.6796 - loss: 1.2637 - val_accuracy: 0.6228 - val_loss: 1.4459
Epoch 7/100
[1

In [5]:
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")

[1m118/118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.6646 - loss: 1.9677
Test Loss: 1.8799532651901245
Test Accuracy: 0.673209547996521


# LSTM-CNN

In [3]:
import tensorflow as tf
from tensorflow.keras.layers import Dense, Dropout, Conv1D, GlobalMaxPooling1D, Bidirectional, LSTM, Input, LayerNormalization, Flatten
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers.schedules import CosineDecayRestarts
import numpy as np
import gc

# Model parameters
max_sequence_length, embedding_dim = 128, 768
nb_filters = 128  # Number of filters for the Conv1D layer
attention_heads = 16  # Best attention heads
first_decay_steps = 40  # Best first decay steps
num_classes = len(np.unique(y_train))  # Assuming y_train is defined with labels

# Define the model with the BiLSTM and CNN
inputs = Input(shape=(max_sequence_length, embedding_dim))

# BiLSTM layer for sequential feature processing
bilstm_output = Bidirectional(LSTM(128, return_sequences=True))(inputs)

# Conv1D layer after BiLSTM to extract local features
conv_output = Conv1D(filters=nb_filters, kernel_size=3, padding="same", activation="relu")(bilstm_output)
conv_output = GlobalMaxPooling1D()(conv_output)  # Global max pooling to reduce sequence dimension

# Dense layer for feature processing after convolution
x = Dense(256, activation="relu", kernel_regularizer=tf.keras.regularizers.l2(0.001))(conv_output)
x = Dropout(rate=0.3)(x)

# Final Dense layers for classification
x = Dense(128, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.001))(x)
x = Dropout(0.3)(x)
x = Dense(64, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.001))(x)
x = Dropout(0.3)(x)

# Output layer for multi-class classification
outputs = Dense(num_classes, activation="softmax")(x)  # Softmax for multi-class

# Define the model
model = Model(inputs=inputs, outputs=outputs)

# Cosine annealing learning rate schedule
cosine_annealing = CosineDecayRestarts(
    initial_learning_rate=0.0005,
    first_decay_steps=first_decay_steps,
    t_mul=2,
    alpha=0.01
)
optimizer = Adam(learning_rate=cosine_annealing)

# Compile the model for multi-class classification
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Early stopping
early_stopping = EarlyStopping(
    monitor='val_accuracy',
    patience=10,
    restore_best_weights=True
)

# Train the model
history = model.fit(
    X_train, y_train,
    epochs=100,
    validation_data=(X_test, y_test),
    callbacks=[early_stopping],
    verbose=1
)

# Clear memory after training
tf.keras.backend.clear_session()
gc.collect()  # Explicit garbage collection

# Print completion message
print("Completed training with the BiLSTM-CNN model configuration")

Epoch 1/100
[1m472/472[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 37ms/step - accuracy: 0.1498 - loss: 3.0878 - val_accuracy: 0.5268 - val_loss: 1.7044
Epoch 2/100
[1m472/472[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 29ms/step - accuracy: 0.4791 - loss: 1.8461 - val_accuracy: 0.6114 - val_loss: 1.4515
Epoch 3/100
[1m472/472[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 29ms/step - accuracy: 0.5962 - loss: 1.5152 - val_accuracy: 0.6294 - val_loss: 1.3984
Epoch 4/100
[1m472/472[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 29ms/step - accuracy: 0.6219 - loss: 1.4262 - val_accuracy: 0.6454 - val_loss: 1.3290
Epoch 5/100
[1m472/472[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 29ms/step - accuracy: 0.6862 - loss: 1.2133 - val_accuracy: 0.6605 - val_loss: 1.3035
Epoch 6/100
[1m472/472[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 29ms/step - accuracy: 0.7047 - loss: 1.1434 - val_accuracy: 0.6406 - val_loss: 1.4369
Epoch 7/10

In [4]:
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")

[1m118/118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 14ms/step - accuracy: 0.7037 - loss: 1.9077
Test Loss: 1.8400700092315674
Test Accuracy: 0.7047745585441589


# LSTM-CNN-Attention

In [3]:
import tensorflow as tf
from tensorflow.keras.layers import Dense, Dropout, Conv1D, GlobalMaxPooling1D, Bidirectional, LSTM, Input, LayerNormalization, Flatten, Attention
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers.schedules import CosineDecayRestarts
import numpy as np
import gc

# Model parameters
max_sequence_length, embedding_dim = 128, 768
nb_filters = 128  # Number of filters for the Conv1D layer
attention_heads = 16  # Best attention heads
first_decay_steps = 40  # Best first decay steps
num_classes = len(np.unique(y_train))  # Assuming y_train is defined with labels

# Define the model with the BiLSTM, CNN, Attention, and Layer Normalization
inputs = Input(shape=(max_sequence_length, embedding_dim))

# BiLSTM layer for sequential feature processing
bilstm_output = Bidirectional(LSTM(128, return_sequences=True))(inputs)
bilstm_output = LayerNormalization()(bilstm_output)  # Apply Layer Normalization after BiLSTM

# Add Attention layer after BiLSTM to focus on relevant sequence information
attention_output = Attention()([bilstm_output, bilstm_output])

# Conv1D layer after Attention to extract local features
conv_output = Conv1D(filters=nb_filters, kernel_size=3, padding="same", activation="relu")(attention_output)
conv_output = GlobalMaxPooling1D()(conv_output)  # Global max pooling to reduce sequence dimension
conv_output = LayerNormalization()(conv_output)  # Apply Layer Normalization after Conv1D

# Dense layer for feature processing after convolution
x = Dense(256, activation="relu", kernel_regularizer=tf.keras.regularizers.l2(0.001))(conv_output)
x = Dropout(rate=0.3)(x)

# Additional Dense layers for classification with regularization and Dropout
x = Dense(128, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.001))(x)
x = Dropout(0.3)(x)
x = Dense(64, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.001))(x)
x = Dropout(0.3)(x)

# Output layer for multi-class classification
outputs = Dense(num_classes, activation="softmax")(x)  # Softmax for multi-class

# Define the model
model = Model(inputs=inputs, outputs=outputs)

# Cosine annealing learning rate schedule
cosine_annealing = CosineDecayRestarts(
    initial_learning_rate=0.001,
    first_decay_steps=first_decay_steps,
    t_mul=2,
    alpha=0.01
)
optimizer = Adam(learning_rate=cosine_annealing)

# Compile the model for multi-class classification
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Early stopping
early_stopping = EarlyStopping(
    monitor='val_accuracy',
    patience=10,
    restore_best_weights=True
)

# Train the model
history = model.fit(
    X_train, y_train,
    epochs=100,
    validation_data=(X_test, y_test),
    callbacks=[early_stopping],
    verbose=1
)

# Clear memory after training
tf.keras.backend.clear_session()
gc.collect()  # Explicit garbage collection

# Print completion message
print("Completed training with the BiLSTM-CNN model configuration with Attention and Layer Normalization")


Epoch 1/100
[1m472/472[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 39ms/step - accuracy: 0.1295 - loss: 3.1364 - val_accuracy: 0.5276 - val_loss: 1.6934
Epoch 2/100
[1m472/472[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 31ms/step - accuracy: 0.5030 - loss: 1.7870 - val_accuracy: 0.6332 - val_loss: 1.4125
Epoch 3/100
[1m472/472[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 31ms/step - accuracy: 0.6294 - loss: 1.4138 - val_accuracy: 0.6186 - val_loss: 1.4244
Epoch 4/100
[1m472/472[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 31ms/step - accuracy: 0.6381 - loss: 1.3692 - val_accuracy: 0.6538 - val_loss: 1.2750
Epoch 5/100
[1m472/472[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 31ms/step - accuracy: 0.7169 - loss: 1.0964 - val_accuracy: 0.6841 - val_loss: 1.2351
Epoch 6/100
[1m472/472[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 31ms/step - accuracy: 0.7524 - loss: 0.9814 - val_accuracy: 0.6377 - val_loss: 1.3558
Epoch 7/10

In [4]:
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")

[1m118/118[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 14ms/step - accuracy: 0.7098 - loss: 1.8359
Test Loss: 1.7726138830184937
Test Accuracy: 0.7161803841590881


# LSTM-CNN-Multihead Attention

In [3]:
import tensorflow as tf
from tensorflow.keras.layers import Dense, Dropout, Conv1D, GlobalMaxPooling1D, Bidirectional, LSTM, Input, LayerNormalization, Flatten, MultiHeadAttention
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers.schedules import CosineDecayRestarts
import numpy as np
import gc

# Model parameters
max_sequence_length, embedding_dim = 128, 768
nb_filters = 128  # Number of filters for the Conv1D layer
attention_heads = 16  # Best attention heads
first_decay_steps = 40  # Best first decay steps
num_classes = len(np.unique(y_train))  # Assuming y_train is defined with labels

# Define the model with the BiLSTM, CNN, Multi-Head Attention, and Layer Normalization
inputs = Input(shape=(max_sequence_length, embedding_dim))

# BiLSTM layer for sequential feature processing
bilstm_output = Bidirectional(LSTM(128, return_sequences=True))(inputs)
bilstm_output = LayerNormalization()(bilstm_output)  # Apply Layer Normalization after BiLSTM

# Add Multi-Head Attention after BiLSTM to focus on relevant sequence information
multihead_attention_output = MultiHeadAttention(num_heads=attention_heads, key_dim=embedding_dim)(bilstm_output, bilstm_output)
multihead_attention_output = LayerNormalization()(multihead_attention_output)  # Layer Normalization after Multi-Head Attention

# Conv1D layer after Multi-Head Attention to extract local features
conv_output = Conv1D(filters=nb_filters, kernel_size=3, padding="same", activation="relu")(multihead_attention_output)
conv_output = GlobalMaxPooling1D()(conv_output)  # Global max pooling to reduce sequence dimension
conv_output = LayerNormalization()(conv_output)  # Apply Layer Normalization after Conv1D

# Dense layer for feature processing after convolution
x = Dense(256, activation="relu", kernel_regularizer=tf.keras.regularizers.l2(0.001))(conv_output)
x = Dropout(rate=0.3)(x)

# Additional Dense layers for classification with regularization and Dropout
x = Dense(128, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.001))(x)
x = Dropout(0.3)(x)
x = Dense(64, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.001))(x)
x = Dropout(0.3)(x)

# Output layer for multi-class classification
outputs = Dense(num_classes, activation="softmax")(x)  # Softmax for multi-class

# Define the model
model = Model(inputs=inputs, outputs=outputs)

# Cosine annealing learning rate schedule
cosine_annealing = CosineDecayRestarts(
    initial_learning_rate=0.001,
    first_decay_steps=first_decay_steps,
    t_mul=2,
    alpha=0.01
)
optimizer = Adam(learning_rate=cosine_annealing)

# Compile the model for multi-class classification
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Early stopping
early_stopping = EarlyStopping(
    monitor='val_accuracy',
    patience=10,
    restore_best_weights=True
)

# Train the model
history = model.fit(
    X_train, y_train,
    epochs=100,
    validation_data=(X_test, y_test),
    callbacks=[early_stopping],
    verbose=1
)

# Clear memory after training
tf.keras.backend.clear_session()
gc.collect()  # Explicit garbage collection

# Print completion message
print("Completed training with the BiLSTM-CNN model configuration with Multi-Head Attention and Layer Normalization")

Epoch 1/100
[1m472/472[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m76s[0m 142ms/step - accuracy: 0.0492 - loss: 3.3747 - val_accuracy: 0.0546 - val_loss: 3.2093
Epoch 2/100
[1m472/472[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m63s[0m 134ms/step - accuracy: 0.0522 - loss: 3.1948 - val_accuracy: 0.0584 - val_loss: 3.1163
Epoch 3/100
[1m472/472[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m63s[0m 134ms/step - accuracy: 0.0667 - loss: 3.0785 - val_accuracy: 0.1515 - val_loss: 2.6216
Epoch 4/100
[1m472/472[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m63s[0m 134ms/step - accuracy: 0.1849 - loss: 2.3768 - val_accuracy: 0.3241 - val_loss: 1.8826
Epoch 5/100
[1m472/472[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m63s[0m 134ms/step - accuracy: 0.3108 - loss: 1.8985 - val_accuracy: 0.3902 - val_loss: 1.7397
Epoch 6/100
[1m472/472[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m63s[0m 134ms/step - accuracy: 0.3570 - loss: 1.7851 - val_accuracy: 0.4199 - val_loss: 1.6895
Epoc

In [None]:
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")