In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, GRU, Conv1D, MaxPooling1D, GlobalMaxPooling1D
from sklearn.metrics import classification_report, accuracy_score


### **Load and Preprocess Data**

In [2]:
# Load the data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Map original labels to binary values
label_mapping = {'__label__0': 0, '__label__1': 1}
train_data['label'] = train_data['label'].map(label_mapping)
test_data['label'] = test_data['label'].map(label_mapping)

# Ensure text column is string type and handle missing values
train_data['text'] = train_data['text'].fillna('').astype(str)
test_data['text'] = test_data['text'].fillna('').astype(str)



In [3]:
# Define a simple text preprocessing function
def preprocess_text(text):
    return text.lower()

# Apply preprocessing
train_data['text'] = train_data['text'].apply(preprocess_text)
test_data['text'] = test_data['text'].apply(preprocess_text)

In [4]:
# Tokenize the text data
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train_data['text'])

X_train_seq = tokenizer.texts_to_sequences(train_data['text'])
X_test_seq = tokenizer.texts_to_sequences(test_data['text'])

# Pad the sequences
max_sequence_length = 100
X_train_padded = pad_sequences(X_train_seq, maxlen=max_sequence_length)
X_test_padded = pad_sequences(X_test_seq, maxlen=max_sequence_length)

# Extract labels
y_train = train_data['label'].values
y_test = test_data['label'].values

## ***Deep Learning Models***

**LSTM Model**

In [5]:
def create_lstm_model(input_length, vocab_size):
    model = Sequential([
        Embedding(input_dim=vocab_size, output_dim=128, input_length=input_length),
        LSTM(128, return_sequences=False),
        Dropout(0.5),
        Dense(64, activation='relu'),
        Dropout(0.5),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

**GRU Model**

In [6]:
def create_gru_model(input_length, vocab_size):
    model = Sequential([
        Embedding(input_dim=vocab_size, output_dim=128, input_length=input_length),
        GRU(128, return_sequences=False),
        Dropout(0.5),
        Dense(64, activation='relu'),
        Dropout(0.5),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

**CNN Model**

In [7]:
def create_cnn_model(input_length, vocab_size):
    model = Sequential([
        Embedding(input_dim=vocab_size, output_dim=128, input_length=input_length),
        Conv1D(128, 5, activation='relu'),
        MaxPooling1D(pool_size=2),
        GlobalMaxPooling1D(),
        Dense(64, activation='relu'),
        Dropout(0.5),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

### **Train and Evaluate Models**

In [8]:
# Define model parameters
vocab_size = len(tokenizer.word_index) + 1
input_length = max_sequence_length
batch_size = 32
epochs = 5

In [9]:
# Train and evaluate LSTM model
print("Training LSTM model...")
lstm_model = create_lstm_model(input_length, vocab_size)
lstm_model.fit(X_train_padded, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.2)
lstm_y_pred = (lstm_model.predict(X_test_padded) > 0.5).astype(int)
lstm_accuracy = accuracy_score(y_test, lstm_y_pred)
print(f"LSTM Accuracy: {lstm_accuracy:.4f}")
print("LSTM Classification Report:")
print(classification_report(y_test, lstm_y_pred))


Training LSTM model...




Epoch 1/5
[1m441/441[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m73s[0m 145ms/step - accuracy: 0.7833 - loss: 0.5260 - val_accuracy: 0.8612 - val_loss: 0.3793
Epoch 2/5
[1m441/441[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m63s[0m 143ms/step - accuracy: 0.8770 - loss: 0.3401 - val_accuracy: 0.9015 - val_loss: 0.3025
Epoch 3/5
[1m441/441[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 138ms/step - accuracy: 0.9064 - loss: 0.2715 - val_accuracy: 0.8967 - val_loss: 0.3111
Epoch 4/5
[1m441/441[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 139ms/step - accuracy: 0.9142 - loss: 0.2305 - val_accuracy: 0.8780 - val_loss: 0.3453
Epoch 5/5
[1m441/441[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 139ms/step - accuracy: 0.9320 - loss: 0.1871 - val_accuracy: 0.8754 - val_loss: 0.4003
[1m138/138[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 38ms/step
LSTM Accuracy: 0.8705
LSTM Classification Report:
              precision    recall  f1-score   support


In [10]:
# Train and evaluate GRU model
print("\nTraining GRU model...")
gru_model = create_gru_model(input_length, vocab_size)
gru_model.fit(X_train_padded, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.2)
gru_y_pred = (gru_model.predict(X_test_padded) > 0.5).astype(int)
gru_accuracy = accuracy_score(y_test, gru_y_pred)
print(f"GRU Accuracy: {gru_accuracy:.4f}")
print("GRU Classification Report:")
print(classification_report(y_test, gru_y_pred))


Training GRU model...
Epoch 1/5




[1m441/441[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 121ms/step - accuracy: 0.7922 - loss: 0.5170 - val_accuracy: 0.8978 - val_loss: 0.3136
Epoch 2/5
[1m441/441[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 122ms/step - accuracy: 0.8848 - loss: 0.3177 - val_accuracy: 0.9032 - val_loss: 0.3067
Epoch 3/5
[1m441/441[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 121ms/step - accuracy: 0.8934 - loss: 0.2919 - val_accuracy: 0.8953 - val_loss: 0.3297
Epoch 4/5
[1m441/441[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m86s[0m 130ms/step - accuracy: 0.9080 - loss: 0.2481 - val_accuracy: 0.8916 - val_loss: 0.3455
Epoch 5/5
[1m441/441[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 127ms/step - accuracy: 0.9106 - loss: 0.2278 - val_accuracy: 0.8709 - val_loss: 0.3674
[1m138/138[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 31ms/step
GRU Accuracy: 0.8632
GRU Classification Report:
              precision    recall  f1-score   support

           

In [11]:
# Train and evaluate CNN model
print("\nTraining CNN model...")
cnn_model = create_cnn_model(input_length, vocab_size)
cnn_model.fit(X_train_padded, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.2)
cnn_y_pred = (cnn_model.predict(X_test_padded) > 0.5).astype(int)
cnn_accuracy = accuracy_score(y_test, cnn_y_pred)
print(f"CNN Accuracy: {cnn_accuracy:.4f}")
print("CNN Classification Report:")
print(classification_report(y_test, cnn_y_pred))


Training CNN model...
Epoch 1/5




[1m441/441[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 65ms/step - accuracy: 0.7961 - loss: 0.4869 - val_accuracy: 0.9047 - val_loss: 0.2814
Epoch 2/5
[1m441/441[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 65ms/step - accuracy: 0.9028 - loss: 0.2764 - val_accuracy: 0.9030 - val_loss: 0.2859
Epoch 3/5
[1m441/441[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 64ms/step - accuracy: 0.9303 - loss: 0.1914 - val_accuracy: 0.8885 - val_loss: 0.3365
Epoch 4/5
[1m441/441[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 63ms/step - accuracy: 0.9683 - loss: 0.1041 - val_accuracy: 0.8785 - val_loss: 0.4106
Epoch 5/5
[1m441/441[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 63ms/step - accuracy: 0.9887 - loss: 0.0469 - val_accuracy: 0.8686 - val_loss: 0.5453
[1m138/138[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step
CNN Accuracy: 0.8648
CNN Classification Report:
              precision    recall  f1-score   support

           0     

### Hyperparameter Tuning

In [12]:
import keras_tuner as kt
from tensorflow.keras.optimizers import Adam

In [13]:
# Define a hypermodel for LSTM with Keras Tuner
def build_lstm_model(hp):
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=hp.Int('embedding_dim', 64, 256, step=64), input_length=input_length))
    model.add(LSTM(units=hp.Int('lstm_units', 32, 256, step=32), return_sequences=False))
    model.add(Dropout(rate=hp.Float('dropout_rate', 0.2, 0.5, step=0.1)))
    model.add(Dense(units=hp.Int('dense_units', 32, 128, step=32), activation='relu'))
    model.add(Dropout(rate=hp.Float('dropout_rate_2', 0.2, 0.5, step=0.1)))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(
        optimizer=Adam(hp.Float('learning_rate', 1e-4, 1e-2, sampling='log')),
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    return model



# Initialize the tuner
tuner = kt.Hyperband(
    build_lstm_model,
    objective='val_accuracy',
    max_epochs=10,
    factor=3,
    directory='hyperparam_tuning',
    project_name='lstm_tuning'
)
# Search for the best hyperparameters
tuner.search(X_train_padded, y_train, epochs=10, validation_split=0.2)

# Get the best hyperparameters
best_hyperparameters = tuner.get_best_hyperparameters(1)[0]
print(best_hyperparameters.values)

# Get the best model
best_lstm_model = tuner.hypermodel.build(best_hyperparameters)

# Train the best model
best_lstm_model.fit(X_train_padded, y_train, epochs=10, validation_split=0.2)

Reloading Tuner from hyperparam_tuning\lstm_tuning\tuner0.json
{'embedding_dim': 256, 'lstm_units': 160, 'dropout_rate': 0.30000000000000004, 'dense_units': 32, 'dropout_rate_2': 0.30000000000000004, 'learning_rate': 0.0007940243226892147, 'tuner/epochs': 10, 'tuner/initial_epoch': 4, 'tuner/bracket': 1, 'tuner/round': 1, 'tuner/trial_id': '0023'}
Epoch 1/10




[1m441/441[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m101s[0m 219ms/step - accuracy: 0.7950 - loss: 0.4988 - val_accuracy: 0.9010 - val_loss: 0.3002
Epoch 2/10
[1m441/441[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m144s[0m 224ms/step - accuracy: 0.8835 - loss: 0.3213 - val_accuracy: 0.8961 - val_loss: 0.3026
Epoch 3/10
[1m441/441[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 226ms/step - accuracy: 0.9024 - loss: 0.2713 - val_accuracy: 0.8973 - val_loss: 0.3093
Epoch 4/10
[1m441/441[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m99s[0m 226ms/step - accuracy: 0.9034 - loss: 0.2461 - val_accuracy: 0.8990 - val_loss: 0.3339
Epoch 5/10
[1m441/441[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m99s[0m 225ms/step - accuracy: 0.9190 - loss: 0.1991 - val_accuracy: 0.8873 - val_loss: 0.3849
Epoch 6/10
[1m441/441[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m99s[0m 225ms/step - accuracy: 0.9340 - loss: 0.1562 - val_accuracy: 0.8777 - val_loss: 0.4402
Epoch 7/10
[1m441

<keras.src.callbacks.history.History at 0x1c3fdb12790>

In [14]:
# Define hypermodel for GRU
def build_gru_model(hp):
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=hp.Int('embedding_dim', 64, 256, step=64), input_length=input_length))
    model.add(GRU(units=hp.Int('gru_units', 32, 256, step=32), return_sequences=False))
    model.add(Dropout(rate=hp.Float('dropout_rate', 0.2, 0.5, step=0.1)))
    model.add(Dense(units=hp.Int('dense_units', 32, 128, step=32), activation='relu'))
    model.add(Dropout(rate=hp.Float('dropout_rate_2', 0.2, 0.5, step=0.1)))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(
        optimizer=Adam(hp.Float('learning_rate', 1e-4, 1e-2, sampling='log')),
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    return model

In [15]:
# Define hypermodel for CNN
def build_cnn_model(hp):
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=hp.Int('embedding_dim', 64, 256, step=64), input_length=input_length))
    model.add(Conv1D(filters=hp.Int('conv_filters', 32, 128, step=32), kernel_size=hp.Choice('kernel_size', [3, 5, 7]), activation='relu'))
    model.add(MaxPooling1D(pool_size=hp.Choice('pool_size', [2, 3])))
    model.add(GlobalMaxPooling1D())
    model.add(Dense(units=hp.Int('dense_units', 32, 128, step=32), activation='relu'))
    model.add(Dropout(rate=hp.Float('dropout_rate', 0.2, 0.5, step=0.1)))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(
        optimizer=Adam(hp.Float('learning_rate', 1e-4, 1e-2, sampling='log')),
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    return model


In [16]:
# Tune and train models
tuners = {
    'LSTM': kt.Hyperband(build_lstm_model, objective='val_accuracy', max_epochs=10, factor=3, directory='hyperparam_tuning', project_name='lstm_tuning'),
    'GRU': kt.Hyperband(build_gru_model, objective='val_accuracy', max_epochs=10, factor=3, directory='hyperparam_tuning', project_name='gru_tuning'),
    'CNN': kt.Hyperband(build_cnn_model, objective='val_accuracy', max_epochs=10, factor=3, directory='hyperparam_tuning', project_name='cnn_tuning')
}

Reloading Tuner from hyperparam_tuning\lstm_tuning\tuner0.json
Reloading Tuner from hyperparam_tuning\gru_tuning\tuner0.json
Reloading Tuner from hyperparam_tuning\cnn_tuning\tuner0.json


In [17]:
best_models = {}

for model_name, tuner in tuners.items():
    print(f"Tuning and training {model_name}...")
    tuner.search(X_train_padded, y_train, epochs=10, validation_split=0.2)
    best_hyperparameters = tuner.get_best_hyperparameters(1)[0]
    best_model = tuner.hypermodel.build(best_hyperparameters)
    best_model.fit(X_train_padded, y_train, epochs=10, validation_split=0.2)
    best_models[model_name] = best_model


Trial 30 Complete [00h 06m 38s]
val_accuracy: 0.9026674032211304

Best val_accuracy So Far: 0.9060726165771484
Total elapsed time: 03h 41m 23s
Epoch 1/10
[1m441/441[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 28ms/step - accuracy: 0.7627 - loss: 0.5670 - val_accuracy: 0.8306 - val_loss: 0.4250
Epoch 2/10
[1m441/441[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 27ms/step - accuracy: 0.8831 - loss: 0.3291 - val_accuracy: 0.9049 - val_loss: 0.2826
Epoch 3/10
[1m441/441[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 27ms/step - accuracy: 0.9118 - loss: 0.2336 - val_accuracy: 0.8930 - val_loss: 0.3062
Epoch 4/10
[1m441/441[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 29ms/step - accuracy: 0.9462 - loss: 0.1580 - val_accuracy: 0.8851 - val_loss: 0.3569
Epoch 5/10
[1m441/441[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 28ms/step - accuracy: 0.9769 - loss: 0.0864 - val_accuracy: 0.8785 - val_loss: 0.3973
Epoch 6/10
[1m441/441[0m [32m━━━━━

In [20]:
# Evaluate models
results = {}
for model_name, model in best_models.items():
    print(f"\nEvaluating {model_name}...")
    y_pred = (model.predict(X_test_padded) > 0.5).astype(int)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{model_name} Accuracy: {accuracy:.4f}")
    print(f"{model_name} Classification Report:")
    print(classification_report(y_test, y_pred))
    results[model_name] = accuracy


Evaluating LSTM...
[1m138/138[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 49ms/step
LSTM Accuracy: 0.8337
LSTM Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.91      0.89      3298
           1       0.69      0.61      0.65      1103

    accuracy                           0.83      4401
   macro avg       0.78      0.76      0.77      4401
weighted avg       0.83      0.83      0.83      4401


Evaluating GRU...
[1m138/138[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 57ms/step
GRU Accuracy: 0.8355
GRU Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.90      0.89      3298
           1       0.68      0.65      0.67      1103

    accuracy                           0.84      4401
   macro avg       0.78      0.77      0.78      4401
weighted avg       0.83      0.84      0.83      4401


Evaluating CNN...
[1m138/138[0m [32m━━━━━━━━━━━━━━━━━

In [19]:
# Print comparison
print("\nModel Comparison:")
for model_name, accuracy in results.items():
    print(f"{model_name}: {accuracy:.4f}")


Model Comparison:
LSTM: 0.8337
GRU: 0.8355
CNN: 0.8680
