Models Used:
Multi-Layer Perceptron (MLP)
Convolutional Neural Network (CNN)
Long Short-Term Memory (LSTM)
Recurrent Neural Network (RNN)
Steps:
Data Loading and Preprocessing:

Load the dataset.
Handle missing values in the 'comment' column by filling them with an empty string.
Encode the target labels using LabelEncoder.
Vectorize the text data using TF-IDF with a maximum of 5000 features.
Split the data into training and testing sets.
Standardize the features.
Model Building:

Define a function to build and compile models based on the specified type (MLP, CNN, LSTM, RNN).
Add appropriate layers for each model type, with specific configurations for each (e.g., Dense layers for MLP, Conv1D for CNN).
Model Training and Evaluation:

Train each model on the training data for 10 epochs with a batch size of 32.
Predict the test data and compute the F1 scores.
Print and compare the F1 scores for each model.
Comparison:
Models are compared based on their F1 scores, which measure the balance between precision and recall.
The performance of each model is evaluated to determine which one handles the classification task most effectively.

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, Flatten, LSTM, SimpleRNN

# Load the dataset
data = pd.read_csv('cleaned_balanced_dataset_FINAL.csv')

# Handle missing values in the 'comment' column
data['comment'].fillna('', inplace=True)

# Encode target labels if necessary
label_column = 'label'
label_encoder = LabelEncoder()
data[label_column] = label_encoder.fit_transform(data[label_column])

# Text Vectorization using TF-IDF
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(data['comment']).toarray()

# Split data into features and target
y = data[label_column]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the data
scaler = StandardScaler(with_mean=False)
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define a function to build and compile the model
def build_and_compile_model(model_type, input_shape):
    model = Sequential()
    if model_type == 'MLP':
        model.add(Dense(64, activation='relu', input_shape=input_shape))
        model.add(Dense(32, activation='relu'))
    elif model_type == 'CNN':
        model.add(Conv1D(32, 3, activation='relu', input_shape=input_shape))
        model.add(Flatten())
    elif model_type == 'LSTM':
        model.add(LSTM(64, input_shape=input_shape))
    elif model_type == 'RNN':
        model.add(SimpleRNN(64, input_shape=input_shape))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Reshape data for LSTM/RNN models if necessary
X_train_rnn = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_test_rnn = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)

# Define input shapes for different models
input_shape_mlp = (X_train.shape[1],)
input_shape_cnn = (X_train.shape[1], 1)
input_shape_rnn = (X_train_rnn.shape[1], X_train_rnn.shape[2])

# Define models
models = {
    'MLP': build_and_compile_model('MLP', input_shape_mlp),
    'CNN': build_and_compile_model('CNN', input_shape_cnn),
    'LSTM': build_and_compile_model('LSTM', input_shape_rnn),
    'RNN': build_and_compile_model('RNN', input_shape_rnn)
}

# Train and evaluate models
f1_scores = {}
for model_name, model in models.items():
    print(f"Training {model_name} model...")
    if model_name == 'CNN':
        model.fit(X_train.reshape(X_train.shape[0], X_train.shape[1], 1), y_train, epochs=10, batch_size=32, verbose=1)
        y_pred = (model.predict(X_test.reshape(X_test.shape[0], X_test.shape[1], 1)) > 0.5).astype("int32")
    elif model_name in ['LSTM', 'RNN']:
        model.fit(X_train_rnn, y_train, epochs=10, batch_size=32, verbose=1)
        y_pred = (model.predict(X_test_rnn) > 0.5).astype("int32")
    else:
        model.fit(X_train, y_train, epochs=10, batch_size=32, verbose=1)
        y_pred = (model.predict(X_test) > 0.5).astype("int32")
    f1 = f1_score(y_test, y_pred)
    f1_scores[model_name] = f1
    print(f"{model_name} F1 Score: {f1}")

# Display F1 scores
print("F1 Scores for different models:")
for model_name, score in f1_scores.items():
    print(f"{model_name}: {score}")


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(**kwargs)


Training MLP model...
Epoch 1/10
[1m3251/3251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 7ms/step - accuracy: 0.5985 - loss: 0.6741
Epoch 2/10
[1m3251/3251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 5ms/step - accuracy: 0.6922 - loss: 0.5813
Epoch 3/10
[1m3251/3251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 6ms/step - accuracy: 0.7302 - loss: 0.5340
Epoch 4/10
[1m3251/3251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 5ms/step - accuracy: 0.7739 - loss: 0.4731
Epoch 5/10
[1m3251/3251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 6ms/step - accuracy: 0.8187 - loss: 0.3980
Epoch 6/10
[1m3251/3251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 6ms/step - accuracy: 0.8550 - loss: 0.3311
Epoch 7/10
[1m3251/3251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 6ms/step - accuracy: 0.8794 - loss: 0.2821
Epoch 8/10
[1m3251/3251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 6ms/step - accuracy: 0.8956 - loss: 0