#  __Deep Learning Models__



## Import necessary libraries

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding, SpatialDropout1D, Conv1D, GlobalMaxPooling1D, Bidirectional
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.regularizers import l2

In [5]:
#Load the cleaned dataset
df = pd.read_csv(r'C:\group-1-main\Model-Evaluvation\cleaned_data.csv')
df['tweet'] = df['tweet'].astype(str)


## Encoding and Handling Data Imbalance

In [6]:
# Split the data into training and testing sets
X = df['tweet']
y = df['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and fit the tokenizer on training data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

# Transform training and test data
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Pad sequences to the same length
X_train_pad = pad_sequences(X_train_seq, maxlen=100)
X_test_pad = pad_sequences(X_test_seq, maxlen=100)

vocab_size = len(tokenizer.word_index) + 1

# Address class imbalance using SMOTE on training data
smote = SMOTE(random_state=42)
X_train_pad_res, y_train_pad_res = smote.fit_resample(X_train_pad, y_train)


# Define early stopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

## 1.  __LSTM__ model

In [7]:
# Define LSTM model
lstm_model = Sequential()
lstm_model.add(Embedding(input_dim=vocab_size, output_dim=128))
lstm_model.add(SpatialDropout1D(0.2))  # Dropout layer to prevent overfitting
lstm_model.add(Bidirectional(LSTM(100, dropout=0.2, recurrent_dropout=0.2, kernel_regularizer=l2(0.01))))
lstm_model.add(Dense(3, activation='softmax'))  # Assuming 3 classes

# Compile the model with Adam optimizer
lstm_model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the LSTM model with early stopping
lstm_model.fit(X_train_pad_res, y_train_pad_res, epochs=20, batch_size=64, validation_data=(X_test_pad, y_test), callbacks=[early_stopping])

# Evaluate LSTM model
lstm_loss, lstm_accuracy = lstm_model.evaluate(X_test_pad, y_test)
lstm_y_pred = lstm_model.predict(X_test_pad)
lstm_y_pred_classes = np.argmax(lstm_y_pred, axis=1)
lstm_report = classification_report(y_test, lstm_y_pred_classes, target_names=['Normal', 'Hate', 'Offensive'])

print("LSTM Model Accuracy:", lstm_accuracy)
print("LSTM Model Classification Report:\n", lstm_report)

Epoch 1/20
[1m722/722[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m104s[0m 137ms/step - accuracy: 0.5552 - loss: 1.5461 - val_accuracy: 0.7420 - val_loss: 0.6746
Epoch 2/20
[1m722/722[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m101s[0m 140ms/step - accuracy: 0.6849 - loss: 0.7347 - val_accuracy: 0.8082 - val_loss: 0.5626
Epoch 3/20
[1m722/722[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m99s[0m 137ms/step - accuracy: 0.7223 - loss: 0.6692 - val_accuracy: 0.7903 - val_loss: 0.6082
Epoch 4/20
[1m722/722[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m103s[0m 143ms/step - accuracy: 0.7520 - loss: 0.6125 - val_accuracy: 0.7826 - val_loss: 0.6527
Epoch 5/20
[1m722/722[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m103s[0m 143ms/step - accuracy: 0.7727 - loss: 0.5722 - val_accuracy: 0.7915 - val_loss: 0.6560
[1m155/155[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 24ms/step - accuracy: 0.8024 - loss: 0.5715
[1m155/155[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s

## 2.  __CNN__ 

In [8]:
# Define CNN model
cnn_model = Sequential()
cnn_model.add(Embedding(input_dim=vocab_size, output_dim=128))
cnn_model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
cnn_model.add(GlobalMaxPooling1D())
cnn_model.add(Dense(3, activation='softmax'))  # Assuming 3 classes

# Compile the model with Adam optimizer
cnn_model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the CNN model with early stopping
cnn_model.fit(X_train_pad_res, y_train_pad_res, epochs=20, batch_size=64, validation_data=(X_test_pad, y_test), callbacks=[early_stopping])

# Evaluate CNN model
cnn_loss, cnn_accuracy = cnn_model.evaluate(X_test_pad, y_test)
cnn_y_pred = cnn_model.predict(X_test_pad)
cnn_y_pred_classes = np.argmax(cnn_y_pred, axis=1)
cnn_report = classification_report(y_test, cnn_y_pred_classes, target_names=['Normal', 'Hate', 'Offensive'])

print("CNN Model Accuracy:", cnn_accuracy)
print("CNN Model Classification Report:\n", cnn_report)

Epoch 1/20
[1m722/722[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 40ms/step - accuracy: 0.6113 - loss: 0.8376 - val_accuracy: 0.8339 - val_loss: 0.4827
Epoch 2/20
[1m722/722[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 40ms/step - accuracy: 0.7740 - loss: 0.5429 - val_accuracy: 0.7868 - val_loss: 0.5988
Epoch 3/20
[1m722/722[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 40ms/step - accuracy: 0.8895 - loss: 0.3135 - val_accuracy: 0.8000 - val_loss: 0.5983
Epoch 4/20
[1m722/722[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 39ms/step - accuracy: 0.9497 - loss: 0.1628 - val_accuracy: 0.7778 - val_loss: 0.7348
[1m155/155[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.8313 - loss: 0.4881
[1m155/155[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step
CNN Model Accuracy: 0.833871603012085
CNN Model Classification Report:
               precision    recall  f1-score   support

      Normal       0.23      0.45

## 3.  __Bidirectional LSTM__ model

In [9]:
# Define Bidirectional LSTM model
bi_lstm_model = Sequential()
bi_lstm_model.add(Embedding(input_dim=vocab_size, output_dim=128))
bi_lstm_model.add(SpatialDropout1D(0.2))  # Dropout layer to prevent overfitting
bi_lstm_model.add(Bidirectional(LSTM(100, dropout=0.2, recurrent_dropout=0.2, kernel_regularizer=l2(0.01))))
bi_lstm_model.add(Dense(3, activation='softmax'))  # Assuming 3 classes

# Compile the model with Adam optimizer
bi_lstm_model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the Bidirectional LSTM model with early stopping
bi_lstm_model.fit(X_train_pad_res, y_train_pad_res, epochs=20, batch_size=64, validation_data=(X_test_pad, y_test), callbacks=[early_stopping])

# Evaluate Bidirectional LSTM model
bi_lstm_loss, bi_lstm_accuracy = bi_lstm_model.evaluate(X_test_pad, y_test)
bi_lstm_y_pred = bi_lstm_model.predict(X_test_pad)
bi_lstm_y_pred_classes = np.argmax(bi_lstm_y_pred, axis=1)
bi_lstm_report = classification_report(y_test, bi_lstm_y_pred_classes, target_names=['Normal', 'Hate', 'Offensive'])

print("Bidirectional LSTM Model Accuracy:", bi_lstm_accuracy)
print("Bidirectional LSTM Model Classification Report:\n", bi_lstm_report)

Epoch 1/20
[1m722/722[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m102s[0m 134ms/step - accuracy: 0.5566 - loss: 1.5559 - val_accuracy: 0.7951 - val_loss: 0.5978
Epoch 2/20
[1m722/722[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m91s[0m 126ms/step - accuracy: 0.6838 - loss: 0.7385 - val_accuracy: 0.8066 - val_loss: 0.5818
Epoch 3/20
[1m722/722[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m92s[0m 128ms/step - accuracy: 0.7224 - loss: 0.6699 - val_accuracy: 0.7840 - val_loss: 0.6472
[1m155/155[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 22ms/step - accuracy: 0.7918 - loss: 0.6058
[1m155/155[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 25ms/step
Bidirectional LSTM Model Accuracy: 0.795115053653717
Bidirectional LSTM Model Classification Report:
               precision    recall  f1-score   support

      Normal       0.19      0.50      0.27       282
        Hate       0.97      0.79      0.87      3798
   Offensive       0.71      0.91      0.80       874