#  __Deep Learning Models__



## Import necessary libraries

In [32]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, f1_score
from imblearn.over_sampling import SMOTE
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Conv1D, MaxPooling1D, GlobalMaxPooling1D, Dense, Dropout, Input, Layer
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow as tf
from tensorflow.keras.layers import Attention

## Loading and Preprocessing the Data

In [45]:
df = pd.read_csv('C:\\group-1-main\\Model-Evaluvation\\cleaned_data.csv')

# Ensure all entries in the text column are strings
df['tweet'] = df['tweet'].astype(str).fillna('')

# Filter the dataset to include only hate speech (0), offensive language (1), and non-hate speech (2)
df = df[df['class'].isin([0, 1, 2])]

# Encode the labels (0 and 1 for hate speech and offensive language, 2 for non-hate speech)
label_mapping = {0: 0, 1: 0, 2: 1}
df['label'] = df['class'].map(label_mapping)
label_distribution = df['label'].value_counts()
print(label_distribution)

# Split the data
X = df['tweet'].values
y = df['label'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Ensure all training and test data are strings
X_train = [str(text) for text in X_train]
X_test = [str(text) for text in X_test]


label
0    20608
1     4158
Name: count, dtype: int64


## Tokenizing and Padding the Sequences

In [34]:
# Tokenize the text
tokenizer = Tokenizer(num_words=10000)  # Adjust num_words as needed
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Pad the sequences
max_length = 100  # Adjust max_length as needed
X_train_pad = pad_sequences(X_train_seq, maxlen=max_length)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_length)

## Loading GloVe Embeddings

In [35]:
# Load GloVe embeddings
def load_glove_embeddings(filepath, word_index, embedding_dim=100):
    embeddings_index = {}
    with open(filepath, 'r', encoding='utf8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs

    embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return embedding_matrix

embedding_dim = 100
glove_filepath = 'C:\\group-1-main\\Model-Evaluvation\\DL_Models\\glove.6B.100d.txt'  # Update the path to your GloVe file
embedding_matrix = load_glove_embeddings(glove_filepath, tokenizer.word_index, embedding_dim)

print("Shape of training data:", X_train_pad_res.shape)
print("Shape of test data:", X_test_pad.shape)

Shape of training data: (33056, 100)
Shape of test data: (4954, 100)


## Defining Custom Layer
    -This cell defines a custom layer to compute the mean of the input tensor along the specified axis.

In [36]:
# Custom layer to wrap tf.reduce_mean
class ReduceMeanLayer(Layer):
    def call(self, inputs):
        return tf.reduce_mean(inputs, axis=1)


## 1.  __LSTM__ model with Attention Model
    -LSTM model with an Attention mechanism. The model includes an embedding layer, LSTM layer, attention layer, dense layer, and output layer.

In [37]:
# Define LSTM model with Attention
def create_lstm_attention_model():
    inputs = Input(shape=(max_length,))
    embedding_layer = Embedding(input_dim=len(tokenizer.word_index) + 1,
                                output_dim=embedding_dim,
                                weights=[embedding_matrix],
                                input_length=max_length,
                                trainable=False)(inputs)
    lstm_layer = LSTM(128, return_sequences=True)(embedding_layer)
    attention_layer = Attention()([lstm_layer, lstm_layer])
    attention_output = ReduceMeanLayer()(attention_layer)
    dense_layer = Dense(64, activation='relu')(attention_output)
    dropout_layer = Dropout(0.5)(dense_layer)
    outputs = Dense(2, activation='softmax')(dropout_layer)
    model = Model(inputs=inputs, outputs=outputs)
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

## 2.  __CNN__ 

In [38]:
# Define CNN model
def create_cnn_model():
    model = Sequential()
    model.add(Embedding(input_dim=len(tokenizer.word_index) + 1,
                        output_dim=embedding_dim,
                        weights=[embedding_matrix],
                        input_length=max_length,
                        trainable=False))
    model.add(Conv1D(128, 5, activation='relu'))
    model.add(MaxPooling1D(pool_size=4))
    model.add(GlobalMaxPooling1D())
    model.add(Dropout(0.5))
    model.add(Dense(2, activation='softmax'))
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

## 3.  __Bidirectional LSTM__ model

In [39]:
# Define Bidirectional LSTM model
def create_bidirectional_lstm_model():
    model = Sequential()
    model.add(Embedding(input_dim=len(tokenizer.word_index) + 1,
                        output_dim=embedding_dim,
                        weights=[embedding_matrix],
                        input_length=max_length,
                        trainable=False))
    model.add(Bidirectional(LSTM(128, activation='tanh', recurrent_activation='sigmoid')))
    model.add(Dropout(0.5))
    model.add(Dense(2, activation='softmax'))
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

## Setting Up Early Stopping

In [40]:
# Early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

### Train and Evaluate LSTM with Attention

In [41]:
# Train and evaluate LSTM with Attention model
lstm_attention_model = create_lstm_attention_model()
history_lstm_attention = lstm_attention_model.fit(X_train_pad, y_train, validation_split=0.2, epochs=20, batch_size=32, callbacks=[early_stopping], verbose=2)
y_pred_lstm_attention = np.argmax(lstm_attention_model.predict(X_test_pad), axis=1)
accuracy_lstm_attention = accuracy_score(y_test, y_pred_lstm_attention)
f1_lstm_attention = f1_score(y_test, y_pred_lstm_attention, average='weighted')
report_lstm_attention = classification_report(y_test, y_pred_lstm_attention, target_names=['Hate Speech', 'Non-Hate Speech'])

# Save LSTM with Attention model
lstm_attention_model.save('lstm_attention_model.h5')


Epoch 1/20




496/496 - 32s - 64ms/step - accuracy: 0.8879 - loss: 0.2472 - val_accuracy: 0.9140 - val_loss: 0.1872
Epoch 2/20
496/496 - 28s - 57ms/step - accuracy: 0.9162 - loss: 0.1694 - val_accuracy: 0.9187 - val_loss: 0.1731
Epoch 3/20
496/496 - 29s - 58ms/step - accuracy: 0.9231 - loss: 0.1574 - val_accuracy: 0.9266 - val_loss: 0.1676
Epoch 4/20
496/496 - 29s - 58ms/step - accuracy: 0.9281 - loss: 0.1483 - val_accuracy: 0.9263 - val_loss: 0.1643
Epoch 5/20
496/496 - 28s - 57ms/step - accuracy: 0.9330 - loss: 0.1394 - val_accuracy: 0.9230 - val_loss: 0.1772
Epoch 6/20
496/496 - 28s - 56ms/step - accuracy: 0.9377 - loss: 0.1333 - val_accuracy: 0.9251 - val_loss: 0.1690
Epoch 7/20
496/496 - 28s - 56ms/step - accuracy: 0.9442 - loss: 0.1236 - val_accuracy: 0.9326 - val_loss: 0.1653
[1m155/155[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 23ms/step




### Train and Evaluate Bidirectional LSTM

In [42]:
# Train and evaluate Bidirectional LSTM model
bidirectional_lstm_model = create_bidirectional_lstm_model()
history_bidirectional_lstm = bidirectional_lstm_model.fit(X_train_pad, y_train, validation_split=0.2, epochs=20, batch_size=32, callbacks=[early_stopping], verbose=2)
y_pred_bidirectional_lstm = np.argmax(bidirectional_lstm_model.predict(X_test_pad), axis=1)
accuracy_bidirectional_lstm = accuracy_score(y_test, y_pred_bidirectional_lstm)
f1_bidirectional_lstm = f1_score(y_test, y_pred_bidirectional_lstm, average='weighted')
report_bidirectional_lstm = classification_report(y_test, y_pred_bidirectional_lstm, target_names=['Hate Speech', 'Non-Hate Speech'])

# Save Bidirectional LSTM model
bidirectional_lstm_model.save('bidirectional_lstm_model.h5')


Epoch 1/20




496/496 - 41s - 82ms/step - accuracy: 0.9017 - loss: 0.2173 - val_accuracy: 0.9165 - val_loss: 0.1778
Epoch 2/20
496/496 - 37s - 74ms/step - accuracy: 0.9264 - loss: 0.1583 - val_accuracy: 0.9238 - val_loss: 0.1665
Epoch 3/20
496/496 - 36s - 72ms/step - accuracy: 0.9346 - loss: 0.1445 - val_accuracy: 0.9246 - val_loss: 0.1657
[1m155/155[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 29ms/step




### Train and Evaluate CNN Model

In [43]:
# Train and evaluate CNN model
cnn_model = create_cnn_model()
history_cnn = cnn_model.fit(X_train_pad, y_train, validation_split=0.2, epochs=20, batch_size=32, callbacks=[early_stopping], verbose=2)
y_pred_cnn = np.argmax(cnn_model.predict(X_test_pad), axis=1)
accuracy_cnn = accuracy_score(y_test, y_pred_cnn)
f1_cnn = f1_score(y_test, y_pred_cnn, average='weighted')
report_cnn = classification_report(y_test, y_pred_cnn, target_names=['Hate Speech', 'Non-Hate Speech'])

# Save CNN model
cnn_model.save('cnn_model.h5')


Epoch 1/20




496/496 - 6s - 12ms/step - accuracy: 0.8930 - loss: 0.2396 - val_accuracy: 0.9172 - val_loss: 0.1772
Epoch 2/20
496/496 - 5s - 10ms/step - accuracy: 0.9282 - loss: 0.1597 - val_accuracy: 0.9215 - val_loss: 0.1690
Epoch 3/20
496/496 - 5s - 9ms/step - accuracy: 0.9420 - loss: 0.1369 - val_accuracy: 0.9208 - val_loss: 0.1758
[1m155/155[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step




## Comparing Models and Saving the Best One

In [44]:
# Compare model performances
model_performance = {
    'Model': ['LSTM with Attention', 'Bidirectional LSTM', 'CNN'],
    'Accuracy': [accuracy_lstm_attention, accuracy_bidirectional_lstm, accuracy_cnn],
    'F1 Score': [f1_lstm_attention, f1_bidirectional_lstm, f1_cnn]
}

performance_df = pd.DataFrame(model_performance)

# Display the performance of each model
print(performance_df)

# Determine the best model based on F1 Score
best_model_index = performance_df['F1 Score'].idxmax()
best_model_name = performance_df.iloc[best_model_index]['Model']

# Save the best model
if best_model_name == 'LSTM with Attention':
    best_model = lstm_attention_model
elif best_model_name == 'Bidirectional LSTM':
    best_model = bidirectional_lstm_model
else:
    best_model = cnn_model

best_model.save('best_model.h5')

print(f"The best model is {best_model_name} with an F1 Score of {performance_df.iloc[best_model_index]['F1 Score']:.4f}")




                 Model  Accuracy  F1 Score
0  LSTM with Attention  0.923496  0.925898
1   Bidirectional LSTM  0.916027  0.917746
2                  CNN  0.921074  0.920877
The best model is LSTM with Attention with an F1 Score of 0.9259
