In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, f1_score
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Input, Layer
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import tensorflow as tf
from tensorflow.keras.layers import Attention

In [11]:
# Load the dataset
df = pd.read_csv('C:\\group-1-main\\Model-Evaluvation\\cleaned_data.csv')

# Ensure all entries in the text column are strings
df['tweet'] = df['tweet'].astype(str).fillna('')

# Filter the dataset to include only hate speech (0), offensive language (1), and non-hate speech (2)
df = df[df['class'].isin([0, 1, 2])]

# Encode the labels (0 and 1 for hate speech and offensive language, 2 for non-hate speech)
label_mapping = {0: 0, 1: 0, 2: 1}
df['label'] = df['class'].map(label_mapping)

# Split the data
X = df['tweet'].values
y = df['label'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Ensure all training and test data are strings
X_train = [str(text) for text in X_train]
X_test = [str(text) for text in X_test]

In [12]:
# Tokenize the text
tokenizer = Tokenizer(num_words=10000)  # Adjust num_words as needed
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Pad the sequences
max_length = 100  # Adjust max_length as needed
X_train_pad = pad_sequences(X_train_seq, maxlen=max_length)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_length)

In [13]:
# Load GloVe embeddings
def load_glove_embeddings(filepath, word_index, embedding_dim=100):
    embeddings_index = {}
    with open(filepath, 'r', encoding='utf8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs

    embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return embedding_matrix

embedding_dim = 100
glove_filepath = 'C:\\group-1-main\\Model-Evaluvation\\DL_Models\\glove.6B.100d.txt'  # Update the path to your GloVe file
embedding_matrix = load_glove_embeddings(glove_filepath, tokenizer.word_index, embedding_dim)

print("Shape of training data:", X_train_pad.shape)
print("Shape of test data:", X_test_pad.shape)

Shape of training data: (19812, 100)
Shape of test data: (4954, 100)


In [14]:
# Custom layer to wrap tf.reduce_mean
class ReduceMeanLayer(Layer):
    def call(self, inputs):
        return tf.reduce_mean(inputs, axis=1)

In [15]:
# Define LSTM model with Attention
def create_lstm_attention_model():
    inputs = Input(shape=(max_length,))
    embedding_layer = Embedding(input_dim=len(tokenizer.word_index) + 1,
                                output_dim=embedding_dim,
                                weights=[embedding_matrix],
                                input_length=max_length,
                                trainable=False)(inputs)
    lstm_layer = LSTM(128, return_sequences=True)(embedding_layer)
    attention_layer = Attention()([lstm_layer, lstm_layer])
    attention_output = ReduceMeanLayer()(attention_layer)
    dense_layer = Dense(64, activation='relu')(attention_output)
    dropout_layer = Dropout(0.5)(dense_layer)
    outputs = Dense(2, activation='softmax')(dropout_layer)
    model = Model(inputs=inputs, outputs=outputs)
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

# Create the model
lstm_attention_model = create_lstm_attention_model()



In [16]:
# Early stopping and model checkpointing
early_stopping = EarlyStopping(monitor='val_loss', min_delta=0.001, patience=5, restore_best_weights=True)
checkpoint_filepath = 'best_lstm_model_checkpoint.keras'
model_checkpoint = ModelCheckpoint(filepath=checkpoint_filepath, save_best_only=True, monitor='val_loss', mode='min')

In [17]:
# Train the model
history = lstm_attention_model.fit(X_train_pad, y_train, validation_split=0.2, epochs=20, batch_size=32, callbacks=[early_stopping, model_checkpoint], verbose=2)


Epoch 1/20
496/496 - 37s - 74ms/step - accuracy: 0.8917 - loss: 0.2415 - val_accuracy: 0.9142 - val_loss: 0.1760
Epoch 2/20
496/496 - 35s - 71ms/step - accuracy: 0.9202 - loss: 0.1649 - val_accuracy: 0.9215 - val_loss: 0.1713
Epoch 3/20
496/496 - 34s - 68ms/step - accuracy: 0.9280 - loss: 0.1528 - val_accuracy: 0.9266 - val_loss: 0.1642
Epoch 4/20
496/496 - 34s - 69ms/step - accuracy: 0.9339 - loss: 0.1420 - val_accuracy: 0.9326 - val_loss: 0.1576
Epoch 5/20
496/496 - 35s - 70ms/step - accuracy: 0.9399 - loss: 0.1339 - val_accuracy: 0.9299 - val_loss: 0.1659
Epoch 6/20
496/496 - 34s - 68ms/step - accuracy: 0.9404 - loss: 0.1272 - val_accuracy: 0.9349 - val_loss: 0.1633
Epoch 7/20
496/496 - 35s - 70ms/step - accuracy: 0.9462 - loss: 0.1164 - val_accuracy: 0.9341 - val_loss: 0.1573
Epoch 8/20
496/496 - 33s - 67ms/step - accuracy: 0.9519 - loss: 0.1087 - val_accuracy: 0.9293 - val_loss: 0.1800
Epoch 9/20
496/496 - 35s - 70ms/step - accuracy: 0.9570 - loss: 0.0981 - val_accuracy: 0.9301 - 

In [18]:
# Evaluate the model
y_pred = np.argmax(lstm_attention_model.predict(X_test_pad), axis=1)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
report = classification_report(y_test, y_pred, target_names=['Hate Speech', 'Non-Hate Speech'], digits=4)

# Print accuracy, F1 score, and classification report
print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")
print(f"Classification Report:\n{report}")

[1m155/155[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 25ms/step
Accuracy: 0.9269277351635042
F1 Score: 0.9262851762002624
Classification Report:
                 precision    recall  f1-score   support

    Hate Speech     0.9514    0.9603    0.9558      4080
Non-Hate Speech     0.8062    0.7712    0.7883       874

       accuracy                         0.9269      4954
      macro avg     0.8788    0.8657    0.8721      4954
   weighted avg     0.9258    0.9269    0.9263      4954



In [19]:
# Save the model
model_save_path = 'best_lstm_model.keras'
lstm_attention_model.save(model_save_path)
print(f"Model saved to {model_save_path}")

Model saved to best_lstm_model.keras
