In [5]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential

# Load the dataset
data = pd.read_csv('HateSpeechDetection.csv')

# Preprocess the data
texts = data['Text'].values
labels = data['Label'].values

# Encode labels
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(labels)

# Tokenize the text
tokenizer = Tokenizer(num_words=15000)
tokenizer.fit_on_texts(texts)
X = tokenizer.texts_to_sequences(texts)
X = pad_sequences(X, maxlen=200)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)


In [6]:
# Load GloVe embeddings
def glove_embeddings(filepath, word_index, embedding_dim):
    embeddings_index = {}
    with open(filepath, 'r', encoding='utf8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs

    embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return embedding_matrix

embedding_dim = 200
glove_filepath = 'glove.6B.200d.txt' 
embedding_matrix = glove_embeddings(glove_filepath, tokenizer.word_index, embedding_dim)


In [7]:
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.layers import LSTM
from tensorflow.keras.callbacks import EarlyStopping

# Build the model
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=embedding_dim, weights=[embedding_matrix], input_length=200))
model.add(Bidirectional(LSTM(128, return_sequences=True)))
model.add(Dropout(0.3))
model.add(Dense(256,activation='relu'))
model.add(LSTM(512, return_sequences=True))
model.add(Dropout(0.4))
model.add(Dense(256,activation='sigmoid'))
model.add(Bidirectional(LSTM(64)))
model.add(Dropout(0.2))
model.add(Dense(32,activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

early_stopping = EarlyStopping(monitor='loss', patience=3, restore_best_weights=True)
# Train the model
model.fit(X_train, y_train, epochs=15, batch_size=64,callbacks=[early_stopping])




Epoch 1/15
[1m220/220[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m92s[0m 365ms/step - accuracy: 0.5740 - loss: 0.6981
Epoch 2/15
[1m220/220[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m85s[0m 387ms/step - accuracy: 0.5901 - loss: 0.6786
Epoch 3/15
[1m220/220[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 376ms/step - accuracy: 0.6049 - loss: 0.6666
Epoch 4/15
[1m220/220[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 372ms/step - accuracy: 0.7347 - loss: 0.5231
Epoch 5/15
[1m220/220[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 376ms/step - accuracy: 0.8020 - loss: 0.4086
Epoch 6/15
[1m220/220[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m87s[0m 396ms/step - accuracy: 0.8465 - loss: 0.3403
Epoch 7/15
[1m220/220[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m88s[0m 400ms/step - accuracy: 0.8812 - loss: 0.2682
Epoch 8/15
[1m220/220[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m87s[0m 392ms/step - accuracy: 0.8954 - loss: 0.2430
Epoch 9/15
[1m

<keras.src.callbacks.history.History at 0x1b092781d50>

In [15]:
from sklearn.metrics import classification_report
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype("int32")
y_pred = y_pred.flatten()
print(classification_report(y_test,y_pred))


[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 92ms/step
              precision    recall  f1-score   support

           0       0.96      0.87      0.79      2094
           1       0.85      0.81      0.84      1426

    accuracy                           0.91      3520
   macro avg       0.93      0.82      0.82      3520
weighted avg       0.88      0.86      0.78      3520


In [17]:
from sklearn.metrics import roc_auc_score

roc_auc=roc_auc_score(y_test,y_pred)
print('ROC-AUC SCORE:',roc_auc)

ROC-AUC SCORE: 0.8528366628221151
