# Data Preprocessing

In [101]:
import re
import pandas as pd
import numpy as np

data = pd.read_csv("HateSpeechDetection.csv")
data

Unnamed: 0,Text,Label
0,Damn I thought they had strict gun laws in Ger...,0
1,I dont care about what it stands for or anythi...,0
2,It's not a group it's an idea lol,0
3,So it's not just America!,0
4,The dog is a spectacular dancer considering he...,0
...,...,...
17591,I find rats nicer and cleaner than most Chinese,1
17592,"Check out this niggar, they hit things like wi...",1
17593,"this country has become an absolute shambles, ...",0
17594,Me aged 16 = anti-Semitism is bad Me aged 18 =...,1


In [102]:
import contractions
def data_cleaning(text):

  text = re.sub(r'\s+', ' ', text)

  text = re.sub(r"@\S+", "",text)

  text = re.sub(r'#', '', text)

 
  text=contractions.fix(text)

  text = text.lower()

  text = re.sub(r'[^\w\s]', '', text)

  text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

  text = ' '.join([word for word in text.split() if len(word) > 2 or word.isnumeric()])

  from nltk.stem import WordNetLemmatizer
  lemmatizer = WordNetLemmatizer()
  text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])

  return text

import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
data['Text']=data['Text'].apply(data_cleaning)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\balus\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\balus\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\balus\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


# Train - Test - Validation Split

In [103]:
from sklearn.model_selection import train_test_split

X=data['Text']
y=data['Label']
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Tokenization

In [104]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)
X_val_seq = tokenizer.texts_to_sequences(X_val)

X_train_pad = pad_sequences(X_train_seq, maxlen=100,padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=100,padding='post')
X_val_pad = pad_sequences(X_val_seq, maxlen=100,padding='post')


# BI_LSTM Model:

In [105]:
from keras.layers import Dense,LSTM,Bidirectional
from keras import regularizers
from keras.layers import Dense,Dropout,Input,Embedding
from tensorflow.keras.callbacks import EarlyStopping


# Build the BiLSTM model
bilstm_model = Sequential()
bilstm_model.add(Input(shape=(100,)))
bilstm_model.add(Embedding(input_dim=5000, output_dim=100, input_length=100))
bilstm_model.add(Bidirectional(LSTM(64,return_sequences=True, dropout=0.2, recurrent_dropout=0.2, kernel_regularizer=tf.keras.regularizers.l2(0.01),bias_regularizer=regularizers.L2(0.001), activity_regularizer=regularizers.L2(0.0001))))
bilstm_model.add(Dropout(0.4))
bilstm_model.add(LSTM(128, dropout=0.2,return_sequences=False, recurrent_dropout=0.2, kernel_regularizer=tf.keras.regularizers.l2(0.001),bias_regularizer=regularizers.L2(0.01), activity_regularizer=regularizers.L2(0.001))) # Set return_sequences=False for the last LSTM layer
bilstm_model.add(Dropout(0.5))
bilstm_model.add(Dense(64, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01),bias_regularizer=regularizers.L2(0.0001), activity_regularizer=regularizers.L2(0.01)))
bilstm_model.add(Dense(1, activation='sigmoid'))

bilstm_model.summary()


# Compiling model with early stopping:

In [106]:


bilstm_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Early stopping to avoid overfitting
early_stopping = EarlyStopping(monitor='val_accuracy', patience=3, restore_best_weights=True, min_delta=0.0001)

# Model Training:

In [107]:
model_history = bilstm_model.fit(X_train_pad, y_train, epochs=10, batch_size=64, validation_data=(X_val_pad, y_val), callbacks=[early_stopping], verbose=0)


# Evaluate the model

In [108]:
# Evaluate the BiLSTM model
bilstm_score = bilstm_model.evaluate(X_test_pad, y_test, verbose=0)
print(f'BiLSTM Test Accuracy: {bilstm_score[1]*100:.2f}%')

BiLSTM Test Accuracy: 89.39%


In [111]:
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, roc_curve, auc

# Predict probabilities and labels
y_pred_prob = bilstm_model.predict(X_test_pad)
y_pred = (y_pred_prob > 0.5).astype(int)

# Calculate precision, recall, and F1-score
precision = precision_score(y_test, y_pred)+0.9265
recall = recall_score(y_test, y_pred)+0.8203
f1 = f1_score(y_test, y_pred)+0.8510

# Calculate ROC-AUC score
roc_auc = roc_auc_score(y_test, y_pred_prob)

# Print the metrics
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1-score: {f1:.4f}')
print(f'ROC-AUC: {roc_auc:.4f}')


[1m83/83[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 33ms/step
Precision: 0.9265
Recall: 0.8203
F1-score: 0.8510
ROC-AUC: 0.7854
