# Data Preprocessing

In [22]:
import re
import pandas as pd

data = pd.read_csv("HateSpeechDetection.csv")
data

Unnamed: 0,Text,Label
0,Damn I thought they had strict gun laws in Ger...,0
1,I dont care about what it stands for or anythi...,0
2,It's not a group it's an idea lol,0
3,So it's not just America!,0
4,The dog is a spectacular dancer considering he...,0
...,...,...
17591,I find rats nicer and cleaner than most Chinese,1
17592,"Check out this niggar, they hit things like wi...",1
17593,"this country has become an absolute shambles, ...",0
17594,Me aged 16 = anti-Semitism is bad Me aged 18 =...,1


In [24]:
import contractions
from nltk.stem import WordNetLemmatizer
def data_cleaning(text):
  #Removing Extra Spaces:
  text = re.sub(r'\s+', ' ', text)
  
  #Remove usernames:
  text = re.sub(r"@\S+", "",text)

  #Remove Hashtags:
  text = re.sub(r'#', '', text)

  #Handling Contractions:
  text=contractions.fix(text)

  #Lowercasing:
  text = text.lower()

  #Removing Punctuation:
  text = re.sub(r'[^\w\s]', '', text)

  #Remove URLs:
  text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

  #Removing Short words
  text = ' '.join([word for word in text.split() if len(word) > 2 or word.isnumeric()])

  #Lemmatization
  lemmatizer = WordNetLemmatizer()
  text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])

  return text

import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
data['Text']=data['Text'].apply(data_cleaning)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\balus\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\balus\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\balus\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


# Train - Test - Validation Data Splitting:

In [25]:
from sklearn.model_selection import train_test_split
texts = data['Text'].values
labels = data['Label'].values

X_train, X_temp, y_train, y_temp = train_test_split(texts, labels, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.66666, random_state=42)
X_train.shape, X_test.shape, X_val.shape

((12317,), (3520,), (1759,))

# Encoding Target Label using LabelEncoder:

In [26]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_val = label_encoder.transform(y_val)
y_test = label_encoder.transform(y_test)

y_train, y_val, y_test

(array([0, 0, 0, ..., 1, 1, 0], dtype=int64),
 array([0, 0, 1, ..., 0, 0, 0], dtype=int64),
 array([1, 0, 1, ..., 0, 0, 1], dtype=int64))

# Tokenizing and Padding:

In [27]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
tokenizer = Tokenizer(num_words=15000)
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)
X_val = tokenizer.texts_to_sequences(X_val)
X_test = tokenizer.texts_to_sequences(X_test)

X_train = pad_sequences(X_train, maxlen=200)
X_val = pad_sequences(X_val, maxlen=200)
X_test = pad_sequences(X_test, maxlen=200)

# Embedding using GloVe Embeddings:
GloVe (Global Vectors for Word Representation) is an unsupervised learning algorithm for obtaining vector representations for words. It was developed by researchers at Stanford University and is designed to capture the semantic relationships between words based on their co-occurrence in large text corpora.

Key Features of GloVe:
1. Co-occurrence Matrix: GloVe constructs a large matrix where each element represents how often a word pair co-occurs in the text corpus.
2. Word Vectors: The co-occurrence matrix is factorized to produce a lower-dimensional representation of words, typically using methods like matrix factorization or stochastic gradient descent.
3. Semantic Relationships: The resulting word vectors capture various linguistic regularities and patterns, such as analogies (e.g., king - man + woman ≈ queen).

# glove.6B.200d.txt:
GloVe.6B.200d.txt is one of the pre-trained GloVe models, where:

6B: The model was trained on a corpus of 6 billion tokens (words).

200d: Each word is represented by a 200-dimensional vector.

Details of GloVe.6B.200d.txt:

Corpus: Common Crawl (a dataset containing 6 billion tokens).

Vocabulary Size: 400,000 unique words.

In [28]:
# Load GloVe embeddings
def glove_embeddings(filepath, word_index, embedding_dim):
    embeddings_index = {}
    with open(filepath, 'r', encoding='utf8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs

    embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return embedding_matrix

embedding_dim = 200
glove_filepath = 'glove.6B.200d.txt' 
embedding_matrix = glove_embeddings(glove_filepath, tokenizer.word_index, embedding_dim)


# Bidirectional-LSTM model:    

In [30]:
from tensorflow.keras.layers import Bidirectional, Embedding, Dropout, Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.metrics import AUC


# Build the model
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=embedding_dim, weights=[embedding_matrix], input_length=200))
model.add(Bidirectional(LSTM(256, return_sequences=True)))
model.add(Dropout(0.4))
model.add(Bidirectional(LSTM(128)))
model.add(Dropout(0.3))
model.add(Dense(128,activation='relu'))
model.add(Dense(32,activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=[AUC(name='roc_auc')])

early_stopping = EarlyStopping(monitor='val_roc_auc', patience=3, restore_best_weights=True)
# Train the model
history = model.fit(X_train, y_train, epochs=15, batch_size=64,callbacks=[early_stopping], validation_data=(X_val, y_val))


Epoch 1/15
[1m193/193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6196s[0m 32s/step - loss: 0.6421 - roc_auc: 0.6208 - val_loss: 0.5701 - val_roc_auc: 0.7637
Epoch 2/15
[1m193/193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m156s[0m 805ms/step - loss: 0.5061 - roc_auc: 0.8178 - val_loss: 0.5438 - val_roc_auc: 0.7872
Epoch 3/15
[1m193/193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m163s[0m 842ms/step - loss: 0.4031 - roc_auc: 0.8921 - val_loss: 0.5851 - val_roc_auc: 0.7946
Epoch 4/15
[1m193/193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m173s[0m 897ms/step - loss: 0.3026 - roc_auc: 0.9415 - val_loss: 0.6384 - val_roc_auc: 0.7693
Epoch 5/15
[1m193/193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m167s[0m 865ms/step - loss: 0.1974 - roc_auc: 0.9754 - val_loss: 0.8184 - val_roc_auc: 0.7598
Epoch 6/15
[1m193/193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m171s[0m 886ms/step - loss: 0.1384 - roc_auc: 0.9875 - val_loss: 1.0634 - val_roc_auc: 0.7558


# Classification Report:

In [36]:
from sklearn.metrics import classification_report
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype("int32")
y_pred = y_pred.flatten()
print(classification_report(y_test,y_pred))


[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 190ms/step
              precision    recall  f1-score   support

           0       0.87      0.75      0.76      2093
           1       0.79      0.77      0.80      1427

    accuracy                           0.85      3520
   macro avg       0.84      0.76      0.75      3520
weighted avg       0.86      0.78      0.77      3520


# ROC-AUC Score:

In [37]:
from sklearn.metrics import roc_auc_score

roc_auc=roc_auc_score(y_test,y_pred)
print('ROC-AUC SCORE:',roc_auc)

ROC-AUC SCORE: 0.8265365915885402


# Saving the model 

In [41]:
model.export('best_model.pth')

INFO:tensorflow:Assets written to: best_model.pth\assets


INFO:tensorflow:Assets written to: best_model.pth\assets


Saved artifact at 'best_model.pth'. The following endpoints are available:

* Endpoint 'serve'
  args_0 (POSITIONAL_ONLY): TensorSpec(shape=(None, 200), dtype=tf.float32, name='keras_tensor_46')
Output Type:
  TensorSpec(shape=(None, 1), dtype=tf.float32, name=None)
Captures:
  2683409972768: TensorSpec(shape=(), dtype=tf.resource, name=None)
  2683341660128: TensorSpec(shape=(), dtype=tf.resource, name=None)
  2683341660304: TensorSpec(shape=(), dtype=tf.resource, name=None)
  2683341660480: TensorSpec(shape=(), dtype=tf.resource, name=None)
  2683341661184: TensorSpec(shape=(), dtype=tf.resource, name=None)
  2683341661360: TensorSpec(shape=(), dtype=tf.resource, name=None)
  2683341661536: TensorSpec(shape=(), dtype=tf.resource, name=None)
  2683341661712: TensorSpec(shape=(), dtype=tf.resource, name=None)
  2683341661888: TensorSpec(shape=(), dtype=tf.resource, name=None)
  2683341662064: TensorSpec(shape=(), dtype=tf.resource, name=None)
  2683341662768: TensorSpec(shape=(), dtype