# Hate Speech Identification

`Pipeline for developing hate speech identification task`

- Data Acquisition - https://www.kaggle.com/datasets/mrmorj/hate-speech-and-offensive-language-dataset
- Data Preprocessing (Text Preparation)
- Feature Representation
- Model Selection
- Model Training
- Testing the model
- Deployment

### Import necessary libraries and load the dataset

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import chardet
import seaborn as sns
import nltk
import regex as re 
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import remove_stopwords
from nltk.stem.wordnet import WordNetLemmatizer

In [2]:
data = pd.read_csv(r'labeled_data.csv')

### Data Exploration

In [3]:
data = data[['tweet','class']]

In [4]:
data.head()

Unnamed: 0,tweet,class
0,!!! RT @mayasolovely: As a woman you shouldn't...,2
1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,1
2,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,1
3,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,1
4,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,1


In [5]:
data['class'].value_counts() # Class Imbalance - This could potentially lead to a bias in your model towards predicting class 1, as it has significantly more instances than the other classes

class
1    19190
2     4163
0     1430
Name: count, dtype: int64

In [4]:
x = data['tweet']
y = data['class']

In [5]:
# train test split the data
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y) # stratify=y ensures that the class distribution is the same in both the training and testing sets

### Text Preprocessing



In [8]:
# nltk.download('wordnet')

In [6]:
lemma = WordNetLemmatizer()

In [7]:
def preprocess(text): # basic and simple preprocessing techniques
    text = re.sub('@[\w]+','',text)
    text = simple_preprocess(remove_stopwords(text))
    return [lemma.lemmatize(str(word)) for word in text if word != 'rt']

In [8]:
train_sentences = [preprocess(text) for text in x_train] 
test_sentences = [preprocess(text) for text in x_test] 

In [9]:
train_sentences[:2]

[['talking', 'angela', 'hoe'], ['lol', 'tricking', 'niccas', 'left', 'right']]

In [10]:
print(test_sentences[:2])

[['there', 'gift', 'shop', 'museum', 'and', 'charge', 'admission', 'it', 'mass', 'grave', 'fucking', 'cunt', 'what', 'disgrace'], ['need', 'red', 'green', 'amp', 'yellow', 'pant', 'amp', 'helly', 'shirt', 'express']]


In [None]:
'''
from nltk.tokenize import TweetTokenizer
tokenizer = TweetTokenizer()
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

data['tweet'] = data['tweet'].str.lower() # convert to lowercase
data['tweet'] = data['tweet'].apply(tokenizer.tokenize)  # tokenize
data['tweet'] = data['tweet'].apply(lambda x: [item for item in x if item not in stop_words]) # remove stopwords

# find where urls is present 
print(data['tweet'][data['tweet'].apply(lambda x: 'http' in x)][:2]) # printing first 2 instances
pattern = r'https?://\S+|www\.\S+'
data['tweet'] = data['tweet'].apply(lambda x: [re.sub(pattern, '', item) for item in x]) # remove URLs

# remove unwanted symbols and punctutations
unwanted_symbols = r'[^\w\s]'
data['tweet'] = data['tweet'].apply(lambda x: [re.sub(unwanted_symbols, '', item) for item in x])

data['tweet'] = data['tweet'].apply(lambda x: [item for item in x if item != '']) # remove empty strings 

data['tweet'] = data['tweet'].apply(lambda x: [item for item in x if item != 'rt']) # remove 'rt'

data['tweet'] = data['tweet'].apply(lambda x: [lemma.lemmatize(item) for item in x]) # lemmatize
 
x = data['tweet']
y = data['class']

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)

# convert into a list 
train_sentences = x_train.tolist()
test_sentences = x_test.tolist()
'''

```word_tokenize``` splits ```#dummysmiley as '#' and 'dummysmiley'```, while ```TweetTokenizer``` splits as ```'#dummysmiley'```. TweetTokenizer is built mainly for analyzing tweets.

### Word Embeddings


- Word2Vec
- FastText
- CNN
- RNN

In [15]:
from gensim.models.word2vec import Word2Vec 
from gensim.models import FastText

`Word2Vec`

In [16]:
cbow_train = Word2Vec(train_sentences, vector_size=100, window=5, min_count=1, workers=2, sg=0)
cbow_test = Word2Vec(test_sentences, vector_size=100, window=5, min_count=1, workers=2, sg=0)

In [17]:
train_vocab_cbow = cbow_train.wv.index_to_key 
test_vocab_cbow = cbow_test.wv.index_to_key 

In [18]:
def get_mean_vector(model, sentence, vocab):
    words = [word for word in sentence if word in vocab] 
    if len(words) >= 1: 
        return np.mean(model.wv[words], axis=0) 
    return np.zeros((100,)) 

In [19]:
cbow_array_train = []

for sentence in train_sentences:
    mean_vec = get_mean_vector(cbow_train, sentence, train_vocab_cbow)
    cbow_array_train.append(mean_vec)
    
cbow_array_train = np.array(cbow_array_train)

In [20]:
cbow_array_test = []

for sentence in test_sentences:
    mean_vec = get_mean_vector(cbow_test, sentence, test_vocab_cbow)
    cbow_array_test.append(mean_vec)
    
cbow_array_test = np.array(cbow_array_test)

`FastText`

In [21]:
fasttext_train = FastText(train_sentences, vector_size=100, window=5, min_count=1, workers=2, sg=0)
fasttext_test = FastText(test_sentences, vector_size=100, window=5, min_count=1, workers=2, sg=0)

In [22]:
train_vocab_ft = fasttext_train.wv.index_to_key 
test_vocab_ft = fasttext_test.wv.index_to_key 

In [23]:
fasttext_array_train = []

for sentence in train_sentences:
    mean_vec = get_mean_vector(fasttext_train, sentence, train_vocab_ft)
    fasttext_array_train.append(mean_vec)
    
fasttext_array_train = np.array(fasttext_array_train)

In [24]:
fasttext_array_test = []

for sentence in test_sentences:
    mean_vec = get_mean_vector(fasttext_test, sentence, test_vocab_ft)
    fasttext_array_test.append(mean_vec)
    
fasttext_array_test = np.array(fasttext_array_test)

### Model Building for Word2Vec and FastText

In [25]:
from sklearn.svm import SVC 
from sklearn.metrics import accuracy_score

`SVM`

In [26]:
cbow_svm = SVC()
cbow_svm.fit(cbow_array_train, y_train) 

In [27]:
cbow_svm_pred = cbow_svm.predict(cbow_array_test) 

In [28]:
accuracy_score(y_test, cbow_svm_pred)

0.7845471051038935

In [29]:
fasttext_svm = SVC()
fasttext_svm.fit(fasttext_array_train, y_train) 

In [30]:
fasttext_svm_pred = fasttext_svm.predict(fasttext_array_test) 

In [31]:
accuracy_score(y_test, fasttext_svm_pred)

0.7744603590881581

### CNN and RNN

In [11]:
import keras 
import tensorflow
from keras.models import Sequential 
from keras.layers import Dense, Flatten, Conv1D, MaxPooling1D, Embedding, Bidirectional, LSTM
from keras.preprocessing import sequence

In [13]:
tokenizer = tensorflow.keras.preprocessing.text.Tokenizer()

In [17]:
tokenizer.fit_on_texts(train_sentences) # fit the tokenizer on the training data
print(tokenizer.word_index) # print the word index



The tokenizer should be fit only on the training data to learn the word-to-integer mapping, and then used to transform both the training and test data to ensure consistency.

In [18]:
print(train_sentences[:2])

[['talking', 'angela', 'hoe'], ['lol', 'tricking', 'niccas', 'left', 'right']]


In [19]:
# Convert the text data to sequences of integers
train_seq = tokenizer.texts_to_sequences(train_sentences)
train_seq_pad = sequence.pad_sequences(train_seq, maxlen=100)

test_seq = tokenizer.texts_to_sequences(test_sentences)
test_seq_pad = sequence.pad_sequences(test_seq, maxlen=100)

In [21]:
# Check if there is a 1 in the train sequences
is_one_in_train = any(1 in seq for seq in train_seq_pad)

# Check if there is a 1 in the test sequences
is_one_in_test = any(1 in seq for seq in test_seq_pad)

print("Is there a 1 in the train sequences? ", is_one_in_train)
print("Is there a 1 in the test sequences? ", is_one_in_test)

Is there a 1 in the train sequences?  True
Is there a 1 in the test sequences?  True


In [20]:
train_seq[0], train_seq_pad[0], test_seq[0], test_seq_pad[0]

([105, 5362, 2],
 array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,  105, 5362,
           2]),
 [144, 1744, 2730, 75, 1177, 13, 2251, 4347, 30, 89, 60, 2643],
 array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,  

`CNN`

In [23]:
top_words = 10000
cnn = Sequential([
    Embedding(top_words,32),
    Conv1D(32,3,padding='same',activation='relu'),
    MaxPooling1D(), 
    Flatten(),
    Dense(250,activation='relu'),
    Dense(1,activation='sigmoid')    
])

In [24]:
cnn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [25]:
cnn.fit(train_seq_pad, y_train, epochs=2, batch_size=64, verbose=2, validation_data=(test_seq_pad, y_test))

Epoch 1/2
310/310 - 7s - 22ms/step - accuracy: 0.7743 - loss: -6.4433e+05 - val_accuracy: 0.7743 - val_loss: -3.8321e+06
Epoch 2/2
310/310 - 5s - 15ms/step - accuracy: 0.7743 - loss: -2.7732e+07 - val_accuracy: 0.7743 - val_loss: -7.6033e+07


<keras.src.callbacks.history.History at 0x20d080e9ad0>

In [26]:
cnn.evaluate(test_seq_pad, y_test)[1]

[1m155/155[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.7710 - loss: -81095352.0000


0.7742586135864258

`RNN`

In [27]:
rnn = Sequential([
    Embedding(top_words,64),
    Bidirectional(LSTM(64)),
    Dense(1,activation='sigmoid')
])

In [28]:
rnn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [29]:
rnn.fit(train_seq_pad, y_train, epochs=2, batch_size=64, verbose=2, validation_data=(test_seq_pad, y_test))

Epoch 1/2


310/310 - 42s - 134ms/step - accuracy: 0.7721 - loss: -3.3364e+00 - val_accuracy: 0.7743 - val_loss: -5.5780e+00
Epoch 2/2
310/310 - 50s - 161ms/step - accuracy: 0.7743 - loss: -7.5773e+00 - val_accuracy: 0.7743 - val_loss: -9.5782e+00


<keras.src.callbacks.history.History at 0x20d7ea55350>

In [30]:
rnn.evaluate(test_seq_pad,y_test)[1]

[1m  1/155[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m12s[0m 78ms/step - accuracy: 0.8125 - loss: -10.8623

[1m155/155[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 21ms/step - accuracy: 0.7710 - loss: -10.1965


0.7742586135864258