In [30]:
import re
import pandas as pd
import contractions
import numpy as np
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords


data = pd.read_csv("HateSpeechDetection.csv")
print(data.head(15))
print(data.tail(10))
print(data.info())


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\balui\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\balui\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\balui\AppData\Roaming\nltk_data...


                                                 Text  Label
0   Damn I thought they had strict gun laws in Ger...      0
1   I dont care about what it stands for or anythi...      0
2                   It's not a group it's an idea lol      0
3                           So it's not just America!      0
4   The dog is a spectacular dancer considering he...      0
5   If ppl dont wear masks you complain... If ppl ...      0
6             We should send them All back  to africa      1
7   Checking to see if it's whataboutism from the ...      0
8                      As a european, I approve this.      0
9   Idk which of these groups to join, they're all...      0
10  Sometimes the messenger is just as important a...      0
11                    This is the future we all want.      0
12     Wow the monkey calls someone else a monkey lol      1
13                   They are the gypsies of America.      1
14                                   I know that guy!      0
                        

[nltk_data]   Package omw-1.4 is already up-to-date!


In [31]:
def data_cleaning(data):
    def remove_extra_spaces(text):
        return re.sub(r'\s+', ' ', text) #the re.sub function replaces one or more whitespace characters (\s+) with a single space.

    def remove_username(text):
        return re.sub(r"@\S+", "",text) 
    #We used pattern “@\S+” -> it suggests string group which starts with ‘@’ and followed by non-whitespace character(\S), ‘+’ means repeatition of preceding character one or more times

    def remove_hashtags(text):
        return re.sub(r'#', '', text)
    # replacing the character("#") with "" but not removing the term.
    
    
    def text_lower(text):
        return text.lower()
    
    def remove_punctuation(text):
        return re.sub(r'[^\w\s]', '', text)
    #\w: Represents any alphanumeric character (equivalent to [a-zA-Z0-9_]).
    #\s: Denotes any whitespace character, such as space, tab, or newline.
    # so it defines the other than a alphanumeric character followed by a single space, ('^' for negation) remove other characters

    def remove_url(text):
        return re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # it identifies the words starting with http or https or www and ending with a non-white space Character(\S) then remove it

    
    def lemmatizers(text):
        text = nltk.word_tokenize(text)
        word=[]
        for i in text:
            if i not in stopwords.words('english'):
                word.append(lemmatizer.lemmatize(i))
            else:
                word.append(i)
        return ' '.join(word)

    data['Text'] = data['Text'].apply(remove_extra_spaces)

    data['Text'] = data['Text'].apply(remove_username)

    data['Text'] = data['Text'].apply(remove_hashtags)

    data['Text']=data['Text'].apply(lambda x:contractions.fix(x))
    
    data['Text'] = data['Text'].apply(text_lower)
    
    data['Text'] = data['Text'].apply(remove_punctuation)

    data['Text'] = data['Text'].apply(remove_url)
    
    data['Text'] = data['Text'].apply(lambda x: ' '.join([word for word in x.split() if len(word) >= 3 or word.isnumeric()]))
    
    lemmatizer = WordNetLemmatizer()
    data['Text'] = data['Text'].apply(lemmatizers)
    
    return data
cleaned_data=data_cleaning(data)

In [32]:
cleaned_data

Unnamed: 0,Text,Label
0,damn thought they had strict gun law germany,0
1,not care about what stand for anything its con...,0
2,not group idea lol,0
3,not just america,0
4,the dog spectacular dancer considering has two...,0
...,...,...
17591,find rat nicer and cleaner than most chinese,1
17592,check out this niggar they hit thing like wild...,1
17593,this country has become absolute shamble the a...,0
17594,aged 16 antisemitism bad aged 18 antisemitism ...,1


In [33]:
from nltk.tokenize import word_tokenize
def tokenization(data):
    data['Tokens']=data['Text'].apply(word_tokenize)
    return data
tokenized_data=tokenization(data)

In [34]:
tokenized_data

Unnamed: 0,Text,Label,Tokens
0,damn thought they had strict gun law germany,0,"[damn, thought, they, had, strict, gun, law, g..."
1,not care about what stand for anything its con...,0,"[not, care, about, what, stand, for, anything,..."
2,not group idea lol,0,"[not, group, idea, lol]"
3,not just america,0,"[not, just, america]"
4,the dog spectacular dancer considering has two...,0,"[the, dog, spectacular, dancer, considering, h..."
...,...,...,...
17591,find rat nicer and cleaner than most chinese,1,"[find, rat, nicer, and, cleaner, than, most, c..."
17592,check out this niggar they hit thing like wild...,1,"[check, out, this, niggar, they, hit, thing, l..."
17593,this country has become absolute shamble the a...,0,"[this, country, has, become, absolute, shamble..."
17594,aged 16 antisemitism bad aged 18 antisemitism ...,1,"[aged, 16, antisemitism, bad, aged, 18, antise..."


In [35]:
from gensim.models import Word2Vec

def word2vec_embedding_sg(texts):
    model = Word2Vec(texts, vector_size=200, window=6, min_count=1, workers=4,sg=1)
    word_vectors = model.wv
    #print(word_vectors)

    def get_word2vec_embeddings(text, word_vectors):
        embeddings = [word_vectors[word] for word in text if word in word_vectors]
        if embeddings:
            return np.mean(embeddings, axis=0)
        else:
            return np.zeros(200)

    embeddings = np.array([get_word2vec_embeddings(text, word_vectors) for text in texts])
    return embeddings

embeddings_w2v_sg = word2vec_embedding_sg(tokenized_data['Tokens'])


In [36]:
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(embeddings_w2v_sg, data['Label'], test_size=0.2, random_state=42)

print(y_train.value_counts())


0    8402
1    5674
Name: Label, dtype: int64


# Imbalanced data:
From the above output, we can observe that after splitting dataset into training and test data, there is an imbalance in 'Label' column in the training data as we can see there are 8402 instances of label '0' where as there are only 5674 instances of label '1'. This would significantly effect model training because models trained on this dataset might be biased towards the majority class (non-hate speech) and may not perform as well in identifying hate speech instances.

The training data can be balanced using resampling techniques like undersampling and oversampling.

we can randomly delete rows from the majority class to match them with the minority class which is called undersampling.

When we are using an imbalanced dataset, we can oversample the minority class using replacement. This technique used to handle imbalanced data is called oversampling.



In [65]:
from sklearn.utils import resample
#create two different dataframe of majority and minority class
df_train = pd.DataFrame(X_train, y_train)
#print(df_train.info())
df_majority = df_train[df_train.index == 0]
df_minority = df_train[df_train.index == 1]
# upsample minority class
df_minority_upsampled = resample(data_minority, 
                                 replace=True,
                                 n_samples= 8402,    # sample with replacement
                                 random_state=42)

df_upsampled = pd.concat([df_minority_upsampled, df_majority])


But generally, a random oversampler is not preferably used because it duplicates the minority instances to equalize with majority instances.

# SMOTE:
 Synthetic Minority Oversampling Technique or SMOTE, which is another technique to oversample the minority class. Simply adding duplicate records of minority class often don’t adon’ty new information to the model. In SMOTE new instances are synthesized from the existing data. 

In [24]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

print(y_train_resampled.value_counts())
print("Original dataset shape:", X_train.shape)
print("Resampled dataset shape:", X_train_resampled.shape)

1    8402
0    8402
Name: Label, dtype: int64
Original dataset shape: (14076, 200)
Resampled dataset shape: (16804, 200)
