# Imports and Loading SpaCy Model:

In [1]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from textblob import TextBlob
from spellchecker import SpellChecker
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
import spacy
import numpy as np

# Load Data:

In [2]:
data = pd.read_csv('hatspeech dataset.csv', encoding='ISO-8859-1')  
data

Unnamed: 0.1,Unnamed: 0,comment,label
0,1,dalits are lowlives,N
1,2,gay people are a burden to our society,N
2,3,Arabs are not welcome,N
3,4,I'm not saying we should actually eliminate he...,N
4,5,bananas are for black people,N
...,...,...,...
41139,117100,Indian culture is so backwards,N
41140,118100,"Women don't want equality, they just want to b...",N
41141,119100,fuck covid,P
41142,1205,This computer is garbage,P


# Data Exploration:

In [3]:
print(data.head())  # Check the first few rows of the dataset

   Unnamed: 0                                            comment label
0           1                                dalits are lowlives     N
1           2             gay people are a burden to our society     N
2           3                              Arabs are not welcome     N
3           4  I'm not saying we should actually eliminate he...     N
4           5                       bananas are for black people     N


In [4]:
print(data.info())  # Get information about the dataset, like column names and data types

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41144 entries, 0 to 41143
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  41144 non-null  int64 
 1   comment     41144 non-null  object
 2   label       41144 non-null  object
dtypes: int64(1), object(2)
memory usage: 964.4+ KB
None


In [5]:
print(data['label'].value_counts())  # Check the distribution of labels

label
N    22158
P    18950
O       36
Name: count, dtype: int64


In [6]:
data = data[data['label'] != 'O'].copy()

In [7]:
print(data['label'].value_counts())

label
N    22158
P    18950
Name: count, dtype: int64


In [8]:
data.isnull().any()

Unnamed: 0    False
comment       False
label         False
dtype: bool

In [9]:
#Find the number of duplicate rows
num_duplicates = data.duplicated().sum()
print(f'Number of duplicate rows: {num_duplicates}')

Number of duplicate rows: 0


# Text Cleaning:

In [10]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\maddi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\maddi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\maddi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [11]:
chatword_dictionary = {
    'u': 'you',
    'ur': 'your',
    'r': 'are',
    'y': 'why',
    'b4': 'before',
    'gr8': 'great',
    'l8r': 'later',
    'w8': 'wait',
    'bff': 'best friend forever',
    'brb': 'be right back',
    'btw': 'by the way',
    'cuz': 'because',
    'idk': 'i do not know',
    'ikr': 'i know right',
    'imo': 'in my opinion',
    'lmao': 'laughing my ass off',
    'lol': 'laugh out loud',
    'omg': 'oh my god',
    'omw': 'on my way',
    'pls': 'please',
    'thx': 'thanks',
    'ttyl': 'talk to you later',
    'wth': 'what the hell',
    'wyd': 'what you doing',
    'smh': 'shaking my head'
}

In [12]:
slang_dict = {
    "kinda": "kind of",
    "wanna": "want to",
    "gonna": "going to",
    "gotta": "got to",
    "ain't": "is not",
    "y'all": "you all",
    "can't": "cannot",
    "won't": "will not",
    "don't": "do not",
    "idk": "I do not know",
    "tbh": "to be honest",
    "brb": "be right back",
    "bbl": "be back later",
    "btw": "by the way",
    "omg": "oh my god",
    "lol": "laughing out loud",
    "i'm": "i am",
    "you're": "you are",
    "he's": "he is",
    "she's": "she is",
    "it's": "it is",
    "we're": "we are",
    "they're": "they are",
    "i'll": "i will",
    "you'll": "you will",
    "he'll": "he will",
    "she'll": "she will",
    "we'll": "we will",
    "they'll": "they will",
    "i've": "i have",
    "you've": "you have",
    "we've": "we have",
    "they've": "they have",
    "isn't": "is not",
    "aren't": "are not",
    "wasn't": "was not",
    "weren't": "were not",
    "haven't": "have not",
    "hasn't": "has not",
    "hadn't": "had not",
    "doesn't": "does not",
    "didn't": "did not",
    "won't": "will not",
    "wouldn't": "would not",
    "don't": "do not",
    "can't": "cannot",
    "couldn't": "could not",
    "shouldn't": "should not",
    "mightn't": "might not",
    "mustn't": "must not"
}

In [13]:
def clean_text(text):
  text = text.lower()  # Convert text to lowercase
  text = re.sub(r'\d+', '', text)  # Remove digits
  for chatword, full_form in chatword_dictionary.items():
      text = re.sub(rf"\b{chatword}\b", full_form, text)
  for slang, expanded in slang_dict.items():
      text = re.sub(r'\b' + re.escape(slang) + r'\b', expanded, text)
  text = re.sub(r'\W', ' ', text)  # Remove non-word characters
  text = re.sub(r'\s+', ' ', text)  # Remove extra whitespace
  text = re.sub(r'(.)\1{2,}', r'\1', text)  # Replace sequences of the same character > 2 with 1 occurrence
  text = re.sub(r'(\w)[^\w\s](\w)', r'\1\2', text)  # Remove special characters within words like 'b@ckward' to 'backward'
  return text

In [14]:
data['clean_text'] = data['comment'].apply(clean_text)

In [15]:
print(data['clean_text'])

0                                      dalits are lowlives
1                   gay people are a burden to our society
2                                    arabs are not welcome
3        i am not saying we should actually eliminate h...
4                             bananas are for black people
                               ...                        
41139                       indian culture is so backwards
41140    women do not want equality they just want to b...
41141                                           fuck covid
41142                             this computer is garbage
41143                   the only good muslim is a dead one
Name: clean_text, Length: 41108, dtype: object


# Tokenization

In [16]:
from nltk.tokenize import word_tokenize

data['tokens'] = data['clean_text'].apply(word_tokenize)
data['tokens']

0                                  [dalits, are, lowlives]
1          [gay, people, are, a, burden, to, our, society]
2                               [arabs, are, not, welcome]
3        [i, am, not, saying, we, should, actually, eli...
4                       [bananas, are, for, black, people]
                               ...                        
41139                 [indian, culture, is, so, backwards]
41140    [women, do, not, want, equality, they, just, w...
41141                                        [fuck, covid]
41142                        [this, computer, is, garbage]
41143          [the, only, good, muslim, is, a, dead, one]
Name: tokens, Length: 41108, dtype: object

# Stopwords Removal

In [17]:
stop_words = set(stopwords.words('english'))
# Function to remove stop words
def remove_stop_words(tokens):
    return [word for word in tokens if word.lower() not in stop_words]
data['remove_stopwords'] = data['tokens'].apply(remove_stop_words)
data['remove_stopwords']

0                                       [dalits, lowlives]
1                           [gay, people, burden, society]
2                                         [arabs, welcome]
3        [saying, actually, eliminate, heebs, wish, nat...
4                                 [bananas, black, people]
                               ...                        
41139                         [indian, culture, backwards]
41140                [women, want, equality, want, charge]
41141                                        [fuck, covid]
41142                                  [computer, garbage]
41143                            [good, muslim, dead, one]
Name: remove_stopwords, Length: 41108, dtype: object

# stemming

In [18]:
#stemming
ps= PorterStemmer()
# Function to perform stemming
def perform_stemming(tokens):
    return [ps.stem(word) for word in tokens]
data['stemmed'] = data['remove_stopwords'].apply(perform_stemming)
data['stemmed']

0                                          [dalit, lowliv]
1                            [gay, peopl, burden, societi]
2                                           [arab, welcom]
3        [say, actual, elimin, heeb, wish, natur, becam...
4                                   [banana, black, peopl]
                               ...                        
41139                           [indian, cultur, backward]
41140                    [women, want, equal, want, charg]
41141                                        [fuck, covid]
41142                                     [comput, garbag]
41143                            [good, muslim, dead, one]
Name: stemmed, Length: 41108, dtype: object

# Lemmatization:

In [19]:
lemmatizer = WordNetLemmatizer()
# Function to perform lemmatization
def perform_lemmatization(tokens):
    return [lemmatizer.lemmatize(word) for word in tokens]
data['lemmatized'] = data['remove_stopwords'].apply(perform_lemmatization)
data['lemmatized']

0                                       [dalits, lowlives]
1                           [gay, people, burden, society]
2                                          [arab, welcome]
3        [saying, actually, eliminate, heebs, wish, nat...
4                                  [banana, black, people]
                               ...                        
41139                         [indian, culture, backwards]
41140                [woman, want, equality, want, charge]
41141                                        [fuck, covid]
41142                                  [computer, garbage]
41143                            [good, muslim, dead, one]
Name: lemmatized, Length: 41108, dtype: object

# Load spaCy model

In [30]:
# Load spaCy model
nlp = spacy.load('en_core_web_md')

# Function to get word2vec embeddings using SpaCy

In [33]:
# Function to get word2vec embeddings using SpaCy
def get_word2vec_embeddings(text_series):
    embeddings = []
    for doc in nlp.pipe(text_series, disable=["parser", "ner"]):
        if doc.has_vector:
            embeddings.append(doc.vector)
        else:
            embeddings.append(None)  # In case the doc has no vectors
    return embeddings

# Apply get_word2vec_embeddings to lemmatized text

In [34]:
# Apply get_word2vec_embeddings to lemmatized text
data['word2vec'] = get_word2vec_embeddings(data['lemmatized'].apply(lambda x: ' '.join(x)))

In [35]:
data['word2vec']

0        [-1.906, 1.2487, 1.9823, 1.9465, 1.79555, 1.50...
1        [0.19075249, 2.0418324, -3.4573777, 0.86299753...
2        [1.9986349, -5.041985, -0.76663, -1.51065, 1.4...
3        [-1.7351625, -0.36887124, -0.23621875, -1.3883...
4        [-2.5482068, 0.2703433, -3.9575834, 2.7313, 1....
                               ...                        
41139    [-0.96657497, -0.061333258, -0.79939, -0.07666...
41140    [0.40465587, 2.970246, -3.5896618, -1.3439579,...
41141    [3.7351, 1.0318251, -0.8066, 2.11059, -1.93250...
41142    [0.08542502, -0.652035, 1.139975, 0.37980497, ...
41143    [-2.26612, 1.6801, -1.407895, -0.5243, 3.7953,...
Name: word2vec, Length: 41108, dtype: object

# Balance the data

In [36]:
# Balance the data
X = np.vstack(data['word2vec'].dropna().values)
y = data.loc[data['word2vec'].notna(), 'label']

# Using RandomUnderSampler

In [37]:
# Using RandomUnderSampler
rus = RandomUnderSampler(random_state=42)
X_res, y_res = rus.fit_resample(X, y)

In [None]:
# Convert back to DataFrame

In [38]:
# Convert back to DataFrame
balanced_data = pd.DataFrame(X_res)
balanced_data['label'] = y_res

In [39]:
print("Balanced dataset label distribution:")
print(balanced_data['label'].value_counts())

Balanced dataset label distribution:
label
N    18943
P    18943
Name: count, dtype: int64


In [40]:
# Save balanced data to a new CSV file
balanced_data.to_csv('balanced_hatespeech_dataset.csv', index=False)