# Importing Necessary Libraries:

In [1]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from textblob import TextBlob
from spellchecker import SpellChecker
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
import numpy as np
import nltk

# Load the dataset

In [2]:
# Load the dataset
data = pd.read_csv('hatspeech dataset.csv', encoding='ISO-8859-1')

# Basic data exploration

In [3]:
# Basic data exploration
print(data.head())  # Check the first few rows of the dataset

   Unnamed: 0                                            comment label
0           1                                dalits are lowlives     N
1           2             gay people are a burden to our society     N
2           3                              Arabs are not welcome     N
3           4  I'm not saying we should actually eliminate he...     N
4           5                       bananas are for black people     N


In [4]:
print(data.info())  # Get information about the dataset, like column names and data types

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41144 entries, 0 to 41143
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  41144 non-null  int64 
 1   comment     41144 non-null  object
 2   label       41144 non-null  object
dtypes: int64(1), object(2)
memory usage: 964.4+ KB
None


In [5]:
print(data['label'].value_counts())  # Check the distribution of labels

label
N    22158
P    18950
O       36
Name: count, dtype: int64


# Filter out rows with label 'O'

In [6]:
# Filter out rows with label 'O'
data = data[data['label'] != 'O'].copy()
print(data['label'].value_counts())

label
N    22158
P    18950
Name: count, dtype: int64


# Check for missing values

In [7]:
# Check for missing values
print(data.isnull().any())

Unnamed: 0    False
comment       False
label         False
dtype: bool


# Check for duplicates

In [8]:
# Check for duplicates
num_duplicates = data.duplicated().sum()
print(f'Number of duplicate rows: {num_duplicates}')

Number of duplicate rows: 0


# Download necessary NLTK data

In [9]:
# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\maddi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\maddi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\maddi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Define chatword and slang dictionaries

In [10]:
# Define chatword and slang dictionaries
chatword_dictionary = {
    'u': 'you',
    'ur': 'your',
    'r': 'are',
    'y': 'why',
    'b4': 'before',
    'gr8': 'great',
    'l8r': 'later',
    'w8': 'wait',
    'bff': 'best friend forever',
    'brb': 'be right back',
    'btw': 'by the way',
    'cuz': 'because',
    'idk': 'i do not know',
    'ikr': 'i know right',
    'imo': 'in my opinion',
    'lmao': 'laughing my ass off',
    'lol': 'laugh out loud',
    'omg': 'oh my god',
    'omw': 'on my way',
    'pls': 'please',
    'thx': 'thanks',
    'ttyl': 'talk to you later',
    'wth': 'what the hell',
    'wyd': 'what you doing',
    'smh': 'shaking my head'
}

In [11]:
slang_dict = {
    "kinda": "kind of",
    "wanna": "want to",
    "gonna": "going to",
    "gotta": "got to",
    "ain't": "is not",
    "y'all": "you all",
    "can't": "cannot",
    "won't": "will not",
    "don't": "do not",
    "idk": "I do not know",
    "tbh": "to be honest",
    "brb": "be right back",
    "bbl": "be back later",
    "btw": "by the way",
    "omg": "oh my god",
    "lol": "laughing out loud",
    "i'm": "i am",
    "you're": "you are",
    "he's": "he is",
    "she's": "she is",
    "it's": "it is",
    "we're": "we are",
    "they're": "they are",
    "i'll": "i will",
    "you'll": "you will",
    "he'll": "he will",
    "she'll": "she will",
    "we'll": "we will",
    "they'll": "they will",
    "i've": "i have",
    "you've": "you have",
    "we've": "we have",
    "they've": "they have",
    "isn't": "is not",
    "aren't": "are not",
    "wasn't": "was not",
    "weren't": "were not",
    "haven't": "have not",
    "hasn't": "has not",
    "hadn't": "had not",
    "doesn't": "does not",
    "didn't": "did not",
    "won't": "will not",
    "wouldn't": "would not",
    "don't": "do not",
    "can't": "cannot",
    "couldn't": "could not",
    "shouldn't": "should not",
    "mightn't": "might not",
    "mustn't": "must not"
}

# Define text cleaning function

In [12]:
# Define text cleaning function
def clean_text(text):
    text = text.lower()  # Convert text to lowercase
    text = re.sub(r'\d+', '', text)  # Remove digits
    for chatword, full_form in chatword_dictionary.items():
        text = re.sub(rf"\b{chatword}\b", full_form, text)
    for slang, expanded in slang_dict.items():
        text = re.sub(r'\b' + re.escape(slang) + r'\b', expanded, text)
    text = re.sub(r'\W', ' ', text)  # Remove non-word characters
    text = re.sub(r'\s+', ' ', text)  # Remove extra whitespace
    text = re.sub(r'(.)\1{2,}', r'\1', text)  # Replace sequences of the same character > 2 with 1 occurrence
    text = re.sub(r'(\w)[^\w\s](\w)', r'\1\2', text)  # Remove special characters within words like 'b@ckward' to 'backward'
    return text

In [13]:
data['clean_text'] = data['comment'].apply(clean_text)

In [14]:
data['clean_text']

0                                      dalits are lowlives
1                   gay people are a burden to our society
2                                    arabs are not welcome
3        i am not saying we should actually eliminate h...
4                             bananas are for black people
                               ...                        
41139                       indian culture is so backwards
41140    women do not want equality they just want to b...
41141                                           fuck covid
41142                             this computer is garbage
41143                   the only good muslim is a dead one
Name: clean_text, Length: 41108, dtype: object

# Tokenize text

In [15]:
# Tokenize text
data['tokens'] = data['clean_text'].apply(word_tokenize)

In [16]:
data['tokens']

0                                  [dalits, are, lowlives]
1          [gay, people, are, a, burden, to, our, society]
2                               [arabs, are, not, welcome]
3        [i, am, not, saying, we, should, actually, eli...
4                       [bananas, are, for, black, people]
                               ...                        
41139                 [indian, culture, is, so, backwards]
41140    [women, do, not, want, equality, they, just, w...
41141                                        [fuck, covid]
41142                        [this, computer, is, garbage]
41143          [the, only, good, muslim, is, a, dead, one]
Name: tokens, Length: 41108, dtype: object

# Remove stop words

In [17]:
# Remove stop words
stop_words = set(stopwords.words('english'))
def remove_stop_words(tokens):
    return [word for word in tokens if word.lower() not in stop_words]
data['remove_stopwords'] = data['tokens'].apply(remove_stop_words)

In [18]:
data['remove_stopwords']

0                                       [dalits, lowlives]
1                           [gay, people, burden, society]
2                                         [arabs, welcome]
3        [saying, actually, eliminate, heebs, wish, nat...
4                                 [bananas, black, people]
                               ...                        
41139                         [indian, culture, backwards]
41140                [women, want, equality, want, charge]
41141                                        [fuck, covid]
41142                                  [computer, garbage]
41143                            [good, muslim, dead, one]
Name: remove_stopwords, Length: 41108, dtype: object

# Stemming

In [19]:
# Stemming
ps = PorterStemmer()
def perform_stemming(tokens):
    return [ps.stem(word) for word in tokens]
data['stemmed'] = data['remove_stopwords'].apply(perform_stemming)

In [20]:
data['stemmed']

0                                          [dalit, lowliv]
1                            [gay, peopl, burden, societi]
2                                           [arab, welcom]
3        [say, actual, elimin, heeb, wish, natur, becam...
4                                   [banana, black, peopl]
                               ...                        
41139                           [indian, cultur, backward]
41140                    [women, want, equal, want, charg]
41141                                        [fuck, covid]
41142                                     [comput, garbag]
41143                            [good, muslim, dead, one]
Name: stemmed, Length: 41108, dtype: object

# Lemmatization

In [21]:
# Lemmatization
lemmatizer = WordNetLemmatizer()
def perform_lemmatization(tokens):
    return [lemmatizer.lemmatize(word) for word in tokens]
data['lemmatized'] = data['remove_stopwords'].apply(perform_lemmatization)

In [22]:
data['lemmatized']

0                                       [dalits, lowlives]
1                           [gay, people, burden, society]
2                                          [arab, welcome]
3        [saying, actually, eliminate, heebs, wish, nat...
4                                  [banana, black, people]
                               ...                        
41139                         [indian, culture, backwards]
41140                [woman, want, equality, want, charge]
41141                                        [fuck, covid]
41142                                  [computer, garbage]
41143                            [good, muslim, dead, one]
Name: lemmatized, Length: 41108, dtype: object

# Perform TF-IDF embedding

In [23]:
# Perform TF-IDF embedding
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X_tfidf = tfidf_vectorizer.fit_transform(data['lemmatized'].apply(lambda x: ' '.join(x)))

# Initialize RandomOverSampler

In [29]:
# Initialize RandomOverSampler
ros = RandomOverSampler(random_state=42)

# Resample the data

In [30]:
# Resample the data
X_resampled, y_resampled = ros.fit_resample(X_tfidf, data['label'])

# Convert back to DataFrame

In [31]:
# Convert back to DataFrame if necessary
balanced_data = pd.DataFrame(X_resampled.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
balanced_data['label'] = y_resampled

# Save balanced data to a new CSV file

In [32]:
# Save balanced data to a new CSV file
balanced_data.to_csv('tfidf balanced_hatespeech_dataset.csv', index=False)