# Importing Libraries

In [1]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from imblearn.over_sampling import SMOTE
from sklearn.feature_extraction.text import TfidfVectorizer
from textblob import TextBlob
from sklearn.preprocessing import LabelEncoder

# Loading the Dataset

In [2]:
data = pd.read_csv('hatspeech dataset.csv', encoding='ISO-8859-1')  

In [3]:
data

Unnamed: 0.1,Unnamed: 0,comment,label
0,1,dalits are lowlives,N
1,2,gay people are a burden to our society,N
2,3,Arabs are not welcome,N
3,4,I'm not saying we should actually eliminate he...,N
4,5,bananas are for black people,N
...,...,...,...
41139,117100,Indian culture is so backwards,N
41140,118100,"Women don't want equality, they just want to b...",N
41141,119100,fuck covid,P
41142,1205,This computer is garbage,P


# Exploring the data

In [4]:
print(data.head())  # Check the first few rows of the dataset

   Unnamed: 0                                            comment label
0           1                                dalits are lowlives     N
1           2             gay people are a burden to our society     N
2           3                              Arabs are not welcome     N
3           4  I'm not saying we should actually eliminate he...     N
4           5                       bananas are for black people     N


In [5]:
print(data.info())  # Get information about the dataset, like column names and data types

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41144 entries, 0 to 41143
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  41144 non-null  int64 
 1   comment     41144 non-null  object
 2   label       41144 non-null  object
dtypes: int64(1), object(2)
memory usage: 964.4+ KB
None


# Checking the distribution of labels

In [6]:
print(data['label'].value_counts())  # Check the distribution of labels

label
N    22158
P    18950
O       36
Name: count, dtype: int64


In [7]:
data = data[data['label'] != 'O'].copy()

In [8]:
print(data['label'].value_counts())

label
N    22158
P    18950
Name: count, dtype: int64


In [9]:
data.isnull().any()

Unnamed: 0    False
comment       False
label         False
dtype: bool

# Checking for Duplicates

In [10]:
#Find the number of duplicate rows
num_duplicates = data.duplicated().sum()
print(f'Number of duplicate rows: {num_duplicates}')

Number of duplicate rows: 0


# Importing necessary nltk packages

In [11]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\maddi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\maddi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\maddi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Text Cleaning

In [12]:
chatword_dictionary = {
    'u': 'you',
    'ur': 'your',
    'r': 'are',
    'y': 'why',
    'b4': 'before',
    'gr8': 'great',
    'l8r': 'later',
    'w8': 'wait',
    'bff': 'best friend forever',
    'brb': 'be right back',
    'btw': 'by the way',
    'cuz': 'because',
    'idk': 'i do not know',
    'ikr': 'i know right',
    'imo': 'in my opinion',
    'lmao': 'laughing my ass off',
    'lol': 'laugh out loud',
    'omg': 'oh my god',
    'omw': 'on my way',
    'pls': 'please',
    'thx': 'thanks',
    'ttyl': 'talk to you later',
    'wth': 'what the hell',
    'wyd': 'what you doing',
    'smh': 'shaking my head'
}


In [13]:
slang_dict = {
    "kinda": "kind of",
    "wanna": "want to",
    "gonna": "going to",
    "gotta": "got to",
    "ain't": "is not",
    "y'all": "you all",
    "can't": "cannot",
    "won't": "will not",
    "don't": "do not",
    "idk": "I do not know",
    "tbh": "to be honest",
    "brb": "be right back",
    "bbl": "be back later",
    "btw": "by the way",
    "omg": "oh my god",
    "lol": "laughing out loud",
    "i'm": "i am",
    "you're": "you are",
    "he's": "he is",
    "she's": "she is",
    "it's": "it is",
    "we're": "we are",
    "they're": "they are",
    "i'll": "i will",
    "you'll": "you will",
    "he'll": "he will",
    "she'll": "she will",
    "we'll": "we will",
    "they'll": "they will",
    "i've": "i have",
    "you've": "you have",
    "we've": "we have",
    "they've": "they have",
    "isn't": "is not",
    "aren't": "are not",
    "wasn't": "was not",
    "weren't": "were not",
    "haven't": "have not",
    "hasn't": "has not",
    "hadn't": "had not",
    "doesn't": "does not",
    "didn't": "did not",
    "won't": "will not",
    "wouldn't": "would not",
    "don't": "do not",
    "can't": "cannot",
    "couldn't": "could not",
    "shouldn't": "should not",
    "mightn't": "might not",
    "mustn't": "must not"
}


In [14]:
def clean_text(text):
  text = text.lower()  # Convert text to lowercase
  text = re.sub(r'\d+', '', text)  # Remove digits
  for chatword, full_form in chatword_dictionary.items():
      text = re.sub(rf"\b{chatword}\b", full_form, text)
  for slang, expanded in slang_dict.items():
      text = re.sub(r'\b' + re.escape(slang) + r'\b', expanded, text)

  text = re.sub(r'\b\w\b', '', text)  # Remove single letters
  text = re.sub(r'\W', ' ', text)  # Remove non-word characters
  text = re.sub(r'\s+', ' ', text)  # Remove extra whitespace
  text = re.sub(r'(.)\1{2,}', r'\1', text)  # Replace sequences of the same character > 2 with 1 occurrence
  text = re.sub(r'(\w)[^\w\s](\w)', r'\1\2', text)  # Remove special characters within words like 'b@ckward' to 'backward'
  return text

In [15]:
data['clean_text'] = data['comment'].apply(clean_text)

In [16]:
print(data['clean_text'])

0                                      dalits are lowlives
1                     gay people are burden to our society
2                                    arabs are not welcome
3         am not saying we should actually eliminate he...
4                             bananas are for black people
                               ...                        
41139                       indian culture is so backwards
41140    women do not want equality they just want to b...
41141                                           fuck covid
41142                             this computer is garbage
41143                     the only good muslim is dead one
Name: clean_text, Length: 41108, dtype: object


# Tokenization

Tokenization is a way of separating a piece of text into smaller units called tokens. Here, tokens can be either words, characters, or subwords.For example, tokenizing the sentence “I love ice cream” would result in three tokens: “I,” “love,” and “ice cream.” It’s a fundamental step in natural language processing and text analysis tasks.

In [17]:
from nltk.tokenize import word_tokenize

data['tokens'] = data['clean_text'].apply(word_tokenize)

In [18]:
data['tokens']

0                                  [dalits, are, lowlives]
1             [gay, people, are, burden, to, our, society]
2                               [arabs, are, not, welcome]
3        [am, not, saying, we, should, actually, elimin...
4                       [bananas, are, for, black, people]
                               ...                        
41139                 [indian, culture, is, so, backwards]
41140    [women, do, not, want, equality, they, just, w...
41141                                        [fuck, covid]
41142                        [this, computer, is, garbage]
41143             [the, only, good, muslim, is, dead, one]
Name: tokens, Length: 41108, dtype: object

# Stopword removal

In [19]:
stop_words = set(stopwords.words('english'))

In [20]:
# Function to remove stop words
def remove_stop_words(tokens):
    return [word for word in tokens if word.lower() not in stop_words]

In [21]:
data['remove_stopwords'] = data['tokens'].apply(remove_stop_words)

In [22]:
data['remove_stopwords']

0                                       [dalits, lowlives]
1                           [gay, people, burden, society]
2                                         [arabs, welcome]
3        [saying, actually, eliminate, heebs, wish, nat...
4                                 [bananas, black, people]
                               ...                        
41139                         [indian, culture, backwards]
41140                [women, want, equality, want, charge]
41141                                        [fuck, covid]
41142                                  [computer, garbage]
41143                            [good, muslim, dead, one]
Name: remove_stopwords, Length: 41108, dtype: object

# stemming

In [23]:
#stemming
ps= PorterStemmer()

In [24]:
# Function to perform stemming
def perform_stemming(tokens):
    return [ps.stem(word) for word in tokens]

In [25]:
data['stemmed'] = data['remove_stopwords'].apply(perform_stemming)

In [26]:
data['stemmed']

0                                          [dalit, lowliv]
1                            [gay, peopl, burden, societi]
2                                           [arab, welcom]
3        [say, actual, elimin, heeb, wish, natur, becam...
4                                   [banana, black, peopl]
                               ...                        
41139                           [indian, cultur, backward]
41140                    [women, want, equal, want, charg]
41141                                        [fuck, covid]
41142                                     [comput, garbag]
41143                            [good, muslim, dead, one]
Name: stemmed, Length: 41108, dtype: object

# Lemmatization

Lemmatization is the process of grouping together the different inflected forms of a word so they can be analyzed as a single item. Lemmatization is similar to stemming but it brings context to the words. So, it links words with similar meanings to one word. The practical distinction between stemming and lemmatization is that, where stemming merely removes common suffixes from the end of word tokens, lemmatization ensures the output word is an existing normalized form of the word (for example, lemma) that can be found in the dictionary.

In [27]:
lemmatizer = WordNetLemmatizer()

In [28]:
# Function to perform lemmatization
def perform_lemmatization(tokens):
    return [lemmatizer.lemmatize(word) for word in tokens]

In [29]:
data['lemmatized'] = data['remove_stopwords'].apply(perform_lemmatization)

In [30]:
data['lemmatized']

0                                       [dalits, lowlives]
1                           [gay, people, burden, society]
2                                          [arab, welcome]
3        [saying, actually, eliminate, heebs, wish, nat...
4                                  [banana, black, people]
                               ...                        
41139                         [indian, culture, backwards]
41140                [woman, want, equality, want, charge]
41141                                        [fuck, covid]
41142                                  [computer, garbage]
41143                            [good, muslim, dead, one]
Name: lemmatized, Length: 41108, dtype: object

In [31]:
# Label Encoding
label_encoder = LabelEncoder()
data['label_encoded'] = label_encoder.fit_transform(data['label'])

In [32]:
data.to_csv('label_encoded.csv', index=False)
print("File saved")

File saved
