In [1]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
data = pd.read_csv('hatspeech dataset.csv', encoding='ISO-8859-1')  

In [3]:
data

Unnamed: 0.1,Unnamed: 0,comment,label
0,1,dalits are lowlives,N
1,2,gay people are a burden to our society,N
2,3,Arabs are not welcome,N
3,4,I'm not saying we should actually eliminate he...,N
4,5,bananas are for black people,N
...,...,...,...
41139,117100,Indian culture is so backwards,N
41140,118100,"Women don't want equality, they just want to b...",N
41141,119100,fuck covid,P
41142,1205,This computer is garbage,P


In [4]:
print(data.info())  # Get information about the dataset, like column names and data types

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41144 entries, 0 to 41143
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  41144 non-null  int64 
 1   comment     41144 non-null  object
 2   label       41144 non-null  object
dtypes: int64(1), object(2)
memory usage: 964.4+ KB
None


In [5]:
print(data['label'].value_counts())  # Check the distribution of labels

label
N    22158
P    18950
O       36
Name: count, dtype: int64


In [6]:
data = data[data['label'] != 'O'].copy()

In [7]:
print(data['label'].value_counts())

label
N    22158
P    18950
Name: count, dtype: int64


In [8]:
data.isnull().any()

Unnamed: 0    False
comment       False
label         False
dtype: bool

In [9]:
#Find the number of duplicate rows
num_duplicates = data.duplicated().sum()
print(f'Number of duplicate rows: {num_duplicates}')

Number of duplicate rows: 0


In [10]:
chatword_dictionary = {
    'u': 'you', 'ur': 'your', 'r': 'are', 'y': 'why', 'b4': 'before', 'gr8': 'great', 'l8r': 'later',
    'w8': 'wait', 'bff': 'best friend forever', 'brb': 'be right back', 'btw': 'by the way', 'cuz': 'because',
    'idk': 'i do not know', 'ikr': 'i know right', 'imo': 'in my opinion', 'lmao': 'laughing my ass off',
    'lol': 'laugh out loud', 'omg': 'oh my god', 'omw': 'on my way', 'pls': 'please', 'thx': 'thanks',
    'ttyl': 'talk to you later', 'wth': 'what the hell', 'wyd': 'what you doing', 'smh': 'shaking my head'}

In [11]:
slang_dict = {
    "kinda": "kind of", "wanna": "want to", "gonna": "going to", "gotta": "got to", "ain't": "is not",
    "y'all": "you all", "can't": "cannot", "won't": "will not", "don't": "do not", "idk": "I do not know",
    "tbh": "to be honest", "brb": "be right back", "bbl": "be back later", "btw": "by the way",
    "omg": "oh my god", "lol": "laughing out loud", "i'm": "i am", "you're": "you are", "he's": "he is",
    "she's": "she is", "it's": "it is", "we're": "we are", "they're": "they are", "i'll": "i will",
    "you'll": "you will", "he'll": "he will", "she'll": "she will", "we'll": "we will", "they'll": "they will",
    "i've": "i have", "you've": "you have", "we've": "we have", "they've": "they have", "isn't": "is not",
    "aren't": "are not", "wasn't": "was not", "weren't": "were not", "haven't": "have not", "hasn't": "has not",
    "hadn't": "had not", "doesn't": "does not", "didn't": "did not", "won't": "will not", "wouldn't": "would not",
    "don't": "do not", "can't": "cannot", "couldn't": "could not", "shouldn't": "should not", "mightn't": "might not",
    "mustn't": "must not"
}

In [12]:
def clean_text(text):
    text = text.lower()  # Convert text to lowercase
    text = re.sub(r'\d+', '', text)  # Remove digits
    for chatword, full_form in chatword_dictionary.items():
        text = re.sub(rf"\b{chatword}\b", full_form, text)
    for slang, expanded in slang_dict.items():
        text = re.sub(r'\b' + re.escape(slang) + r'\b', expanded, text)
    text = re.sub(r'\b\w\b', '', text)  # Remove single letters
    text = re.sub(r'\W', ' ', text)  # Remove non-word characters
    text = re.sub(r'\s+', ' ', text)  # Remove extra whitespace
    text = re.sub(r'(.)\1{2,}', r'\1', text)  # Replace sequences of the same character > 2 with 1 occurrence
    return text

In [13]:
data['clean_text'] = data['comment'].apply(clean_text)

In [14]:
# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(data['clean_text'])

In [15]:
# Convert TF-IDF matrix to DataFrame for better readability
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

In [16]:
# Concatenate the original data (with clean text) and TF-IDF DataFrame
result_df = pd.concat([data.reset_index(drop=True), tfidf_df], axis=1)
print("Concatenated DataFrame preview:")
print(result_df.head())

Concatenated DataFrame preview:
   Unnamed: 0                                            comment label  \
0           1                                dalits are lowlives     N   
1           2             gay people are a burden to our society     N   
2           3                              Arabs are not welcome     N   
3           4  I'm not saying we should actually eliminate he...     N   
4           5                       bananas are for black people     N   

                                          clean_text  _as  _spread   aa  aah  \
0                                dalits are lowlives  0.0      0.0  0.0  0.0   
1               gay people are burden to our society  0.0      0.0  0.0  0.0   
2                              arabs are not welcome  0.0      0.0  0.0  0.0   
3   am not saying we should actually eliminate he...  0.0      0.0  0.0  0.0   
4                       bananas are for black people  0.0      0.0  0.0  0.0   

   aan  aand  ...  zounderkites   zs  zube

In [None]:
tfidf_df.to_csv('tf-idf encoded.csv', index=False)
print("File saved")