In [17]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [18]:
data = pd.read_csv('hatspeech dataset.csv',encoding='latin1')

In [19]:
print(data.head())  # Check the first few rows of the dataset

   Unnamed: 0                                            comment label
0           1                                dalits are lowlives     N
1           2             gay people are a burden to our society     N
2           3                              Arabs are not welcome     N
3           4  I'm not saying we should actually eliminate he...     N
4           5                       bananas are for black people     N


In [20]:
print(data.info())  # Get information about the dataset, like column names and data types

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41144 entries, 0 to 41143
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  41144 non-null  int64 
 1   comment     41144 non-null  object
 2   label       41144 non-null  object
dtypes: int64(1), object(2)
memory usage: 964.4+ KB
None


In [21]:
print(data['label'].value_counts())  # Check the distribution of labels

label
N    22158
P    18950
O       36
Name: count, dtype: int64


In [22]:
#Find the number of duplicate rows
num_duplicates = data.duplicated().sum()
print(f'Number of duplicate rows: {num_duplicates}')

Number of duplicate rows: 0


In [39]:
def clean_text(text):
    text = text.lower()  # Convert text to lowercase
    text = re.sub(r'\W', ' ', text)  # Remove non-word characters
    text = re.sub(r'\s+', ' ', text) # Remove extra whitespace
    text = re.sub(r'(.)\1{2,}', r'\1', text)  # Replace sequences of the same character > 2 with 1 occurrence
    text = re.sub(r'(\w)[^\w\s](\w)', r'\1\2', text)
    return text

In [40]:
data['clean_text'] = data['comment'].apply(clean_text)

In [41]:
data['clean_text']

0                                      dalits are lowlives
1                   gay people are a burden to our society
2                                    arabs are not welcome
3        i m not saying we should actually eliminate he...
4                             bananas are for black people
                               ...                        
41139                       indian culture is so backwards
41140    women don t want equality they just want to be...
41141                                           fuck covid
41142                             this computer is garbage
41143                   the only good muslim is a dead one
Name: clean_text, Length: 41144, dtype: object

In [42]:
# Step 4: Tokenization (Whitespace Tokenization)
data['tokens'] = data['clean_text'].apply(lambda x: x.split())

In [43]:
data['tokens']

0                                  [dalits, are, lowlives]
1          [gay, people, are, a, burden, to, our, society]
2                               [arabs, are, not, welcome]
3        [i, m, not, saying, we, should, actually, elim...
4                       [bananas, are, for, black, people]
                               ...                        
41139                 [indian, culture, is, so, backwards]
41140    [women, don, t, want, equality, they, just, wa...
41141                                        [fuck, covid]
41142                        [this, computer, is, garbage]
41143          [the, only, good, muslim, is, a, dead, one]
Name: tokens, Length: 41144, dtype: object

In [44]:
stop_words = set(stopwords.words('english'))

In [45]:
def remove_stopwords(tokens):
    return [word for word in tokens if word not in stop_words]

In [46]:
data['tokens'] = data['tokens'].apply(remove_stopwords)

In [47]:
data['tokens']

0                                       [dalits, lowlives]
1                           [gay, people, burden, society]
2                                         [arabs, welcome]
3        [saying, actually, eliminate, heebs, wish, nat...
4                                 [bananas, black, people]
                               ...                        
41139                         [indian, culture, backwards]
41140                [women, want, equality, want, charge]
41141                                        [fuck, covid]
41142                                  [computer, garbage]
41143                            [good, muslim, dead, one]
Name: tokens, Length: 41144, dtype: object

In [48]:
lemmatizer = WordNetLemmatizer()

In [49]:
def lemmatize_words(tokens):
    return [lemmatizer.lemmatize(word) for word in tokens]

In [50]:
data['lemmatized_tokens'] = data['tokens'].apply(lemmatize_words)

In [51]:
data['lemmatized_tokens']

0                                       [dalits, lowlives]
1                           [gay, people, burden, society]
2                                          [arab, welcome]
3        [saying, actually, eliminate, heebs, wish, nat...
4                                  [banana, black, people]
                               ...                        
41139                         [indian, culture, backwards]
41140                [woman, want, equality, want, charge]
41141                                        [fuck, covid]
41142                                  [computer, garbage]
41143                            [good, muslim, dead, one]
Name: lemmatized_tokens, Length: 41144, dtype: object

In [52]:
data['final_text'] = data['lemmatized_tokens'].apply(lambda x: ' '.join(x))

In [53]:
data['final_text']

0                                          dalits lowlives
1                                gay people burden society
2                                             arab welcome
3        saying actually eliminate heebs wish naturally...
4                                      banana black people
                               ...                        
41139                             indian culture backwards
41140                      woman want equality want charge
41141                                           fuck covid
41142                                     computer garbage
41143                                 good muslim dead one
Name: final_text, Length: 41144, dtype: object

In [54]:
data.to_csv('whitespace tokenization.csv', index=False)
print("File saved")

File saved
