In [13]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from tokenizers import Tokenizer, models, trainers, pre_tokenizers, processors

In [14]:
data = pd.read_csv('hatspeech dataset.csv',encoding='latin1')

In [15]:
print(data.head())  # Check the first few rows of the dataset

   Unnamed: 0                                            comment label
0           1                                dalits are lowlives     N
1           2             gay people are a burden to our society     N
2           3                              Arabs are not welcome     N
3           4  I'm not saying we should actually eliminate he...     N
4           5                       bananas are for black people     N


In [4]:
print(data.info())  # Get information about the dataset, like column names and data types

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41144 entries, 0 to 41143
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  41144 non-null  int64 
 1   comment     41144 non-null  object
 2   label       41144 non-null  object
dtypes: int64(1), object(2)
memory usage: 964.4+ KB
None


In [5]:
print(data['label'].value_counts())  # Check the distribution of labels

label
N    22158
P    18950
O       36
Name: count, dtype: int64


In [6]:
#Find the number of duplicate rows
num_duplicates = data.duplicated().sum()
print(f'Number of duplicate rows: {num_duplicates}')

Number of duplicate rows: 0


In [7]:
def clean_text(text):
    text = text.lower()  # Convert text to lowercase
    text = re.sub(r'\W', ' ', text)  # Remove non-word characters
    text = re.sub(r'\s+', ' ', text) # Remove extra whitespace
    text = re.sub(r'(.)\1{2,}', r'\1', text)  # Replace sequences of the same character > 2 with 1 occurrence
    text = re.sub(r'(\w)[^\w\s](\w)', r'\1\2', text)
    return text

In [8]:
data['clean_text'] = data['comment'].apply(clean_text)

In [9]:
data['clean_text']

0                                      dalits are lowlives
1                   gay people are a burden to our society
2                                    arabs are not welcome
3        i m not saying we should actually eliminate he...
4                             bananas are for black people
                               ...                        
41139                       indian culture is so backwards
41140    women don t want equality they just want to be...
41141                                           fuck covid
41142                             this computer is garbage
41143                   the only good muslim is a dead one
Name: clean_text, Length: 41144, dtype: object

In [17]:
def train_bpe_tokenizer(texts, vocab_size=10000):
    tokenizer = Tokenizer(models.BPE())
    
    # Customize pre-tokenizer and post-processor if needed
    tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
    
    # Create a trainer for the tokenizer
    trainer = trainers.BpeTrainer(vocab_size=vocab_size)
    
    # Train the tokenizer on the provided texts
    tokenizer.train_from_iterator(texts, trainer=trainer)
    
    return tokenizer

In [20]:
 #Train the tokenizer on the clean text data
tokenizer = train_bpe_tokenizer(data['comment'].tolist())

In [21]:
# Tokenize the text data
def tokenize_with_bpe(text, tokenizer):
    encoded = tokenizer.encode(text)
    return encoded.tokens

In [23]:
data['tokens'] = data['comment'].apply(lambda x: tokenize_with_bpe(x, tokenizer))

In [24]:
data['tokens']

0                              [d, alits, are, low, lives]
1          [gay, people, are, a, burden, to, our, society]
2                               [Arabs, are, not, welcome]
3        [I, ', m, not, saying, we, should, actually, e...
4                       [bananas, are, for, black, people]
                               ...                        
41139                 [Indian, culture, is, so, backwards]
41140    [Women, don, ', t, want, equality, ,, they, ju...
41141                                        [fuck, covid]
41142                        [This, computer, is, garbage]
41143          [The, only, good, Muslim, is, a, dead, one]
Name: tokens, Length: 41144, dtype: object

In [10]:
stop_words = set(stopwords.words('english'))

In [11]:
def remove_stopwords(tokens):
    return [word for word in tokens if word not in stop_words]

In [12]:
data['tokens'] = data['tokens'].apply(remove_stopwords)

KeyError: 'tokens'

In [None]:
data['tokens']

In [None]:
lemmatizer = WordNetLemmatizer()

In [None]:
def lemmatize_words(tokens):
    return [lemmatizer.lemmatize(word) for word in tokens]

In [None]:
data['lemmatized_tokens'] = data['tokens'].apply(lemmatize_words)

In [None]:
data['lemmatized_tokens']

In [None]:
data['final_text'] = data['lemmatized_tokens'].apply(lambda x: ' '.join(x))

In [None]:
data['final_text']

In [None]:
data.to_csv('byte pair encoded.csv', index=False)
print("File saved")