In [1]:
import pandas as pd
import re
import string
from tqdm.auto import tqdm
import os
import platform
import numpy as np

## Preprocessing the testset

Now we will prepare the testset.

1. concatenate the labels to one column
2. remove all rows with the label -1, which means the text is not labeled
3. clean the text
4. save the preprocessed testset

In [None]:
data = pd.read_csv("../data/test/test.csv")
data.head()

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.


In [None]:
labels = pd.read_csv("../data/test_labels/test_labels.csv")
labels.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,-1,-1,-1,-1,-1,-1
1,0000247867823ef7,-1,-1,-1,-1,-1,-1
2,00013b17ad220c46,-1,-1,-1,-1,-1,-1
3,00017563c3f7919a,-1,-1,-1,-1,-1,-1
4,00017695ad8997eb,-1,-1,-1,-1,-1,-1


The value -1 means the text is not labeled. We will concatenate the labels to one column and remove all rows with the label -1. If the sum of the labels is greater than 0, the text is labeled as hate speech. If the sum is 0, the text is not labeled as hate speech.

In [None]:
def fun(x):
    if x.sum() > 0:
        return 1
    elif x.sum() == 0:
        return 0
    return -1

In [None]:
rows = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

data['label'] = labels[rows].apply(lambda x: fun(x), axis=1)

for i in rows:
    del labels[i]

data.head()

Unnamed: 0,id,comment_text,label
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...,-1
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...,-1
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap...",-1
3,00017563c3f7919a,":If you have a look back at the source, the in...",-1
4,00017695ad8997eb,I don't anonymously edit articles at all.,-1


Now we remove all rows with the label -1.

In [None]:
data = data[data['label'] != -1]
data.head()

Unnamed: 0,id,comment_text,label
5,0001ea8717f6de06,Thank you for understanding. I think very high...,0
7,000247e83dcc1211,:Dear god this site is horrible.,0
11,0002f87b16116a7f,"""::: Somebody will invariably try to add Relig...",0
13,0003e1cccfd5a40a,""" \n\n It says it right there that it IS a typ...",0
14,00059ace3e3e9a53,""" \n\n == Before adding a new product to the l...",0


For text cleaning, we only remove special characters, links, and punctuation, because we want to keep the text as close to the original as possible and BERT does not require any further preprocessing. Stopwords are not removed because they can be important for the model to understand the context of the text.

In [None]:
def clean_text(text):

    text = re.sub('\[.*?\]', '', text)
    #pattern = [zero or more character]

    text = re.sub('https?://\S+|www\.\S+', '', text)
    #pattern = removes (http),://, 'and' www.

    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    #pattern = any punctionation

    text = re.sub('\n', ' ', text)
    #pattern = any new line

    text = re.sub('\w*\d\w*', '', text)
    #pattern = any from[a-zA-Z0-9_], any from[0-9], any from [a-zA-Z0-9_]

    return text

In [None]:
data['comment_text'] = data['comment_text'].apply(lambda x: clean_text(x))
data.head()

Unnamed: 0,id,comment_text,label
5,0001ea8717f6de06,Thank you for understanding I think very highl...,0
7,000247e83dcc1211,Dear god this site is horrible,0
11,0002f87b16116a7f,Somebody will invariably try to add Religion ...,0
13,0003e1cccfd5a40a,It says it right there that it IS a type T...,0
14,00059ace3e3e9a53,Before adding a new product to the list m...,0


save the preprocessed testset

In [None]:
data.to_csv("../data/test/clean_test.csv", index=False)

## Preprocessing the trainset for the model

Now we will prepare the trainset for the model.

1. clean the text
2. concatenate the labels to one column
3. save the preprocessed trainset

First we will load the trainset and check the operating system to load the data from the correct path.

In [None]:
def check_os_and_load_data(filename):
    if platform.system() == "Windows":
        print("This is a Windows system. Running Windows-specific code.")
        # Assuming the directory path for Windows is '../data/train/'
        path = os.path.join('..', 'data', 'train', filename)
        data = pd.read_csv(path, encoding='utf-8')

    elif platform.system() == "Linux":
        print("This is a Linux system. Running Linux-specific code.")
        # Assuming the directory path for Linux is the home directory
        path = os.path.join(os.path.expanduser('~'), 'ownCloud - Michael Saxer (zhaw.ch)@drive.switch.ch', '4 Semester', 'NLP', 'workbench', filename)
        data = pd.read_csv(path, encoding='utf-8')

    else:
        print("This is neither a Windows nor a Linux system. You're on your own, sorry.")
        data = None  # Or handle other operating systems as needed

    return data

# Pass only the filename, not the path.
data = check_os_and_load_data('train.csv')

This is a Windows system. Running Windows-specific code.


In [None]:
data.head(8)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
5,00025465d4725e87,"""\n\nCongratulations from me as well, use the ...",0,0,0,0,0,0
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0
7,00031b1e95af7921,Your vandalism to the Matt Shirvington article...,0,0,0,0,0,0


Now clean the trainset

In [None]:
data['comment_text'] = data['comment_text'].apply(lambda x: clean_text(x))
data.head(8)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation Why the edits made under my userna...,0,0,0,0,0,0
1,000103f0d9cfb60f,Daww He matches this background colour Im seem...,0,0,0,0,0,0
2,000113f07ec002fd,Hey man Im really not trying to edit war Its j...,0,0,0,0,0,0
3,0001b41b1c6bb37e,More I cant make any real suggestions on impr...,0,0,0,0,0,0
4,0001d958c54c6e35,You sir are my hero Any chance you remember wh...,0,0,0,0,0,0
5,00025465d4725e87,Congratulations from me as well use the tool...,0,0,0,0,0,0
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0
7,00031b1e95af7921,Your vandalism to the Matt Shirvington article...,0,0,0,0,0,0


Now we will concatenate the labels to one column. If the sum of the labels is greater than 0, the text is labeled as hate speech. If the sum is 0, the text is not labeled as hate speech.

In [None]:
def concat(x):
    if x.sum() != 0:
        return 1
    return 0

rows = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

data['hate'] = data[rows].apply(lambda x: concat(x), axis=1)

for i in rows:
    del data[i]

In [None]:
data.head(10)

Unnamed: 0,id,comment_text,hate
0,0000997932d777bf,Explanation Why the edits made under my userna...,0
1,000103f0d9cfb60f,Daww He matches this background colour Im seem...,0
2,000113f07ec002fd,Hey man Im really not trying to edit war Its j...,0
3,0001b41b1c6bb37e,More I cant make any real suggestions on impr...,0
4,0001d958c54c6e35,You sir are my hero Any chance you remember wh...,0
5,00025465d4725e87,Congratulations from me as well use the tool...,0
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1
7,00031b1e95af7921,Your vandalism to the Matt Shirvington article...,0
8,00037261f536c51d,Sorry if the word nonsense was offensive to yo...,0
9,00040093b2687caa,alignment on this subject and which are contra...,0


save the preprocessed trainset

In [None]:
data.to_csv("data/train/clean_train.csv", index=False)

NameError: name 'data' is not defined

## Balancing training dataset 

In [None]:
data = pd.read_csv("data/train/clean_train.csv")

In [None]:
# Indizes für jede Klasse extrahieren
class_0_indices = [i for i, label in enumerate(data['hate']) if label == 0]
class_1_indices = [i for i, label in enumerate(data['hate']) if label == 1]

# Undersampling für Klasse 0
undersampled_class_0_indices = np.random.choice(class_0_indices, size=len(class_1_indices), replace=False)

# Zusammenführen der Indizes für ein ausgeglichenes Dataset
balanced_indices = list(undersampled_class_0_indices) + class_1_indices

data_balanced = data.iloc[balanced_indices]

data_balanced

Unnamed: 0,id,comment_text,hate
138485,e4dd8c9c1ba982a2,Lol looks like UserReliableBen has decided t...,0
138248,e3cb52163b78e814,History needs updating The History section nee...,0
66015,b084b4ce7d3dda79,Please remember that i am a human being before,0
93998,fb5634abb59d370b,I was wondering if I could keep the bridge ...,0
140790,f160f6e62e366201,Thanks Rlevse WMC also deleted Abds article ...,0
...,...,...,...
159494,fef4cf7ba0012866,our previous conversation you fucking shi...,1
159514,ff39a2895fc3b40e,YOU ARE A MISCHIEVIOUS PUBIC HAIR,1
159541,ffa33d3122b599d6,Your absurd edits Your absurd edits on great...,1
159546,ffb47123b2d82762,Hey listen dont you ever Delete my edits eve...,1


In [21]:
data_balanced.to_csv("data/train/balanced_train.csv", index=False)

OSError: Cannot save file into a non-existent directory: 'data/train'

In [None]:
## Balancing test dataset 

In [40]:
#data = pd.read_csv("data/train/clean_train.csv")
filenames = 'test_labels.csv'
path = os.path.join(os.path.expanduser('~'), 'ownCloud - Michael Saxer (zhaw.ch)@drive.switch.ch', '4 Semester', 'NLP', 'workbench', filenames)

data_labels = pd.read_csv(path)
data_labels.head(100)

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,-1,-1,-1,-1,-1,-1
1,0000247867823ef7,-1,-1,-1,-1,-1,-1
2,00013b17ad220c46,-1,-1,-1,-1,-1,-1
3,00017563c3f7919a,-1,-1,-1,-1,-1,-1
4,00017695ad8997eb,-1,-1,-1,-1,-1,-1
...,...,...,...,...,...,...,...
95,0023f3f84f353bce,0,0,0,0,0,0
96,002586bdf3280356,-1,-1,-1,-1,-1,-1
97,0025a91b6955f1a5,0,0,0,0,0,0
98,0025c49d87d9a18f,0,0,0,0,0,0


In [39]:
#data = pd.read_csv("data/train/clean_train.csv")
filenames = 'test.csv'
path = os.path.join(os.path.expanduser('~'), 'ownCloud - Michael Saxer (zhaw.ch)@drive.switch.ch', '4 Semester', 'NLP', 'workbench', filenames)

data_comments = pd.read_csv(path)
data_comments.head(100)

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.
...,...,...
95,0023f3f84f353bce,""" \n\n == Main towns that are not so main == \..."
96,002586bdf3280356,""" \n\n my comments follow, bluewillow991967 -..."
97,0025a91b6955f1a5,""" \n\n == Halliday == \n\n Good to see another..."
98,0025c49d87d9a18f,""" \n ::: That Stephen Barrett is not Board Cer..."


In [41]:
data = pd.merge(data_labels, data_comments, on='id', how='inner')
data

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate,comment_text
0,00001cee341fdb12,-1,-1,-1,-1,-1,-1,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,-1,-1,-1,-1,-1,-1,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,-1,-1,-1,-1,-1,-1,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,-1,-1,-1,-1,-1,-1,":If you have a look back at the source, the in..."
4,00017695ad8997eb,-1,-1,-1,-1,-1,-1,I don't anonymously edit articles at all.
...,...,...,...,...,...,...,...,...
153159,fffcd0960ee309b5,-1,-1,-1,-1,-1,-1,". \n i totally agree, this stuff is nothing bu..."
153160,fffd7a9a6eb32c16,-1,-1,-1,-1,-1,-1,== Throw from out field to home plate. == \n\n...
153161,fffda9e8d6fafa9e,-1,-1,-1,-1,-1,-1,""" \n\n == Okinotorishima categories == \n\n I ..."
153162,fffe8f1340a79fc2,-1,-1,-1,-1,-1,-1,""" \n\n == """"One of the founding nations of the..."


In [42]:
def fun(x):
    if x.sum() < 0:
        return 0
    return 1

rows = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

data['was_used'] = data[rows].apply(lambda x: fun(x), axis=1)

data.head(100)

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate,comment_text,was_used
0,00001cee341fdb12,-1,-1,-1,-1,-1,-1,Yo bitch Ja Rule is more succesful then you'll...,0
1,0000247867823ef7,-1,-1,-1,-1,-1,-1,== From RfC == \n\n The title is fine as it is...,0
2,00013b17ad220c46,-1,-1,-1,-1,-1,-1,""" \n\n == Sources == \n\n * Zawe Ashton on Lap...",0
3,00017563c3f7919a,-1,-1,-1,-1,-1,-1,":If you have a look back at the source, the in...",0
4,00017695ad8997eb,-1,-1,-1,-1,-1,-1,I don't anonymously edit articles at all.,0
...,...,...,...,...,...,...,...,...,...
95,0023f3f84f353bce,0,0,0,0,0,0,""" \n\n == Main towns that are not so main == \...",1
96,002586bdf3280356,-1,-1,-1,-1,-1,-1,""" \n\n my comments follow, bluewillow991967 -...",0
97,0025a91b6955f1a5,0,0,0,0,0,0,""" \n\n == Halliday == \n\n Good to see another...",1
98,0025c49d87d9a18f,0,0,0,0,0,0,""" \n ::: That Stephen Barrett is not Board Cer...",1


In [43]:
data = data[data['was_used'] != 0]

# Step 2: Drop column A from the DataFrame
data = data.drop(columns=['was_used'])

data.head(100)

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate,comment_text
5,0001ea8717f6de06,0,0,0,0,0,0,Thank you for understanding. I think very high...
7,000247e83dcc1211,0,0,0,0,0,0,:Dear god this site is horrible.
11,0002f87b16116a7f,0,0,0,0,0,0,"""::: Somebody will invariably try to add Relig..."
13,0003e1cccfd5a40a,0,0,0,0,0,0,""" \n\n It says it right there that it IS a typ..."
14,00059ace3e3e9a53,0,0,0,0,0,0,""" \n\n == Before adding a new product to the l..."
...,...,...,...,...,...,...,...,...
230,0065324079670f4d,0,0,0,0,0,0,""" \n\n P.S. IMHO, this all falls under the cat..."
233,006622a9f635ace3,0,0,0,0,0,0,REDIRECT Talk:Ponhook Lake 10
241,0069e74a5302bf10,0,0,0,0,0,0,""" \n\n ===Use of """"Nepal Bhasa""""=== \n The ter..."
244,006a703e116e7ee8,0,0,0,0,0,0,If the indigenous population is so low why do ...


In [44]:
def fun(x):
    if x.sum() != 0:
        return 1
    return 0

rows = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

data['hate'] = data[rows].apply(lambda x: fun(x), axis=1)

for i in rows:
    del data[i]
    
data

#print(data['hate'].sum())

Unnamed: 0,id,comment_text,hate
5,0001ea8717f6de06,Thank you for understanding. I think very high...,0
7,000247e83dcc1211,:Dear god this site is horrible.,0
11,0002f87b16116a7f,"""::: Somebody will invariably try to add Relig...",0
13,0003e1cccfd5a40a,""" \n\n It says it right there that it IS a typ...",0
14,00059ace3e3e9a53,""" \n\n == Before adding a new product to the l...",0
...,...,...,...
153150,fff8f64043129fa2,":Jerome, I see you never got around to this…! ...",0
153151,fff9d70fe0722906,==Lucky bastard== \n http://wikimediafoundatio...,0
153154,fffa8a11c4378854,==shame on you all!!!== \n\n You want to speak...,0
153155,fffac2a094c8e0e2,MEL GIBSON IS A NAZI BITCH WHO MAKES SHITTY MO...,1


In [45]:
print(data['hate'].sum())
print(data['hate'].value_counts().get(0, 0))


6243
57735


In [46]:
# Indizes für jede Klasse extrahieren
class_0_indices = [i for i, label in enumerate(data['hate']) if label == 0]
class_1_indices = [i for i, label in enumerate(data['hate']) if label == 1]

# Undersampling für Klasse 0
undersampled_class_0_indices = np.random.choice(class_0_indices, size=len(class_1_indices), replace=False)

# Zusammenführen der Indizes für ein ausgeglichenes Dataset
balanced_indices = list(undersampled_class_0_indices) + class_1_indices

data_balanced = data.iloc[balanced_indices]

data_balanced

Unnamed: 0,id,comment_text,hate
115004,bff0f2d9b01fd658,"== Help Required!! == \n\n Hey Strikeeagle, I ...",0
5550,095a0427914bd340,""" \n\n == Criteria for inclusion == \n\n A wel...",0
123874,ceddfb0e2fc4e18f,"== Category:UK Wikipedians == \n\n Hi, just to...",0
68753,726fb99e482ce463,== Talk:History of Sesame Street/ image review...,0
130587,da382d46fef6da26,REDIRECT Talk:Giraffe given to Charles X of Fr...,0
...,...,...,...
152908,ff91c3d8a3e34398,NIGEL IS A CRAZY IDIOT!!!,1
153049,ffd49b8defd069d0,""" \n ::Well, now don't I feel stupid.... · """,1
153078,ffdf6854b41d9102,==Fourth Baldrick possibly being cleverer than...,1
153119,ffebe90c8d5acaba,""" \n\n == IRAN == \n That’s right, Iran. It wa...",1


In [47]:
print(data_balanced['hate'].sum())
print(data_balanced['hate'].value_counts().get(0, 0))


6243
6243


In [None]:
data_balanced.to_csv("data/train/balanced_test.csv", index=False)

# Tokenization

Now we will tokenize the trainset. We will use the BERT tokenizer to tokenize the text. The tokenization will be done in the following way:

1. Tokenize the text
2. Add special tokens
3. Set the max length to 512
4. Truncate the text if it is longer than 512 tokens
5. Pad the text if it is shorter than 512 tokens
6. Return the tokenized text as PyTorch tensors
7. Save the tokenized trainset as a pickle file

In [None]:
from transformers import BertTokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)

In [None]:
def tokenize_data(text):
    try:
        return tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=512,
            truncation=True,
            padding='max_length',
            return_tensors='pt'
        )
    except Exception as e:
        print(f"Error tokenizing text: {e}")
        return None

apply the tokenization to the trainset and save the tokenized trainset

In [None]:
data_balanced = pd.read_csv("data/train/balanced_train.csv")
# This could take a while depending on the size of your dataset
tqdm.pandas(desc="Tokenizing data")
data_balanced['tokenized'] = data_balanced['comment_text'].progress_apply(tokenize_data)

# save dataframe as pickle
data_balanced.to_pickle('data/train/tokenized_train_balanced_data.pkl')

Tokenizing data:   0%|          | 0/32450 [00:00<?, ?it/s]

Error tokenizing text: Input nan is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers.
Error tokenizing text: Input nan is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers.
Error tokenizing text: Input nan is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers.


Now we have a preprocessed and tokenized trainset. We can use this trainset to train the model.

In [10]:
from google.colab import drive
drive.mount('/content/drive')
base_path = '/content/drive/My Drive/'
data_path = os.path.join(base_path, 'NLP/hate_speech_detection_pipeline/data/train/tokenized_train_balanced_data.pkl')

if os.path.exists(data_path):
    data = pd.read_pickle(data_path)
else:
    print("File not found. Please ensure the file path is correct and run the previous cell to create the file.")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
