In [9]:
import pandas as pd
import re
import string
from tqdm.auto import tqdm
import os
import platform

## Preprocessing the testset

Now we will prepare the testset.

1. concatenate the labels to one column
2. remove all rows with the label -1, which means the text is not labeled
3. clean the text
4. save the preprocessed testset

In [19]:
data = pd.read_csv("../data/test/test.csv")
data.head()

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.


In [20]:
labels = pd.read_csv("../data/test_labels/test_labels.csv")
labels.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,-1,-1,-1,-1,-1,-1
1,0000247867823ef7,-1,-1,-1,-1,-1,-1
2,00013b17ad220c46,-1,-1,-1,-1,-1,-1
3,00017563c3f7919a,-1,-1,-1,-1,-1,-1
4,00017695ad8997eb,-1,-1,-1,-1,-1,-1


The value -1 means the text is not labeled. We will concatenate the labels to one column and remove all rows with the label -1. If the sum of the labels is greater than 0, the text is labeled as hate speech. If the sum is 0, the text is not labeled as hate speech.

In [2]:
def fun(x):
    if x.sum() > 0:
        return 1
    elif x.sum() == 0:
        return 0
    return -1

In [3]:
rows = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

data['label'] = labels[rows].apply(lambda x: fun(x), axis=1)

for i in rows:
    del labels[i]
    
data.head()

Unnamed: 0,id,comment_text,label
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...,-1
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...,-1
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap...",-1
3,00017563c3f7919a,":If you have a look back at the source, the in...",-1
4,00017695ad8997eb,I don't anonymously edit articles at all.,-1


Now we remove all rows with the label -1.

In [4]:
data = data[data['label'] != -1]
data.head()

Unnamed: 0,id,comment_text,label
5,0001ea8717f6de06,Thank you for understanding. I think very high...,0
7,000247e83dcc1211,:Dear god this site is horrible.,0
11,0002f87b16116a7f,"""::: Somebody will invariably try to add Relig...",0
13,0003e1cccfd5a40a,""" \n\n It says it right there that it IS a typ...",0
14,00059ace3e3e9a53,""" \n\n == Before adding a new product to the l...",0


For text cleaning, we only remove special characters, links, and punctuation, because we want to keep the text as close to the original as possible and BERT does not require any further preprocessing. Stopwords are not removed because they can be important for the model to understand the context of the text.

In [5]:
def clean_text(text):

    text = re.sub('\[.*?\]', '', text)
    #pattern = [zero or more character]

    text = re.sub('https?://\S+|www\.\S+', '', text)
    #pattern = removes (http),://, 'and' www.
    
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    #pattern = any punctionation

    text = re.sub('\n', ' ', text)
    #pattern = any new line

    text = re.sub('\w*\d\w*', '', text)
    #pattern = any from[a-zA-Z0-9_], any from[0-9], any from [a-zA-Z0-9_]

    return text

In [6]:
data['comment_text'] = data['comment_text'].apply(lambda x: clean_text(x))
data.head()

Unnamed: 0,id,comment_text,label
5,0001ea8717f6de06,Thank you for understanding I think very highl...,0
7,000247e83dcc1211,Dear god this site is horrible,0
11,0002f87b16116a7f,Somebody will invariably try to add Religion ...,0
13,0003e1cccfd5a40a,It says it right there that it IS a type T...,0
14,00059ace3e3e9a53,Before adding a new product to the list m...,0


save the preprocessed testset

In [7]:
data.to_csv("../data/test/clean_test.csv", index=False)

## Preprocessing the trainset for the model

Now we will prepare the trainset for the model.

1. clean the text
2. concatenate the labels to one column
3. save the preprocessed trainset

First we will load the trainset and check the operating system to load the data from the correct path.

In [10]:
def check_os_and_load_data(filename):
    if platform.system() == "Windows":
        print("This is a Windows system. Running Windows-specific code.")
        # Assuming the directory path for Windows is '../data/train/'
        path = os.path.join('..', 'data', 'train', filename)
        data = pd.read_csv(path, encoding='utf-8')
        
    elif platform.system() == "Linux":
        print("This is a Linux system. Running Linux-specific code.")
        # Assuming the directory path for Linux is the home directory
        path = os.path.join(os.path.expanduser('~'), 'ownCloud - Michael Saxer (zhaw.ch)@drive.switch.ch', '4 Semester', 'NLP', 'workbench', filename)
        data = pd.read_csv(path, encoding='utf-8')
        
    else:
        print("This is neither a Windows nor a Linux system. You're on your own, sorry.")
        data = None  # Or handle other operating systems as needed
        
    return data

# Pass only the filename, not the path.
data = check_os_and_load_data('train.csv')

This is a Windows system. Running Windows-specific code.


In [11]:
data.head(8)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
5,00025465d4725e87,"""\n\nCongratulations from me as well, use the ...",0,0,0,0,0,0
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0
7,00031b1e95af7921,Your vandalism to the Matt Shirvington article...,0,0,0,0,0,0


Now clean the trainset

In [12]:
data['comment_text'] = data['comment_text'].apply(lambda x: clean_text(x))
data.head(8)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation Why the edits made under my userna...,0,0,0,0,0,0
1,000103f0d9cfb60f,Daww He matches this background colour Im seem...,0,0,0,0,0,0
2,000113f07ec002fd,Hey man Im really not trying to edit war Its j...,0,0,0,0,0,0
3,0001b41b1c6bb37e,More I cant make any real suggestions on impr...,0,0,0,0,0,0
4,0001d958c54c6e35,You sir are my hero Any chance you remember wh...,0,0,0,0,0,0
5,00025465d4725e87,Congratulations from me as well use the tool...,0,0,0,0,0,0
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0
7,00031b1e95af7921,Your vandalism to the Matt Shirvington article...,0,0,0,0,0,0


Now we will concatenate the labels to one column. If the sum of the labels is greater than 0, the text is labeled as hate speech. If the sum is 0, the text is not labeled as hate speech.

In [13]:
def concat(x):
    if x.sum() != 0:
        return 1
    return 0

rows = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

data['hate'] = data[rows].apply(lambda x: concat(x), axis=1)

for i in rows:
    del data[i]

In [14]:
data.head(10)

Unnamed: 0,id,comment_text,hate
0,0000997932d777bf,Explanation Why the edits made under my userna...,0
1,000103f0d9cfb60f,Daww He matches this background colour Im seem...,0
2,000113f07ec002fd,Hey man Im really not trying to edit war Its j...,0
3,0001b41b1c6bb37e,More I cant make any real suggestions on impr...,0
4,0001d958c54c6e35,You sir are my hero Any chance you remember wh...,0
5,00025465d4725e87,Congratulations from me as well use the tool...,0
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1
7,00031b1e95af7921,Your vandalism to the Matt Shirvington article...,0
8,00037261f536c51d,Sorry if the word nonsense was offensive to yo...,0
9,00040093b2687caa,alignment on this subject and which are contra...,0


save the preprocessed trainset

In [15]:
data.to_csv("../data/train/clean_train.csv", index=False)

# Tokenization

Now we will tokenize the trainset. We will use the BERT tokenizer to tokenize the text. The tokenization will be done in the following way:

1. Tokenize the text
2. Add special tokens
3. Set the max length to 512
4. Truncate the text if it is longer than 512 tokens
5. Pad the text if it is shorter than 512 tokens
6. Return the tokenized text as PyTorch tensors
7. Save the tokenized trainset as a pickle file

In [16]:
from transformers import BertTokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)

In [17]:
def tokenize_data(text):
    try:
        # print(" do not forget to save this settings in the model info file before saving the hidden states!!!")
        return tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=512,
            truncation=True,
            padding='max_length',
            return_tensors='pt'
        )
    except Exception as e:
        print(f"Error tokenizing text: {e}")
        return None

apply the tokenization to the trainset and save the tokenized trainset

In [18]:
# This could take a while depending on the size of your dataset
tqdm.pandas(desc="Tokenizing data")
data['tokenized'] = data['comment_text'].progress_apply(tokenize_data)

# save dataframe as pickle
data.to_pickle('../data/train/tokenized_train_data.pkl')

Tokenizing data:   0%|          | 0/159571 [00:00<?, ?it/s]