In [1]:
import pandas as pd
import re
import string
from tqdm.auto import tqdm

## Preprocessing the testset

Now we will prepare the testset.

1. concatenate the labels to one column
2. remove all rows with the label -1, which means the text is not labeled
3. clean the text
4. save the preprocessed testset

In [2]:
def fun(x):
    if x.sum() > 0:
        return 1
    elif x.sum() == 0:
        return 0
    return -1

In [3]:
data = pd.read_csv("../data/test/test.csv")
labels = pd.read_csv("../data/test_labels/test_labels.csv")

rows = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

data['label'] = labels[rows].apply(lambda x: fun(x), axis=1)

for i in rows:
    del labels[i]
    
data.head()

Unnamed: 0,id,comment_text,label
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...,-1
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...,-1
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap...",-1
3,00017563c3f7919a,":If you have a look back at the source, the in...",-1
4,00017695ad8997eb,I don't anonymously edit articles at all.,-1


In [4]:
data = data[data['label'] != -1]
data.head()

Unnamed: 0,id,comment_text,label
5,0001ea8717f6de06,Thank you for understanding. I think very high...,0
7,000247e83dcc1211,:Dear god this site is horrible.,0
11,0002f87b16116a7f,"""::: Somebody will invariably try to add Relig...",0
13,0003e1cccfd5a40a,""" \n\n It says it right there that it IS a typ...",0
14,00059ace3e3e9a53,""" \n\n == Before adding a new product to the l...",0


In [5]:
def clean_text(text):

    text = re.sub('\[.*?\]', '', text)
    #pattern = [zero or more character]

    text = re.sub('https?://\S+|www\.\S+', '', text)
    #pattern = removes (http),://, 'and' www.
    
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    #pattern = any punctionation

    text = re.sub('\n', ' ', text)
    #pattern = any new line

    text = re.sub('\w*\d\w*', '', text)
    #pattern = any from[a-zA-Z0-9_], any from[0-9], any from [a-zA-Z0-9_]

    return text

In [6]:
data['comment_text'] = data['comment_text'].apply(lambda x: clean_text(x))
data.head()

Unnamed: 0,id,comment_text,label
5,0001ea8717f6de06,Thank you for understanding I think very highl...,0
7,000247e83dcc1211,Dear god this site is horrible,0
11,0002f87b16116a7f,Somebody will invariably try to add Religion ...,0
13,0003e1cccfd5a40a,It says it right there that it IS a type T...,0
14,00059ace3e3e9a53,Before adding a new product to the list m...,0


In [7]:
data.to_csv("../data/test/clean_test.csv", index=False)