In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import nltk
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import StandardScaler

In [2]:
DATA_PATH = '../data/'
# Read the CSV file into a DataFrame
df = pd.read_csv(DATA_PATH + 'raw/filtered.tsv', delimiter='\t')
# Display the first few rows of the dataset
df.head()

Unnamed: 0,id,reference,translation,similarity,lenght_diff,ref_tox,trn_tox
0,0,"If Alkar is flooding her with psychic waste, t...","if Alkar floods her with her mental waste, it ...",0.785171,0.010309,0.014195,0.981983
1,1,Now you're getting nasty.,you're becoming disgusting.,0.749687,0.071429,0.065473,0.999039
2,2,"Well, we could spare your life, for one.","well, we can spare your life.",0.919051,0.268293,0.213313,0.985068
3,3,"Ah! Monkey, you've got to snap out of it.","monkey, you have to wake up.",0.664333,0.309524,0.053362,0.994215
4,4,I've got orders to put her down.,I have orders to kill her.,0.726639,0.181818,0.009402,0.999348


Select the relevant columns for training the text de-toxification model, which are 'reference', 'translation', 'ref_tox', and 'trn_tox'.


In [3]:
# Normalize/Standardize numerical features
scaler = StandardScaler()
df[['ref_tox', 'trn_tox', 'similarity', 'lenght_diff']] = scaler.fit_transform(df[['ref_tox', 'trn_tox', 'similarity', 'lenght_diff']])
df.head()

Unnamed: 0,id,reference,translation,similarity,lenght_diff,ref_tox,trn_tox
0,0,"If Alkar is flooding her with psychic waste, t...","if Alkar floods her with her mental waste, it ...",0.288055,-1.363574,-1.152122,1.193047
1,1,Now you're getting nasty.,you're becoming disgusting.,-0.094744,-0.797951,-1.040057,1.230213
2,2,"Well, we could spare your life, for one.","well, we can spare your life.",1.732364,1.023913,-0.716958,1.199768
3,3,"Ah! Monkey, you've got to snap out of it.","monkey, you have to wake up.",-1.015544,1.405483,-1.066524,1.219701
4,4,I've got orders to put her down.,I have orders to kill her.,-0.343382,0.223641,-1.162597,1.230887


In [8]:
# Text preprocessing
# Lowercasing the text
df['reference'] = df['reference'].str.lower()
df['translation'] = df['translation'].str.lower()

# Remove special characters
df['reference'] = df['reference'].str.replace(r'[^\w\s]+', '', regex=True)
df['translation'] = df['translation'].str.replace(r'[^\w\s]+', '', regex=True)
df.head()

Unnamed: 0,id,reference,translation,similarity,lenght_diff,ref_tox,trn_tox
0,0,if alkar is flooding her with psychic waste th...,if alkar floods her with her mental waste it w...,0.288055,-1.363574,-1.152122,1.193047
1,1,now youre getting nasty,youre becoming disgusting,-0.094744,-0.797951,-1.040057,1.230213
2,2,well we could spare your life for one,well we can spare your life,1.732364,1.023913,-0.716958,1.199768
3,3,ah monkey youve got to snap out of it,monkey you have to wake up,-1.015544,1.405483,-1.066524,1.219701
4,4,ive got orders to put her down,i have orders to kill her,-0.343382,0.223641,-1.162597,1.230887


Unnamed: 0,id,reference,translation,similarity,lenght_diff,ref_tox,trn_tox
0,0,if alkar is flooding her with psychic waste th...,if alkar floods her with her mental waste it w...,0.288055,-1.363574,-1.152122,1.193047
1,1,now youre getting nasty,youre becoming disgusting,-0.094744,-0.797951,-1.040057,1.230213
2,2,well we could spare your life for one,well we can spare your life,1.732364,1.023913,-0.716958,1.199768
3,3,ah monkey youve got to snap out of it,monkey you have to wake up,-1.015544,1.405483,-1.066524,1.219701
4,4,ive got orders to put her down,i have orders to kill her,-0.343382,0.223641,-1.162597,1.230887


Split the data into 80% for training and 20% for testing

In [11]:
# Split the data into training, validation, and test sets
# 80% for training and 20% for testing
train_data, test_data = train_test_split(df.drop('id', axis=1), test_size=0.2, random_state=42)


In [12]:
# Save the split datasets to separate CSV files
train_data.to_csv(DATA_PATH + 'interim/train.csv', index=False)
test_data.to_csv(DATA_PATH + 'interim/test.csv', index=False)


In [13]:
TOXICITY_THRESHOLD = 0.9
filtered_df = df[df['ref_tox'] > TOXICITY_THRESHOLD]
filtered_df.to_csv(DATA_PATH + 'interim/most_toxic_data.csv', index=False)