# Download ParaNMT Dataset and transform it

## Downloading data

### We download zip file to our local directory

In [1]:
import pandas as pd
import requests
import os
import zipfile

In [2]:
url = "https://github.com/skoltech-nlp/detox/releases/download/emnlp2021/filtered_paranmt.zip"
os.makedirs("../data/raw", exist_ok=True)
zip_filename = "../data/raw/filtered_paranmt.zip"

"Response" variable is used to store the HTTP response obtained from the specified URL when downloading the ZIP file, allowing further manipulation and saving of the file's content.

In [3]:
response = requests.get(url)
with open(zip_filename, "wb") as zip_file:
    zip_file.write(response.content)

with zipfile.ZipFile(zip_filename, "r") as zip_ref:
    with zip_ref.open("filtered.tsv") as file:
        df = pd.read_csv(file, sep='\t')

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,reference,translation,similarity,lenght_diff,ref_tox,trn_tox
0,0,"If Alkar is flooding her with psychic waste, t...","if Alkar floods her with her mental waste, it ...",0.785171,0.010309,0.014195,0.981983
1,1,Now you're getting nasty.,you're becoming disgusting.,0.749687,0.071429,0.065473,0.999039
2,2,"Well, we could spare your life, for one.","well, we can spare your life.",0.919051,0.268293,0.213313,0.985068
3,3,"Ah! Monkey, you've got to snap out of it.","monkey, you have to wake up.",0.664333,0.309524,0.053362,0.994215
4,4,I've got orders to put her down.,I have orders to kill her.,0.726639,0.181818,0.009402,0.999348


lets save our df in untouched mode in .csv file

In [5]:
train_data = df[df['ref_tox'] - df['trn_tox'] > 0]

In [6]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 319142 entries, 5 to 577775
Data columns (total 7 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   Unnamed: 0   319142 non-null  int64  
 1   reference    319142 non-null  object 
 2   translation  319142 non-null  object 
 3   similarity   319142 non-null  float64
 4   lenght_diff  319142 non-null  float64
 5   ref_tox      319142 non-null  float64
 6   trn_tox      319142 non-null  float64
dtypes: float64(4), int64(1), object(2)
memory usage: 19.5+ MB


We need to check  if the is empty fields in df

In [7]:
df = df.rename(columns={'Unnamed: 0': 'id'})
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 577777 entries, 0 to 577776
Data columns (total 7 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   id           577777 non-null  int64  
 1   reference    577777 non-null  object 
 2   translation  577777 non-null  object 
 3   similarity   577777 non-null  float64
 4   lenght_diff  577777 non-null  float64
 5   ref_tox      577777 non-null  float64
 6   trn_tox      577777 non-null  float64
dtypes: float64(4), int64(1), object(2)
memory usage: 30.9+ MB


In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split

train_data, temp_data = train_test_split(df, test_size=0.3, random_state=42)
valid_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)


In [9]:
test_data.drop(columns=["translation", "similarity", "lenght_diff", "trn_tox"])

Unnamed: 0,id,reference,ref_tox
383388,383388,"Sucker, I thought you was dead!",0.987744
540158,540158,They'll kill Immortals dead.,0.999228
440764,440764,75% of these big railroad men have bad-hearted...,0.014735
525024,525024,Shut up and eat it.,0.999146
553451,553451,Here's a tip from an evil New Yorker.,0.671870
...,...,...,...
506501,506501,This long flight of stairs is killing me.,0.001772
545661,545661,Kill him!' Saruman looked round at their hosti...,0.997867
373160,373160,And they will seek to destroy him.,0.288646
441522,441522,"Everyone's signed this petition, even the ones...",0.028304


In [10]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 404443 entries, 182993 to 121958
Data columns (total 7 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   id           404443 non-null  int64  
 1   reference    404443 non-null  object 
 2   translation  404443 non-null  object 
 3   similarity   404443 non-null  float64
 4   lenght_diff  404443 non-null  float64
 5   ref_tox      404443 non-null  float64
 6   trn_tox      404443 non-null  float64
dtypes: float64(4), int64(1), object(2)
memory usage: 24.7+ MB


In [11]:
# Assuming you already have the train_data DataFrame
train_data = train_data[train_data['ref_tox'] - train_data['trn_tox'] > 0]


In [12]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 223638 entries, 182993 to 121958
Data columns (total 7 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   id           223638 non-null  int64  
 1   reference    223638 non-null  object 
 2   translation  223638 non-null  object 
 3   similarity   223638 non-null  float64
 4   lenght_diff  223638 non-null  float64
 5   ref_tox      223638 non-null  float64
 6   trn_tox      223638 non-null  float64
dtypes: float64(4), int64(1), object(2)
memory usage: 13.6+ MB
