# Main Dataset
It is a filtered ParaNMT-detox corpus (500K sentence pairs). The data is given in the .tsv format, means columns are separated by \t symbol.

### Basic setups

In [1]:
!pip install transformers
!pip install spacy
!python -m spacy download en_core_web_md
!pip install annoy
!pip install sentence_transformers

Collecting transformers
  Downloading transformers-4.35.0-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m47.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m39.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m61.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m91.4 MB/s[0m eta [36m0:00:00[0m
Col

In [27]:
# vizualization library
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random

# pytorch library
import torch # the main pytorch library
import torch.nn.functional as f # the sub-library containing different functions for manipulating with tensors

# huggingface's transformers library
from transformers import BertModel, BertTokenizer

import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

import pandas as pd
import zipfile
from urllib.request import urlretrieve

from nltk.tokenize import word_tokenize
import string
import tqdm
import pickle

In [49]:
from sentence_transformers import SentenceTransformer, util
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re
import string

import nltk
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
def set_seed(seed):
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

set_seed(42)


### Download data


In [4]:
# Download data from url to file
fname = "/content/filtered_paranmt.zip"

url = "https://github.com/skoltech-nlp/detox/releases/download/emnlp2021/filtered_paranmt.zip"
urlretrieve(url, fname)

('/content/filtered_paranmt.zip', <http.client.HTTPMessage at 0x7b5477940730>)

In [5]:
# Loading the zip file and extracting a zip object
with zipfile.ZipFile(fname, 'r') as zip_file:
    zip_file.extract("filtered.tsv")

In [6]:
# Read dataset from obtained file
data = pd.read_csv("filtered.tsv", sep="\t", index_col=0)
data

Unnamed: 0,reference,translation,similarity,lenght_diff,ref_tox,trn_tox
0,"If Alkar is flooding her with psychic waste, t...","if Alkar floods her with her mental waste, it ...",0.785171,0.010309,0.014195,0.981983
1,Now you're getting nasty.,you're becoming disgusting.,0.749687,0.071429,0.065473,0.999039
2,"Well, we could spare your life, for one.","well, we can spare your life.",0.919051,0.268293,0.213313,0.985068
3,"Ah! Monkey, you've got to snap out of it.","monkey, you have to wake up.",0.664333,0.309524,0.053362,0.994215
4,I've got orders to put her down.,I have orders to kill her.,0.726639,0.181818,0.009402,0.999348
...,...,...,...,...,...,...
577772,You didn't know that Estelle had stolen some f...,you didn't know that Estelle stole your fish f...,0.870322,0.030769,0.000121,0.949143
577773,It'il suck the life out of you!,you'd be sucked out of your life!,0.722897,0.058824,0.996124,0.215794
577774,"I can't fuckin' take that, bruv.",I really can't take this.,0.617511,0.212121,0.984538,0.000049
577775,They called me a fucking hero. The truth is I ...,"they said I was a hero, but I didn't care.",0.679613,0.358209,0.991945,0.000124


Correct misspell

In [15]:
data.rename(columns={"lenght_diff": "length_diff"}, inplace=True)
data

Unnamed: 0,reference,translation,similarity,length_diff,ref_tox,trn_tox
0,"If Alkar is flooding her with psychic waste, t...","if Alkar floods her with her mental waste, it ...",0.785171,0.010309,0.014195,0.981983
1,Now you're getting nasty.,you're becoming disgusting.,0.749687,0.071429,0.065473,0.999039
2,"Well, we could spare your life, for one.","well, we can spare your life.",0.919051,0.268293,0.213313,0.985068
3,"Ah! Monkey, you've got to snap out of it.","monkey, you have to wake up.",0.664333,0.309524,0.053362,0.994215
4,I've got orders to put her down.,I have orders to kill her.,0.726639,0.181818,0.009402,0.999348
...,...,...,...,...,...,...
577772,You didn't know that Estelle had stolen some f...,you didn't know that Estelle stole your fish f...,0.870322,0.030769,0.000121,0.949143
577773,It'il suck the life out of you!,you'd be sucked out of your life!,0.722897,0.058824,0.996124,0.215794
577774,"I can't fuckin' take that, bruv.",I really can't take this.,0.617511,0.212121,0.984538,0.000049
577775,They called me a fucking hero. The truth is I ...,"they said I was a hero, but I didn't care.",0.679613,0.358209,0.991945,0.000124


### Data analysis

Dataset does not contain unsimilar pairs

In [7]:
# Check that there is datapoint with small similarity
np.where(data["similarity"].apply(lambda x: x<0.5).to_numpy())

(array([], dtype=int64),)

Not all reference sentences are toxic

In [10]:
# Check that all reference sentences are toxic
np.where(data["ref_tox"].apply(lambda x: x<0.5).to_numpy())

(array([     0,      1,      2, ..., 577766, 577772, 577776]),)

Not all translation sentences are non-toxic

In [11]:
# Check that all translation sentences are non-toxic
np.where(data["trn_tox"].apply(lambda x: x>0.5).to_numpy())

(array([     0,      1,      2, ..., 577766, 577772, 577776]),)

In [13]:
# Check lenght difference
np.min(data["length_diff"]), np.max(data["length_diff"])

(0.4, 0.0)

### Data preprocessing

To work with this data, I need to create a dataframe that contains a pair "toxic"-"non-toxic" sentences.

In [16]:
# Final dataset
df = {'input_text': [], 'target_text': []}

for index, row in data.iterrows():
    inp_sent = ""
    tar_sent = ""

    # Find toxic and non-toxic sentences
    if row['ref_tox'] > 0.5 > row['trn_tox']:
        inp_sent = row['reference']
        tar_sent = row['translation']
    elif row['ref_tox'] < 0.5 < row['trn_tox']:
        inp_sent = row['translation']
        tar_sent = row['reference']

    # Add found pair
    if len(inp_sent) > 0:
        df['input_text'].append(inp_sent)
        df['target_text'].append(tar_sent)

df = pd.DataFrame(df)
df

Unnamed: 0,input_text,target_text
0,"if Alkar floods her with her mental waste, it ...","If Alkar is flooding her with psychic waste, t..."
1,you're becoming disgusting.,Now you're getting nasty.
2,"well, we can spare your life.","Well, we could spare your life, for one."
3,"monkey, you have to wake up.","Ah! Monkey, you've got to snap out of it."
4,I have orders to kill her.,I've got orders to put her down.
...,...,...
577772,you didn't know that Estelle stole your fish f...,You didn't know that Estelle had stolen some f...
577773,It'il suck the life out of you!,you'd be sucked out of your life!
577774,"I can't fuckin' take that, bruv.",I really can't take this.
577775,They called me a fucking hero. The truth is I ...,"they said I was a hero, but I didn't care."


Split dataset on train, evaluation and test

In [21]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size=0.2, random_state=10, shuffle=True)
test_df, eval_df = train_test_split(test_df, test_size=0.5, random_state=10, shuffle=True)
test_df.shape

(57778, 2)

In [23]:
train_df.reset_index(drop=True, inplace=True)
eval_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)
train_df

Unnamed: 0,input_text,target_text
0,"I knew mushrooms, and I don't give a shit. """,I didn't know and I don't much care.”
1,Keep on lying like that. You're liable to get ...,you can keep lying like that and you'll be abl...
2,I was probably worth killing then.,I probably stood up for the assassination then.
3,It's bad enough we scared the shit out of them...,we already scared them off with the T-shirt wi...
4,"wake up, you corpses!",Heave and wake the dead!
...,...,...
462216,"Look, suck it, dude.","look, bite it, man."
462217,And if I have to look at that stupid picture o...,and if I have to take another look at the phot...
462218,I miss working with her because she's so godda...,I miss working with her because she does damn ...
462219,"Wow, this place is fuckin' incredible.","wow, this is incredible."


### Save data

In [25]:
file_name = "main_model_"
train_df.to_csv(file_name+"train.csv")
eval_df.to_csv(file_name+"eval.csv")
test_df.to_csv(file_name+"test.csv")

### Future preprocessing

To process sentences before passing them to embedding and model, it should be lowercased, tokenized, without punctuation and free spaces

In [None]:
def preprocess(sent):
    """
    Preprocess name of the dataset point
    Lowercased without punctuation and stop word
    Return list of preprocessed words from the sent
    """
    res = []

    try:
        words = word_tokenize(sent)
    except:
        print(f"\nTokenization fails for {sent}")
        return []

    for word in words:
        # Delete punctuation
        sent = sent.translate(str.maketrans("", "", string.punctuation))
        # Split by a free space
        word = word.strip()
        # Lowercase text
        word = word.lower()

        # Ignore free space
        if len(word) > 0:
            res.append(word)

    # Return list of preprocessed words from the sent
    return res

# External data

## Most frequent word

To build word replacer I need to have list of words. By their embedding, model will predict top k best synonyms (by embedding cosine similarity).

In [28]:
import zipfile

# Unzip file
with zipfile.ZipFile("most_frequent.zip", 'r') as zip_ref:
    zip_ref.extractall("")


In [29]:
# Read data
freq_dataset = pd.read_csv("unigram_freq.csv")
freq_dataset

Unnamed: 0,word,count
0,the,23135851162
1,of,13151942776
2,and,12997637966
3,to,12136980858
4,a,9081174698
...,...,...
333328,gooek,12711
333329,gooddg,12711
333330,gooblle,12711
333331,gollgo,12711


In [37]:
# Check for nans
freq_dataset[freq_dataset['word'].isna()]

Unnamed: 0,word,count
2577,,30739157
12819,,3398089


In [39]:
# Drop rows with NaN
freq_dataset = freq_dataset.drop(freq_dataset[freq_dataset['word'].isna()].index)
# Check for nans
freq_dataset[freq_dataset['word'].isna()]

Unnamed: 0,word,count


I do not need frequency of the words, so I will use only words themselves. Additionally, I delete free space and make lowercase of words

In [40]:
words = freq_dataset['word'].apply(lambda x: x.strip().lower())
words

array(['the', 'of', 'and', ..., 'gooblle', 'gollgo', 'golgw'],
      dtype=object)

In [41]:
# All words are unique
len(set(words.to_numpy()))

333331

Download list

In [46]:
words = pd.DataFrame(words, columns=["word"])
words

Unnamed: 0,word
0,the
1,of
2,and
3,to
4,a
...,...
333326,gooek
333327,gooddg
333328,gooblle
333329,gollgo


In [47]:
words.to_csv("frequent_words_list.csv")

#### Create embeddings

In [50]:
model_similarity = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')  # multi-language model

def embed(text):
    # global nlp
    # return nlp.vocab.get_vector(text)

    global model_similarity
    return model_similarity.encode([text], convert_to_tensor=False)[0]


In [None]:
store_embed = np.zeros((len(words), 384))

for i in tqdm.tqdm(range(len(words))):
    try:
        # if toxisity(item) <= 0.5:
        store_embed[i] = embed(words[i])

    except:
        pass


100%|██████████| 333331/333331 [3:08:45<00:00, 29.43it/s]


In [None]:
#Store sentences & embeddings on disc
with open('word_embeddings.pkl', "wb") as fOut:
    pickle.dump({'word': words, 'embedding': store_embed}, fOut, protocol=pickle.HIGHEST_PROTOCOL)
