<a href="https://colab.research.google.com/github/shitkov/courses/blob/master/transformers/dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Get additional data

In [None]:
%%capture
!wget https://raw.githubusercontent.com/s-nlp/parallel_detoxification_dataset/main/parallel_detoxification_dataset_small.tsv
!wget https://raw.githubusercontent.com/s-nlp/russe_detox_2022/main/data/input/train.tsv
!wget https://raw.githubusercontent.com/s-nlp/russe_detox_2022/main/data/input/dev.tsv
!wget https://raw.githubusercontent.com/s-nlp/russe_detox_2022/main/data/input/test.tsv

In [None]:
import pandas as pd

In [None]:
data = pd.read_csv("/content/parallel_detoxification_dataset_small.tsv", sep='\t')

In [None]:
toxic_comments = list(data['toxic_comment'])

In [None]:
civil_comments = list(data['civil_comment'])

In [None]:
# !pip install googletrans==3.1.0a0

In [None]:
from googletrans import Translator

In [None]:
translator = Translator()

In [None]:
from tqdm import tqdm

In [None]:
toxic_comments_ru = []
civil_comments_ru = []
for tox, detox in tqdm(zip(toxic_comments, civil_comments)):
    toxic_comments_ru.append(translator.translate(tox, dest='ru').text)
    civil_comments_ru.append(translator.translate(detox, dest='ru').text)

In [None]:
df = pd.DataFrame()

In [None]:
df['toxic_comment'] = toxic_comments_ru

In [None]:
df['civil_comment'] = civil_comments_ru

In [None]:
df.to_csv('translated.csv')

In [None]:
data = pd.read_csv("/content/translated.csv")

In [None]:
toxic_comments = list(data['toxic_comment'])
civil_comments = list(data['civil_comment'])

### Utilities

In [None]:
!pip install transformers sentencepiece --quiet

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [None]:
import gc

def cleanup():
    gc.collect()
    torch.cuda.empty_cache()

### Toxic classifier

In [None]:
model_checkpoint = 'cointegrated/rubert-tiny-toxicity'
tokenizer_cls = AutoTokenizer.from_pretrained(model_checkpoint)
model_cls = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)

In [None]:
if torch.cuda.is_available():
    model_cls.cuda()

In [None]:
def text2toxicity(text, model, tokenizer, aggregate=True):
    """ Calculate toxicity of a text (if aggregate=True) or a vector of toxicity aspects (if aggregate=False)"""
    with torch.no_grad():
        inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True).to(model.device)
        proba = torch.sigmoid(model(**inputs).logits).cpu().numpy()
    if isinstance(text, str):
        proba = proba[0]
    if aggregate:
        return 1 - proba.T[0] * (1 - proba.T[-1])
    return proba

In [None]:
toxic_labels = text2toxicity(toxic_comments, model_cls, tokenizer_cls, True)

In [None]:
toxic_labels = [1 if label > 0.8 else 0 for label in toxic_labels]

In [None]:
civil_labels = text2toxicity(civil_comments, True)

In [None]:
civil_labels = [1 if label > 0.8 else 0 for label in civil_labels]

In [None]:
data['toxic_labell'] = toxic_labels
data['civil_label'] = civil_labels

In [None]:
df = data[(data['toxic_labell'] == 1) & (data['civil_label'] == 0)]

In [None]:
data = pd.DataFrame()

data['toxic_comment'] = df['toxic_comment']
data['civil_comment'] = df['civil_comment']

In [None]:
data.head()

In [None]:
data.to_csv('train_translated.csv', index=False)

In [None]:
cleanup()

### Toxify data

In [None]:
model_name = 'IlyaGusev/rut5_tox'

In [None]:
df = pd.read_csv('train.tsv', sep='\t', index_col='index')
df = df.fillna('')

In [None]:
df_train_toxic = []
df_train_neutral = []

for index, row in df.iterrows():
    references = row[['neutral_comment1', 'neutral_comment2', 'neutral_comment3']].tolist()
    
    for reference in references:
        if len(reference) > 0:
            df_train_toxic.append(row['toxic_comment'])
            df_train_neutral.append(reference)
        else:
            break

In [None]:
df0 = pd.DataFrame()
df0['toxic_comment'] = df_train_toxic
df0['civil_comment'] = df_train_neutral

In [None]:
from transformers import T5ForConditionalGeneration

In [None]:
model = T5ForConditionalGeneration.from_pretrained(model_name).cuda()
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
def paraphrase(text, model, tokenizer, n=None, max_length='auto', temperature=0.0, beams=3):
    texts = [text] if isinstance(text, str) else text
    inputs = tokenizer(texts, return_tensors='pt', padding=True)['input_ids'].to(model.device)
    if max_length == 'auto':
        max_length = int(inputs.shape[1] * 1.2) + 10
    result = model.generate(
        inputs, 
        num_return_sequences=n or 1, 
        do_sample=False, 
        temperature=temperature, 
        repetition_penalty=3.0, 
        max_length=max_length,
        bad_words_ids=[[2]],  # unk
        num_beams=beams,
    )
    texts = [tokenizer.decode(r, skip_special_tokens=True) for r in result]
    if not n and isinstance(text, str):
        return texts[0]
    return texts

In [None]:
toxic_texts = [paraphrase(text, model, tokenizer) for text in tqdm(df_train_neutral)]

In [None]:
len(toxic_texts)

In [None]:
data = pd.DataFrame()
data['toxic_comment'] = toxic_texts
data['civil_comment'] = df_train_neutral

In [None]:
data.to_csv('tox_df.csv')

In [None]:
data = pd.read_csv('tox_df.csv')

In [None]:
toxic_comments = list(data['toxic_comment'])
civil_comments = list(data['civil_comment'])

In [None]:
toxic_labels = [text2toxicity(comment, model_cls, tokenizer_cls, True) for comment in tqdm(toxic_comments)]

In [None]:
toxic_labels = [1 if label > 0.8 else 0 for label in toxic_labels]

In [None]:
civil_labels = [text2toxicity(comment, model_cls, tokenizer_cls, True) for comment in tqdm(civil_comments)]

In [None]:
civil_labels = [1 if label > 0.5 else 0 for label in civil_labels]

In [None]:
data['toxic_label'] = toxic_labels
data['civil_label'] = civil_labels

In [None]:
df = data[(data['toxic_label'] == 1) & (data['civil_label'] == 0)]

In [None]:
data = pd.DataFrame()

data['toxic_comment'] = df['toxic_comment']
data['civil_comment'] = df['civil_comment']

In [None]:
data.to_csv('train_retox.csv', index=False)

In [None]:
df1 = pd.read_csv('train_retox.csv')
df2 = pd.read_csv('train_translated.csv')

In [None]:
df0.head()

In [None]:
df = pd.concat([df0, df1, df2])
df = df.sample(frac=1)
df = df.reset_index(drop=True)

In [None]:
df.to_csv('train_extended.csv', index=False)

In [None]:
# ruT5
# loss: CLS - Labse - metrics
# генерация нескольких вариантов, выбор лучшего
# Почистить модель
# GAN
# Фильтрация выхода по словарю