# Russian SOTA

https://github.com/skoltech-nlp/russe_detox_2022/tree/main/baselines/t5

In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '5'

In [2]:
import pandas as pd
from transformers import T5ForConditionalGeneration, AutoTokenizer
import torch
from tqdm.auto import tqdm, trange
import gc


In [3]:
test_data = pd.read_csv('../data/russian_data/test.tsv', sep='\t')
test_inputs = test_data["toxic_comment"].values.tolist()

In [5]:
base_model_name = 'sberbank-ai/ruT5-base'
model_name = 'SkolkovoInstitute/ruT5-base-detox'

In [6]:
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name).cuda()

Downloading:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/850M [00:00<?, ?B/s]

In [14]:
def paraphrase(text, model, n=None, max_length='auto', temperature=1.0, beams=3):
    texts = [text] if isinstance(text, str) else text
    inputs = tokenizer(texts, return_tensors='pt', padding=True)['input_ids'].to(model.device)
    if max_length == 'auto':
        max_length = int(inputs.shape[1] * 1.2) + 10
    result = model.generate(
        inputs, 
        num_return_sequences=n or 1, 
        do_sample=False, 
        temperature=temperature, 
        repetition_penalty=3.0, 
        max_length=max_length,
        bad_words_ids=[[1]],  # unk
        num_beams=beams,
    )
    texts = [tokenizer.decode(r, skip_special_tokens=True) for r in result]
    if not n and isinstance(text, str):
        return texts[0]
    return texts

In [15]:
print(paraphrase(['Дмитрий вы ебанулись, уже все выложено'], model, beams=10))

['Дмитрий вы с ума сошли, уже все выложено']


In [16]:
para_results = []
batch_size = 8

for i in tqdm(range(0, len(test_inputs), batch_size)):
    batch = [sentence for sentence in test_inputs[i:i + batch_size]]
    para_results.extend(paraphrase(batch, model))

  0%|          | 0/125 [00:00<?, ?it/s]

In [17]:
test_inputs[:5]

['укропидорг лавринов! общайся лучше с ией - так хоть на человека похож!',
 'терпеть не могу самок, которые за мужской хер готовы родину продать',
 'фсё для таких пидарасоф как ты',
 'а работать не хочешь да? хочешь за свои фотки деньги получать? хуй на нос тебе',
 'пидор !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! сука ,,продал 15 !!!!!!!!!! союзных республик ?????????? куда смотрело тогдашнее нквд ?????']

In [18]:
para_results[:5]

['лавринов! общайся лучше с ией - так хоть на человека похож!',
 'терпеть не могу самок, которые за мужской бред готовы родину продать',
 'Всё для таких как ты',
 'а работать не хочешь да? хочешь за свои фотки деньги получать?',
 'Продал 15 союзных республик. Почему НКВД его не наказало?']

In [21]:
p = '../results/ruT5-base-detox/'
os.makedirs(p)

In [None]:
with open(p + 'results_ru.txt', 'w') as f:
    for text in para_results:
        f.write(text.replace('\n', ' ') + '\n')

```
cd /home/dale/projects/multilingual_detox
python evaluate_ru.py \
    --result_filename scores \
    --input_dir results/ruT5-base-detox \
    --output_dir results
    
Style accuracy:       0.7726958990097046
Meaning preservation: 0.8499152064323425
Joint fluency:        -0.1741630882024765
Joint score:          -0.1071704551577568
Scores after calibration:
Style accuracy:       0.7954263091087341
Meaning preservation: 0.7758175134658813
Joint fluency:        0.7997124791145325
Joint score:          0.5067287087440491
```

# English SOTA

In [26]:
model_name = 'SkolkovoInstitute/bart-base-detox'

In [30]:
from transformers import AutoModelForSeq2SeqLM

In [31]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).cuda()

In [33]:
tokenizer.unk_token_id

3

In [34]:
def paraphrase(text, model, n=None, max_length='auto', temperature=1.0, beams=3):
    texts = [text] if isinstance(text, str) else text
    inputs = tokenizer(texts, return_tensors='pt', padding=True)['input_ids'].to(model.device)
    if max_length == 'auto':
        max_length = int(inputs.shape[1] * 1.2) + 10
    result = model.generate(
        inputs, 
        num_return_sequences=n or 1, 
        do_sample=False, 
        temperature=temperature, 
        repetition_penalty=3.0, 
        max_length=max_length,
        bad_words_ids=[[tokenizer.unk_token_id]],  # unk
        num_beams=beams,
    )
    texts = [tokenizer.decode(r, skip_special_tokens=True) for r in result]
    if not n and isinstance(text, str):
        return texts[0]
    return texts

In [35]:
paraphrase("I don't give a fuck!", model)

"I don't care!"

In [36]:
with open('../data/english_data/test_toxic_parallel.txt', 'r') as f:
    en_test_inputs = [line.strip() for line in f.readlines()]

In [37]:
en_test_inputs[:5]

['. or the loud ass one - thousand ton beast roaring towards you howling its horn .',
 'mandated  and " right fucking now " would be good .',
 '* neither * of my coworkers gave a shit when it came time to ditch mitch . ugh .',
 '* well shit , cunt shot himself .',
 "&gt i wouldn 't care how ignorant you are in you weren 't pretending to know shit ."]

In [38]:
para_results_en = []
batch_size = 8

for i in tqdm(range(0, len(en_test_inputs), batch_size)):
    batch = [sentence for sentence in en_test_inputs[i:i + batch_size]]
    para_results_en.extend(paraphrase(batch, model))

  0%|          | 0/84 [00:00<?, ?it/s]

In [39]:
para_results_en[:5]

['. or the loud one - thousand ton beast roaring towards you howling its horn.',
 'mandated  and " right now " would be good.',
 '* neither * of my coworkers cared when it came time to ditch mitch. ugh.',
 'He shot himself.',
 "I wouldn't care how ignorant you are in you weren't pretending to know."]

In [40]:
p = '../results/bart-base-detox/'
os.makedirs(p)

In [41]:
with open(p + 'results_en.txt', 'w') as f:
    for text in para_results_en:
        f.write(text.replace('\n', ' ') + '\n')

```
cd /home/dale/projects/paradetox2/evaluation_detox
python metric.py --inputs /home/dale/projects/multilingual_detox/data/english_data/test_toxic_parallel.txt \
    --preds /home/dale/projects/multilingual_detox/results/bart-base-detox/results_en.txt \
    --cola_classifier_path /home/dale/models/cola_classifier_fairseq \
    --wieting_model_path /home/dale/models/wieting_similarity/sim.pt \
    --wieting_tokenizer_path /home/dale/models/wieting_similarity/sim.sp.30k.model \
    --batch_size 32
cat results.md
```
| Model | ACC | EMB_SIM | SIM | CharPPL | TokenPPL | FL | GM | J | BLEU |
| ----- | --- | ------- | --- | ------- | -------- | -- | -- | - | ---- |
results_en.txt|0.6766|0.7574|0.7180|6.5078|86.1255|0.8942|0.0000|0.4116|0.4885|
results_en.txt|0.6662|0.7985|0.7741|6.1724|89.3471|0.9553|0.0000|0.4674|0.5799|
results_en.txt|0.9016|0.8934|0.8592|6.2307|127.6455|0.8599|11.9160|0.6555|0.7101|

# Human references

English references are copied from another project.

In [2]:
import pandas as pd

test_data = pd.read_csv('../data/russian_data/test.tsv', sep='\t')
test_inputs = test_data["toxic_comment"].values.tolist()

In [5]:
with open('../results/human-refs/results_ru.txt', 'w') as f:
    for text in test_data.neutral_comment:
        f.write(text+'\n')

```
cd /home/dale/projects/multilingual_detox
python evaluate_ru.py \
    --result_filename scores \
    --input_dir results/human-refs \
    --output_dir results
    
    
Style accuracy:       0.807894766330719
Meaning preservation: 0.7892386317253113
Joint fluency:        -0.21004270017147064
Joint score:          -0.12385008484125137
Scores after calibration:
Style accuracy:       0.8271052837371826
Meaning preservation: 0.6875579953193665
Joint fluency:        0.7584508657455444
Joint score:          0.4519689381122589
```


```
cd /home/dale/projects/paradetox2/evaluation_detox
python metric.py --inputs /home/dale/projects/multilingual_detox/data/english_data/test_toxic_parallel.txt \
    --preds /home/dale/projects/multilingual_detox/results/human-refs/results_en.txt \
    --cola_classifier_path /home/dale/models/cola_classifier_fairseq \
    --wieting_model_path /home/dale/models/wieting_similarity/sim.pt \
    --wieting_tokenizer_path /home/dale/models/wieting_similarity/sim.sp.30k.model \
    --batch_size 32
cat results.md
```

| Model | ACC | EMB_SIM | SIM | CharPPL | TokenPPL | FL | GM | J | BLEU |
| ----- | --- | ------- | --- | ------- | -------- | -- | -- | - | ---- |
results_en.txt|0.9568|0.8243|0.7736|5.8445|209.4191|0.8674|11.6538|0.6421|0.6213|