In [2]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '5,7'

In [3]:
import sys
sys.path.append(os.path.abspath('..'))

# Transfer English to Russian

In [4]:
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

In [5]:
model_name = '/home/dale/models/detox-parallel/mbart_5000_EN'
tokenizer = AutoTokenizer.from_pretrained('facebook/mbart-large-50')
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to('cuda:0')

In [8]:
def paraphrase(
    text, model, tokenizer, n=None, max_length="auto", beams=5,
):
    texts = [text] if isinstance(text, str) else text
    inputs = tokenizer(texts, return_tensors="pt", padding=True)["input_ids"].to(
        model.device
    )

    if max_length == "auto":
        max_length = inputs.shape[1] + 10

    result = model.generate(
        inputs,
        num_return_sequences=n or 1,
        do_sample=False,
        temperature=1.0,
        repetition_penalty=10.0,
        max_length=max_length,
        min_length=int(0.5 * max_length),
        num_beams=beams,
    )
    texts = [tokenizer.decode(r, skip_special_tokens=True) for r in result]

    if not n and isinstance(text, str):
        return texts[0]
    return texts[0]

In [9]:
paraphrase('fuck this', model, tokenizer)

"I don't like this."

## MariaNMT

In [10]:
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

In [11]:
# model_name = "facebook/wmt19-en-ru"
model_name1 = 'Helsinki-NLP/opus-mt-en-ru'
tokenizer1 = AutoTokenizer.from_pretrained(model_name1)
model1 = AutoModelForSeq2SeqLM.from_pretrained(model_name1).to('cuda:1')

In [12]:
model_name2 = 'Helsinki-NLP/opus-mt-ru-en'
tokenizer2 = AutoTokenizer.from_pretrained(model_name2)
model2 = AutoModelForSeq2SeqLM.from_pretrained(model_name2).to('cuda:1');

In [14]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm, trange

In [46]:
from nltk.tokenize import sent_tokenize


def detokenize(text):
    for symbol in ",.?'!":
        text = text.replace(' ' + symbol, symbol)
    return text


def translate(texts, model, tokenizer, num_beams=5, max_length='auto', repetition_penalty=16.0, **kwargs):
    sentences = []
    sent_sizes = []
    for text in texts:
        sents = sent_tokenize(detokenize(text))
        sentences.extend(sents)
        sent_sizes.append(len(sents))
    inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt")
    if max_length == 'auto': 
        max_length = int(10 + inputs.input_ids.shape[1] * 1.5)
    with torch.no_grad():
        outputs = model.generate(**inputs.to(model.device), num_beams=num_beams, repetition_penalty=repetition_penalty, **kwargs)
    out_sents = [tokenizer.decode(out, skip_special_tokens=True) for out in outputs]
    out_texts = []
    cursor = 0
    for size in sent_sizes:
        out_texts.append(' '.join(out_sents[cursor: cursor+size]))
        cursor += size
    return out_texts

In [15]:
test_data = pd.read_csv('../data/russian_data/test.tsv', sep='\t')
test_inputs = test_data["toxic_comment"].values.tolist()

```
cd /home/dale/models/detox-parallel
mkdir mbart_5000_EN
cd mbart_5000_EN
scp dale@nlp2:/home/moskovskiy/workspace/nlp/multilingual_tst/mbarts/mbart_5000_EN/* .
```

In [16]:
batch_size = 8  # 15 minutes to translate the whole dataset

In [17]:
import gc

def cleanup():
    gc.collect()
    torch.cuda.empty_cache()
    
cleanup()

In [16]:
test_inputs[:3]

['укропидорг лавринов! общайся лучше с ией - так хоть на человека похож!',
 'терпеть не могу самок, которые за мужской хер готовы родину продать',
 'фсё для таких пидарасоф как ты']

In [47]:
test_src_en_marianmt = []
texts = test_inputs
for i in trange(0, len(texts), batch_size):
    test_src_en_marianmt.extend(translate(texts[i:i+batch_size], model2, tokenizer2))

  0%|          | 0/125 [00:00<?, ?it/s]

In [49]:
test_src_en_marianmt[:3]

["The laurin crimp! You'd better talk to the Yi, at least you look like a man!",
 "I hate females who are willing to sell their homeland for a man's dick.",
 'For fuckers like you.']

In [51]:
test_trg_en_marianmt = []
for text in tqdm(test_src_en_marianmt):
    test_trg_en_marianmt.append(paraphrase(text, model, tokenizer))

  0%|          | 0/1000 [00:00<?, ?it/s]

In [52]:
test_trg_en_marianmt[:3]

["You'd better talk to the Yi, at least you look like a man!",
 "I hate females who are willing to sell their homeland for a man's money.",
 'For unsuitable people like you.']

In [53]:
test_trg_ru_marianmt = []
texts = test_trg_en_marianmt
for i in trange(0, len(texts), batch_size):
    test_trg_ru_marianmt.extend(translate(texts[i:i+batch_size], model1, tokenizer1))

  0%|          | 0/125 [00:00<?, ?it/s]

In [54]:
test_trg_ru_marianmt[:3]

['Тебе лучше поговорить с И, по крайней мере ты выглядишь как мужчина!',
 'Я ненавижу женщин, которые готовы продать свою родину за мужские деньги.',
 'Для неподходящих людей вроде тебя.']

In [None]:
path = '../results/backtranslate-marianmt/'
if not os.path.exists(path):
    os.mkdir(path)

In [63]:
with open(path + 'translated_ru2en.txt', 'w') as f:
    for line in test_src_en_marianmt:
        f.write(line + '\n')
        
with open(path + 'translated_ru2en_paraphrased.txt', 'w') as f:
    for line in test_trg_en_marianmt:
        f.write(line + '\n')
        
with open(path + 'results_ru.txt', 'w') as f:
    for line in test_trg_ru_marianmt:
        f.write(line + '\n')

## Yandex translation


In [65]:
import nltk
nltk.download('punkt')
from nltk import sent_tokenize

[nltk_data] Downloading package punkt to /home/dale/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [66]:
import re

def split_by_symbol(text, symbol=',', max_len=400):
    if len(text) <= max_len:
        return [text]
    chunks = re.split(symbol, text)
    if len(chunks) <= 1:
        return [text]
    result = [chunks[0]]
    for chunk in chunks[1:]:
        result.append(symbol)
        result.append(chunk)
    return result

def join_texts(texts, max_len=400):
    result = []
    prev_text = ''
    for text in texts:
        if len(text) + len(prev_text) > max_len:
            result.append(prev_text)
            prev_text = text
        else:
            prev_text = prev_text + text
    result.append(prev_text)
    return result

def hard_split(text, max_len=300):
    parts = list(sent_tokenize(text))
    result = []
    for part in parts:
        chunks = [part]
        for symbol in [',', '-', ' ']:
            chunks = [c2 for c in chunks for c2 in split_by_symbol(c, symbol, max_len=max_len)]
        result.extend(chunks)
    result = join_texts(result, max_len=max_len)
    return result

How to obtain a fresh SID:
* go to translate.yandex.ru
* open the "network" panel of the developers console
* enter any text in the translation form
* find the request to "https://translate.yandex.net/api/v1/tr.json/translate" and copy its first parameter ("id")

In [76]:
import requests

SID = '7d9e19fd.62960895.db4a787d.74722d74657874-8-0'

def translate_yandex(search_str, direction='en-ru', full_response=False):
    try:
        url = f'https://translate.yandex.net/api/v1/tr.json/translate?id={SID}&srv=tr-text&lang={direction}&reason=auto&format=text'

        post_header = {}
        post_header['Accept'] = '*/*'
        post_header['Accept-Encoding'] = 'gzip, deflate'
        post_header['Accept-Language'] = 'en-US,en;q=0.9'
        post_header['Cache-Control'] = 'no-cache'
        post_header['Connection'] = 'keep-alive'
        post_header['Content-Type'] = 'application/x-www-form-urlencoded'
        post_header['Host'] = 'translate.yandex.com'
        post_header['Referer'] = 'https://translate.yandex.com/'
        post_header['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 YaBrowser/21.8.2.383 Yowser/2.5 Safari/537.36'


        data_payload = {'text': search_str, 'options': '4'}

        resp = requests.post(url, headers=post_header, data=data_payload).json()
        #print(resp)
        if full_response:
            return resp

        if resp.get('message') == 'The text size exceeds the maximum':
            parts = hard_split(search_str)
            if len(parts) > 1:
                return 200, ' '.join([translate_yandex(part, dir=dir)[1] for part in parts])

        return resp['code'], resp['text'][0]
    except Exception as e:
        raise e
        return 0, ''

In [77]:
test_inputs[:3]

['укропидорг лавринов! общайся лучше с ией - так хоть на человека похож!',
 'терпеть не могу самок, которые за мужской хер готовы родину продать',
 'фсё для таких пидарасоф как ты']

In [82]:
test_src_en_yandex = []
for text in tqdm(test_inputs):
    test_src_en_yandex.append(translate_yandex(text, 'ru-en')[1])

  0%|          | 0/1000 [00:00<?, ?it/s]

In [83]:
test_src_en_yandex[:3]

['ukropidorg lavrinov! communicate better with ai - so at least you look like a person!',
 "I can't stand females who are ready to sell their homeland for a man's dick",
 'fse for faggots like you']

In [84]:
test_trg_en_yandex = []
for text in tqdm(test_src_en_yandex):
    test_trg_en_yandex.append(paraphrase(text, model, tokenizer))

  0%|          | 0/1000 [00:00<?, ?it/s]

In [85]:
test_trg_en_yandex[:3]

['lavrinov! communicate better with ai - so at least you look like a person!',
 "I can't stand females who are ready to sell their homeland for a man's.",
 'fse for those who dislike it']

In [89]:
test_trg_ru_yandex = []
for text in tqdm(test_trg_en_yandex):
    test_trg_ru_yandex.append(translate_yandex(text, 'en-ru')[1])

  0%|          | 0/1000 [00:00<?, ?it/s]

In [90]:
test_trg_ru_yandex[:3]

['лавринов! лучше общайтесь с искусственным интеллектом - так вы хотя бы будете выглядеть как человек!',
 'Я терпеть не могу женщин, которые готовы продать свою родину за мужскую.',
 'fse для тех, кому это не нравится']

In [91]:
path = '../results/backtranslate-yandex/'
if not os.path.exists(path):
    os.mkdir(path)

In [92]:
with open(path + 'translated_ru2en.txt', 'w') as f:
    for line in test_src_en_yandex:
        f.write(line + '\n')
        
with open(path + 'translated_ru2en_paraphrased.txt', 'w') as f:
    for line in test_trg_en_yandex:
        f.write(line + '\n')
        
with open(path + 'results_ru.txt', 'w') as f:
    for line in test_trg_ru_yandex:
        f.write(line + '\n')

## Scoring the results

```
python evaluate_ru.py \
    --result_filename scores \
    --input_dir results/backtranslate-marianmt \
    --output_dir results
    
Style accuracy:       0.6040729880332947
Meaning preservation: 0.722809910774231
Joint fluency:        -0.22462719678878784
Joint score:          -0.08952497690916061
Scores after calibration:
Style accuracy:       0.6436656713485718
Meaning preservation: 0.5851079821586609
Joint fluency:        0.7416787147521973
Joint score:          0.28991788625717163
```

```
python evaluate_ru.py \
    --result_filename scores \
    --input_dir results/backtranslate-yandex \
    --output_dir results

Style accuracy:       0.6853741407394409
Meaning preservation: 0.8279372453689575
Joint fluency:        -0.1530541628599167
Joint score:          -0.085952028632164
Scores after calibration:
Style accuracy:       0.7168368101119995
Meaning preservation: 0.7420499324798584
Joint fluency:        0.8239876627922058
Joint score:          0.4313623607158661
```

# Transfer Russian to English

In [4]:
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

In [5]:
model_name = '/home/dale/models/detox-parallel/mbart_5000_RU'
tokenizer = AutoTokenizer.from_pretrained('facebook/mbart-large-50')
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to('cuda:0')

In [6]:
def paraphrase(
    text, model, tokenizer, n=None, max_length="auto", beams=5,
):
    texts = [text] if isinstance(text, str) else text
    inputs = tokenizer(texts, return_tensors="pt", padding=True)["input_ids"].to(
        model.device
    )

    if max_length == "auto":
        max_length = inputs.shape[1] + 10

    result = model.generate(
        inputs,
        num_return_sequences=n or 1,
        do_sample=False,
        temperature=1.0,
        repetition_penalty=10.0,
        max_length=max_length,
        min_length=int(0.5 * max_length),
        num_beams=beams,
    )
    texts = [tokenizer.decode(r, skip_special_tokens=True) for r in result]

    if not n and isinstance(text, str):
        return texts[0]
    return texts[0]

In [7]:
paraphrase('В пизду это всё!', model, tokenizer)

'Мне все равно на это всё!'

## MariaNMT

In [8]:
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

In [9]:
# model_name = "facebook/wmt19-en-ru"
model_name1 = 'Helsinki-NLP/opus-mt-en-ru'
tokenizer1 = AutoTokenizer.from_pretrained(model_name1)
model1 = AutoModelForSeq2SeqLM.from_pretrained(model_name1).to('cuda:1')

In [10]:
model_name2 = 'Helsinki-NLP/opus-mt-ru-en'
tokenizer2 = AutoTokenizer.from_pretrained(model_name2)
model2 = AutoModelForSeq2SeqLM.from_pretrained(model_name2).to('cuda:1');

In [11]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm, trange

In [12]:
from nltk.tokenize import sent_tokenize


def detokenize(text):
    for symbol in ",.?'!":
        text = text.replace(' ' + symbol, symbol)
    return text


def translate(texts, model, tokenizer, num_beams=5, max_length='auto', repetition_penalty=16.0, **kwargs):
    sentences = []
    sent_sizes = []
    for text in texts:
        sents = sent_tokenize(detokenize(text))
        sentences.extend(sents)
        sent_sizes.append(len(sents))
    inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt")
    if max_length == 'auto': 
        max_length = int(10 + inputs.input_ids.shape[1] * 1.5)
    with torch.no_grad():
        outputs = model.generate(**inputs.to(model.device), num_beams=num_beams, repetition_penalty=repetition_penalty, **kwargs)
    out_sents = [tokenizer.decode(out, skip_special_tokens=True) for out in outputs]
    out_texts = []
    cursor = 0
    for size in sent_sizes:
        out_texts.append(' '.join(out_sents[cursor: cursor+size]))
        cursor += size
    return out_texts

In [14]:
with open('../data/english_data/test_toxic_parallel.txt', 'r') as f:
    test_inputs = [line.strip() for line in f.readlines()]

In [15]:
test_inputs[:3]

['. or the loud ass one - thousand ton beast roaring towards you howling its horn .',
 'mandated  and " right fucking now " would be good .',
 '* neither * of my coworkers gave a shit when it came time to ditch mitch . ugh .']

In [16]:
batch_size = 8  # 15 minutes to translate the whole dataset

In [17]:
import gc

def cleanup():
    gc.collect()
    torch.cuda.empty_cache()
    
cleanup()

In [21]:
test_src_ru_marianmt = []
texts = test_inputs
for i in trange(0, len(texts), batch_size):
    test_src_ru_marianmt.extend(translate(texts[i:i+batch_size], model1, tokenizer1))

  0%|          | 0/84 [00:00<?, ?it/s]

In [22]:
test_src_ru_marianmt[:3]

['. или громкая задница тысяча тонн зверя, рычащего на тебя и вопящего его рога.',
 'Это было бы неплохо.',
 'Моим коллегам тоже было насрать, когда пришло время бросить Митч. Ух.']

In [23]:
test_trg_ru_marianmt = []
for text in tqdm(test_src_ru_marianmt):
    test_trg_ru_marianmt.append(paraphrase(text, model, tokenizer))

  0%|          | 0/671 [00:00<?, ?it/s]

In [24]:
test_trg_ru_marianmt[:3]

['или громкая задница тысяча тонн зверя, рычащего на тебя и вопящего его рога.',
 'Это было бы неплохо..',
 'Моим коллегам тоже было насрать, когда пришло время бросить Митч. Ух.']

In [25]:
test_trg_en_marianmt = []
texts = test_trg_ru_marianmt
for i in trange(0, len(texts), batch_size):
    test_trg_en_marianmt.extend(translate(texts[i:i+batch_size], model2, tokenizer2))

  0%|          | 0/84 [00:00<?, ?it/s]

In [26]:
test_trg_en_marianmt[:3]

['Or the loud ass of a thousand tons of beast roaring at you and crying out his horn.',
 'That would be nice.',
 "My colleagues didn't give a shit when it was time to leave Mitch. Wow."]

In [27]:
path = '../results/backtranslate-marianmt/'
if not os.path.exists(path):
    os.mkdir(path)

In [29]:
with open(path + 'translated_en2ru.txt', 'w') as f:
    for line in test_src_ru_marianmt:
        f.write(line + '\n')
        
with open(path + 'translated_en2ru_paraphrased.txt', 'w') as f:
    for line in test_trg_ru_marianmt:
        f.write(line + '\n')
        
with open(path + 'results_en.txt', 'w') as f:
    for line in test_trg_en_marianmt:
        f.write(line + '\n')

## Yandex translation


In [30]:
import nltk
nltk.download('punkt')
from nltk import sent_tokenize

[nltk_data] Downloading package punkt to /home/dale/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [31]:
import re

def split_by_symbol(text, symbol=',', max_len=400):
    if len(text) <= max_len:
        return [text]
    chunks = re.split(symbol, text)
    if len(chunks) <= 1:
        return [text]
    result = [chunks[0]]
    for chunk in chunks[1:]:
        result.append(symbol)
        result.append(chunk)
    return result

def join_texts(texts, max_len=400):
    result = []
    prev_text = ''
    for text in texts:
        if len(text) + len(prev_text) > max_len:
            result.append(prev_text)
            prev_text = text
        else:
            prev_text = prev_text + text
    result.append(prev_text)
    return result

def hard_split(text, max_len=300):
    parts = list(sent_tokenize(text))
    result = []
    for part in parts:
        chunks = [part]
        for symbol in [',', '-', ' ']:
            chunks = [c2 for c in chunks for c2 in split_by_symbol(c, symbol, max_len=max_len)]
        result.extend(chunks)
    result = join_texts(result, max_len=max_len)
    return result

How to obtain a fresh SID:
* go to translate.yandex.ru
* open the "network" panel of the developers console
* enter any text in the translation form
* find the request to "https://translate.yandex.net/api/v1/tr.json/translate" and copy its first parameter ("id")

In [32]:
import requests

SID = '7d9e19fd.62960895.db4a787d.74722d74657874-8-0'

def translate_yandex(search_str, direction='en-ru', full_response=False):
    try:
        url = f'https://translate.yandex.net/api/v1/tr.json/translate?id={SID}&srv=tr-text&lang={direction}&reason=auto&format=text'

        post_header = {}
        post_header['Accept'] = '*/*'
        post_header['Accept-Encoding'] = 'gzip, deflate'
        post_header['Accept-Language'] = 'en-US,en;q=0.9'
        post_header['Cache-Control'] = 'no-cache'
        post_header['Connection'] = 'keep-alive'
        post_header['Content-Type'] = 'application/x-www-form-urlencoded'
        post_header['Host'] = 'translate.yandex.com'
        post_header['Referer'] = 'https://translate.yandex.com/'
        post_header['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 YaBrowser/21.8.2.383 Yowser/2.5 Safari/537.36'


        data_payload = {'text': search_str, 'options': '4'}

        resp = requests.post(url, headers=post_header, data=data_payload).json()
        #print(resp)
        if full_response:
            return resp

        if resp.get('message') == 'The text size exceeds the maximum':
            parts = hard_split(search_str)
            if len(parts) > 1:
                return 200, ' '.join([translate_yandex(part, dir=dir)[1] for part in parts])

        return resp['code'], resp['text'][0]
    except Exception as e:
        raise e
        return 0, ''

In [34]:
translate_yandex('В пизду это всё!', 'ru-en')

(200, 'Fuck it all!')

In [35]:
test_inputs[:3]

['. or the loud ass one - thousand ton beast roaring towards you howling its horn .',
 'mandated  and " right fucking now " would be good .',
 '* neither * of my coworkers gave a shit when it came time to ditch mitch . ugh .']

In [38]:
test_src_ru_yandex = []
for text in tqdm(test_inputs):
    test_src_ru_yandex.append(translate_yandex(text, 'en-ru')[1])

  0%|          | 0/671 [00:00<?, ?it/s]

In [39]:
test_src_ru_yandex[:3]

['. или громкоголосый тысячетонный зверь , ревущий к вам , воя своим рогом .',
 '" и " прямо сейчас , блядь " было бы неплохо .',
 '* никому * из моих коллег не было дела, когда пришло время бросить Митча. фу.']

In [40]:
test_trg_ru_yandex = []
for text in tqdm(test_src_ru_yandex):
    test_trg_ru_yandex.append(paraphrase(text, model, tokenizer))

  0%|          | 0/671 [00:00<?, ?it/s]

In [41]:
test_trg_ru_yandex[:3]

['или громкоголосый тысячетонный зверь, ревущий к вам, воя своим рогом',
 '" и " прямо сейчас " было бы неплохо....',
 'Никому из моих подчиненных не было дела, когда пришло время бросить Митча']

In [42]:
test_trg_en_yandex = []
for text in tqdm(test_trg_ru_yandex):
    test_trg_en_yandex.append(translate_yandex(text, 'ru-en')[1])

  0%|          | 0/671 [00:00<?, ?it/s]

In [43]:
test_trg_en_yandex[:3]

['or a loud-voiced thousand-ton beast roaring towards you, howling with its horn',
 '" and \'right now\' would be nice....',
 'None of my subordinates cared when it was time to dump Mitch.']

In [44]:
path = '../results/backtranslate-yandex/'
if not os.path.exists(path):
    os.mkdir(path)

In [45]:
with open(path + 'translated_en2ru.txt', 'w') as f:
    for line in test_src_ru_yandex:
        f.write(line + '\n')
        
with open(path + 'translated_en2ru_paraphrased.txt', 'w') as f:
    for line in test_trg_ru_yandex:
        f.write(line + '\n')
        
with open(path + 'results_en.txt', 'w') as f:
    for line in test_trg_en_yandex:
        f.write(line + '\n')

## Scoring the results

See https://github.com/skoltech-nlp/paradetox/tree/main/evaluation_detox

```
cd /home/dale/projects/paradetox2/evaluation_detox
python metric.py --inputs /home/dale/projects/multilingual_detox/data/english_data/test_toxic_parallel.txt \
    --preds /home/dale/projects/multilingual_detox/results/backtranslate-marianmt/results_en.txt \
    --cola_classifier_path /home/dale/models/cola_classifier_fairseq \
    --wieting_model_path /home/dale/models/wieting_similarity/sim.pt \
    --wieting_tokenizer_path /home/dale/models/wieting_similarity/sim.sp.30k.model \
    --batch_size 32
cat results.md
```

| Model | ACC | EMB_SIM | SIM | CharPPL | TokenPPL | FL | GM | J | BLEU |
| ----- | --- | ------- | --- | ------- | -------- | -- | -- | - | ---- |
results_en.txt|0.6766|0.7574|0.7180|6.5078|86.1255|0.8942|0.0000|0.4116|0.4885|

```
python metric.py --inputs /home/dale/projects/multilingual_detox/data/english_data/test_toxic_parallel.txt \
    --preds /home/dale/projects/multilingual_detox/results/backtranslate-yandex/results_en.txt \
    --cola_classifier_path /home/dale/models/cola_classifier_fairseq \
    --wieting_model_path /home/dale/models/wieting_similarity/sim.pt \
    --wieting_tokenizer_path /home/dale/models/wieting_similarity/sim.sp.30k.model \
    --batch_size 32
cat results.md
```
| Model | ACC | EMB_SIM | SIM | CharPPL | TokenPPL | FL | GM | J | BLEU |
| ----- | --- | ------- | --- | ------- | -------- | -- | -- | - | ---- |
results_en.txt|0.6662|0.7985|0.7741|6.1724|89.3471|0.9553|0.0000|0.4674|0.5799|

# M2M100

We have tried the https://huggingface.co/facebook/m2m100_1.2B model for machine translation, and the small (418M) model translated poorly in both directions. 

The medium one (1.2B) works decently (but not better than marianmt), and the large one (12B) just does not fit our hardware.

In [1]:
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer

hi_text = "जीवन एक चॉकलेट बॉक्स की तरह है।"
chinese_text = "生活就像一盒巧克力。"

model_name = 'facebook/m2m100_418M'
model_name = 'facebook/m2m100_1.2B'


model2 = M2M100ForConditionalGeneration.from_pretrained(model_name)
tokenizer2 = M2M100Tokenizer.from_pretrained(model_name)

Downloading:   0%|          | 0.00/909 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.62G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.54M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.31M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/271 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

In [2]:
# translate Hindi to French
tokenizer2.src_lang = "hi"
encoded_hi = tokenizer2(hi_text, return_tensors="pt")
generated_tokens = model2.generate(**encoded_hi, forced_bos_token_id=tokenizer2.get_lang_id("fr"))
tokenizer2.batch_decode(generated_tokens, skip_special_tokens=True)
# => "La vie est comme une boîte de chocolat."

['La vie est comme une boîte de chocolat.']

In [7]:
model2.cuda();

In [3]:
def translate_m2m(texts, model, tokenizer, src, trg, num_beams=5, max_length='auto', repetition_penalty=16.0, **kwargs):
    tokenizer2.src_lang = src
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt").to(model.device)
    if max_length == 'auto': 
        max_length = int(10 + inputs.input_ids.shape[1] * 1.5)
    generated_tokens = model.generate(
        **inputs, 
        forced_bos_token_id=tokenizer.get_lang_id(trg),
        num_beams=num_beams,
        max_length=max_length,
        repetition_penalty=repetition_penalty,
        **kwargs,
    )
    return tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)

In [8]:
for text in ['. or the loud ass one - thousand ton beast roaring towards you howling its horn .',
 'mandated  and " right fucking now " would be good .',
 '* neither * of my coworkers gave a shit when it came time to ditch mitch . ugh .']:
    print(translate_m2m(text, model2, tokenizer2, 'en', 'ru'))

['Или громкий осел — тысяча тонн зверей, ругающихся к тебе и шумлящих своим рогом.']
['«Правда и проклятие» было бы хорошо.']
['* ни один из моих коллег не давал дерьма, когда пришло время отбросить митч. ugh.']


```
marianmt:
['. или громкая задница тысяча тонн зверя, рычащего на тебя и вопящего его рога.',
 'Это было бы неплохо.',
 'Моим коллегам тоже было насрать, когда пришло время бросить Митч. Ух.']
yandex:
['. или громкоголосый тысячетонный зверь , ревущий к вам , воя своим рогом .',
 '" и " прямо сейчас , блядь " было бы неплохо .',
 '* никому * из моих коллег не было дела, когда пришло время бросить Митча. фу.']
```

In [9]:
for text in ['укропидорг лавринов! общайся лучше с ией - так хоть на человека похож!',
 'терпеть не могу самок, которые за мужской хер готовы родину продать',
 'фсё для таких пидарасоф как ты']:
    print(translate_m2m(text, model2, tokenizer2, 'ru', 'en'))

['Ukropidog laurines! communicate better with IA - so at least it looks like a man!']
['I can’t tolerate females who are willing to sell their homeland for a man.']
['for peddlers like you.']


```
marianmt:
["The laurin crimp! You'd better talk to the Yi, at least you look like a man!",
 "I hate females who are willing to sell their homeland for a man's dick.",
 'For fuckers like you.']
yandex:
['ukropidorg lavrinov! communicate better with ai - so at least you look like a person!',
 "I can't stand females who are ready to sell their homeland for a man's dick",
 'fse for faggots like you']
```