https://huggingface.co/facebook/mbart-large-50-many-to-many-mmt

# Preparation

In [None]:
from google.colab import drive
drive.mount('/gd')

Drive already mounted at /gd; to attempt to forcibly remount, call drive.mount("/gd", force_remount=True).


In [None]:
!pip install transformers sentencepiece -q

[K     |████████████████████████████████| 4.4 MB 29.8 MB/s 
[K     |████████████████████████████████| 1.2 MB 18.9 MB/s 
[K     |████████████████████████████████| 596 kB 52.4 MB/s 
[K     |████████████████████████████████| 6.6 MB 53.2 MB/s 
[K     |████████████████████████████████| 101 kB 11.5 MB/s 
[?25h

In [None]:
!wget https://raw.githubusercontent.com/google/sentencepiece/master/src/sentencepiece_model.proto

--2022-06-27 07:40:52--  https://raw.githubusercontent.com/google/sentencepiece/master/src/sentencepiece_model.proto
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 13451 (13K) [text/plain]
Saving to: ‘sentencepiece_model.proto’


2022-06-27 07:40:52 (97.9 MB/s) - ‘sentencepiece_model.proto’ saved [13451/13451]



In [None]:
! protoc --python_out=. sentencepiece_model.proto

# Counting words

In [None]:
import torch
from transformers import MBartForConditionalGeneration, MBart50Tokenizer
tokenizer = MBart50Tokenizer.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")

In [None]:
print(tokenizer.vocab_size)

250054


In [None]:
import json

In [None]:
with open('/gd/MyDrive/datasets/nlp/myv_corpus/myv_mono.v1.json', 'r') as f:
    all_sentences = json.load(f)
print(len(all_sentences))

333651


In [None]:
all_sentences[:10]

['! Вадо!/Азёдо тестэ!',
 '! Вастомазонок! Неемазонок!',
 '! Инескеть/энялдан',
 '! Мезес пачкодемс!',
 '!вайх!',
 '" " " " Ашо пацяс сюлмимем.',
 '" " " " Мон гулькинекс теевинь.”',
 '" " " " Монь патинем пурнымем.',
 '" " " " Од авинем пидимем.',
 '" " " " Патинень ланкс ёртлимизь.']

In [None]:
from collections import Counter, defaultdict
from tqdm.auto import tqdm, trange
import random
import re

In [None]:
text = 'Моё уважение!!! :) :-\ :-D'

In [None]:
tokenizer.tokenize(text)

['▁Мо', 'ё', '▁уважение', '!!!', '▁:)', '▁:-', '\\', '▁:-', 'D']

In [None]:
tokenizer.prepare_for_tokenization(text)

('Моё уважение!!! :) :-\\ :-D', {})

In [None]:
tokenizer.tokens_trie.split(text)

['Моё уважение!!! :) :-\\ :-D']

In [None]:
tokenizer.sp_model.encode(text)

[7330, 7784, 164999, 1563, 1093, 15771, 41871, 15771, 396]

In [None]:
tokens = tokenizer.tokenize(text)
tokens

['▁Мо', 'ё', '▁уважение', '!!!', '▁:)', '▁:-', '\\', '▁:-', 'D']

In [None]:
char_count = Counter()
for text in tqdm(all_sentences):
    char_count.update(text)

  0%|          | 0/333651 [00:00<?, ?it/s]

In [None]:
PUNCT = '.,-—:)(»«!?–/;„"“…*́№Ёҥ[]”^%+І=і•_􏰀²|}{#‘■>⁠’á<°\§\''
SPACE = '▁'

In [None]:
for k, v in char_count.most_common(200):
    if not re.match('[а-яА-Яa-zA-Z0-9ё\']', k):
        print(k, end='')

 .,-—:)(»«!?–/;„"“…*́№Ёҥ[]”^%+І=і•_􏰀²|}{#‘■>­⁠’á<°\әéäˈ&~óاəაա©↑იüοö§ςαõ

In [None]:
toks = tokenizer.tokenize(text)
toks

['▁',
 '\U0010005a',
 '▁Я',
 '.',
 'Я',
 '.',
 '▁Кул',
 'дур',
 'ка',
 'ев',
 '▁ПО',
 'Э',
 'МАТ',
 '▁',
 'ДЫ',
 '▁',
 'Ё',
 'ВК',
 'СТ',
 '▁86',
 '▁87',
 '▁Кап',
 'ши',
 'ль',
 '▁',
 'анс',
 'як',
 '▁сон',
 '▁мор',
 'о',
 'нь',
 '▁мора',
 'мо',
 '.']

In [None]:
def get_words(text, tokenizer, verbose=False):
    toks = tokenizer.tokenize(text)
    words = []
    word = []
    prev_punct = False
    for tok in toks:
        is_punct = tok.lstrip(SPACE) and all(c in PUNCT for c in tok.lstrip(SPACE))
        if tok.startswith(SPACE) or prev_punct != is_punct:
            if word:
                words.append(word)
            word = []
        word.append(tok)
        prev_punct = is_punct
    if word:
        words.append(word)
    if verbose:
        print(words)
    res = words
    # assert tokenizer.decode([tok for t in res for tok in t]) == text
    return res

In [None]:
word_count = Counter()
word2toks = {}
for text in tqdm(all_sentences):
    for word_toks in get_words(text, tokenizer):
        word = ''.join(word_toks)
        word_count[word] += 1
        word2toks[word] = word_toks
print(len(word_count))

  0%|          | 0/333651 [00:00<?, ?it/s]

423699


In [None]:
from copy import deepcopy
word_count2 = deepcopy(word_count)
word2toks2 = deepcopy(word2toks)

# Computing splits

In [None]:
word_count = deepcopy(word_count2)
word2toks = deepcopy(word2toks2)

In [None]:
for k, v in word_count.most_common(30):
    if len(word2toks[k]) > 1:
        print(word2toks[k])

['▁март', 'о']
['▁код', 'а']
['▁вес', 'е']
['лан', 'г', 'с']
['▁а', 'воль']
['▁кор', 'яс']
['▁ла', 'мо']
['▁у', 'льне', 'сь']


In [None]:
steps = 100_000
min_count = 30  
# default:   0 new tokens, 30 lenght, 0% new tokens
# 100 mindf: 6.6k new tokens, 22 length, 47% new tokens (of sentence length)
# 30 mindf:  20k new tokens, 20 length, 58% new tokens
# 10 mindf: 50K new tokens, 18.5 length, 64% new tokens
extra_vocab = []
extra_counts = []
extra_pairs = []

In [None]:
# word2toks = {''.join(tokenizer.convert_ids_to_tokens(list(w))): w for w in tqdm(word_count)}

In [None]:
pairs_count = Counter()
pair2word = defaultdict(set)
for w, c in tqdm(word_count.items(), total=len(word_count)):
    enc = word2toks[w]
    for pair in zip(enc[:-1], enc[1:]):
        pairs_count[pair] += c
        pair2word[pair].add(w)

  0%|          | 0/423699 [00:00<?, ?it/s]

In [None]:
from heapdict import heapdict
hd = heapdict()
for w, c in pairs_count.items():
    hd[w] = -c

In [None]:
def replace_pair(old_tokens, pair, new_token):
    result = []
    prev = old_tokens[0]
    for tok in old_tokens[1:]:
        if (prev, tok) == pair:
            result.append(new_token)
            prev = None
        else:
            if prev is not None:
                result.append(prev)
            prev = tok
    if prev is not None:
        result.append(prev)
    return result

In [None]:
# rewrite with heapdict
for _ in trange(steps):
    #pair, c = pairs_count.most_common(1)[0]  # это самая времязатратная операция
    pair, c = hd.peekitem()
    c = -c
    
    if c < min_count:
        break
    new_token = ''.join(pair) # instead of BERT-like pair[0] + pair[1][2:]
    extra_vocab.append(pair)
    extra_counts.append(c)
    extra_pairs.append(pair)
    
    # update the vocabulary
    #new_id = len(id2ids)
    #tok2id[new_token] = new_id
    #id2ids.append(id2ids[tok2id[pair[0]]] + id2ids[tok2id[pair[1]]]) 
    
    # calculate the delta for the heap
    delta = Counter()
    for word in list(pair2word[pair]):
        # calculate old and new ways to tokenize the word
        old_toks = word2toks[word]
        # new_toks = " ".join(old_toks).replace(' '.join(pair), new_token).split(" ")
        new_toks = replace_pair(old_toks, pair, new_token)
        word2toks[word] = new_toks
        wc = word_count[word]
        # update the index concerning the tokens of the word
        for old_pair in zip(old_toks[:-1], old_toks[1:]):
            #pairs_count[old_pair] -= wc
            delta[old_pair] -= wc
            if word in pair2word[old_pair]:
                pair2word[old_pair].remove(word)
        for new_pair in zip(new_toks[:-1], new_toks[1:]):
            # pairs_count[new_pair] += wc
            delta[new_pair] += wc
            pair2word[new_pair].add(word)
    # update the heap
    for a_pair, a_delta in delta.items():
        if a_delta == 0:
            continue
        if a_pair not in hd:
            hd[a_pair] = 0
        hd[a_pair] -= a_delta

  0%|          | 0/100000 [00:00<?, ?it/s]

In [None]:
len(extra_pairs)

19491

In [None]:
extra_pairs[:10]

[('з', 'э'),
 ('тне', 'нь'),
 ('не', 'нь'),
 ('с', 'э'),
 ('н', 'ть'),
 ('о', 'нь'),
 ('▁код', 'а'),
 ('с', 'тэ'),
 ('лан', 'г'),
 ('н', 'энь')]

In [None]:
extra_pairs[-20:]

[('оз', 'тне'),
 ('▁нолд', 'тнемс'),
 ('▁се', 'во'),
 ('▁вожд', 'есь'),
 ('▁Ишу', 'ткинэнь'),
 ('▁Мя', 'ляфт'),
 ('▁ванькс', 'чи'),
 ('▁Теньгушев', 'ань'),
 ('Ис', 'кра'),
 ('▁кон', 'центр'),
 ('▁чув', 'тонтень'),
 ('кова', 'нь'),
 ('не', 'сэнть'),
 ('▁Перви', 'чнай'),
 ('▁азор', 'ост'),
 ('▁панжи', 'зь'),
 ('▁руководитель', 'хне'),
 ('▁с', 'вер'),
 ('нтень', 'гак'),
 ('▁Инекуж', 'ось')]

In [None]:
tokenizer.vocab_size

250054

In [None]:
tokenizer.save_pretrained('old_tokenizer')

('old_tokenizer/tokenizer_config.json',
 'old_tokenizer/special_tokens_map.json',
 'old_tokenizer/sentencepiece.bpe.model',
 'old_tokenizer/added_tokens.json')

In [None]:
import sentencepiece_model_pb2 as model
m = model.ModelProto()
m.ParseFromString(open("old_tokenizer/sentencepiece.bpe.model", "rb").read())

5069051

In [None]:
scores = [p.score for p in m.pieces]
min_score = min(scores)
epsilon = 1e-4

In [None]:
tokenizer('приввет')

{'input_ids': [250004, 37764, 18454, 2], 'attention_mask': [1, 1, 1, 1]}

In [None]:
type(m.pieces[37764-1].piece)

str

In [None]:
for i, pair in enumerate(extra_vocab):
    new_token = model.ModelProto().SentencePiece()
    new_token.piece = ''.join(pair)
    new_token.score = min_score - epsilon * (i+1)
    m.pieces.append(new_token)

In [None]:
with open("old_tokenizer/sentencepiece.bpe.model", 'wb') as f:
    f.write(m.SerializeToString())

In [None]:
new_tokenizer = MBart50Tokenizer.from_pretrained(
    "old_tokenizer/sentencepiece.bpe.model", 
    additional_special_tokens = tokenizer.additional_special_tokens
)



In [None]:
new_tokenizer.vocab_size

269545

In [None]:
from transformers.models.mbart50.tokenization_mbart50 import FAIRSEQ_LANGUAGE_CODES

In [None]:
self = new_tokenizer

In [None]:
self.lang_code_to_id = {
    code: self.sp_model_size + i + self.fairseq_offset for i, code in enumerate(FAIRSEQ_LANGUAGE_CODES + ['myv_XX'])
}
self.id_to_lang_code = {v: k for k, v in self.lang_code_to_id.items()}
self.fairseq_tokens_to_ids["<mask>"] = len(self.sp_model) + len(self.lang_code_to_id) + self.fairseq_offset

self.fairseq_tokens_to_ids.update(self.lang_code_to_id)
self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}



In [None]:
new_tokenizer.additional_special_tokens.append('myv_XX')

In [None]:
new_tokenizer.vocab_size

269546

In [None]:
len(new_tokenizer)

269546

In [None]:
text = random.choice(all_sentences)
print(text)

Валске марто паксяв туемадо икеле авам мери: — Кода крандазкеть?


In [None]:
print(tokenizer.tokenize(text))
print(new_tokenizer.tokenize(text))

['▁Вал', 'ске', '▁март', 'о', '▁пак', 'ся', 'в', '▁ту', 'ема', 'до', '▁и', 'ке', 'ле', '▁а', 'вам', '▁мери', ':', '▁—', '▁Ко', 'да', '▁кра', 'нда', 'з', 'ке', 'ть', '?']
['▁Валске', '▁март', 'о', '▁паксяв', '▁туемадо', '▁икеле', '▁а', 'вам', '▁мери', ':', '▁—', '▁Ко', 'да', '▁крандаз', 'ке', 'ть', '?']


In [None]:
random.seed(1)
sample = random.sample(all_sentences, 10000)

In [None]:
import pandas as pd

In [None]:
pd.DataFrame({
    'old': [len(tokenizer.tokenize(text)) for text in sample], 
    'new': [len(new_tokenizer.tokenize(text)) for text in sample]
}).describe()

Unnamed: 0,old,new
count,10000.0,10000.0
mean,30.5254,19.9169
std,25.247418,16.810653
min,1.0,1.0
25%,13.0,9.0
50%,24.0,15.0
75%,40.0,26.0
max,397.0,294.0


In [None]:
new_tokenizer.convert_tokens_to_ids('▁Сталинти')

3

In [None]:
new_tot_len, tot_len = 0, 0
for text in sample:
    for tok in new_tokenizer.tokenize(text):
        s = len(tok)
        tot_len += s
        if new_tokenizer.convert_tokens_to_ids(tok) > len(tokenizer.sp_model):
            new_tot_len += s
print(new_tot_len / tot_len)

0.5788132198428553


In [None]:
new_tokenizer('Это эрьсеф')

{'input_ids': [269495, 4619, 250021, 2687, 3988, 2], 'attention_mask': [1, 1, 1, 1, 1, 1]}

In [None]:
len(new_tokenizer)

269546

In [None]:
len(m.pieces)

269491

In [None]:
# model.resize_token_embeddings(len(tokenizer))
# todo: move language id embeddings by the distance. 

In [None]:
new_tokenizer.additional_special_tokens.append('myv_XX')

# Calculating tokens alignment

In [None]:
with open('/gd/MyDrive/datasets/nlp/myv_corpus/train.v1.json', 'r') as f:
    all_pairs = json.load(f)
print(len(all_pairs))

74503


In [None]:
random.choice(all_pairs)

['«Мезе эно миненек теемс?» – кевкстнесть перьканзо аштицятне.',
 'И спрашивал его народ: что же нам делать?']

In [None]:
from collections import defaultdict, Counter
token_priors = Counter()
token_to_others = defaultdict(Counter)

In [None]:
for myv, ru in tqdm(all_pairs):
    myv_toks = new_tokenizer.convert_tokens_to_ids(new_tokenizer.tokenize(myv))
    ru_toks = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(ru))
    token_priors.update(ru_toks)
    for myv_tok in myv_toks:
        token_to_others[myv_tok].update(ru_toks)

  0%|          | 0/74503 [00:00<?, ?it/s]

In [None]:
myv, ru = random.choice(all_pairs)
myv, ru

('2017 иень якшамковонь 11 чистэ Душанбе ошонть Прявтокс ашти Рустам Эмомали.',
 'С 12 января 2017 года мэром города Душанбе является Рустам Эмомали.')

In [None]:
print(new_tokenizer.convert_tokens_to_ids(new_tokenizer.tokenize(myv)))

[505, 35, 10573, 251509, 534, 250108, 139423, 5514, 251672, 268373, 250068, 37695, 21644, 109827, 419, 7310, 5]


In [None]:
myv_tok = 252698

In [None]:
import numpy as np

def get_ru_toks(myv_tok):
    ru_toks = []
    ru_weights = []
    for t, w in token_to_others[myv_tok].items():
        ru_toks.append(t)
        ru_weights.append(w**2 / token_priors[t])
    ru_weights = np.array(ru_weights)
    ru_weights = ru_weights / (sum(ru_weights) + 1e-4)
    return ru_weights, ru_toks

In [None]:
sorted(zip(*get_ru_toks(myv_tok)), reverse=True)[:5]

[(0.1676537302589413, 164668),
 (0.08382686512947066, 108305),
 (0.08382686512947066, 94416),
 (0.05588457675298044, 223807),
 (0.039707462429749256, 46905)]

In [None]:
tokenizer.convert_ids_to_tokens([62994, 17921, 117229, 188048, 25982])

['▁первую', '▁справ', '▁карты', 'ведению', '▁ат']

# Updating the model embeddings

In [None]:
len(new_tokenizer)

269546

In [None]:
from transformers import MBartForConditionalGeneration, MBart50Tokenizer
model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")

In [None]:
model.model.shared

Embedding(250054, 1024, padding_idx=1)

In [None]:
model.resize_token_embeddings(len(new_tokenizer))

Embedding(269546, 1024)

Move added tokens to the end

In [None]:
n_extra = len(extra_vocab)
print(n_extra)
old_vocab_size = len(tokenizer.sp_model) + 1

19491


In [None]:
for old_token_id in range(old_vocab_size, len(tokenizer)):
    old_token = tokenizer.convert_ids_to_tokens(old_token_id)
    new_token_id = new_tokenizer.convert_tokens_to_ids(old_token)
    
    print(old_token_id, old_token, new_token_id)
    # model.model.shared.weight.data[i + n_extra] = model.model.shared.weight.data[i]
    model.model.shared.weight.data[new_token_id] = model.model.shared.weight.data[old_token_id]

250001 ar_AR 269492
250002 cs_CZ 269493
250003 de_DE 269494
250004 en_XX 269495
250005 es_XX 269496
250006 et_EE 269497
250007 fi_FI 269498
250008 fr_XX 269499
250009 gu_IN 269500
250010 hi_IN 269501
250011 it_IT 269502
250012 ja_XX 269503
250013 kk_KZ 269504
250014 ko_KR 269505
250015 lt_LT 269506
250016 lv_LV 269507
250017 my_MM 269508
250018 ne_NP 269509
250019 nl_XX 269510
250020 ro_RO 269511
250021 ru_RU 269512
250022 si_LK 269513
250023 tr_TR 269514
250024 vi_VN 269515
250025 zh_CN 269516
250026 af_ZA 269517
250027 az_AZ 269518
250028 bn_IN 269519
250029 fa_IR 269520
250030 he_IL 269521
250031 hr_HR 269522
250032 id_ID 269523
250033 ka_GE 269524
250034 km_KH 269525
250035 mk_MK 269526
250036 ml_IN 269527
250037 mn_MN 269528
250038 mr_IN 269529
250039 pl_PL 269530
250040 ps_AF 269531
250041 pt_XX 269532
250042 sv_SE 269533
250043 sw_KE 269534
250044 ta_IN 269535
250045 te_IN 269536
250046 th_TH 269537
250047 tl_XX 269538
250048 uk_UA 269539
250049 ur_PK 269540
250050 xh_ZA 269541


Because we have added one more language, its id must be computed separately, e.g. as an average of related languages.

In [None]:
model.model.shared.weight.data[new_tokenizer.convert_tokens_to_ids('myv_XX')] = (
    model.model.shared.weight.data[tokenizer.convert_tokens_to_ids('fi_FI')] * 0.3
    + model.model.shared.weight.data[tokenizer.convert_tokens_to_ids('et_EE')] * 0.3
    + model.model.shared.weight.data[tokenizer.convert_tokens_to_ids('ru_RU')] * 0.3
)

Compute embeddings for newly added tokens

In [None]:
for i in trange(n_extra):
    myv_tok = i + old_vocab_size
    ru_weights, ru_toks = get_ru_toks(myv_tok)
    if len(ru_toks) > 0:
        new_embedding = (model.model.shared.weight.data[ru_toks].T * ru_weights).sum(1)
        model.model.shared.weight.data[myv_tok] = new_embedding

  0%|          | 0/19491 [00:00<?, ?it/s]

In [None]:
# !mkdir /gd/MyDrive/models/myv

In [None]:
new_model_path = '/gd/MyDrive/models/myv/mbart-large-51-myv-raw'

In [None]:
model.save_pretrained(new_model_path)
new_tokenizer.save_pretrained(new_model_path)

('/gd/MyDrive/models/myv/mbart-large-51-myv-raw/tokenizer_config.json',
 '/gd/MyDrive/models/myv/mbart-large-51-myv-raw/special_tokens_map.json',
 '/gd/MyDrive/models/myv/mbart-large-51-myv-raw/sentencepiece.bpe.model',
 '/gd/MyDrive/models/myv/mbart-large-51-myv-raw/added_tokens.json')

In [None]:
!ls -alsh $new_model_path

total 2.4G
1.5K -rw------- 1 root root 1.4K Jun 27 08:45 config.json
2.4G -rw------- 1 root root 2.4G Jun 27 08:45 pytorch_model.bin
5.3M -rw------- 1 root root 5.3M Jun 27 08:45 sentencepiece.bpe.model
1.0K -rw------- 1 root root  970 Jun 27 08:45 special_tokens_map.json
1.5K -rw------- 1 root root 1.2K Jun 27 08:45 tokenizer_config.json


Check that the model is still able to translate texts.  

In [None]:
article_hi = "संयुक्त राष्ट्र के प्रमुख का कहना है कि सीरिया में कोई सैन्य समाधान नहीं है"

# translate Hindi to French
new_tokenizer.src_lang = "hi_IN"
encoded_hi = new_tokenizer(article_hi, return_tensors="pt")
generated_tokens = model.generate(
    **encoded_hi,
    forced_bos_token_id=new_tokenizer.lang_code_to_id["fr_XX"], 
    max_length=30, 
    num_beams=1
)
new_tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
# => "Le chef de l 'ONU affirme qu 'il n 'y a pas de solution militaire dans la Syrie."

["Le chef de la mission de l 'ONU a déclaré qu 'il n' y a pas de solution militaire en Syria."]

Translation to Russian has broken, because some new myv tokens are very Russian-like, and they interere. 

In [None]:
generated_tokens = model.generate(
    **encoded_hi,
    forced_bos_token_id=new_tokenizer.lang_code_to_id["ru_RU"], 
    max_length=30, 
    num_beams=5,
    repetition_penalty=30.0,
)
new_tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
# => "Le chef de l 'ONU affirme qu 'il n 'y a pas de solution militaire dans la Syrie."

["пендик Джак театра United Nations says there's no military solution in Syria. < s > васендакигейкс пропагандист районы  Таняэрзянские US Secretary"]

For some reason, even with new tokens prohibited, translation to Russian is poor. 

In [None]:
generated_tokens = model.generate(
    **encoded_hi,
    forced_bos_token_id=new_tokenizer.lang_code_to_id["ru_RU"], 
    max_length=30, 
    num_beams=5,
    repetition_penalty=30.0,
    bad_words_ids=[[t] for t in range(n_extra, n_extra+old_vocab_size)]
)
new_tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
# => "Le chef de l 'ONU affirme qu 'il n 'y a pas de solution militaire dans la Syrie."

["«They don 't want to know what' s going on here, but they think that there is a lot of room for"]

translate from Erzya! The model cannot do this, but it tries hard. 

In [None]:
myv, ru = random.choice(all_pairs)
myv, ru

('Шнынк Азоронть, весе кеметь ды виевть Сонзэ ангелэнзэ, конат топавтыть Сонзэ валонзо ды кунсолыть Сонзэ кармавтоманзо!',
 'Благословите Господа, все Ангелы Его, крепкие силою, исполняющие слово Его, повинуясь гласу слова Его;')

In [None]:
new_tokenizer.src_lang = "myv_XX"
encoded_hi = new_tokenizer(myv, return_tensors="pt")
generated_tokens = model.generate(
    **encoded_hi,
    forced_bos_token_id=new_tokenizer.lang_code_to_id["ru_RU"], 
    max_length=30, 
    num_beams=5,
    repetition_penalty=30.0
)
new_tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)

['Ванс оямменнай God,  Кати His angels and conatnam Him from the bottom of your feet! ( руководительхнеа) елав']