In [71]:
import torch

device = torch.device('cuda')

In [72]:
model_path = './models/m2m100_ru_kbd/'

In [73]:
from transformers import M2M100Tokenizer

tokenizer = M2M100Tokenizer.from_pretrained(model_path, extra_ids=0)

In [74]:
from transformers import M2M100ForConditionalGeneration

model = M2M100ForConditionalGeneration.from_pretrained(model_path)
model.to(device)

M2M100ForConditionalGeneration(
  (model): M2M100Model(
    (shared): Embedding(128112, 1024, padding_idx=1)
    (encoder): M2M100Encoder(
      (embed_tokens): Embedding(128112, 1024, padding_idx=1)
      (embed_positions): M2M100SinusoidalPositionalEmbedding()
      (layers): ModuleList(
        (0): M2M100EncoderLayer(
          (self_attn): M2M100Attention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): ReLU()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (final_layer_norm): LayerNorm((1024,)

In [75]:
def cyrillic_to_latin(text):
    with open('../data/kbd cyrillic-latin alphabet table.txt', 'r', encoding='utf-8') as alphabet_table:
        for line in alphabet_table:
            key, value = line.split(':')
            text = text.replace(key, value.replace('\n', ''))
    return text

In [76]:
def translate(input_text, src_lang = "ru", n_beams = 2, mdl = model):

        torch.cuda.empty_cache()

        # force first token to be __ru__ instead of the default __en__
        tokenizer.src_lang = src_lang
        model_inputs = tokenizer(input_text, return_tensors="pt")
        model_inputs.to(device)

        print(tokenizer.convert_ids_to_tokens(tokenizer.encode(input_text)))
        # print(tokenizer.decode(model_inputs.input_ids[0]))

        gen_tokens = mdl.generate(**model_inputs,
                                forced_bos_token_id=tokenizer.get_lang_id("zu"),
                                # output_scores=True,
                                num_beams=n_beams,
                                num_return_sequences=n_beams,
                                # length_penalty=0.2,
                                # do_sample=True,
                                max_length=512,
                                # temperature=0.1,
                                return_dict_in_generate=True,
                                output_scores=True
                                )

        probs = gen_tokens.sequences_scores.softmax(-1)

        # for x in range(0, len(gen_tokens.sequences)):
        #     print(tokenizer.decode(gen_tokens.sequences[x], skip_special_tokens=True), " : ", round(probs[x].item()*100, 2), '%')
        # # print(*tokenizer.batch_decode(gen_tokens, skip_special_tokens=True),sep='\n')

        import pandas as pd

        column_names = ['sentences', 'probabilities']

        df = pd.DataFrame(columns=column_names)

        for x in range(0, len(gen_tokens.sequences)):
                df = df.append({'sentences':tokenizer.decode(gen_tokens.sequences[x], skip_special_tokens=True), 'probabilities':str(round(probs[x].item()*100, 2))+" %"}, ignore_index=True)


        import numpy as np
        df.index = np.arange( 1, len(df) + 1) 
        from IPython.display import display
        display(df.style.set_properties(**{'text-align': 'left'}).set_table_styles([dict(selector='th', props=[('text-align', 'left')])]))

In [77]:
text = "раствор"
src_lang = "ru"
n_beams =  10

translate(text, src_lang, n_beams)

['__ru__', '▁ра', 'ство', 'р', '</s>']


Unnamed: 0,sentences,probabilities
1,псынщIэ,13.1 %
2,псынщIэпс,11.03 %
3,Iэщхъуэ,10.81 %
4,раствор,10.22 %
5,пыIэ,9.97 %
6,сэху,9.89 %
7,псыIэщIэ,9.35 %
8,псынщIэплъ,8.61 %
9,зэрыкIуэ,8.57 %
10,псыIэщI,8.45 %


In [78]:
import ipywidgets as widgets
langs = [x.replace('_', '') for x in tokenizer.additional_special_tokens]
lang_picker = widgets.Dropdown(options=langs, value='en')

lang_picker

Dropdown(index=18, options=('af', 'am', 'ar', 'ast', 'az', 'ba', 'be', 'bg', 'bn', 'br', 'bs', 'ca', 'ceb', 'c…