# Dev

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
base_path = "/code/huggingface/transformers-fair-wmt"
sys.path.insert(0, base_path + "/src")

In [3]:
from pprint import pprint

In [4]:
from transformers.tokenization_fsmt import FSMTTokenizer

# fairseq tokenizer

hub_utils.py:
        
    def encode(self, sentence: str) -> torch.LongTensor:
        sentence = self.tokenize(sentence)
        sentence = self.apply_bpe(sentence)
        return self.binarize(sentence)

    def decode(self, tokens: torch.LongTensor) -> str:
        sentence = self.string(tokens)
        sentence = self.remove_bpe(sentence)
        return self.detokenize(sentence)

    def tokenize(self, sentence: str) -> str:
        if self.tokenizer is not None:
            sentence = self.tokenizer.encode(sentence)
        return sentence

    def detokenize(self, sentence: str) -> str:
        if self.tokenizer is not None:
            sentence = self.tokenizer.decode(sentence)
        return sentence

    def apply_bpe(self, sentence: str) -> str:
        if self.bpe is not None:
            sentence = self.bpe.encode(sentence)
        return sentence

    def remove_bpe(self, sentence: str) -> str:
        if self.bpe is not None:
            sentence = self.bpe.decode(sentence)
        return sentence

    def binarize(self, sentence: str) -> torch.LongTensor:
        return self.src_dict.encode_line(sentence, add_if_not_exist=False).long()

    def string(self, tokens: torch.LongTensor) -> str:
        return self.tgt_dict.string(tokens)       
        
1. tokenize using tokenizer.perl from mosesdecoder https://github.com/moses-smt/mosesdecoder



2. apply_bpe.py script using the wmt14.en-fr.fconv-cuda/bpecodes file
https://github.com/rsennrich/subword-nmt/blob/master/subword_nmt/apply_bpe.py

fairseq/data/encoders/fastbpe.py uses fastBPE
loads "model/bpecodes" file


3. binarize

somehow need to re-use bpe codes:https://github.com/facebookresearch/XLM#download--preprocess-data

> If you want to use our pretrained models, you need to have an exactly identical vocabulary. Since small differences can happen during preprocessing, we recommend that you use our BPE codes and vocabulary (although you should get something almost identical if you learn the codes and compute the vocabulary yourself).


# fastBPE

In [5]:
line = "Machine Learning"

In [6]:
import fastBPE
import fairseq
bpe_codes = base_path + '/data/wmt19.ru-en.ensemble/bpecodes'
codes = fairseq.file_utils.cached_path(bpe_codes)
codes
bpe = fastBPE.fastBPE(codes)
bpe
bpe_symbol = "@@ "
def encode(x: str) -> str: return bpe.apply([x])[0]
def decode(x: str) -> str: return (x + ' ').replace(bpe_symbol, '').rstrip()
encoded = encode("Machine Learning")
encoded
decoded = decode(encoded)
decoded

'/code/huggingface/transformers-fair-wmt/data/wmt19.ru-en.ensemble/bpecodes'

<fastBPE.fastBPE object at 0x7f0f85318450>

'Mach@@ ine L@@ ear@@ ning'

'Machine Learning'

In [7]:
from transformers import XLMTokenizer, CTRLTokenizer
tokenizer = XLMTokenizer.from_pretrained("xlm-mlm-100-1280")
tokenizer.tokenize(line)
tokenizer = CTRLTokenizer.from_pretrained("xlm-mlm-100-1280")
tokenizer.tokenize(line)


['Machine</w>', 'Le', 'arning</w>']

['M@@', 'a@@', 'c@@', 'h@@', 'i@@', 'n@@', 'e', 'L@@', 'e@@', 'a@@', 'r@@', 'n@@', 'i@@', 'n@@', 'g']

# Encode Process

## Load model via fairseq

In [8]:
from fairseq import hub_utils
checkpoint_file = 'model1.pt:model2.pt:model3.pt:model4.pt'
model_name_or_path = 'transformer.wmt19.ru-en'
data_name_or_path = '.'
cls = fairseq.model_parallel.models.transformer.ModelParallelTransformerModel
models = cls.hub_models()
kwargs = {'bpe': 'fastbpe', 'tokenizer': 'moses'}
x = hub_utils.from_pretrained(
            model_name_or_path,
            checkpoint_file,
            data_name_or_path,
            archive_map=models,
            **kwargs
        )

## Tokenize

In [9]:
args = x["args"] # loaded from the checkpoint earlier
from fairseq.data.encoders.moses_tokenizer import MosesTokenizer
#from argparse import Namespace
#args = argparse.Namespace(moses_source_lang='ru_RU', moses_target_lang='en_XX')
# fairseq.data.encoders.moses_tokenizer.MosesTokenizer
sentence = "Машинное обучение - это здорово!"
tokenizer = MosesTokenizer(args)
sentence = tokenizer.encode(sentence)
sentence

# well it mainly just adds whitespace where needed

'Машинное обучение - это здорово !'

## Apply BPE

In [10]:
from fairseq.data.encoders.fastbpe import fastBPE
bpe = fastBPE(args)
sentence = bpe.encode(sentence)
sentence


'Ма@@ шин@@ ное обучение - это здорово !'

## Binarize (bytecodes into numbers)

In [11]:
from fairseq.data.dictionary import Dictionary

filename = '/home/stas/.cache/torch/pytorch_fairseq/1f635f61a93197b2be015bcdc60f47829db1172b5b81547d72b7b28235b28fa9.5e1a2ea19b51d6733a96ed885e05dc2d3d1e3cb3573b9e62ee7acfb0454ee976/dict.ru.txt'
#cls = fairseq.tasks.translation.TranslationTask

#src_dict = Dictionary(args)
src_dict = Dictionary.load(filename)
#dir(src_dict)

# self.binarize(sentence) equivalent
src_dict.encode_line(sentence, add_if_not_exist=False).long()
# setup_task loads both dicts

# this looks up the tokenized bpe strings in the dict and replaces them with the dict indices
src_dict.index("Ма@@")
src_dict.index("обучение@@") # unk (this token doesn't exist in the dict)

# the vocab is this dict:
# src_dict.indices
# can be fed directly to json.dumps => vocab.json

tensor([  648, 13440,    97,  3618,    25,    74, 22548,   384,     2])

648

3

In [12]:
src_dict.indices.keys()

dict_keys(['<s>', '<pad>', '</s>', '<unk>', ',', '.', 'и', 'в', 'на', ')', ':', 'по', 'с', '/', '@-@', '(', '&quot;', 'для', 'что', 'о', 'не', 'к', 'В', 'года', ';', '-', 'или', 'а', 'также', 'от', 'за', 'из', '*', 'как', '1', 'Наций', 'Объединенных', 'Организации', '&gt;', 'том', 'их', '&lt;', '2', 'его', 'которые', 'не@@', '3', 'чтобы', 'области', 'того', '10', 'ные', 'ных', 'при', '4', 'развития', 'человека', 'то', 'будет', 'ной', 'до', 'но', 'более', '5', 'я', 'году', 'ного', 'отношении', 'этой', 'связи', 'ным', '#', 'деятельности', 'вопросам', 'это', 'ее', '12', 'бы', 'прав', 'все', 'всех', 'является', '6', 'было', '00', 'во', 'соответствии', 'ли', 'были', 'об', 'н', '11', 'женщин', 'время', '7', 'между', 'ный', 'ное', 'может', 'рамках', '?', '15', 'a', 'быть', 'тем', 'т', 'ными', 'стран', 'со', 'мы', '8', '2@@', 'он', 'у@@', 'целях', '9', 'резолюции', 'работы', 'этого', 'у', 'Комитет', 'A', 'ном', 'сессии', '20', 'С@@', 'был', '14', 'США', '3@@', 'системы', 'они', 'ная', 'та', 'д

### Remap BPE encoding

In [13]:
import re

In [14]:
x = "\u8de8@@"
re.sub(r'@@', '</w>', x)

'跨</w>'

In [15]:
d = {'le@@': 5, 'tt@@': 6, 'er': 7}
d

{'le@@': 5, 'tt@@': 6, 'er': 7}

In [16]:
d2 = dict((re.sub(r'@@', '', k), v) if k.endswith('@@') else (re.sub(r'$', '</w>', k), v) for k, v in d.items())
d2

{'le': 5, 'tt': 6, 'er</w>': 7}