In [11]:
import json
from pathlib import Path
from tokenizers import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing

import logging
import math
import os
from dataclasses import dataclass, field
from typing import Optional

from transformers import (
    CONFIG_MAPPING,
    MODEL_WITH_LM_HEAD_MAPPING,
    AutoConfig,
    AutoModelWithLMHead,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    HfArgumentParser,
    LineByLineTextDataset,
    pipeline,
    PreTrainedTokenizer,
    TextDataset,
    Trainer,
    TrainingArguments,
    set_seed,
)
from torch.utils.data import Dataset

## Train a tokenizer

In [2]:
paths = [str(x) for x in Path("./eo_data/").glob("**/*.txt")]

Choose byte-level BPE so that we could decompose all words into tokens (no <unk> tokens).

In [3]:
tokenizer = ByteLevelBPETokenizer()

# Customize training
tokenizer.train(files=paths, vocab_size=52_000, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])

# Save files to disk
tokenizer.save(".", "esperberto")

['./esperberto-vocab.json', './esperberto-merges.txt']

In [4]:
with open('./esperberto-vocab.json') as f:
    data = json.load(f)
print(data)

{'<s>': 0, '<pad>': 1, '</s>': 2, '<unk>': 3, '<mask>': 4, '!': 5, '"': 6, '#': 7, '$': 8, '%': 9, '&': 10, "'": 11, '(': 12, ')': 13, '*': 14, '+': 15, ',': 16, '-': 17, '.': 18, '/': 19, '0': 20, '1': 21, '2': 22, '3': 23, '4': 24, '5': 25, '6': 26, '7': 27, '8': 28, '9': 29, ':': 30, ';': 31, '<': 32, '=': 33, '>': 34, '?': 35, '@': 36, 'A': 37, 'B': 38, 'C': 39, 'D': 40, 'E': 41, 'F': 42, 'G': 43, 'H': 44, 'I': 45, 'J': 46, 'K': 47, 'L': 48, 'M': 49, 'N': 50, 'O': 51, 'P': 52, 'Q': 53, 'R': 54, 'S': 55, 'T': 56, 'U': 57, 'V': 58, 'W': 59, 'X': 60, 'Y': 61, 'Z': 62, '[': 63, '\\': 64, ']': 65, '^': 66, '_': 67, '`': 68, 'a': 69, 'b': 70, 'c': 71, 'd': 72, 'e': 73, 'f': 74, 'g': 75, 'h': 76, 'i': 77, 'j': 78, 'k': 79, 'l': 80, 'm': 81, 'n': 82, 'o': 83, 'p': 84, 'q': 85, 'r': 86, 's': 87, 't': 88, 'u': 89, 'v': 90, 'w': 91, 'x': 92, 'y': 93, 'z': 94, '{': 95, '|': 96, '}': 97, '~': 98, '¡': 99, '¢': 100, '£': 101, '¤': 102, '¥': 103, '¦': 104, '§': 105, '¨': 106, '©': 107, 'ª': 108, 

How to apply tokenizer:

In [5]:
tokenizer = ByteLevelBPETokenizer(
    "./esperberto-vocab.json",
    "./esperberto-merges.txt",
)
tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=512)

print(
    tokenizer.encode("Mi estas Julien.")
)

Encoding(num_tokens=7, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])


## Train from Scratch

In [14]:
class EsperantoDataset(Dataset):
    def __init__(self, evaluate: bool = False):
        tokenizer = ByteLevelBPETokenizer(
            "vocab.json",
            "merges.txt",
        )
        tokenizer._tokenizer.post_processor = BertProcessing(
            ("</s>", tokenizer.token_to_id("</s>")),
            ("<s>", tokenizer.token_to_id("<s>")),
        )
        tokenizer.enable_truncation(max_length=512)
        # or use the RobertaTokenizer from `transformers` directly.

        self.examples = []

        src_files = Path("./data/").glob("*-eval.txt") if evaluate else Path("./data/").glob("*-train.txt")
        for src_file in src_files:
            print("file:", src_file)
            lines = src_file.read_text(encoding="utf-8").splitlines()
            self.examples += [x.ids for x in tokenizer.encode_batch(lines)]

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):
        # We’ll pad at the batch level.
        return torch.tensor(self.examples[i])

## Check LM actually trained

In [None]:
fill_mask = pipeline(
    "fill-mask",
    model="/Users/jingoy/Documents/2020 Winter/emr",
    tokenizer="/Users/jingoy/Documents/2020 Winter/emr"
)

# The sun <mask>.
# =>

result = fill_mask("La suno <mask>.")
