In [23]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.normalizers import BertNormalizer

In [24]:
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])

In [25]:
tokenizer.pre_tokenizer = Whitespace()

In [26]:
tokenizer.normalizer = BertNormalizer(clean_text=True, strip_accents=True, lowercase=True)

In [27]:
tokenizer.train(
    [
        "/Users/sidbaskaran/code/ml-sandbox/transformer/de-en.txt/MultiUN.de-en.en",
        "/Users/sidbaskaran/code/ml-sandbox/transformer/de-en.txt/MultiUN.de-en.de",
    ],
    trainer,
)






In [28]:
# post processing
from tokenizers.processors import TemplateProcessing
tokenizer.post_processor = TemplateProcessing(
    single="[CLS] $A [SEP]",
    pair="[CLS] $A [SEP] $B:1 [SEP]:1",
    special_tokens=[
        ("[CLS]", tokenizer.token_to_id("[CLS]")),
        ("[SEP]", tokenizer.token_to_id("[SEP]")),
    ],
)

In [29]:
tokenizer.save("/Users/sidbaskaran/code/ml-sandbox/transformer/de-en.txt/tokenizer-multiun-ende.json")

In [30]:
# testing the tokenizer
output = tokenizer.encode("Hello, y'all! How are you 😁 ?")

In [31]:
output.tokens

['[CLS]',
 'h',
 'ello',
 ',',
 'y',
 "'",
 'all',
 '!',
 'how',
 'are',
 'you',
 '[UNK]',
 '?',
 '[SEP]']

In [56]:
from datasets import load_dataset, DatasetDict
from transformers import PreTrainedTokenizerFast
import pathlib

In [57]:
wrapped_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer)

In [32]:
raw_dataset = load_dataset("text", )

Downloading and preparing dataset text/default to /Users/sidbaskaran/.cache/huggingface/datasets/text/default-8760a2d7ed688483/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2...


Downloading data files: 100%|██████████| 1/1 [00:00<00:00, 3187.16it/s]
Extracting data files: 100%|██████████| 1/1 [00:00<00:00, 316.96it/s]
                                                                     

Dataset text downloaded and prepared to /Users/sidbaskaran/.cache/huggingface/datasets/text/default-8760a2d7ed688483/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2. Subsequent calls will reuse this data.


100%|██████████| 1/1 [00:00<00:00, 247.47it/s]


In [58]:
wrapped_tokenizer(raw_dataset["train"][0]["text"])

{'input_ids': [1, 207, 983, 262, 105, 193, 444, 2], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}

In [59]:
tokenized_dataset = raw_dataset["train"].map(lambda example: wrapped_tokenizer(example["text"]), batched=True)

                                                                      

In [61]:
train_testvalid = tokenized_dataset.train_test_split(test_size=0.1)
test_valid = train_testvalid["test"].train_test_split(test_size=0.5)

split_dict = DatasetDict({
    "train": train_testvalid["train"],
    "test": test_valid["test"],
    "valid": test_valid["train"],
})

In [62]:
split_dict["test"]

Dataset({
    features: ['text', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 16299
})