In [1]:
from tokenizers import Tokenizer
from tokenizers.decoders import ByteLevel as ByteLevelDecoder
from tokenizers.models import BPE
from tokenizers.normalizers import Lowercase, NFKC, Sequence
from tokenizers.pre_tokenizers import ByteLevel
from tokenizers.trainers import BpeTrainer
from tokenizers.processors import TemplateProcessing

from datasets import load_dataset

import config

# Train Tokenizer
Train a simple Byte Pair Encoding (BPE) tokenizer

In [2]:
# Setup BPE Tokenizer
tokenizer = Tokenizer(BPE())
tokenizer.normalizer = Sequence([
    NFKC(),
    Lowercase()
])
tokenizer.pre_tokenizer = ByteLevel()
tokenizer.decoder = ByteLevelDecoder()
tokenizer.post_processor = TemplateProcessing(
    single="[CLS] $A [EOS]",
    special_tokens=[("[CLS]", 1), ("[EOS]", 2)]
)

# Setup trainer
trainer = BpeTrainer(
    vocab_size=config.VOCAB_SIZE, 
    initial_alphabet=ByteLevel.alphabet(),
    special_tokens=["[PAD]", "[CLS]", "[EOS]"],
)

# Train Tokenizer
tokenizer.train(trainer, config.PATH_DATA_FILES)
tokenizer.enable_padding(
    direction="right",
    length=config.SEQ_LEN,
    pad_id=tokenizer.token_to_id("[PAD]")
)
tokenizer.enable_truncation(config.SEQ_LEN)
files = tokenizer.model.save(config.PATH_DATA)

In [10]:
tokenizer.model = BPE.from_file(*files)
string = "Cat is black, dog is blue"
encoding = tokenizer.encode(string)
decoded = tokenizer.decode(encoding.ids)
print("Original string: {}".format(string))
print("Encoded string:\t{}".format(encoding.tokens))
print("Vocab index:\t{}".format(encoding.ids))
print("Decoded string:\t{}".format(decoded))

Original string: Cat is black, dog is blue
Encoded string:	['[CLS]', 'Ġcat', 'Ġis', 'Ġblack', ',', 'Ġdog', 'Ġis', 'Ġblue', '[EOS]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PA

In [4]:
# Save tokenizer
tokenizer.save(config.PATH_TOKENIZER)

# Tokenize the Dataset

In [5]:
def tokenize(batch):
    encoded = tokenizer.encode_batch(batch["text"])
    ids = []
    n = []

    for x in encoded:
        ids.append(x.ids)
        try:
            l = x.ids.index(0)
        except ValueError:
            l = config.SEQ_LEN
        n.append(l)

    return {
        "ids": ids,
        "n": n,
    }

In [6]:
# Load wikitext-103 dataset
train_ds, val_ds, test_ds = load_dataset('wikitext', 'wikitext-103-raw-v1', split=["train", "validation", "test"])

Reusing dataset wikitext (C:\Users\sap98\.cache\huggingface\datasets\wikitext\wikitext-103-raw-v1\1.0.0\47c57a6745aa5ce8e16a5355aaa4039e3aa90d1adad87cef1ad4e0f29e74ac91)


In [7]:
ttrain_ds = train_ds.map(tokenize, batched=True, batch_size=config.BATCH_SIZE)
tval_ds = val_ds.map(tokenize, batched=True, batch_size=config.BATCH_SIZE)
ttest_ds = test_ds.map(tokenize, batched=True, batch_size=config.BATCH_SIZE)

100%|██████████| 56293/56293 [06:23<00:00, 146.73ba/s]
100%|██████████| 118/118 [00:00<00:00, 139.81ba/s]
100%|██████████| 137/137 [00:00<00:00, 157.83ba/s]


In [8]:
ttrain_ds.save_to_disk(config.PATH_TRAIN_TOK)
tval_ds.save_to_disk(config.PATH_VAL_TOK)
ttest_ds.save_to_disk(config.PATH_TEST_TOK)