# Imports

In [1]:
import torch
import transformers
from tokenizers import Tokenizer
from tokenizers import normalizers, pre_tokenizers, models, processors, decoders, trainers
from torch.utils.data import DataLoader
from datasets import load_dataset, total_allocated_bytes

import config

# Train Tokenizer

Train a Byte Pair Encoding (BPE) tokenizer with the following characteristics:
- Splits on whitespace
- Store info about space (metaspace)
- Split digits from words

In [3]:
# Define trainer
trainer = trainers.WordPieceTrainer(
    vocab_size=config.VOCAB_SIZE,
    special_tokens=["[PAD]", "[CLS]", "[SEP]", "[EOS]", "[UNK]", "[MASK]"],
)

In [4]:
# Define tokenizer
tokenizer = Tokenizer(models.WordPiece())

# NFKD Unicode Normalization, all lowercase
tokenizer.normalizer = normalizers.Sequence([
    normalizers.NFKD(),
    normalizers.Lowercase(),
])

# Split on whitespace and digits, and store info about space (metaspace)
tokenizer.pre_tokenizer = pre_tokenizers.Sequence([
    pre_tokenizers.Whitespace(),
    # pre_tokenizers.Metaspace(),
    pre_tokenizers.Digits(individual_digits=False)
])

# Decoding
tokenizer.post_processor = processors.TemplateProcessing(
    single="[CLS] $A [EOS]",
    pair="[CLS] $A [SEP] $B:1 [EOS]:1",
    special_tokens=[("[CLS]", 1), ("[SEP]", 2), ("[EOS]", 3)]
)
tokenizer.decoder = decoders.WordPiece()

In [5]:
tokenizer.train(trainer, config.PATH_DATA_FILES)

In [6]:
# Enable padding
# Pad to longest string in batch
tokenizer.enable_padding(
    direction="right",
    length=config.SEQ_LEN,
    pad_id=tokenizer.token_to_id("[PAD]")
)
tokenizer.enable_truncation(config.SEQ_LEN)

In [7]:
files = tokenizer.model.save(config.PATH_DATA)
tokenizer.model = models.WordPiece.from_file(*files, unk_token="[UNK]")
tokenizer.save(config.PATH_TOKENIZER)

# Tokenize Dataset

In [2]:
# Load wikitext-103 dataset
train_ds, val_ds, test_ds = load_dataset('wikitext', 'wikitext-103-raw-v1', split=["train", "validation", "test"])

Reusing dataset wikitext (C:\Users\sap98\.cache\huggingface\datasets\wikitext\wikitext-103-raw-v1\1.0.0\47c57a6745aa5ce8e16a5355aaa4039e3aa90d1adad87cef1ad4e0f29e74ac91)


In [8]:
def tokenize(batch):
    encoded = tokenizer.encode_batch(batch["text"])
    ids = []
    n = []

    for x in encoded:
        ids.append(x.ids)
        try:
            l = x.ids.index(0)
        except ValueError:
            l = config.SEQ_LEN
        n.append(l)

    return {
        "ids": ids,
        "n": n,
    }

In [9]:
# Tokenize the data
ttrain_ds = train_ds.map(tokenize, batched=True, batch_size=config.BATCH_SIZE)
tval_ds = val_ds.map(tokenize, batched=True, batch_size=config.BATCH_SIZE)
ttest_ds = test_ds.map(tokenize, batched=True, batch_size=config.BATCH_SIZE)

100%|██████████| 56293/56293 [05:37<00:00, 166.97ba/s]
100%|██████████| 118/118 [00:00<00:00, 169.30ba/s]
100%|██████████| 137/137 [00:00<00:00, 152.05ba/s]


In [10]:
# Save the tokenized data to disk
ttrain_ds.save_to_disk(config.PATH_TRAIN_TOK)
tval_ds.save_to_disk(config.PATH_VAL_TOK)
ttest_ds.save_to_disk(config.PATH_TEST_TOK)

tval_ds.column_names

['ids', 'n', 'text']

# Test Tokenizer