# Imports

In [1]:
import torch
import transformers
from tokenizers import Tokenizer
from tokenizers import normalizers, pre_tokenizers, models, processors, decoders, trainers
from torch.utils.data import DataLoader
from datasets import load_dataset, total_allocated_bytes

files = [f"wiki.{split}.raw" for split in ["test", "train", "valid"]]

# Hyper parameters

In [2]:
VOCAB_SIZE = 65536
PADDING_SIZE = 512

In [74]:
# Load wikitext-103 dataset
train_ds, val_ds, test_ds = load_dataset('wikitext', 'wikitext-103-raw-v1', split=["train", "validation", "test"])


Reusing dataset wikitext (C:\Users\Marc Bøg\.cache\huggingface\datasets\wikitext\wikitext-103-raw-v1\1.0.0\47c57a6745aa5ce8e16a5355aaa4039e3aa90d1adad87cef1ad4e0f29e74ac91)


In [75]:
# Tokenizer Configuration - Uses Byte Pair Encoding
# NFKD Unicode Normalization, all lowercase
# Split on whitespace, store info about space (metaspace)
# Split digits from words

tokenizer = Tokenizer(models.BPE())
tokenizer.normalizer = normalizers.Sequence([
    normalizers.NFKD(),
    normalizers.Lowercase(),
    #normalizers.Strip("both"),
])
tokenizer.pre_tokenizer = pre_tokenizers.Sequence([
    pre_tokenizers.Whitespace(),
    pre_tokenizers.Metaspace(),
    pre_tokenizers.Digits(individual_digits=False)
])
tokenizer.post_processor = processors.TemplateProcessing(
    single="[CLS] $A [EOS]",
    pair="[CLS] $A [SEP] $B:1 [EOS]:1",
    special_tokens=[("[CLS]", 1), ("[SEP]", 2), ("[EOS]", 3)]
)
tokenizer.decoder = decoders.Metaspace()
tokenizer.enable_padding(
    direction="right",
    length=PADDING_SIZE
)
tokenizer.enable_truncation(
    max_length=PADDING_SIZE
)


In [76]:
trainer = trainers.BpeTrainer(
    vocab_size=VOCAB_SIZE,
    special_tokens=["[UNK]", "[CLS]", "[SEP]", "[EOS]", "[PAD]", "[MASK]"],
    show_progress=True
)
tokenizer.train(trainer, files)

In [77]:
tokenizer.model.save('.')

['.\\vocab.json', '.\\merges.txt']

In [78]:
tokenizer.model = models.BPE.from_file("vocab.json", "merges.txt")
encoding = tokenizer.encode_batch(["This is a test string", "and another one"])
print(encoding)

[Encoding(num_tokens=512, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]), Encoding(num_tokens=512, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])]


In [80]:
def tokenize(batch):
    encoded = tokenizer.encode_batch(batch["text"])
    ids = []
    type_ids = []
    attention_mask = []
    special_tokens_mask = []

    for x in encoded:
        ids.append(x.ids)
        type_ids.append(x.type_ids)
        attention_mask.append(x.attention_mask)
        special_tokens_mask.append(x.special_tokens_mask)
    
    return {
        "ids": ids,
        "type_ids": type_ids,
        "attention_mask": attention_mask,
        "special_tokens_mask": special_tokens_mask
    }


ttrain_ds = train_ds.map(tokenize, batched=True)
tval_ds = val_ds.map(tokenize, batched=True)
ttest_ds = test_ds.map(tokenize, batched=True)

100%|██████████| 1802/1802 [05:00<00:00,  6.00ba/s]
100%|██████████| 4/4 [00:00<00:00,  6.08ba/s]
100%|██████████| 5/5 [00:00<00:00,  6.53ba/s]


In [81]:
ttrain_ds.save_to_disk("tokenized_train")
tval_ds.save_to_disk("tokenized_val")
ttest_ds.save_to_disk("tokenized_test")

tokenized_dataset.column_names

['attention_mask', 'ids', 'special_tokens_mask', 'text', 'type_ids']

In [13]:
tokenized_dataset.set_format(type="pt", columns=["ids", "type_ids", "attention_mask", "special_tokens_mask"])

In [14]:
tokenized_dataset.column_names

['attention_mask', 'ids', 'special_tokens_mask', 'text', 'type_ids']

In [15]:
dataloader = DataLoader(tokenized_dataset, batch_size=32)

In [27]:
d = iter(dataloader)
next(d)

{'attention_mask': tensor([[1, 1, 0,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 0,  ..., 0, 0, 0],
         ...,
         [1, 1, 0,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'ids': tensor([[    1,     3,     0,  ...,     0,     0,     0],
         [    1,  4209, 27384,  ...,     0,     0,     0],
         [    1,     3,     0,  ...,     0,     0,     0],
         ...,
         [    1,     3,     0,  ...,     0,     0,     0],
         [    1,  4184,  5054,  ...,     0,     0,     0],
         [    1,  8313,  4404,  ...,     0,     0,     0]]),
 'special_tokens_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
         [1, 0, 0,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         ...,
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 0, 0,  ..., 1, 1, 1],
         [1, 0, 0,  ..., 1, 1, 1]]),
 'type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,

In [71]:
next(d)

{'attention_mask': tensor([[1, 1, 0,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'ids': tensor([[    1,     3,     0,  ...,     0,     0,     0],
         [    1, 13134, 10998,  ...,     0,     0,     0],
         [    1,  4165,  5196,  ...,     0,     0,     0],
         ...,
         [    1,  4165, 29381,  ...,     0,     0,     0],
         [    1,  4165, 15152,  ...,     0,     0,     0],
         [    1,  4162,  9539,  ...,     0,     0,     0]]),
 'special_tokens_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
         [1, 0, 0,  ..., 1, 1, 1],
         [1, 0, 0,  ..., 1, 1, 1],
         ...,
         [1, 0, 0,  ..., 1, 1, 1],
         [1, 0, 0,  ..., 1, 1, 1],
         [1, 0, 0,  ..., 1, 1, 1]]),
 'type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,