# Imports

In [1]:
import torch
import transformers
from tokenizers import Tokenizer
from tokenizers import normalizers, pre_tokenizers, models, processors, decoders, trainers
from torch.utils.data import DataLoader
from datasets import load_dataset, total_allocated_bytes

import config

# Hyper parameters

In [2]:
# Load wikitext-103 dataset
train_ds, val_ds, test_ds = load_dataset('wikitext', 'wikitext-103-raw-v1', split=["train", "validation", "test"])

Old caching folder C:\Users\Marc Bøg\.cache\huggingface\datasets\wikitext\wikitext-103-raw-v1\1.0.0\47c57a6745aa5ce8e16a5355aaa4039e3aa90d1adad87cef1ad4e0f29e74ac91 for dataset wikitext exists but not data were found. Removing it. 
Downloading and preparing dataset wikitext/wikitext-103-raw-v1 (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to C:\Users\Marc Bøg\.cache\huggingface\datasets\wikitext\wikitext-103-raw-v1\1.0.0\47c57a6745aa5ce8e16a5355aaa4039e3aa90d1adad87cef1ad4e0f29e74ac91...
Dataset wikitext downloaded and prepared to C:\Users\Marc Bøg\.cache\huggingface\datasets\wikitext\wikitext-103-raw-v1\1.0.0\47c57a6745aa5ce8e16a5355aaa4039e3aa90d1adad87cef1ad4e0f29e74ac91. Subsequent calls will reuse this data.


In [3]:
# Tokenizer Configuration - Uses Byte Pair Encoding
# NFKD Unicode Normalization, all lowercase
# Split on whitespace, store info about space (metaspace)
# Split digits from words
trainer = trainers.BpeTrainer(
    vocab_size=config.VOCAB_SIZE,
    special_tokens=["[PAD]", "[CLS]", "[SEP]", "[EOS]", "[UNK]", "[MASK]"],
    show_progress=True
)

tokenizer = Tokenizer(models.BPE())
tokenizer.normalizer = normalizers.Sequence([
    normalizers.NFKD(),
    normalizers.Lowercase(),
    #normalizers.Strip("both"),
])
tokenizer.pre_tokenizer = pre_tokenizers.Sequence([
    pre_tokenizers.Whitespace(),
    pre_tokenizers.Metaspace(),
    pre_tokenizers.Digits(individual_digits=False)
])
tokenizer.post_processor = processors.TemplateProcessing(
    single="[CLS] $A [EOS]",
    pair="[CLS] $A [SEP] $B:1 [EOS]:1",
    special_tokens=[("[CLS]", 1), ("[SEP]", 2), ("[EOS]", 3)]
)
tokenizer.decoder = decoders.BPEDecoder()

# Pad to longest string in batch
tokenizer.enable_padding(
    direction="right",
    #length=config.PADDING_SIZE,
    pad_id=tokenizer.token_to_id("[PAD]")
)
# tokenizer.enable_truncation(
#     max_length=config.PADDING_SIZE
# )

In [4]:
tokenizer.train(trainer, config.PATH_DATA_FILES)

In [5]:
files = tokenizer.model.save(config.PATH_DATA)
tokenizer.model = models.BPE.from_file(*files, unk_token="[UNK]")
tokenizer.save(config.PATH_TOKENIZER)

In [6]:
def tokenize(batch):
    encoded = tokenizer.encode_batch(batch["text"])
    ids = []
    type_ids = []
    attention_mask = []
    special_tokens_mask = []

    for x in encoded:
        ids.append(x.ids)
        #type_ids.append(x.type_ids)
        attention_mask.append(x.attention_mask)
        #special_tokens_mask.append(x.special_tokens_mask)
    
    return {
        "ids": ids,
        #"type_ids": type_ids,
        "attention_mask": attention_mask,
        #"special_tokens_mask": special_tokens_mask
    }


ttrain_ds = train_ds.map(tokenize, batched=True, batch_size=config.BATCH_SIZE)
tval_ds = val_ds.map(tokenize, batched=True, batch_size=config.BATCH_SIZE)
ttest_ds = test_ds.map(tokenize, batched=True, batch_size=config.BATCH_SIZE)

100%|██████████| 14074/14074 [03:06<00:00, 75.49ba/s]
100%|██████████| 30/30 [00:00<00:00, 80.62ba/s]
100%|██████████| 35/35 [00:00<00:00, 78.20ba/s]


In [7]:
ttrain_ds.save_to_disk(config.PATH_TRAIN_TOK)
tval_ds.save_to_disk(config.PATH_VAL_TOK)
ttest_ds.save_to_disk(config.PATH_TEST_TOK)

tval_ds.column_names

['attention_mask', 'ids', 'text']

In [27]:
ttrain_ds.set_format(type="np", columns=["ids", "attention_mask"])

In [28]:
ttrain_ds.column_names

['attention_mask', 'ids', 'text']

In [29]:
dataloader = DataLoader(ttrain_ds, batch_size=config.BATCH_SIZE)

In [32]:
d = iter(dataloader)
next(d)

{'attention_mask': tensor([[1, 1, 0,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 0,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 0,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'ids': tensor([[   1,    3,    0,  ...,    0,    0,    0],
         [   1, 4209, 5310,  ...,    0,    0,    0],
         [   1,    3,    0,  ...,    0,    0,    0],
         ...,
         [   1, 4209, 4209,  ...,    0,    0,    0],
         [   1,    3,    0,  ...,    0,    0,    0],
         [   1, 4165, 5628,  ...,    0,    0,    0]])}

In [75]:
next(d)["ids"]

tensor([[   1, 4209, 4209,  ...,    0,    0,    0],
        [   1,    3,    0,  ...,    0,    0,    0],
        [   1, 4165, 4669,  ...,    0,    0,    0],
        ...,
        [   1,    3,    0,  ...,    0,    0,    0],
        [   1, 4209, 4209,  ...,    0,    0,    0],
        [   1,    3,    0,  ...,    0,    0,    0]])