[useful link](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/tokenizer_training.ipynb#scrollTo=_Iq-_dtMSM9L)

In [3]:
from datasets import load_dataset
dataset = load_dataset("wikitext", name="wikitext-2-raw-v1", split="train")
dataset

Downloading readme:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/733k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/6.36M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/657k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/36718 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

Dataset({
    features: ['text'],
    num_rows: 36718
})

In [4]:
dataset[:2]

{'text': ['', ' = Valkyria Chronicles III = \n']}

To avoid loading everything into memory (since the Datasets library keeps the element on disk and only load them in memory when requested), we define a Python iterator. This is particularly useful if you have a huge dataset:

In [5]:

batch_size = 1000
all_texts = [dataset[i : i + batch_size]["text"] for i in range(0, len(dataset), batch_size)]

def batch_iterator():
    for i in range(0, len(dataset), batch_size):
        yield dataset[i : i + batch_size]["text"]

# Now let's build a tokenizer from scratch!  
This script initializes and configures a tokenizer using the `tokenizers` library.

The tokenizer is set up with the following components:
- Model: WordPiece with an unknown token "[UNK]".
- Normalizer: A sequence of normalizers including:
    - NFD (Normalization Form D)
    - Lowercase conversion
    - Strip accents
- Pre-tokenizer: BertPreTokenizer

Modules used:
- `decoders`: For decoding tokenized sequences.
- `models`: For defining the tokenization model.
- `normalizers`: For normalizing text before tokenization.
- `pre_tokenizers`: For pre-tokenizing text.
- `processors`: For post-processing tokenized sequences.
- `trainers`: For training the tokenizer model.
- `Tokenizer`: The main class for tokenization.

In [26]:
from tokenizers import decoders, models, normalizers, pre_tokenizers, processors, trainers, Tokenizer

# Initialize the tokenizer with the WordPiece model and unknown token "[UNK]"

tokenizer = Tokenizer(models.WordPiece(unk_token="[UNK]"))

# Set the normalizer to lowercase all text
tokenizer.normalizer = normalizers.BertNormalizer(lowercase=True)

# Set a sequence of normalizers: NFD, lowercase, and strip accents
tokenizer.normalizer = normalizers.Sequence(
    [normalizers.NFD(), normalizers.Lowercase(), normalizers.StripAccents()]
)

# Set the pre-tokenizer to use the BERT pre-tokenizer
tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()

In [27]:
tokenizer.pre_tokenizer.pre_tokenize_str("This is an example!")

[('This', (0, 4)),
 ('is', (5, 7)),
 ('an', (8, 10)),
 ('example', (11, 18)),
 ('!', (18, 19))]

In [28]:
special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
trainer = trainers.WordPieceTrainer(vocab_size=25000, special_tokens=special_tokens)

In [32]:
tokenizer.train_from_iterator(batch_iterator(), trainer=trainer)






In [33]:
cls_token_id = tokenizer.token_to_id("[CLS]")
sep_token_id = tokenizer.token_to_id("[SEP]")
print(cls_token_id, sep_token_id)

2 3


In [35]:
tokenizer.post_processor = processors.TemplateProcessing(
    single=f"[CLS]:0 $A:0 [SEP]:0",
    pair=f"[CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1",
    special_tokens=[
        ("[CLS]", cls_token_id),
        ("[SEP]", sep_token_id),
    ],
)

In [36]:
encoding = tokenizer.encode("This is one sentence.", "With this one we have a pair.")
encoding.tokens

['[CLS]',
 'this',
 'is',
 'one',
 'sentence',
 '.',
 '[SEP]',
 'with',
 'this',
 'one',
 'we',
 'have',
 'a',
 'pair',
 '.',
 '[SEP]']

In [37]:
encoding.type_ids

[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]

In [38]:
tokenizer.decoder = decoders.WordPiece(prefix="##")

In [None]:
from transformers import BertTokenizerFast

new_tokenizer = BertTokenizerFast(tokenizer_object=tokenizer)
# new_tokenizer.save_pretrained("my-new-tokenizer")