<a href="https://colab.research.google.com/github/pranavsaranaway/NLP-Implementations/blob/main/Creating_A_Tokenizer_From_Scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers datasets tokenizers
!pip install -U datasets huggingface_hub fsspec

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting huggingface_hub
  Downloading huggingface_hub-0.33.2-py3-none-any.whl.metadata (14 kB)
Collecting fsspec
  Downloading fsspec-2025.5.1-py3-none-any.whl.metadata (11 kB)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m19.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading huggingface_hub-0.33.2-py3-none-any.whl (515 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m515.4/515.4 kB[0m [31m35.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m17.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, huggingface_hub, datasets
  Attempting uninstall: fsspec
    Found existing installat

In [2]:
from datasets import load_dataset

dataset = load_dataset("wikitext", name="wikitext-2-raw-v1", split="train")


def get_training_corpus():
    for i in range(0, len(dataset), 1000):
        yield dataset[i : i + 1000]["text"]

README.md: 0.00B [00:00, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/733k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/6.36M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/657k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/36718 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

In [3]:
# or this training corpous can be generated locally
with open("wikitext-2.txt", "w", encoding="utf-8") as f:
    for i in range(len(dataset)):
        f.write(dataset[i]["text"] + "\n")

# Word Piece Encoding

In [4]:
# we will use word piece tokenizer base
from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)

tokenizer = Tokenizer(models.WordPiece(unk_token="[UNK]"))

# normalization
tokenizer.normalizer = normalizers.Sequence(
    [normalizers.NFD(), normalizers.Lowercase(), normalizers.StripAccents()]
)

# pre-tokenization
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()

# model
special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
trainer = trainers.WordPieceTrainer(vocab_size=25000, special_tokens=special_tokens)


In [5]:
# run the training
tokenizer.train_from_iterator(get_training_corpus(), trainer=trainer)

In [6]:
cls_token_id = tokenizer.token_to_id("[CLS]")
sep_token_id = tokenizer.token_to_id("[SEP]")

# post-processing
tokenizer.post_processor = processors.TemplateProcessing(
    single=f"[CLS]:0 $A:0 [SEP]:0",
    pair=f"[CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1",
    special_tokens=[("[CLS]", cls_token_id), ("[SEP]", sep_token_id)],
)

In [7]:
# test
encoding = tokenizer.encode("Let's test this tokenizer.")
print(encoding.tokens)

encoding = tokenizer.encode("Let's test this tokenizer...", "on a pair of sentences.")
print(encoding.tokens)
print(encoding.type_ids)

['[CLS]', 'let', "'", 's', 'test', 'this', 'tok', '##eni', '##zer', '.', '[SEP]']
['[CLS]', 'let', "'", 's', 'test', 'this', 'tok', '##eni', '##zer', '...', '[SEP]', 'on', 'a', 'pair', 'of', 'sentences', '.', '[SEP]']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]


In [8]:
# include the decoder
tokenizer.decoder = decoders.WordPiece(prefix="##")

# test
tokenizer.decode(encoding.ids)


"let ' s test this tokenizer... on a pair of sentences."

In [9]:
# save the tokenizer in json
tokenizer.save("tokenizer.json")

In [10]:
from transformers import PreTrainedTokenizerFast

wrapped_tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=tokenizer,
    # tokenizer_file="tokenizer.json", # You can load from the tokenizer file, alternatively
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]",
)

In [11]:
!pip install huggingface_hub



In [12]:
from huggingface_hub import whoami
from huggingface_hub import login

login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [13]:
whoami()

{'type': 'user',
 'id': '683767745ba8dfda65d841e5',
 'name': 'pranavsaranaway',
 'fullname': 'Pranav Saran',
 'email': 'pxs878@case.edu',
 'emailVerified': True,
 'canPay': False,
 'periodEnd': 1754006399,
 'isPro': False,
 'avatarUrl': 'https://cdn-avatars.huggingface.co/v1/production/uploads/noauth/6NDvIo3gt4-1UzkHZ13D3.jpeg',
 'orgs': [],
 'auth': {'type': 'access_token',
  'accessToken': {'displayName': 'COLAB',
   'role': 'write',
   'createdAt': '2025-07-03T23:19:38.562Z'}}}

In [14]:
wrapped_tokenizer.save_pretrained("wpe_tokenizer")
wrapped_tokenizer.push_to_hub("wpe_tokenizer")

CommitInfo(commit_url='https://huggingface.co/pranavsaranaway/wpe_tokenizer/commit/5edbdf6d9eb3635a0926c01f7ad610889bbf4355', commit_message='Upload tokenizer', commit_description='', oid='5edbdf6d9eb3635a0926c01f7ad610889bbf4355', pr_url=None, repo_url=RepoUrl('https://huggingface.co/pranavsaranaway/wpe_tokenizer', endpoint='https://huggingface.co', repo_type='model', repo_id='pranavsaranaway/wpe_tokenizer'), pr_revision=None, pr_num=None)

# BPE (Byte Pair Encoding)

In [16]:
tokenizer = Tokenizer(models.BPE())

# we dont need normalization
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)

In [18]:
trainer = trainers.BpeTrainer(vocab_size=25000, special_tokens=["<|endoftext|>"])
tokenizer.train_from_iterator(get_training_corpus(), trainer=trainer)

In [20]:
# apply post processing
tokenizer.post_processor = processors.ByteLevel(trim_offsets=False)

In [22]:
sentence = "Let's test this tokenizer."
encoding = tokenizer.encode(sentence)
start, end = encoding.offsets[4]
sentence[start:end]

' test'

In [23]:
tokenizer.decoder = decoders.ByteLevel()
tokenizer.decode(encoding.ids)

"Let's test this tokenizer."

In [24]:
from transformers import PreTrainedTokenizerFast

wrapped_tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=tokenizer,
    bos_token="<|endoftext|>",
    eos_token="<|endoftext|>",
)

In [25]:
wrapped_tokenizer.save_pretrained("bpe_tokenizer")
wrapped_tokenizer.push_to_hub("bpe_tokenizer")

CommitInfo(commit_url='https://huggingface.co/pranavsaranaway/bpe_tokenizer/commit/96d7939a2204b8f05a3baf6a990a8e2ef5ab13a7', commit_message='Upload tokenizer', commit_description='', oid='96d7939a2204b8f05a3baf6a990a8e2ef5ab13a7', pr_url=None, repo_url=RepoUrl('https://huggingface.co/pranavsaranaway/bpe_tokenizer', endpoint='https://huggingface.co', repo_type='model', repo_id='pranavsaranaway/bpe_tokenizer'), pr_revision=None, pr_num=None)