# Train Custom Tokenizer

This notebook trains a custom Byte-Pair Encoding (BPE) tokenizer on the local dataset `input-1.txt`.

In [1]:
!pip install tokenizers transformers




[notice] A new release of pip is available: 25.0.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [5]:
from tokenizers import ByteLevelBPETokenizer
from transformers import PreTrainedTokenizerFast
import os

# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()

# Customize training
files = ["input-1.txt"]
vocab_size = 5000 # Adjust based on your dataset size. 1.1MB is small, so 5000 is reasonable.
min_frequency = 2

print(f"Training tokenizer on {files} with vocab_size={vocab_size}...")

# Train the tokenizer
tokenizer.train(files=files, vocab_size=vocab_size, min_frequency=min_frequency, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])

# Save the tokenizer
save_path = "./custom_tokenizer"
if not os.path.exists(save_path):
    os.makedirs(save_path)

tokenizer.save_model(save_path)
print(f"Tokenizer saved to {save_path}")

Training tokenizer on ['input-1.txt'] with vocab_size=5000...
Tokenizer saved to ./custom_tokenizer


In [3]:
# Wrap it in a Transformers tokenizer for easy usage
from transformers import GPT2TokenizerFast

# Load the trained tokenizer files
tokenizer_obj = GPT2TokenizerFast.from_pretrained(save_path)

# Set special tokens
tokenizer_obj.bos_token = "<s>"
tokenizer_obj.eos_token = "</s>"
tokenizer_obj.unk_token = "<unk>"
tokenizer_obj.pad_token = "<pad>"
tokenizer_obj.mask_token = "<mask>"

# Save the complete tokenizer config
tokenizer_obj.save_pretrained(save_path)
print("Transformers-compatible tokenizer saved.")

Transformers-compatible tokenizer saved.


In [4]:
# Test the tokenizer
encoded = tokenizer_obj.encode("Hello world! This is a test.")
decoded = tokenizer_obj.decode(encoded)
print(f"Encoded: {encoded}")
print(f"Decoded: {decoded}")

Encoded: [44, 3910, 871, 5, 2507, 329, 263, 4274, 18]
Decoded: Hello world! This is a test.
