# Train the Custom Tokenizer

In [14]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer

VOCAB_SIZE = 30000
SPECIAL_TOKENS = [
    "[PAD]",   # Padding token: used to pad sequences to the same length
    "[UNK]",   # Unknown token: represents out-of-vocabulary or unknown words
    "[CLS]",   # Classification token: often used as the first token for classification tasks
    "[SEP]",   # Separator token: used to separate segments (e.g., sentence pairs)
    "[MASK]",  # Mask token: used for masked language modeling (e.g., BERT pretraining)
]


In [16]:
with open("wikitext2_train_cleaned.txt", "r", encoding="utf-8") as f:
    # f.readlines() will take each line in txt and convert to an element
    # in a list of type string
    train_data = f.readlines()  # List of strings (one per line)
tokenizer = Tokenizer(BPE())
trainer = BpeTrainer(
    special_tokens=SPECIAL_TOKENS,
    vocab_size = VOCAB_SIZE)

def batch_iterator(data, batch_size=1000): # needed for passing a list of strs to a to a HF Tokenizer
    for i in range(0, len(data), batch_size):
        yield data[i:i + batch_size]

# Train the tokenizer with the batch_iterator 
tokenizer.train_from_iterator(
    batch_iterator(train_data, batch_size=1000), 
    trainer=trainer
)
print("Tokenizer vocabulary size: ", tokenizer.get_vocab_size())




Tokenizer vocabulary size:  30000


In [11]:
# Save the tokenizer
tokenizer.save("my_tokenier")