In [1]:
from tokenizers import Tokenizer
from pathlib import Path

In [2]:
def get_tokenizer(tokenizer_path=None):
    """
    Load custom BPE tokenizer.

    Args:
        tokenizer_path: Path to tokenizer.json file.
                       Defaults to tokenizer/bpe_tokenizer.json
    """
    if tokenizer_path is None:
        # Default path relative to project root
        tokenizer_path = Path.cwd().parent / "bpe_tokenizer.json"

    tokenizer_path = Path(tokenizer_path)

    if not tokenizer_path.exists():
        raise FileNotFoundError(
            f"Tokenizer not found at {tokenizer_path}. "
            "Run notebooks/build_tokenizer.ipynb first."
        )

    tokenizer = Tokenizer.from_file(str(tokenizer_path))
    print(f"Loaded tokenizer from {tokenizer_path}")
    print(f"Vocab size: {tokenizer.get_vocab_size()}")

    return tokenizer


In [3]:
tokenizer = get_tokenizer()

Loaded tokenizer from /home/smedar/code_files/llama4-from-scratch/vision_language_alignment/bpe_tokenizer.json
Vocab size: 32000


In [4]:
tokenizer.add_special_tokens(["<image>"])

1

In [5]:
# Check what special tokens exist
vocab = tokenizer.get_vocab()
special = {k: v for k, v in vocab.items() if k.startswith("<") and k.endswith(">")}
print(special)


{'<image>': 32000, '<unk>': 0, '<s>': 1, '</s>': 2, '<pad>': 3}


In [6]:
tokenizer.get_vocab_size()

32001

In [7]:
# Get the <image> token ID - you'll need this later
image_token_id = tokenizer.token_to_id("<image>")
print(f"<image> token ID: {image_token_id}")  # Should be 32000

# Test encoding with <image>
test = tokenizer.encode("<image> a brown dog running")
print(test.ids)  # Should start with 32000

<image> token ID: 32000
[32000, 214, 7532, 4485, 4274]


In [8]:
tokenizer.save("../bpe_tokenizer_with_image_tag.json")