In [1]:
from transformers import AutoTokenizer

marin_tokenizer = AutoTokenizer.from_pretrained("stanford-crfm/marin-tokenizer")
qwen3_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B-Base")


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Swap existing EOS token from "<|endoftext|>" to "<|end_of_text|>" (keeping same ID 151643)
import json
import tempfile
import os

# Save tokenizer to temp directory
with tempfile.TemporaryDirectory() as tmpdir:
    qwen3_tokenizer.save_pretrained(tmpdir)
    
    # Debug: list saved files
    print("Saved files:", os.listdir(tmpdir))
    
    # 1. Modify tokenizer.json
    tokenizer_json_path = os.path.join(tmpdir, "tokenizer.json")
    with open(tokenizer_json_path, "r") as f:
        tokenizer_data = json.load(f)
    
    # Replace in vocab
    vocab = tokenizer_data["model"]["vocab"]
    if "<|endoftext|>" in vocab:
        token_id = vocab.pop("<|endoftext|>")
        vocab["<|end_of_text|>"] = token_id
        print(f"[vocab] Swapped '<|endoftext|>' to '<|end_of_text|>' with ID {token_id}")
    else:
        print("[vocab] '<|endoftext|>' not found in vocab")
    
    # Replace in added_tokens
    if "added_tokens" in tokenizer_data:
        for token_entry in tokenizer_data["added_tokens"]:
            if token_entry.get("content") == "<|endoftext|>":
                token_entry["content"] = "<|end_of_text|>"
                print(f"[added_tokens] Swapped '<|endoftext|>' to '<|end_of_text|>'")
    
    with open(tokenizer_json_path, "w") as f:
        json.dump(tokenizer_data, f, ensure_ascii=False)
    
    # 2. Modify special_tokens_map.json (eos_token and pad_token)
    special_tokens_path = os.path.join(tmpdir, "special_tokens_map.json")
    with open(special_tokens_path, "r") as f:
        special_tokens = json.load(f)
    print(f"[special_tokens_map] Before: {special_tokens}")
    # Swap eos_token
    if special_tokens.get("eos_token") == "<|endoftext|>":
        special_tokens["eos_token"] = "<|end_of_text|>"
    elif isinstance(special_tokens.get("eos_token"), dict):
        if special_tokens["eos_token"].get("content") == "<|endoftext|>":
            special_tokens["eos_token"]["content"] = "<|end_of_text|>"
    # Swap pad_token
    if special_tokens.get("pad_token") == "<|endoftext|>":
        special_tokens["pad_token"] = "<|end_of_text|>"
    elif isinstance(special_tokens.get("pad_token"), dict):
        if special_tokens["pad_token"].get("content") == "<|endoftext|>":
            special_tokens["pad_token"]["content"] = "<|end_of_text|>"
            print("[special_tokens_map] Swapped pad_token to '<|end_of_text|>'")
    with open(special_tokens_path, "w") as f:
        json.dump(special_tokens, f, ensure_ascii=False)
    
    # 3. Modify tokenizer_config.json (eos_token and pad_token)
    tokenizer_config_path = os.path.join(tmpdir, "tokenizer_config.json")
    with open(tokenizer_config_path, "r") as f:
        tokenizer_config = json.load(f)
    print(f"[tokenizer_config] eos_token before: {tokenizer_config.get('eos_token')}")
    print(f"[tokenizer_config] pad_token before: {tokenizer_config.get('pad_token')}")
    # Swap eos_token
    if tokenizer_config.get("eos_token") == "<|endoftext|>":
        tokenizer_config["eos_token"] = "<|end_of_text|>"
    elif isinstance(tokenizer_config.get("eos_token"), dict):
        if tokenizer_config["eos_token"].get("content") == "<|endoftext|>":
            tokenizer_config["eos_token"]["content"] = "<|end_of_text|>"
    # Swap pad_token
    if tokenizer_config.get("pad_token") == "<|endoftext|>":
        tokenizer_config["pad_token"] = "<|end_of_text|>"
        print("[tokenizer_config] Swapped pad_token to '<|end_of_text|>'")
    elif isinstance(tokenizer_config.get("pad_token"), dict):
        if tokenizer_config["pad_token"].get("content") == "<|endoftext|>":
            tokenizer_config["pad_token"]["content"] = "<|end_of_text|>"
            print("[tokenizer_config] Swapped pad_token to '<|end_of_text|>'")
    with open(tokenizer_config_path, "w") as f:
        json.dump(tokenizer_config, f, ensure_ascii=False)
    
    # Reload tokenizer
    qwen3_tokenizer = AutoTokenizer.from_pretrained(tmpdir)

print("EOS token:", qwen3_tokenizer.eos_token)
print("EOS token ID:", qwen3_tokenizer.eos_token_id)

Saved files: ['special_tokens_map.json', 'vocab.json', 'merges.txt', 'chat_template.jinja', 'tokenizer.json', 'tokenizer_config.json', 'added_tokens.json']
[vocab] '<|endoftext|>' not found in vocab
[added_tokens] Swapped '<|endoftext|>' to '<|end_of_text|>'
[special_tokens_map] Before: {'additional_special_tokens': ['<|im_start|>', '<|im_end|>', '<|object_ref_start|>', '<|object_ref_end|>', '<|box_start|>', '<|box_end|>', '<|quad_start|>', '<|quad_end|>', '<|vision_start|>', '<|vision_end|>', '<|vision_pad|>', '<|image_pad|>', '<|video_pad|>'], 'eos_token': {'content': '<|endoftext|>', 'lstrip': False, 'normalized': False, 'rstrip': False, 'single_word': False}, 'pad_token': {'content': '<|endoftext|>', 'lstrip': False, 'normalized': False, 'rstrip': False, 'single_word': False}}
[special_tokens_map] Swapped pad_token to '<|end_of_text|>'
[tokenizer_config] eos_token before: <|endoftext|>
[tokenizer_config] pad_token before: <|endoftext|>
[tokenizer_config] Swapped pad_token to '<|end

In [3]:
# Add bos_token to qwen3 tokenizer
qwen3_tokenizer.add_special_tokens({"bos_token": "<|begin_of_text|>"})
print("BOS token:", qwen3_tokenizer.bos_token)
print("BOS token ID:", qwen3_tokenizer.bos_token_id)

BOS token: <|begin_of_text|>
BOS token ID: 151670


In [4]:
print("BOS token:", qwen3_tokenizer.bos_token)
print("EOS token:", qwen3_tokenizer.eos_token)
print("--------------------------------")
print("BOS token:", marin_tokenizer.bos_token)
print("EOS token:", marin_tokenizer.eos_token)

BOS token: <|begin_of_text|>
EOS token: <|end_of_text|>
--------------------------------
BOS token: <|begin_of_text|>
EOS token: <|end_of_text|>


In [5]:
text = "<|begin_of_text|>Hello how are you<|end_of_text|>"

In [6]:
for i, token in enumerate(qwen3_tokenizer(text)['input_ids']):
    print(f"{i}: {qwen3_tokenizer.decode([token])}, token_id: {token}")


0: <|begin_of_text|>, token_id: 151670
1: Hello, token_id: 9707
2:  how, token_id: 1246
3:  are, token_id: 525
4:  you, token_id: 498
5: <|end_of_text|>, token_id: 151643


In [7]:
for i, token in enumerate(marin_tokenizer(text)['input_ids']):
    print(f"{i}: {marin_tokenizer.decode([token])}")

0: <|begin_of_text|>
1: <|begin_of_text|>
2: Hello
3:  how
4:  are
5:  you
6: <|end_of_text|>


In [8]:
# Upload modified qwen3 tokenizer to Hugging Face
repo_name = "potsawee/qwen3x-tokenizer"  # Change this to your desired repo name
qwen3_tokenizer.push_to_hub(repo_name)

Processing Files (1 / 1): 100%|██████████| 11.4MB / 11.4MB, 1.33MB/s  
New Data Upload: |          |  0.00B /  0.00B,  0.00B/s  


CommitInfo(commit_url='https://huggingface.co/potsawee/qwen3x-tokenizer/commit/0edb09a3f9cbdde54ecee5132b8daa76c1ed1d9a', commit_message='Upload tokenizer', commit_description='', oid='0edb09a3f9cbdde54ecee5132b8daa76c1ed1d9a', pr_url=None, repo_url=RepoUrl('https://huggingface.co/potsawee/qwen3x-tokenizer', endpoint='https://huggingface.co', repo_type='model', repo_id='potsawee/qwen3x-tokenizer'), pr_revision=None, pr_num=None)