In [1]:
from transformers import AutoTokenizer

marin_tokenizer = AutoTokenizer.from_pretrained("stanford-crfm/marin-tokenizer")
qwen3_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B-Base")


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Swap existing EOS token from "<|endoftext|>" to "<|end_of_text|>" (keeping same ID 151643)
import json
import tempfile
import os

# Save tokenizer to temp directory
with tempfile.TemporaryDirectory() as tmpdir:
    qwen3_tokenizer.save_pretrained(tmpdir)
    
    # Debug: list saved files
    print("Saved files:", os.listdir(tmpdir))
    
    # 1. Modify tokenizer.json
    tokenizer_json_path = os.path.join(tmpdir, "tokenizer.json")
    with open(tokenizer_json_path, "r") as f:
        tokenizer_data = json.load(f)
    
    # Replace in vocab
    vocab = tokenizer_data["model"]["vocab"]
    if "<|endoftext|>" in vocab:
        token_id = vocab.pop("<|endoftext|>")
        vocab["<|end_of_text|>"] = token_id
        print(f"[vocab] Swapped '<|endoftext|>' to '<|end_of_text|>' with ID {token_id}")
    else:
        print("[vocab] '<|endoftext|>' not found in vocab")
    
    # Replace in added_tokens
    if "added_tokens" in tokenizer_data:
        for token_entry in tokenizer_data["added_tokens"]:
            if token_entry.get("content") == "<|endoftext|>":
                token_entry["content"] = "<|end_of_text|>"
                print(f"[added_tokens] Swapped '<|endoftext|>' to '<|end_of_text|>'")
    
    with open(tokenizer_json_path, "w") as f:
        json.dump(tokenizer_data, f, ensure_ascii=False)
    
    # 2. Modify special_tokens_map.json (eos_token and pad_token)
    special_tokens_path = os.path.join(tmpdir, "special_tokens_map.json")
    with open(special_tokens_path, "r") as f:
        special_tokens = json.load(f)
    print(f"[special_tokens_map] Before: {special_tokens}")
    # Swap eos_token
    if special_tokens.get("eos_token") == "<|endoftext|>":
        special_tokens["eos_token"] = "<|end_of_text|>"
    elif isinstance(special_tokens.get("eos_token"), dict):
        if special_tokens["eos_token"].get("content") == "<|endoftext|>":
            special_tokens["eos_token"]["content"] = "<|end_of_text|>"
    # Swap pad_token
    if special_tokens.get("pad_token") == "<|endoftext|>":
        special_tokens["pad_token"] = "<|end_of_text|>"
    elif isinstance(special_tokens.get("pad_token"), dict):
        if special_tokens["pad_token"].get("content") == "<|endoftext|>":
            special_tokens["pad_token"]["content"] = "<|end_of_text|>"
            print("[special_tokens_map] Swapped pad_token to '<|end_of_text|>'")
    with open(special_tokens_path, "w") as f:
        json.dump(special_tokens, f, ensure_ascii=False)
    
    # 3. Modify tokenizer_config.json (eos_token and pad_token)
    tokenizer_config_path = os.path.join(tmpdir, "tokenizer_config.json")
    with open(tokenizer_config_path, "r") as f:
        tokenizer_config = json.load(f)
    print(f"[tokenizer_config] eos_token before: {tokenizer_config.get('eos_token')}")
    print(f"[tokenizer_config] pad_token before: {tokenizer_config.get('pad_token')}")
    # Swap eos_token
    if tokenizer_config.get("eos_token") == "<|endoftext|>":
        tokenizer_config["eos_token"] = "<|end_of_text|>"
    elif isinstance(tokenizer_config.get("eos_token"), dict):
        if tokenizer_config["eos_token"].get("content") == "<|endoftext|>":
            tokenizer_config["eos_token"]["content"] = "<|end_of_text|>"
    # Swap pad_token
    if tokenizer_config.get("pad_token") == "<|endoftext|>":
        tokenizer_config["pad_token"] = "<|end_of_text|>"
        print("[tokenizer_config] Swapped pad_token to '<|end_of_text|>'")
    elif isinstance(tokenizer_config.get("pad_token"), dict):
        if tokenizer_config["pad_token"].get("content") == "<|endoftext|>":
            tokenizer_config["pad_token"]["content"] = "<|end_of_text|>"
            print("[tokenizer_config] Swapped pad_token to '<|end_of_text|>'")
    with open(tokenizer_config_path, "w") as f:
        json.dump(tokenizer_config, f, ensure_ascii=False)
    
    # Reload tokenizer
    qwen3_tokenizer = AutoTokenizer.from_pretrained(tmpdir)

print("EOS token:", qwen3_tokenizer.eos_token)
print("EOS token ID:", qwen3_tokenizer.eos_token_id)

Saved files: ['tokenizer_config.json', 'added_tokens.json', 'tokenizer.json', 'chat_template.jinja', 'vocab.json', 'special_tokens_map.json', 'merges.txt']
[vocab] '<|endoftext|>' not found in vocab
[added_tokens] Swapped '<|endoftext|>' to '<|end_of_text|>'
[special_tokens_map] Before: {'additional_special_tokens': ['<|im_start|>', '<|im_end|>', '<|object_ref_start|>', '<|object_ref_end|>', '<|box_start|>', '<|box_end|>', '<|quad_start|>', '<|quad_end|>', '<|vision_start|>', '<|vision_end|>', '<|vision_pad|>', '<|image_pad|>', '<|video_pad|>'], 'eos_token': {'content': '<|endoftext|>', 'lstrip': False, 'normalized': False, 'rstrip': False, 'single_word': False}, 'pad_token': {'content': '<|endoftext|>', 'lstrip': False, 'normalized': False, 'rstrip': False, 'single_word': False}}
[special_tokens_map] Swapped pad_token to '<|end_of_text|>'
[tokenizer_config] eos_token before: <|endoftext|>
[tokenizer_config] pad_token before: <|endoftext|>
[tokenizer_config] Swapped pad_token to '<|end

In [3]:
# Add bos_token to qwen3 tokenizer
qwen3_tokenizer.add_special_tokens({"bos_token": "<|begin_of_text|>"})
print("BOS token:", qwen3_tokenizer.bos_token)
print("BOS token ID:", qwen3_tokenizer.bos_token_id)

BOS token: <|begin_of_text|>
BOS token ID: 151670


In [4]:
print("BOS token:", qwen3_tokenizer.bos_token)
print("EOS token:", qwen3_tokenizer.eos_token)
print("--------------------------------")
print("BOS token:", marin_tokenizer.bos_token)
print("EOS token:", marin_tokenizer.eos_token)

BOS token: <|begin_of_text|>
EOS token: <|end_of_text|>
--------------------------------
BOS token: <|begin_of_text|>
EOS token: <|end_of_text|>


In [22]:
text = "<|begin_of_text|>[2]Hello how are you<|end_of_text|>"

In [23]:
for i, token in enumerate(qwen3_tokenizer(text)['input_ids']):
    print(f"{i}: {qwen3_tokenizer.decode([token])}, token_id: {token}")


0: <|begin_of_text|>, token_id: 151670
1: [, token_id: 58
2: 2, token_id: 17
3: ], token_id: 60
4: Hello, token_id: 9707
5:  how, token_id: 1246
6:  are, token_id: 525
7:  you, token_id: 498
8: <|end_of_text|>, token_id: 151643


In [11]:
for i, token in enumerate(marin_tokenizer(text)['input_ids']):
    print(f"{i}: {marin_tokenizer.decode([token])}")

0: <|begin_of_text|>
1: <|begin_of_text|>
2: [
3: 3
4: ]
5: Hello
6:  how
7:  are
8:  you
9: <|end_of_text|>


In [12]:
# Upload modified qwen3 tokenizer to Hugging Face
# repo_name = "potsawee/qwen3x-tokenizer"  # Change this to your desired repo name
# qwen3_tokenizer.push_to_hub(repo_name)

## Build qwen3x-mimi model
- Qwen3x = Qwen3 + vocab expanded by Mimi 8 codebooks 

In [1]:
# https://github.com/QwenLM/Qwen3/issues/29
# Tokenizer size and embedding size mismatch

In [2]:
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [40]:
model_size = "1.7B" # 0.6B, 1.7B

In [41]:
model_name = f"Qwen/Qwen3-{model_size}-Base"
qwen3_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
qwen3_tokenizer = AutoTokenizer.from_pretrained(model_name)
print("vocab size 1  :", qwen3_tokenizer.vocab_size)
print("vocab size 2  :", len(qwen3_tokenizer))
print("embedding size:", qwen3_model.model.embed_tokens.weight.shape[0])
# The first one refers to the vocab size without special tokens, while the second includes them. The embedding size is larger with padding tokens due to distributed training in our pretraining stage.

vocab size 1  : 151643
vocab size 2  : 151669
embedding size: 151936


In [42]:
qwen3x_tokenizer = AutoTokenizer.from_pretrained("potsawee/qwen3x-mimi-bpe-8cb-16k-tokenizer")
print("vocab size 1 :", qwen3x_tokenizer.vocab_size)
print("vocab size 2 :", len(qwen3x_tokenizer))

vocab size 1 : 151643
vocab size 2 : 168059


In [43]:
# 151669 -- original vocab size
# 2048*8 -- vocab size from 8 Mimi codebooks
# 4 -- <|audio_start|> <|audio_end|> <|text_start|> <|text_end|>
# 2 -- <|begin_of_text|> <|end_of_text|>
print(151669 + 2048*8 + 4 + 2)

168059


In [44]:
# so I need the embedding size of qwen3x model to be at least 168059
# Marin will handle padding automatically -- confirmed with the first "ws" experiment

In [45]:
# In the last cell, I want you the extend the current embedding of qwen3_model.model.embed_tokens.weight from 151936 to 168059

# And for the expanded embedding, the positions [0,1,...,151668] should have the values copied over from the original embedding, while the positions [151669, 151670, 151671, ..., 168058] should be freshly initialized (i.e., from scratch)

In [46]:
import torch
import torch.nn as nn

# Get original embedding
original_embed = qwen3_model.model.embed_tokens.weight
print(f"Original embedding shape: {original_embed.shape}")
print(f"Original embedding dtype: {original_embed.dtype}")

# Original dimensions
original_vocab_size = original_embed.shape[0]  # 151936
hidden_dim = original_embed.shape[1]
dtype = original_embed.dtype  # bfloat16

# New dimensions
new_vocab_size = 168059
num_tokens_to_copy = 151669  # Copy positions [0, 1, ..., 151668]
num_new_tokens = new_vocab_size - num_tokens_to_copy

# Create new embedding layer in bfloat16
new_embed = nn.Embedding(new_vocab_size, hidden_dim, dtype=dtype)
print(f"New embedding shape: {new_embed.weight.shape}")
print(f"New embedding dtype: {new_embed.weight.dtype}")

# Compute mean embedding from positions [0, ..., 151668]
mean_embedding = original_embed[:num_tokens_to_copy].mean(dim=0)
print(f"Mean embedding shape: {mean_embedding.shape}")

# Small Gaussian noise std (you can adjust this)
noise_std = 0.02

with torch.no_grad():
    # Copy over the first 151669 embeddings from original
    new_embed.weight[:num_tokens_to_copy] = original_embed[:num_tokens_to_copy].clone()
    
    # Initialize new positions with mean + small Gaussian noise (in bfloat16)
    noise = (torch.randn(num_new_tokens, hidden_dim) * noise_std).to(dtype)
    new_embed.weight[num_tokens_to_copy:] = mean_embedding.unsqueeze(0) + noise

# Replace the embedding layer in the model
qwen3_model.model.embed_tokens = new_embed

# Also update lm_head to match the new vocabulary size
original_lm_head = qwen3_model.lm_head.weight
mean_lm_head = original_lm_head[:num_tokens_to_copy].mean(dim=0)

new_lm_head = nn.Linear(hidden_dim, new_vocab_size, bias=False, dtype=dtype)
with torch.no_grad():
    new_lm_head.weight[:num_tokens_to_copy] = original_lm_head[:num_tokens_to_copy].clone()
    noise = (torch.randn(num_new_tokens, hidden_dim) * noise_std).to(dtype)
    new_lm_head.weight[num_tokens_to_copy:] = mean_lm_head.unsqueeze(0) + noise
qwen3_model.lm_head = new_lm_head

# Update model config
qwen3_model.config.vocab_size = new_vocab_size

print(f"\n--- Summary ---")
print(f"New embedding size: {qwen3_model.model.embed_tokens.weight.shape[0]}")
print(f"New embedding dtype: {qwen3_model.model.embed_tokens.weight.dtype}")
print(f"New lm_head size: {qwen3_model.lm_head.weight.shape[0]}")
print(f"New lm_head dtype: {qwen3_model.lm_head.weight.dtype}")
print(f"Copied {num_tokens_to_copy} embeddings from original (positions 0 to {num_tokens_to_copy - 1})")
print(f"Initialized {num_new_tokens} new embeddings (positions {num_tokens_to_copy} to {new_vocab_size - 1}) with mean + Gaussian noise (std={noise_std})")


Original embedding shape: torch.Size([151936, 2048])
Original embedding dtype: torch.bfloat16
New embedding shape: torch.Size([168059, 2048])
New embedding dtype: torch.bfloat16
Mean embedding shape: torch.Size([2048])

--- Summary ---
New embedding size: 168059
New embedding dtype: torch.bfloat16
New lm_head size: 168059
New lm_head dtype: torch.bfloat16
Copied 151669 embeddings from original (positions 0 to 151668)
Initialized 16390 new embeddings (positions 151669 to 168058) with mean + Gaussian noise (std=0.02)


In [47]:
qwen3_model.push_to_hub(f"potsawee/qwen3x-{model_size}-base")


Processing Files (1 / 1): 100%|██████████| 4.20GB / 4.20GB, 95.0MB/s  
New Data Upload: 100%|██████████|  135MB /  135MB, 70.6kB/s  


CommitInfo(commit_url='https://huggingface.co/potsawee/qwen3x-1.7B-base/commit/14cd3a126af7dfa8d4b518be4d9c4be439ac3934', commit_message='Upload Qwen3ForCausalLM', commit_description='', oid='14cd3a126af7dfa8d4b518be4d9c4be439ac3934', pr_url=None, repo_url=RepoUrl('https://huggingface.co/potsawee/qwen3x-1.7B-base', endpoint='https://huggingface.co', repo_type='model', repo_id='potsawee/qwen3x-1.7B-base'), pr_revision=None, pr_num=None)

In [49]:
# count the number of parameters in the model
# 0.6B --> 784652288
# 1.7B --> 2097779712
qwen3_model.num_parameters()

2097779712