In [17]:
from transformers import AutoTokenizer

In [19]:
tokenizer_llama_1B = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B-Instruct")
tokenizer_qwen_1B = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct")
# microsoft phi
# phi3 has different tokenizer # we use other families for small models
tokenizer_phi4_14B = AutoTokenizer.from_pretrained("microsoft/phi-4")

In [25]:
tokenizer_qwen3_4B = AutoTokenizer.from_pretrained("Qwen/Qwen3-4B")
tokenizer_qwq_32B = AutoTokenizer.from_pretrained("Qwen/QwQ-32B")

In [67]:
# other tokenizers

# meta llama
tokenizer_llama_3B = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B-Instruct")
tokenizer_llama_8B = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
tokenizer_llama_70B = AutoTokenizer.from_pretrained("meta-llama/Llama-3.3-70B-Instruct")

# alibaba qwen
tokenizer_qwen_3B = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-3B-Instruct")
tokenizer_qwen_7B = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-7B-Instruct")
tokenizer_qwen_14B = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-14B-Instruct")
tokenizer_qwen_32B = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-32B-Instruct")
tokenizer_qwen_72B = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-72B-Instruct")

# google gemma
tokenizer_gemma_1B = AutoTokenizer.from_pretrained("google/gemma-3-1b-it")
tokenizer_gemma_4B = AutoTokenizer.from_pretrained("google/gemma-3-4b-it")
tokenizer_gemma_12B = AutoTokenizer.from_pretrained("google/gemma-3-12b-it")
tokenizer_gemma_27B = AutoTokenizer.from_pretrained("google/gemma-3-27b-it")

# mistral
tokenizer_mistral_24B = AutoTokenizer.from_pretrained("mistralai/Mistral-Small-24B-Instruct-2501")

# deepseek
tokenizer_deepseek_qwen_1B = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B")
tokenizer_deepseek_qwen_7B = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Qwen-7B")
tokenizer_deepseek_qwen_14B = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Qwen-14B")
tokenizer_deepseek_llama_8B = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Llama-8B")

# alibaba qwen qwq
tokenizer_qwq_32B = AutoTokenizer.from_pretrained("Qwen/QwQ-32B")

In [3]:
assert tokenizer_gemma_1B.vocab == tokenizer_gemma_1B.vocab
assert tokenizer_gemma_4B.vocab == tokenizer_gemma_1B.vocab
assert tokenizer_gemma_12B.vocab == tokenizer_gemma_1B.vocab
assert tokenizer_gemma_27B.vocab == tokenizer_gemma_1B.vocab

In [4]:
assert tokenizer_qwen_1B.vocab == tokenizer_qwen_1B.vocab
assert tokenizer_qwen_7B.vocab == tokenizer_qwen_1B.vocab
assert tokenizer_qwen_14B.vocab == tokenizer_qwen_1B.vocab
assert tokenizer_qwen_32B.vocab == tokenizer_qwen_1B.vocab
assert tokenizer_qwen_72B.vocab == tokenizer_qwen_1B.vocab

In [5]:
assert tokenizer_llama_1B.vocab == tokenizer_llama_1B.vocab
assert tokenizer_llama_3B.vocab == tokenizer_llama_1B.vocab
assert tokenizer_llama_8B.vocab == tokenizer_llama_1B.vocab
assert tokenizer_llama_70B.vocab == tokenizer_llama_1B.vocab

In [6]:
assert tokenizer_deepseek_qwen_7B.vocab == tokenizer_deepseek_qwen_1B.vocab 
assert tokenizer_deepseek_qwen_14B.vocab == tokenizer_deepseek_qwen_1B.vocab 

In [7]:
# qwen != llama
assert tokenizer_qwen_1B.vocab != tokenizer_llama_1B.vocab
# phi != llama
assert tokenizer_phi4_14B.vocab != tokenizer_llama_1B.vocab
# gemma != llama
assert tokenizer_gemma_1B.vocab != tokenizer_llama_1B.vocab
# mistral != llama
assert tokenizer_mistral_24B.vocab != tokenizer_llama_1B.vocab
# qwen != deepseek qwen
assert tokenizer_qwen_1B.vocab != tokenizer_deepseek_qwen_1B.vocab
# llama != deepseek llama
assert tokenizer_llama_1B.vocab != tokenizer_deepseek_llama_8B.vocab
# deepseek qwen != deepseek llama
assert tokenizer_deepseek_qwen_1B.vocab != tokenizer_deepseek_llama_8B.vocab
# qwen != qwq
assert tokenizer_qwen_1B.vocab != tokenizer_qwq_32B.vocab 

In [13]:
tokenizer_qwen_3B.encode('this is a test sentence')

[574, 374, 264, 1273, 11652]

In [16]:
tokenizer_llama_1B.encode('this is a test sentence')

[128000, 576, 374, 264, 1296, 11914]

In [17]:
tokenizer_qwen_1B.encode('<a> <b> <c> .')

[9312, 29, 366, 65, 29, 366, 66, 29, 659]

In [26]:
tokenizer_qwen_1B.convert_ids_to_tokens(200000) is None

True

In [23]:
len(tokenizer_phi4_14B.vocab)

100352

# Find tokenizer config

In [12]:
tokenizer = tokenizer_llama_1B

In [13]:
len(tokenizer.vocab), max(tokenizer.vocab.values())

(128256, 128255)

In [14]:
newline_token_id = tokenizer.encode('''
''')[-1]
print(newline_token_id)
newline_token = tokenizer.convert_ids_to_tokens(newline_token_id)
print(newline_token)

198
Ċ


In [15]:
switch_patterns_variations = ['Fact:', 'fact:', ' Fact:', ' fact:']

In [16]:
switch_pattern = {}
for switch_pattern_str in switch_patterns_variations:
    no_space = tokenizer.encode(switch_pattern_str, add_special_tokens=False)
    print(no_space, list(map(tokenizer.convert_ids_to_tokens, no_space)))
    #space = tokenizer.encode(' '+switch_pattern_str, add_special_tokens=False)
    #print(space, list(map(tokenizer.convert_ids_to_tokens, space)))
    
    lv = switch_pattern
    for tok in no_space:
        lv[tok] = {}
        lv = lv[tok]
'''    
    lv = switch_pattern
    for tok in space:
        lv[tok] = {}
        lv = lv[tok]
'''
print(switch_pattern)

[17873, 25] ['Fact', ':']
[34210, 25] ['fact', ':']
[37812, 25] ['ĠFact', ':']
[2144, 25] ['Ġfact', ':']
{17873: {25: {}}, 34210: {25: {}}, 37812: {25: {}}, 2144: {25: {}}}


In [31]:
switch_pattern = tokenizer.encode('''
Answer:''')[1:]
print(switch_pattern)

[16533, 25]


In [32]:
end_of_triple = tokenizer.encode('''triple .''')[-1]
print(end_of_triple)

662


In [13]:
tokenizer.eos_token_id

151645

In [14]:
tokenizer.special_tokens_map

{'eos_token': '<|im_end|>',
 'pad_token': '<|endoftext|>',
 'additional_special_tokens': ['<|im_start|>',
  '<|im_end|>',
  '<|object_ref_start|>',
  '<|object_ref_end|>',
  '<|box_start|>',
  '<|box_end|>',
  '<|quad_start|>',
  '<|quad_end|>',
  '<|vision_start|>',
  '<|vision_end|>',
  '<|vision_pad|>',
  '<|image_pad|>',
  '<|video_pad|>']}

In [15]:
tokenizer.bos_token

In [16]:
tokenizer.add_bos_token

AttributeError: Qwen2TokenizerFast has no attribute add_bos_token

In [17]:
tokenizer.decode(tokenizer(['ciao come stai?'], add_special_tokens=False)['input_ids'][0])

'ciao come stai?'

# Debug

In [21]:
def order_and_eq(vocab1, vocab2):
    sort_fun = lambda x: x[1]
    vocab1_ordered = [(k,v) for k,v in vocab1.items()]
    vocab1_ordered.sort(key=sort_fun)
    vocab2_ordered = [(k,v) for k,v in vocab2.items()]
    vocab2_ordered.sort(key=sort_fun)

    diff_len = len(vocab1) - len(vocab2)

    i = 0
    end = min((len(vocab1), len(vocab2)))
    while True:
        if i == end:
            break
        if vocab1_ordered[i] != vocab2_ordered[i]:
            break
        i += 1

    eq_up_to = i       

    eq = vocab1_ordered == vocab2_ordered
    diff12 = set(vocab1.keys()) - set(vocab2.keys())
    diff21 = set(vocab2.keys()) - set(vocab1.keys())
    return eq, diff_len, eq_up_to, diff12, diff21, vocab1_ordered, vocab2_ordered

In [26]:
eq, diff_len, eq_up_to, diff12, diff21, tk1, tk2 = order_and_eq(tokenizer_qwen3_4B.vocab, tokenizer_qwq_32B.vocab)

In [27]:
eq, diff_len, eq_up_to, len(diff12), len(diff21)

(True, 0, 151669, 0, 0)

In [28]:
eq_up_to == len(tokenizer_qwen_1B.vocab)

False

In [64]:
tqwen[0]

('!', 0)

In [43]:
diff12

{'</think>', '</tool_response>', '<think>', '<tool_response>'}

In [44]:
diff21

set()

In [36]:
tqwq[-15:]

[('<|vision_pad|>', 151654),
 ('<|image_pad|>', 151655),
 ('<|video_pad|>', 151656),
 ('<tool_call>', 151657),
 ('</tool_call>', 151658),
 ('<|fim_prefix|>', 151659),
 ('<|fim_middle|>', 151660),
 ('<|fim_suffix|>', 151661),
 ('<|fim_pad|>', 151662),
 ('<|repo_name|>', 151663),
 ('<|file_sep|>', 151664),
 ('<tool_response>', 151665),
 ('</tool_response>', 151666),
 ('<think>', 151667),
 ('</think>', 151668)]