In [1]:
from transformers import AutoTokenizer

In [2]:
# meta llama
tokenizer_llama_1B = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B-Instruct")
tokenizer_llama_3B = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B-Instruct")
tokenizer_llama_8B = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
tokenizer_llama_70B = AutoTokenizer.from_pretrained("meta-llama/Llama-3.3-70B-Instruct")

# alibaba qwen
tokenizer_qwen_1B = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct")
tokenizer_qwen_3B = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-3B-Instruct")
tokenizer_qwen_7B = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-7B-Instruct")
tokenizer_qwen_14B = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-14B-Instruct")
tokenizer_qwen_32B = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-32B-Instruct")
tokenizer_qwen_72B = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-72B-Instruct")

# microsoft phi
# phi3 has different tokenizer # we use other families for small models
tokenizer_phi4_14B = AutoTokenizer.from_pretrained("microsoft/phi-4")

# google gemma
tokenizer_gemma_1B = AutoTokenizer.from_pretrained("google/gemma-3-1b-it")
tokenizer_gemma_4B = AutoTokenizer.from_pretrained("google/gemma-3-4b-it")
tokenizer_gemma_12B = AutoTokenizer.from_pretrained("google/gemma-3-12b-it")
tokenizer_gemma_27B = AutoTokenizer.from_pretrained("google/gemma-3-27b-it")

# mistral
tokenizer_mistral_24B = AutoTokenizer.from_pretrained("mistralai/Mistral-Small-24B-Instruct-2501")

# deepseek
tokenizer_deepseek_qwen_1B = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B")
tokenizer_deepseek_qwen_7B = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Qwen-7B")
tokenizer_deepseek_qwen_14B = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Qwen-14B")
tokenizer_deepseek_llama_8B = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Llama-8B")

In [3]:
assert tokenizer_gemma_1B.vocab == tokenizer_gemma_1B.vocab
assert tokenizer_gemma_4B.vocab == tokenizer_gemma_1B.vocab
assert tokenizer_gemma_12B.vocab == tokenizer_gemma_1B.vocab
assert tokenizer_gemma_27B.vocab == tokenizer_gemma_1B.vocab

In [4]:
assert tokenizer_qwen_1B.vocab == tokenizer_qwen_1B.vocab
assert tokenizer_qwen_7B.vocab == tokenizer_qwen_1B.vocab
assert tokenizer_qwen_14B.vocab == tokenizer_qwen_1B.vocab
assert tokenizer_qwen_32B.vocab == tokenizer_qwen_1B.vocab
assert tokenizer_qwen_72B.vocab == tokenizer_qwen_1B.vocab

In [5]:
assert tokenizer_llama_1B.vocab == tokenizer_llama_1B.vocab
assert tokenizer_llama_3B.vocab == tokenizer_llama_1B.vocab
assert tokenizer_llama_8B.vocab == tokenizer_llama_1B.vocab
assert tokenizer_llama_70B.vocab == tokenizer_llama_1B.vocab

In [6]:
assert tokenizer_deepseek_qwen_7B.vocab == tokenizer_deepseek_qwen_1B.vocab 
assert tokenizer_deepseek_qwen_14B.vocab == tokenizer_deepseek_qwen_1B.vocab 

In [7]:
# qwen != llama
assert tokenizer_qwen_1B.vocab != tokenizer_llama_1B.vocab
# phi != llama
assert tokenizer_phi4_14B.vocab != tokenizer_llama_1B.vocab
# gemma != llama
assert tokenizer_gemma_1B.vocab != tokenizer_llama_1B.vocab
# mistral != llama
assert tokenizer_mistral_24B.vocab != tokenizer_llama_1B.vocab
# qwen != deepseek qwen
assert tokenizer_qwen_1B.vocab != tokenizer_deepseek_qwen_1B.vocab
# llama != deepseek llama
assert tokenizer_llama_1B.vocab != tokenizer_deepseek_llama_8B.vocab
# deepseek qwen != deepseek llama
assert tokenizer_deepseek_qwen_1B.vocab != tokenizer_deepseek_llama_8B.vocab

In [13]:
tokenizer_qwen_3B.encode('this is a test sentence')

[574, 374, 264, 1273, 11652]

In [16]:
tokenizer_llama_1B.encode('this is a test sentence')

[128000, 576, 374, 264, 1296, 11914]

In [17]:
tokenizer_qwen_1B.encode('<a> <b> <c> .')

[9312, 29, 366, 65, 29, 366, 66, 29, 659]

In [26]:
tokenizer_qwen_1B.convert_ids_to_tokens(200000) is None

True

In [23]:
len(tokenizer_phi4_14B.vocab)

100352

# Find tokenizer config

In [3]:
tokenizer = tokenizer_gemma_1B

In [4]:
newline_token_id = tokenizer.encode('''
''')[-1]
print(newline_token_id)
newline_token = tokenizer.convert_ids_to_tokens(newline_token_id)
print(newline_token)

107




In [5]:
switch_pattern = tokenizer.encode('''
Fact:''')[1:]
print(switch_pattern)

[107, 27711, 236787]


In [6]:
switch_pattern = tokenizer.encode('''
Answer:''')[1:]
print(switch_pattern)

[107, 7925, 236787]


In [7]:
end_of_triple = tokenizer.encode('''triple .''')[-1]
print(end_of_triple)

783


In [8]:
tokenizer.eos_token_id

1

In [9]:
tokenizer.special_tokens_map

{'bos_token': '<bos>',
 'eos_token': '<eos>',
 'unk_token': '<unk>',
 'pad_token': '<pad>',
 'boi_token': '<start_of_image>',
 'eoi_token': '<end_of_image>',
 'image_token': '<image_soft_token>'}

In [10]:
tokenizer.bos_token

'<bos>'

In [11]:
tokenizer.add_bos_token

True

In [17]:
tokenizer.decode(tokenizer(['ciao come stai?'], add_special_tokens=False)['input_ids'][0])

'ciao come stai?'