In [None]:
from huggingface_hub import login
from transformers import AutoTokenizer
import os

In [None]:
hf_api_token = os.getenv("HF_API_TOKEN")
login(hf_api_token, add_to_git_credential=True)

In [24]:
tokenizer = AutoTokenizer.from_pretrained('meta-llama/Llama-3.1-8B', trust_remote_code=True)

In [29]:
text = "I am learning how tokenizer handles both encoding and decoding works"
tokens = tokenizer.encode(text)

tokens

[128000, 40, 1097, 6975, 1268, 47058, 13777, 2225, 11418, 323, 48216, 4375]

In [30]:
character_count = len(text)
word_count = len(text.split(' '))
token_count = len(tokens)
print(f"There are {character_count} characters, {word_count} words and {token_count} tokens")

There are 68 characters, 11 words and 12 tokens


In [31]:
tokenizer.decode(tokens)

'<|begin_of_text|>I am learning how tokenizer handles both encoding and decoding works'

In [32]:
tokenizer.batch_decode(tokens)

['<|begin_of_text|>',
 'I',
 ' am',
 ' learning',
 ' how',
 ' tokenizer',
 ' handles',
 ' both',
 ' encoding',
 ' and',
 ' decoding',
 ' works']

In [34]:
tokenizer.get_added_vocab()

{'<|begin_of_text|>': 128000,
 '<|end_of_text|>': 128001,
 '<|reserved_special_token_0|>': 128002,
 '<|reserved_special_token_1|>': 128003,
 '<|finetune_right_pad_id|>': 128004,
 '<|reserved_special_token_2|>': 128005,
 '<|start_header_id|>': 128006,
 '<|end_header_id|>': 128007,
 '<|eom_id|>': 128008,
 '<|eot_id|>': 128009,
 '<|python_tag|>': 128010,
 '<|reserved_special_token_3|>': 128011,
 '<|reserved_special_token_4|>': 128012,
 '<|reserved_special_token_5|>': 128013,
 '<|reserved_special_token_6|>': 128014,
 '<|reserved_special_token_7|>': 128015,
 '<|reserved_special_token_8|>': 128016,
 '<|reserved_special_token_9|>': 128017,
 '<|reserved_special_token_10|>': 128018,
 '<|reserved_special_token_11|>': 128019,
 '<|reserved_special_token_12|>': 128020,
 '<|reserved_special_token_13|>': 128021,
 '<|reserved_special_token_14|>': 128022,
 '<|reserved_special_token_15|>': 128023,
 '<|reserved_special_token_16|>': 128024,
 '<|reserved_special_token_17|>': 128025,
 '<|reserved_special_to

In [40]:
tokenizer = AutoTokenizer.from_pretrained('meta-llama/Llama-3.1-8B-Instruct', trust_remote_code=True)

In [41]:
messages = [
    {"role": "system", "content": "You are a helpful assistant"},
    {"role": "user", "content": "explain about tokenizer to a non-AI person"}
]

prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
print(prompt)

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

You are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>

explain about tokenizer to a non-AI person<|eot_id|><|start_header_id|>assistant<|end_header_id|>




### Trying new models


In [49]:

PHI4 = "microsoft/Phi-4-mini-instruct"
DEEPSEEK = "deepseek-ai/DeepSeek-V3.1"
QWEN_CODER = "Qwen/Qwen2.5-Coder-7B-Instruct"

In [43]:
phi4_tokenizer = AutoTokenizer.from_pretrained(PHI4)

text = "Testing and learning about how. hugging face tokenizer works with different opensource models"

print("llama:")
tokens = tokenizer.encode(text)
print(tokens)
print(tokenizer.batch_decode(tokens))

print("\nPhi4")
tokens = phi4_tokenizer.encode(text)
print(tokens)
print(phi4_tokenizer.batch_decode(tokens))


llama:
[128000, 16856, 323, 6975, 922, 1268, 13, 305, 36368, 3663, 47058, 4375, 449, 2204, 16264, 930, 4211]
['<|begin_of_text|>', 'Testing', ' and', ' learning', ' about', ' how', '.', ' h', 'ugging', ' face', ' tokenizer', ' works', ' with', ' different', ' opens', 'ource', ' models']

Phi4
[30079, 326, 7524, 1078, 1495, 13, 196946, 4950, 99665, 5882, 483, 2647, 24061, 1310, 7015]
['Testing', ' and', ' learning', ' about', ' how', '.', ' hugging', ' face', ' tokenizer', ' works', ' with', ' different', ' opens', 'ource', ' models']


In [44]:
print("Llama:")
print(tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True))
print("\nPhi 4:")
print(phi4_tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True))

Llama:
<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

You are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>

explain about tokenizer to a non-AI person<|eot_id|><|start_header_id|>assistant<|end_header_id|>



Phi 4:
<|system|>You are a helpful assistant<|end|><|user|>explain about tokenizer to a non-AI person<|end|><|assistant|>


In [46]:
deepseek_tokenizer = AutoTokenizer.from_pretrained(DEEPSEEK)

text = "Testing and learning about how. hugging face tokenizer works with different opensource models"
print(tokenizer.encode(text))
print()
print(phi4_tokenizer.encode(text))
print()
print(deepseek_tokenizer.encode(text))

[128000, 16856, 323, 6975, 922, 1268, 13, 305, 36368, 3663, 47058, 4375, 449, 2204, 16264, 930, 4211]

[30079, 326, 7524, 1078, 1495, 13, 196946, 4950, 99665, 5882, 483, 2647, 24061, 1310, 7015]

[0, 54886, 305, 3607, 943, 1192, 16, 127816, 4219, 17840, 9160, 2984, 418, 1688, 22103, 2319, 5363]


In [47]:
print("Llama:")
print(tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True))
print("\nPhi:")
print(phi4_tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True))
print("\nDeepSeek:")
print(deepseek_tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True))

Llama:
<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

You are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>

explain about tokenizer to a non-AI person<|eot_id|><|start_header_id|>assistant<|end_header_id|>



Phi:
<|system|>You are a helpful assistant<|end|><|user|>explain about tokenizer to a non-AI person<|end|><|assistant|>

DeepSeek:
<｜begin▁of▁sentence｜>You are a helpful assistant<｜User｜>explain about tokenizer to a non-AI person<｜Assistant｜></think>


In [50]:
qwen_tokenizer = AutoTokenizer.from_pretrained(QWEN_CODER)
code = """
def hello_world(person):
  print("Hello", person)
"""
tokens = qwen_tokenizer.encode(code)
for token in tokens:
  print(f"{token}={qwen_tokenizer.decode(token)}")

198=

750=def
23811= hello
31792=_world
29766=(person
982=):

220= 
1173= print
445=("
9707=Hello
497=",
1697= person
340=)

