In [1]:
from google.colab import userdata
from huggingface_hub import login
from transformers import AutoTokenizer

In [3]:
hf_token = userdata.get('HF_TOKEN')
if hf_token and hf_token.startswith("hf_"):
  print("HF key looks good so far")
else:
  print("HF key is not set - please click the key in the left sidebar")
login(hf_token, add_to_git_credential=True)

gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)
  if gpu_info.find('Tesla T4') >= 0:
    print("Success - Connected to a T4")
  else:
    print("NOT CONNECTED TO A T4")

HF key looks good so far
Wed Dec 17 07:53:38 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   48C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                       

In [2]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")

tokenizer_config.json:   0%|          | 0.00/996 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [3]:
text = "I am excited to show Tokenizers in action to my LLM engineers"
tokens = tokenizer.encode(text)
tokens

[1,
 315,
 837,
 9534,
 298,
 1347,
 16625,
 17916,
 297,
 2992,
 298,
 586,
 16704,
 28755,
 22488]

In [4]:
character_count = len(text)
word_count = len(text.split(' '))
token_count = len(tokens)
print(f"There are {character_count} characters, {word_count} words and {token_count} tokens")

There are 61 characters, 12 words and 15 tokens


In [5]:
tokenizer.decode(tokens)

'<s> I am excited to show Tokenizers in action to my LLM engineers'

In [6]:
tokenizer.batch_decode(tokens)

['<s>',
 'I',
 'am',
 'excited',
 'to',
 'show',
 'Token',
 'izers',
 'in',
 'action',
 'to',
 'my',
 'LL',
 'M',
 'engineers']

In [7]:
tokenizer.get_added_vocab()

{'<unk>': 0, '<s>': 1, '</s>': 2}

In [8]:
len(tokenizer.vocab)

32000

In [17]:
MISTRAL = "mistralai/Mistral-7B-Instruct-v0.2"
PHI4 = "microsoft/Phi-4-mini-instruct"
DEEPSEEK = "deepseek-ai/DeepSeek-V3.1"
QWEN_CODER = "Qwen/Qwen2.5-Coder-7B-Instruct"

In [18]:
tokenizer = AutoTokenizer.from_pretrained(MISTRAL)
phi4_tokenizer = AutoTokenizer.from_pretrained(PHI4)
deepseek_tokenizer = AutoTokenizer.from_pretrained(DEEPSEEK)
qwen_tokenizer = AutoTokenizer.from_pretrained(QWEN_CODER)

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

In [19]:
MISTRAL_CHAT_TEMPLATE = """{% for message in messages %}
{% if message['role'] == 'system' %}
{% set system_message = message['content'] %}
{% elif message['role'] == 'user' %}
[INST] {{ system_message + "\\n" if system_message is defined else "" }}{{ message['content'] }} [/INST]
{% elif message['role'] == 'assistant' %}
{{ message['content'] }}</s>
{% endif %}
{% endfor %}
"""

In [20]:
tokenizer.chat_template = MISTRAL_CHAT_TEMPLATE

In [21]:
messages = [
    {"role": "system", "content": "You are a helpful assistant"},
    {"role": "user", "content": "Tell a light-hearted joke for a room of Data Scientists"}
]

In [22]:
print("Mistral:")
print(tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True))

print("\nPhi-4:")
print(phi4_tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True))

print("\nDeepSeek:")
print(deepseek_tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True))


Mistral:
[INST] Tell a light-hearted joke for a room of Data Scientists [/INST]


Phi-4:
<|system|>You are a helpful assistant<|end|><|user|>Tell a light-hearted joke for a room of Data Scientists<|end|><|assistant|>

DeepSeek:
<｜begin▁of▁sentence｜>You are a helpful assistant<｜User｜>Tell a light-hearted joke for a room of Data Scientists<｜Assistant｜></think>


In [23]:
text = "I am curiously excited to show Hugging Face Tokenizers in action to my LLM engineers"

print("\nMistral tokens:")
tokens = tokenizer.encode(text)
print(tokens)
print(tokenizer.batch_decode(tokens))

print("\nPhi-4 tokens:")
tokens = phi4_tokenizer.encode(text)
print(tokens)
print(phi4_tokenizer.batch_decode(tokens))

print("\nDeepSeek tokens:")
print(deepseek_tokenizer.encode(text))


Mistral tokens:
[1, 315, 837, 1191, 7990, 9534, 298, 1347, 13287, 3080, 7197, 16625, 17916, 297, 2992, 298, 586, 16704, 28755, 22488]
['<s>', 'I', 'am', 'cur', 'iously', 'excited', 'to', 'show', 'Hug', 'ging', 'Face', 'Token', 'izers', 'in', 'action', 'to', 'my', 'LL', 'M', 'engineers']

Phi-4 tokens:
[40, 939, 4396, 23138, 15209, 316, 2356, 59116, 4512, 29049, 17951, 24223, 306, 3736, 316, 922, 451, 19641, 32437]
['I', ' am', ' cur', 'iously', ' excited', ' to', ' show', ' Hug', 'ging', ' Face', ' Token', 'izers', ' in', ' action', ' to', ' my', ' L', 'LM', ' engineers']

DeepSeek tokens:
[0, 43, 1030, 108771, 15046, 304, 1801, 24133, 5426, 11906, 47948, 24524, 295, 4271, 304, 1026, 33792, 47, 26170]


In [24]:
code = """
def hello_world(person):
  print("Hello", person)
"""

In [25]:
tokens = qwen_tokenizer.encode(code)
for token in tokens:
    print(f"{token} = {qwen_tokenizer.decode(token)}")

198 = 

750 = def
23811 =  hello
31792 = _world
29766 = (person
982 = ):

220 =  
1173 =  print
445 = ("
9707 = Hello
497 = ",
1697 =  person
340 = )

