<a href="https://colab.research.google.com/github/sngo/llms-practice/blob/main/HF_Tokenizers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import userdata
from huggingface_hub import login
from transformers import AutoTokenizer

In [2]:
hf_token = userdata.get('HF_TOKEN')
login(hf_token, add_to_git_credential=True)

In [None]:
tokenizer = AutoTokenizer.from_pretrained('meta-llama/Meta-Llama-3.1-8B', trust_remote_code=True)

In [4]:
text = "I am excited to show Tokenizers in action to my LLM engineers"
tokens = tokenizer(text)

In [None]:
tokens

In [None]:
len(tokens['input_ids'])  #rule of thumb: about 4 letter per tokens or 0.75 token per word.

In [None]:
tokenizer.decode(tokens['input_ids'])

In [None]:
#tokenizer.vocab #word matching token
tokenizer.get_added_vocab()  #resevered words

In [None]:
# Variants of Model.  Label end with "Instruct" that means model already train
tokenizer = AutoTokenizer.from_pretrained('meta-llama/Meta-Llama-3.1-8B-Instruct', trust_remote_code=True)

In [None]:
messages = [
    {"role": "system", "content": "You are a helpful assistant"},
    {"role": "user", "content": "Tell a light-hearted joke for a room of Data Scientists"}
  ]

promt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
print(promt)

In [13]:
# Phil3 from Microsoft, Qwen2 from Alibabe, Starcoder2 from BigCode
PHI3_MODEL_NAME = "microsoft/Phi-3-mini-4k-instruct"
QWEN2_MODEL_NAME = "Qwen/Qwen2-7B-Instruct"
STARCODER2_MODEL_NAME = "bigcode/starcoder2-3b"

In [None]:
phi3_tokenizer = AutoTokenizer.from_pretrained(PHI3_MODEL_NAME, trust_remote_code=True)
qwen2_tokenizer = AutoTokenizer.from_pretrained(QWEN2_MODEL_NAME, trust_remote_code=True)

text = "I am excited to show Tokenizers in action to my LLM engineers"
print(tokenizer.encode(text))
print("----------PHIL3----------")
phi3_tokens = phi3_tokenizer.encode(text)
print(phi3_tokens)
print()
print(phi3_tokenizer.batch_decode(phi3_tokens))
print("----------QWEN2----------")
qwen2_tokens = qwen2_tokenizer.encode(text)
print(qwen2_tokens)
print()
print(qwen2_tokenizer.batch_decode(qwen2_tokens))

In [None]:
print(tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True))
print('------- PHI3-----------')
print(phi3_tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True))
print('---------QWEN2------------')
print(qwen2_tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True))

In [None]:
starcoder2_tokenizer = AutoTokenizer.from_pretrained(STARCODER2_MODEL_NAME, trust_remote_code=True)
code = """
def hello_world(person):
  print("Hello", person)
"""
tokens = starcoder2_tokenizer.encode(code)
for token in tokens:
  print(f"{token}={starcoder2_tokenizer.decode(token)}")