In [3]:
from huggingface_hub import login
from transformers import AutoTokenizer

In [4]:
import os
from dotenv import load_dotenv

In [6]:
hf_token = os.getenv('HF_TOKEN')
login(hf_token, add_to_git_credential=True)

Token is valid (permission: read).
Your token has been saved in your configured git credential helpers (manager).
Your token has been saved to C:\Users\ma2\.cache\huggingface\token
Login successful


### Llama 3.1

In [28]:
tokenizer = AutoTokenizer.from_pretrained('meta-llama/Meta-Llama-3.1-8B', trust_remote_code=True)

In [29]:
text = "The quick brown fox jump over the lazy dog"
tokens = tokenizer.encode(text)
tokens

[128000, 791, 4062, 14198, 39935, 7940, 927, 279, 16053, 5679]

In [30]:
len(tokens)

10

In [31]:
tokenizer.decode(tokens)

'<|begin_of_text|>The quick brown fox jump over the lazy dog'

In [32]:
tokenizer.batch_decode(tokens)

['<|begin_of_text|>',
 'The',
 ' quick',
 ' brown',
 ' fox',
 ' jump',
 ' over',
 ' the',
 ' lazy',
 ' dog']

In [33]:
tokenizer.get_added_vocab()

{'<|begin_of_text|>': 128000,
 '<|end_of_text|>': 128001,
 '<|reserved_special_token_0|>': 128002,
 '<|reserved_special_token_1|>': 128003,
 '<|finetune_right_pad_id|>': 128004,
 '<|reserved_special_token_2|>': 128005,
 '<|start_header_id|>': 128006,
 '<|end_header_id|>': 128007,
 '<|eom_id|>': 128008,
 '<|eot_id|>': 128009,
 '<|python_tag|>': 128010,
 '<|reserved_special_token_3|>': 128011,
 '<|reserved_special_token_4|>': 128012,
 '<|reserved_special_token_5|>': 128013,
 '<|reserved_special_token_6|>': 128014,
 '<|reserved_special_token_7|>': 128015,
 '<|reserved_special_token_8|>': 128016,
 '<|reserved_special_token_9|>': 128017,
 '<|reserved_special_token_10|>': 128018,
 '<|reserved_special_token_11|>': 128019,
 '<|reserved_special_token_12|>': 128020,
 '<|reserved_special_token_13|>': 128021,
 '<|reserved_special_token_14|>': 128022,
 '<|reserved_special_token_15|>': 128023,
 '<|reserved_special_token_16|>': 128024,
 '<|reserved_special_token_17|>': 128025,
 '<|reserved_special_to

### converting message to acceptable prompt

In [34]:
tokenizer = AutoTokenizer.from_pretrained('meta-llama/Meta-Llama-3.1-8B-Instruct', trust_remote_code=True)

In [41]:
messages = [
    {"role": "system", "content": "You are a helpful assistant"},
    {"role": "user", "content": "Tell a light-hearted joke about the monkey"}
  ]

prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
print(prompt)

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

You are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>

Tell a light-hearted joke about the monkey<|eot_id|><|start_header_id|>assistant<|end_header_id|>




### Other models

In [42]:
PHI3_MODEL_NAME = "microsoft/Phi-3-mini-4k-instruct"
QWEN2_MODEL_NAME = "Qwen/Qwen2-7B-Instruct"
STARCODER2_MODEL_NAME = "bigcode/starcoder2-3b"

In [43]:
PHI3_tokenizer = AutoTokenizer.from_pretrained(PHI3_MODEL_NAME,trust_remote_code=True)
tokens = PHI3_tokenizer.encode(text)
tokens

[450, 4996, 17354, 1701, 29916, 12500, 975, 278, 17366, 11203]

In [44]:
PHI3_tokenizer.batch_decode(tokens)

['The', 'quick', 'brown', 'fo', 'x', 'jump', 'over', 'the', 'lazy', 'dog']

In [45]:
PHI3_tokenizer.get_added_vocab()

{'<unk>': 0,
 '<s>': 1,
 '</s>': 2,
 '<|endoftext|>': 32000,
 '<|assistant|>': 32001,
 '<|placeholder1|>': 32002,
 '<|placeholder2|>': 32003,
 '<|placeholder3|>': 32004,
 '<|placeholder4|>': 32005,
 '<|system|>': 32006,
 '<|end|>': 32007,
 '<|placeholder5|>': 32008,
 '<|placeholder6|>': 32009,
 '<|user|>': 32010}

In [46]:
print(PHI3_tokenizer.apply_chat_template(messages, tokenize=False,add_generation_prompt=True))
print()
print(tokenizer.apply_chat_template(messages,tokenize=False,add_generation_prompt=True))

<|system|>
You are a helpful assistant<|end|>
<|user|>
Tell a light-hearted joke about the monkey<|end|>
<|assistant|>


<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

You are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>

Tell a light-hearted joke about the monkey<|eot_id|><|start_header_id|>assistant<|end_header_id|>




In [47]:
QWEN2_tokenizer = AutoTokenizer.from_pretrained(QWEN2_MODEL_NAME,trust_remote_code=True)
tokens = QWEN2_tokenizer.encode(text)
tokens

tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

[785, 3974, 13876, 38835, 7784, 916, 279, 15678, 5562]

In [48]:
QWEN2_tokenizer.decode(tokens)

'The quick brown fox jump over the lazy dog'

In [49]:
QWEN2_tokenizer.get_added_vocab()

{'<|endoftext|>': 151643, '<|im_start|>': 151644, '<|im_end|>': 151645}

In [50]:
print(QWEN2_tokenizer.apply_chat_template(messages, tokenize=False,add_generation_prompt=True))
print()
print(tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True))

<|im_start|>system
You are a helpful assistant<|im_end|>
<|im_start|>user
Tell a light-hearted joke about the monkey<|im_end|>
<|im_start|>assistant


<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

You are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>

Tell a light-hearted joke about the monkey<|eot_id|><|start_header_id|>assistant<|end_header_id|>




In [54]:
STARCODER2_tokenizer=AutoTokenizer.from_pretrained(STARCODER2_MODEL_NAME,trust_remote_code=True)
code = """
def hello_world(person):
  print("Hello", person)
"""

tokens = STARCODER2_tokenizer.encode(code)
print(tokens)
print()
print(STARCODER2_tokenizer.decode(tokens))
print()
print(STARCODER2_tokenizer.get_added_vocab())

[222, 610, 17966, 100, 5879, 45, 6427, 731, 353, 1489, 459, 8302, 411, 4944, 46, 222]


def hello_world(person):
  print("Hello", person)


{'<|endoftext|>': 0, '<fim_prefix>': 1, '<fim_middle>': 2, '<fim_suffix>': 3, '<fim_pad>': 4, '<repo_name>': 5, '<file_sep>': 6, '<issue_start>': 7, '<issue_comment>': 8, '<issue_closed>': 9, '<jupyter_start>': 10, '<jupyter_text>': 11, '<jupyter_code>': 12, '<jupyter_output>': 13, '<jupyter_script>': 14, '<empty_output>': 15, '<code_to_intermediate>': 16, '<intermediate_to_code>': 17, '<pr>': 18, '<pr_status>': 19, '<pr_is_merged>': 20, '<pr_base>': 21, '<pr_file>': 22, '<pr_base_code>': 23, '<pr_diff>': 24, '<pr_diff_hunk>': 25, '<pr_comment>': 26, '<pr_event_id>': 27, '<pr_review>': 28, '<pr_review_state>': 29, '<pr_review_comment>': 30, '<pr_in_reply_to_review_id>': 31, '<pr_in_reply_to_comment_id>': 32, '<pr_diff_hunk_comment_line>': 33, '<NAME>': 34, '<EMAIL>': 35, '<KEY>': 36, '<PASSWORD>': 37}
