In [1]:
import os
import sys
import numpy as np
import pandas as pd
import torch
import pytorch_lightning as pl
from transformers import AutoTokenizer
from typing import NamedTuple, Dict, List, Callable
from tqdm import tqdm

In [2]:
tokenizers = {
    "deberta": AutoTokenizer.from_pretrained("huggingface/microsoft/deberta-v3-base"),
    "xlmr": AutoTokenizer.from_pretrained("huggingface/FacebookAI/xlm-roberta-base"),
    "llama3": AutoTokenizer.from_pretrained("huggingface/meta-llama/Meta-Llama-3-8B"),
}

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"
percentiles=[.01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .99]
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()

In [4]:
for name, tokenizer in tokenizers.items():
    if name.startswith("llama"):
        tokenizer.pad_token = tokenizer.eos_token
    ct = tokenizer.chat_template
    if ct is None or len(ct)==0: 
        # Default chat template described in https://huggingface.co/docs/transformers/main/en/chat_templating
        # The one-liner also includes handy support for generation prompts, but note that it doesn’t add BOS or EOS tokens! 
        # If your model expects those, they won’t be added automatically by apply_chat_template 
        # in other words, the text will be tokenized with add_special_tokens=False. 
        # This is to avoid potential conflicts between the template and the add_special_tokens logic. 
        # If your model expects special tokens, make sure to add them to the template!
        tokenizer.chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
    print(f"=====  {name}  =====\n{tokenizer}\n{tokenizer.default_chat_template}")

=====  deberta  =====
DebertaV2TokenizerFast(name_or_path='huggingface/microsoft/deberta-v3-base', vocab_size=128000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '[CLS]', 'eos_token': '[SEP]', 'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	128000: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
{% for mess

In [5]:
chat = [
  {"role": "user", "content": "Hello, how are you?"},
  {"role": "assistant", "content": "I'm doing great. How can I help you today?"},
  {"role": "user", "content": "I'd like to show off how chat templating works!"},
]
for name, tokenizer in tokenizers.items():
    enc = tokenizer.apply_chat_template(
        chat, 
        tokenize=True,
        add_generation_prompt=False,
        truncation=True,
        max_length=128,
        padding=True,
        return_dict=True,
    )
    print(name)
    chat_str = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=False)
    print(chat_str)
    for k,v in enc.items():
        print(f"{name}.{k}")
        if k=="input_ids":
            print(tokenizer.convert_ids_to_tokens(v))
            continue
        print(v)
    enc = tokenizer(
        chat_str, 
        truncation=True,
        max_length=128,
        padding="max_length",
        add_special_tokens=True,
    )
    for k,v in enc.items():
        print(f"{name}.{k}")
        if k=="input_ids":
            print(tokenizer.convert_ids_to_tokens(v))
            continue
        print(v)

deberta
<|im_start|>user
Hello, how are you?<|im_end|>
<|im_start|>assistant
I'm doing great. How can I help you today?<|im_end|>
<|im_start|>user
I'd like to show off how chat templating works!<|im_end|>

deberta.input_ids
['▁<', '|', 'im', '_', 'start', '|', '>', 'user', '▁Hello', ',', '▁how', '▁are', '▁you', '?', '<', '|', 'im', '_', 'end', '|', '>', '▁<', '|', 'im', '_', 'start', '|', '>', 'assistant', '▁I', "'", 'm', '▁doing', '▁great', '.', '▁How', '▁can', '▁I', '▁help', '▁you', '▁today', '?', '<', '|', 'im', '_', 'end', '|', '>', '▁<', '|', 'im', '_', 'start', '|', '>', 'user', '▁I', "'", 'd', '▁like', '▁to', '▁show', '▁off', '▁how', '▁chat', '▁templating', '▁works', '!', '<', '|', 'im', '_', 'end', '|', '>']
deberta.token_type_ids
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
deberta.attention_mask