In [1]:
import os
import sys
import numpy as np
import pandas as pd
import torch
import pytorch_lightning as pl
from transformers import AutoTokenizer
from typing import NamedTuple, Dict, List, Callable
from tqdm import tqdm

In [2]:
tokenizers = {
    "deberta": AutoTokenizer.from_pretrained("huggingface/microsoft/deberta-v3-base"),
    "xlmr": AutoTokenizer.from_pretrained("huggingface/FacebookAI/xlm-roberta-base"),
    "llama3": AutoTokenizer.from_pretrained("huggingface/meta-llama/Meta-Llama-3-8B"),
}

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"
percentiles=[.01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .99]
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()

In [4]:
for name, tokenizer in tokenizers.items():
    #tokenizer.add_special_tokens(
    #    special_tokens_dict={"additional_special_tokens": ["[BOT]", "[EOT]"]},
    #)
    # Asking to pad but the tokenizer does not have a padding token. 
    # Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)`
    if name.startswith("llama"):
        tokenizer.pad_token = tokenizer.eos_token
    print(f"=====  {name}  =====\n{tokenizer}")

=====  deberta  =====
DebertaV2TokenizerFast(name_or_path='huggingface/microsoft/deberta-v3-base', vocab_size=128000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '[CLS]', 'eos_token': '[SEP]', 'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	128000: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
=====  xlmr

In [10]:
# deberta and xlmr do not handle newline characters.
# Special tokens added in the text are not reflected in `special_tokens_mask`
# even when `add_special_tokens`=False
# Fix: `tokenizer.get_special_tokens_mask(input_ids, already_has_special_tokens=True)`
# see https://github.com/huggingface/transformers/issues/7199
for name, tokenizer in tokenizers.items():
    s1 = [
        "one two",
        "\n\r\tone\n\r\ttwo\n\r\t",
        f"{tokenizer.bos_token} one two {tokenizer.eos_token}",
    ]
    for add_special_tokens in [True, False]:
        print(f"{name}, add_special_tokens={add_special_tokens}")
        enc = tokenizer(
            s1, 
            truncation=True,
            max_length=13,
            padding="max_length",
            add_special_tokens=add_special_tokens,
            return_special_tokens_mask=True,
        )
        print(enc.keys())
        for i in range(len(enc["input_ids"])):
            input_ids = enc["input_ids"][i]
            print(tokenizer.convert_ids_to_tokens(input_ids))
            print(f"S={enc['special_tokens_mask'][i]}")
            stm = tokenizer.get_special_tokens_mask(input_ids, already_has_special_tokens=True)
            print(f"T={stm}")
            print(f"A={enc['attention_mask'][i]}")

deberta, add_special_tokens=True
dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'special_tokens_mask'])
['[CLS]', '▁one', '▁two', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']
S=[1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
T=[1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
A=[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]
['[CLS]', '▁one', '▁two', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']
S=[1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
T=[1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
A=[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]
['[CLS]', '[CLS]', '▁one', '▁two', '[SEP]', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']
S=[1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
T=[1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]
A=[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]
deberta, add_special_tokens=False
dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'special_tokens_mask'])
['▁one', '▁two', '[PAD]', '[PAD]', '[PA

In [6]:
for name, tokenizer in tokenizers.items():
    print(name)
    s1 = [
        "one two",
        f"{tokenizer.bos_token} one two {tokenizer.eos_token}",
    ]
    s2 = [
        "three four",
        f"{tokenizer.bos_token} three four {tokenizer.eos_token}",
    ]
    enc = tokenizer(
        s1, s2,
        truncation=True,
        max_length=16,
        padding="max_length",
        return_token_type_ids=True,
        return_special_tokens_mask=True,
    )
    print(enc.keys())
    for i in range(len(enc["input_ids"])):
        print(tokenizer.convert_ids_to_tokens(enc["input_ids"][i]))
        print(f"S={enc['special_tokens_mask'][i]}")
        print(f"A={enc['attention_mask'][i]}")

deberta
dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'special_tokens_mask'])
['[CLS]', '▁one', '▁two', '[SEP]', '▁three', '▁four', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']
S=[1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
A=[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]
['[CLS]', '[CLS]', '▁one', '▁two', '[SEP]', '[SEP]', '[CLS]', '▁three', '▁four', '[SEP]', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']
S=[1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]
A=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]
xlmr
dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'special_tokens_mask'])
['<s>', '▁one', '▁two', '</s>', '</s>', '▁three', '▁four', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
S=[1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]
A=[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]
['<s>', '<s>', '▁one', '▁two', '</s>', '</s>', '</s>', '<s>', '▁three', '▁four', '</s>