In [2]:
# Load the meta-llama/Llama-2-7b-hf tokenizer
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")

# Print the tokenizer's vocab size
print(f"Tokenizer vocab size: {len(tokenizer.get_vocab())}")

# Print the tokenizer's special tokens
print(f"Tokenizer special tokens: {tokenizer.all_special_tokens}")

# Print the tokenizer's pad token
print(f"Tokenizer pad token: {tokenizer.pad_token}")

Tokenizer vocab size: 32000
Tokenizer special tokens: ['<s>', '</s>', '<unk>']
Tokenizer pad token: None


In [7]:
# Pass in a word "horse", then tokenize and print the decoded tokens
# Pass in the same word with interventions, and print the decoded tokens
# Interventions include:
# wrap_quotes, wrap_unicode_quotes, wrap_parentheses, prefix_hair_space, prefix_zw_space, prefix_zwj_space, prefix_zwj_space_zwj, prefix_zwj_space_zwj_zwj

entity = "horse"

# Define intervention functions
def wrap_quotes(text):
    return f'"{text}"'

def wrap_unicode_quotes(text):
    return f'"{text}"'  # Unicode left/right double quotation marks

def wrap_parentheses(text):
    return f'({text})'

def prefix_hair_space(text):
    return '\u200A' + text  # Hair space (U+200A)

def prefix_zw_space(text):
    return '\u200B' + text  # Zero-width space (U+200B)

def prefix_zwj_space(text):
    return '\u200D' + text  # Zero-width joiner (U+200D)


interventions = [
    ("wrap_quotes", wrap_quotes),
    ("wrap_unicode_quotes", wrap_unicode_quotes),
    ("wrap_parentheses", wrap_parentheses),
    ("prefix_hair_space", prefix_hair_space),
    ("prefix_zw_space", prefix_zw_space),
    ("prefix_zwj_space", prefix_zwj_space),
]

# Tokenize the original entity
tokens = tokenizer.encode(entity)
print(f"Original '{entity}':")
print(f"Tokens: {tokens}")
# Decode the  tokens by decoding them separately and printing them iteratively
print(f"Decoded: {[tokenizer.decode(token) for token in tokens]}")
print("\n")

# Apply interventions and tokenize
for intervention_name, intervention_func in interventions:
    modified_entity = intervention_func(entity)
    tokens = tokenizer.encode(modified_entity)
    decoded = tokenizer.decode(tokens)
    print(f"{intervention_name}: '{modified_entity}'")
    print(f"Tokens: {tokens}")
    print(f"Decoded: {[tokenizer.decode(token) for token in tokens]}")
    print()

Original 'horse':
Tokens: [1, 10435]
Decoded: ['<s>', 'horse']


wrap_quotes: '"horse"'
Tokens: [1, 376, 2015, 344, 29908]
Decoded: ['<s>', '"', 'hor', 'se', '"']

wrap_unicode_quotes: '"horse"'
Tokens: [1, 376, 2015, 344, 29908]
Decoded: ['<s>', '"', 'hor', 'se', '"']

wrap_parentheses: '(horse)'
Tokens: [1, 313, 2015, 344, 29897]
Decoded: ['<s>', '(', 'hor', 'se', ')']

prefix_hair_space: ' horse'
Tokens: [1, 29871, 30118, 2015, 344]
Decoded: ['<s>', '', '\u200a', 'hor', 'se']

prefix_zw_space: '​horse'
Tokens: [1, 29871, 30166, 2015, 344]
Decoded: ['<s>', '', '\u200b', 'hor', 'se']

prefix_zwj_space: '‍horse'
Tokens: [1, 29871, 30722, 2015, 344]
Decoded: ['<s>', '', '\u200d', 'hor', 'se']



In [5]:
print(f"eos_token_id: {tokenizer.eos_token_id}")
print(f"unk_token_id: {tokenizer.unk_token_id}")
print(f"pad_token_id: {tokenizer.pad_token_id}")
print(f"bos_token_id: {tokenizer.bos_token_id}")
print(f"eos_token: {tokenizer.eos_token}")
print(f"unk_token: {tokenizer.unk_token}")
print(f"pad_token: {tokenizer.pad_token}")
print(f"bos_token: {tokenizer.bos_token}")
print(f"all_special_tokens: {tokenizer.all_special_tokens}")
print(f"all_special_tokens_ids: {tokenizer.all_special_tokens_ids}")

Using pad_token, but it is not set yet.


eos_token_id: 2
unk_token_id: 0
pad_token_id: None
bos_token_id: 1
eos_token: </s>
unk_token: <unk>
pad_token: None
bos_token: <s>
all_special_tokens: ['<s>', '</s>', '<unk>']


AttributeError: 'LlamaTokenizerFast' object has no attribute 'all_special_tokens_ids'

In [16]:
tokenizer.convert_ids_to_tokens(0)

'<unk>'

In [None]:
# Load the T5 base tokenizer
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("t5-base")

# Print the tokenizer's vocab size
print(f"Tokenizer vocab size: {len(tokenizer.get_vocab())}")

# Print the tokenizer's special tokens
print(f"Tokenizer special tokens: {tokenizer.all_special_tokens}")

# Print the tokenizer's pad token
print(f"Tokenizer pad token: {tokenizer.pad_token}")
# Load the other T5 tokenizers 



Tokenizer vocab size: 32100
Tokenizer special tokens: ['</s>', '<unk>', '<pad>', '<extra_id_0>', '<extra_id_1>', '<extra_id_2>', '<extra_id_3>', '<extra_id_4>', '<extra_id_5>', '<extra_id_6>', '<extra_id_7>', '<extra_id_8>', '<extra_id_9>', '<extra_id_10>', '<extra_id_11>', '<extra_id_12>', '<extra_id_13>', '<extra_id_14>', '<extra_id_15>', '<extra_id_16>', '<extra_id_17>', '<extra_id_18>', '<extra_id_19>', '<extra_id_20>', '<extra_id_21>', '<extra_id_22>', '<extra_id_23>', '<extra_id_24>', '<extra_id_25>', '<extra_id_26>', '<extra_id_27>', '<extra_id_28>', '<extra_id_29>', '<extra_id_30>', '<extra_id_31>', '<extra_id_32>', '<extra_id_33>', '<extra_id_34>', '<extra_id_35>', '<extra_id_36>', '<extra_id_37>', '<extra_id_38>', '<extra_id_39>', '<extra_id_40>', '<extra_id_41>', '<extra_id_42>', '<extra_id_43>', '<extra_id_44>', '<extra_id_45>', '<extra_id_46>', '<extra_id_47>', '<extra_id_48>', '<extra_id_49>', '<extra_id_50>', '<extra_id_51>', '<extra_id_52>', '<extra_id_53>', '<extra_id_

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
