# Tokenizador

In [1]:
!hostname

k003-002.hpcfund


In [2]:
import os

WORK_DIR = os.getenv('WORK')

DATA_FOLDER = os.path.join(WORK_DIR, "data")

CACHED_DATA_FOLDER = os.path.join(WORK_DIR, "cached_data")

# Salvamos o path do Cache par ao HuggingFace
os.environ['HF_HOME'] = CACHED_DATA_FOLDER

CACHED_DATA_FOLDER

'/work1/lgarcia/renneruan/cached_data'

In [3]:
os.chdir(WORK_DIR)
print(os.getcwd())

/work1/lgarcia/renneruan


In [34]:
from datasets import load_from_disk

from transformers import PreTrainedTokenizerFast
from tokenizers.normalizers import NFC, Lowercase, Replace
from tokenizers import (
    normalizers,
    pre_tokenizers,
    Tokenizer
)

from tokenizers.pre_tokenizers import Punctuation, Metaspace, WhitespaceSplit, Digits
from tokenizers.processors import TemplateProcessing
from tokenizers.decoders import Metaspace
from tokenizers.models import Unigram
from tokenizers.trainers import UnigramTrainer

from tqdm import tqdm

from multiprocessing import cpu_count





In [5]:
vocabulary_size = 32_768
context_size = 512
tokenizer_name = f"tokenizers/custom/{vocabulary_size:_}"

In [7]:
split_save_path = os.path.join(DATA_FOLDER, 'split_datasets')

In [8]:
split_dataset = load_from_disk(split_save_path)

In [9]:
# ordered by the index
custom_special_tokens = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]

In [None]:
# replace latex usage of "aspas"
# NFC - Canonico (evita equivalencia) e composto, menos characteres
# lowercase - unifica maiuscula e mi nusculo, que geralmente muda pouco (o positional embedding pode cuidar de comecar com maiuscula)

# NFC, NFK
# Ã A~

custom_normalizer = normalizers.Sequence([
    Replace("``", '"'), 
    Replace("''", '"'),
    NFC(), 
    Lowercase(),
])

print("Å".encode("utf-8"), "Å".encode("utf-8"))

normalized_sample = custom_normalizer.normalize_str("Oi, Å Å tudo BEM? O ``quê'' \"essa\" ''função'' faz?")

print(normalized_sample.encode("utf-8"))

normalized_sample

b'\xc3\x85' b'\xe2\x84\xab'
b'oi, \xc3\xa5 \xc3\xa5 tudo bem? o "qu\xc3\xaa" "essa" "fun\xc3\xa7\xc3\xa3o" faz?'


'oi, å å tudo bem? o "quê" "essa" "função" faz?'

In [13]:

# to keep the original words (separed by whitespaces), we use metaspaces, represented by:
# “▁” U+2581 Lower One Eighth Block Unicode Character

# Digits(individual_digits=True), Whitespace()

custom_pre_tokenizer = pre_tokenizers.Sequence([WhitespaceSplit(), Punctuation(), Digits(individual_digits=False), Metaspace(replacement="▁", prepend_scheme="always")])

custom_pre_tokenizer.pre_tokenize_str("Quer saber, vamos ver o que esse pre-tokenizer faz com numeros: 12345 123 192 193!?!")

[('▁Quer', (0, 4)),
 ('▁saber', (5, 10)),
 ('▁,', (10, 11)),
 ('▁vamos', (12, 17)),
 ('▁ver', (18, 21)),
 ('▁o', (22, 23)),
 ('▁que', (24, 27)),
 ('▁esse', (28, 32)),
 ('▁pre', (33, 36)),
 ('▁-', (36, 37)),
 ('▁tokenizer', (37, 46)),
 ('▁faz', (47, 50)),
 ('▁com', (51, 54)),
 ('▁numeros', (55, 62)),
 ('▁:', (62, 63)),
 ('▁12345', (64, 69)),
 ('▁123', (70, 73)),
 ('▁192', (74, 77)),
 ('▁193', (78, 81)),
 ('▁!', (81, 82)),
 ('▁?', (82, 83)),
 ('▁!', (83, 84))]

In [14]:
custom_pre_tokenizer.pre_tokenize_str("o quê ele faz \t com         espaços adicionais \n\n e quebras de linhas?   ")

[('▁o', (0, 1)),
 ('▁quê', (2, 5)),
 ('▁ele', (6, 9)),
 ('▁faz', (10, 13)),
 ('▁com', (16, 19)),
 ('▁espaços', (28, 35)),
 ('▁adicionais', (36, 46)),
 ('▁e', (50, 51)),
 ('▁quebras', (52, 59)),
 ('▁de', (60, 62)),
 ('▁linhas', (63, 69)),
 ('▁?', (69, 70))]

In [16]:
custom_post_processor = TemplateProcessing(
    single=f"[CLS]:0 $A:0 [SEP]:0",
    pair=f"[CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1",
    special_tokens=[("[CLS]", custom_special_tokens.index("[CLS]")), ("[SEP]", custom_special_tokens.index("[SEP]"))],
)

In [18]:
custom_decoder = Metaspace()

In [None]:
custom_tokenizer = Tokenizer(Unigram())

custom_tokenizer.normalizer = custom_normalizer
custom_tokenizer.pre_tokenizer = custom_pre_tokenizer
custom_tokenizer.post_processor = custom_post_processor
custom_tokenizer.decoder = custom_decoder

In [24]:
custom_trainer = UnigramTrainer(
        vocab_size=vocabulary_size,
        initial_alphabet=pre_tokenizers.ByteLevel.alphabet(),
        special_tokens=custom_special_tokens,
        unk_token="[UNK]",
)

# create a python generator to dynamically load the data, one batch at a time
def batch_iterator(batch_size=128): # 128 (cores)
    for i in tqdm(range(0, len(split_dataset["train"]), batch_size)):
        yield split_dataset["train"][i : i + batch_size]["text"]

custom_tokenizer.train_from_iterator(iterator=batch_iterator(), trainer=custom_trainer)

100%|██████████| 443178/443178 [04:40<00:00, 1577.28it/s]






In [27]:
fast_custom_tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=custom_tokenizer,
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    bos_token="[CLS]",
    sep_token="[SEP]",
    eos_token="[SEP]",
    mask_token="[MASK]",
    padding_side="right",
)

In [28]:
# save locally
fast_custom_tokenizer.save_pretrained(tokenizer_name)

('tokenizers/custom/32_768/tokenizer_config.json',
 'tokenizers/custom/32_768/special_tokens_map.json',
 'tokenizers/custom/32_768/tokenizer.json')

In [29]:
tokenizer = fast_custom_tokenizer

## Teste do Tokenizador

In [30]:
# print( tokenizer_albert.convert_ids_to_tokens( tokenizer_albert.encode("Olá pessoal, Como vocês estão 😁 ?") ) )

print( tokenizer.convert_ids_to_tokens( tokenizer.encode("Olá pessoal, Como vocês estão 😁 ?") ) )

['[CLS]', '▁olá', '▁pessoal', '▁', ',', '▁como', '▁vocês', '▁estão', '▁', '😁', '▁', '?', '[SEP]']


In [31]:
def test_tokenizer(sample):
    encoding = tokenizer.encode(sample)
    print(encoding)
    print(tokenizer.convert_ids_to_tokens(encoding))
    print()

test_tokenizer('''
Atiraram o pau no gato, mas o gato não morreu
''')

test_tokenizer('''
Não sei, só sei que foi assim...
''')

test_tokenizer('''
Testando o modo continuo, e tambem o modo subjuntivo ( soubesse )
''')

test_tokenizer('''
justo, justa, justiça, injusto, injustamente, justamente, junto
''')

test_tokenizer('''
    testando acentos, será que manter os acentos melhora a acurácia do meu modelo? 
''')

test_tokenizer('''
    amigo: amiguinho, amiga, amiguinha, amigão, amigaço, amigalhaço
''')

[2, 5, 7, 16208, 5, 8, 2535, 24, 5553, 5, 6, 47, 5, 8, 5553, 32, 2151, 3]
['[CLS]', '▁', 'a', 'tiraram', '▁', 'o', '▁pau', '▁no', '▁gato', '▁', ',', '▁mas', '▁', 'o', '▁gato', '▁não', '▁morreu', '[SEP]']

[2, 32, 774, 5, 6, 104, 774, 13, 43, 113, 5, 12, 5, 12, 5, 12, 3]
['[CLS]', '▁não', '▁sei', '▁', ',', '▁só', '▁sei', '▁que', '▁foi', '▁assim', '▁', '.', '▁', '.', '▁', '.', '[SEP]']

[2, 5, 16436, 5, 8, 427, 12153, 5, 6, 5, 11, 6102, 5, 8, 427, 587, 10929, 1272, 5, 31, 13588, 5, 30, 3]
['[CLS]', '▁', 'testando', '▁', 'o', '▁modo', '▁continuo', '▁', ',', '▁', 'e', '▁tambem', '▁', 'o', '▁modo', '▁sub', 'jun', 'tivo', '▁', '(', '▁soubesse', '▁', ')', '[SEP]']

[2, 3563, 5, 6, 4058, 5, 6, 378, 5, 6, 13011, 5, 6, 5, 19156, 5, 6, 2229, 5, 6, 480, 3]
['[CLS]', '▁justo', '▁', ',', '▁justa', '▁', ',', '▁justiça', '▁', ',', '▁injusto', '▁', ',', '▁', 'injustamente', '▁', ',', '▁justamente', '▁', ',', '▁junto', '[SEP]']

[2, 5, 16436, 5, 7, 2211, 8, 10, 5, 6, 178, 13, 644, 5, 8, 10, 5, 7, 2211, 

#### Objetively Evaluate Tokenizer on Compression Rate (1/Fertility)

In [32]:
def normalize_and_pre_tokenize(text):
    normalized = tokenizer.backend_tokenizer.normalizer.normalize_str(text)
    processed = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(normalized)
    return processed

normalize_and_pre_tokenize( split_dataset["train"][0]["text"] )

[('▁ao', (0, 2)),
 ('▁final', (3, 8)),
 ('▁de', (9, 11)),
 ('▁maio', (12, 16)),
 ('▁deste', (17, 22)),
 ('▁ano', (23, 26)),
 ('▁,', (26, 27)),
 ('▁quase', (28, 33)),
 ('▁100', (34, 37)),
 ('▁pessoas', (38, 45)),
 ('▁foram', (46, 51)),
 ('▁detidas', (52, 59)),
 ('▁numa', (60, 64)),
 ('▁investida', (65, 74)),
 ('▁global', (75, 81)),
 ('▁contra', (82, 88)),
 ('▁os', (89, 91)),
 ('▁criadores', (92, 101)),
 ('▁,', (101, 102)),
 ('▁vendedores', (103, 113)),
 ('▁e', (114, 115)),
 ('▁usuários', (116, 124)),
 ('▁do', (125, 127)),
 ('▁blackshades', (128, 139)),
 ('▁rat', (140, 143)),
 ('▁.', (143, 144))]

In [35]:
def count_tokens(batch):
    
    original_tokens = 0
    generated_tokens = 0

    for doc in batch["text"]:

        original_tokens += len( normalize_and_pre_tokenize(doc) )
        generated_tokens += len( tokenizer.encode(doc) )
        
    # Add the token counts as a new column to the batch
    return {
        "generated": [generated_tokens],
        "original": [original_tokens]
    }

evaluate_fertility = split_dataset["train"].map(count_tokens, 
                                      batched=True,
                                      remove_columns=["text"], 
                                      num_proc=cpu_count()
                                      )

Map (num_proc=128):   0%|          | 0/56726693 [00:00<?, ? examples/s]

In [36]:
total_generated = sum(evaluate_fertility["generated"])
total_original = sum(evaluate_fertility["original"])

print(total_generated, total_original)

print("fertility:", total_generated/total_original)

3668402771 2388707891
fertility: 1.5357268190143891


## Tokenize the dataset

In [38]:
tokenizer.model_max_length = context_size

tokenizer

PreTrainedTokenizerFast(name_or_path='', vocab_size=32768, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '[CLS]', 'eos_token': '[SEP]', 'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	4: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

In [39]:
print(f"The tokenizer will keep only: {context_size} tokens" )

def group_texts(examples):
    tokenized_inputs = tokenizer(
        examples["text"], 
        max_length=context_size,
        truncation=True,
        padding="max_length",
        return_special_tokens_mask=True,
    )
    return tokenized_inputs

# preprocess dataset

tokenized_datasets = split_dataset.map(group_texts, 
                                      batched=True,
                                      remove_columns=["text"], 
                                      num_proc=cpu_count()
                                      )

tokenized_datasets

The tokenizer will keep only: 512 tokens


Map (num_proc=128):   0%|          | 0/56726693 [00:00<?, ? examples/s]

Map (num_proc=128):   0%|          | 0/6302966 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'special_tokens_mask'],
        num_rows: 56726693
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'special_tokens_mask'],
        num_rows: 6302966
    })
})

In [41]:
print("first quartile:")

for elem in tokenized_datasets["train"][:10]['input_ids']:
    print(elem[:40])
    print(tokenizer.decode(elem[:40]))
    print()

first quartile:
[2, 5, 7, 8, 209, 9, 631, 281, 102, 5, 6, 325, 618, 108, 106, 9, 4870, 10, 335, 13172, 1532, 138, 5, 8, 10, 8553, 5, 6, 11693, 5, 11, 1436, 14, 4076, 10, 65, 7, 155, 10, 5]
[CLS] ao final de maio deste ano , quase 100 pessoas foram detidas numa investida global contra os criadores , vendedores e usuários do blackshades 

[2, 17, 7876, 127, 8681, 5, 6, 2385, 5, 20, 26, 29, 6289, 5, 11, 18, 912, 4149, 28, 7247, 3862, 9, 53, 1034, 19, 5, 7, 605, 9, 215, 5, 7, 10, 451, 5, 6, 2055, 5, 6, 6948]
[CLS] para celebrar este acontecimento , realizou - se uma celebração eucarística na catedral metropolitana de pelotas com a presença de todas as crianças , familiares , educadores

[2, 5, 20, 5, 31, 4558, 5, 30, 5, 8, 991, 1344, 5, 12, 113, 5, 10, 7, 60, 145, 9332, 5, 150, 11, 34, 854, 1406, 5, 8, 1974, 15, 107, 14, 49, 13, 7261, 3627, 5, 8, 899]
[CLS] - ( off ) o rei continua . assim sagínero jeiper representa o começo da vida do ser que queira conquistar o auto

[2, 354, 9, 5, 7, 17

In [51]:
tokenized_datasets_name = os.path.join(DATA_FOLDER, f"tokenized-for-training/custom/vocab_size:{vocabulary_size:_}/context_size:{context_size}")

tokenized_datasets_name

'/work1/lgarcia/renneruan/data/tokenized-for-training/custom/vocab_size:32_768/context_size:512'

In [52]:
tokenized_datasets.save_to_disk(tokenized_datasets_name)

Saving the dataset (0/409 shards):   0%|          | 0/56726693 [00:00<?, ? examples/s]

Saving the dataset (0/46 shards):   0%|          | 0/6302966 [00:00<?, ? examples/s]

In [None]:
# def filtering(example):
#     flags = []

#     for id_list in example["input_ids"]:

#         if id_list[-1] != 0: # last token is not a padding (the doc was probably truncated)
#             flags.append(False)

#         elif id_list[10] == 0: # the token in the first 10 is a padding [PAD] (the doc is <= 10 tokens, including [sep])
#             flags.append(False)

#         else:
#             flags.append(True)

#     return flags


# filtered_datasets = tokenized_datasets.filter(filtering,
#                                          batched = True,
#                                          num_proc = cpu_count(),
#                                         )

# filtered_datasets

Filter (num_proc=128):   0%|          | 0/56726693 [00:00<?, ? examples/s]

Filter (num_proc=128):   0%|          | 0/6302966 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'special_tokens_mask'],
        num_rows: 56686246
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'special_tokens_mask'],
        num_rows: 6298512
    })
})

In [None]:
# filtered_datasets_name = os.path.join(DATA_FOLDER, f"filtered/custom/vocab_size:{vocabulary_size:_}/context_size:{context_size}")
# filtered_datasets_name

'/work1/lgarcia/renneruan/data/filtered/custom/vocab_size:32_768/context_size:512'

In [50]:
# # save tokenized dataset locally:
# filtered_datasets.save_to_disk(filtered_datasets_name)