# Tokenization

### BertTweet

- fastBPE
- 64K subword

### Twilbert
- SentencePiece (fastBPE)
- 30k subword 

In [3]:
%load_ext autoreload
%autoreload 2
from glob import glob

num_files = 100
tweet_files = glob("../../data/filtered_tweets/*.txt")

train_files = tweet_files[:2]


tweets = list([x.strip("\n") for x in open(tweet_files[0])])[:100_000]

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
len(tweets)

100000

In [16]:
from tokenizers import SentencePieceUnigramTokenizer, SentencePieceBPETokenizer, BertWordPieceTokenizer, ByteLevelBPETokenizer

tokenizer = SentencePieceBPETokenizer()#replacement="_")

In [17]:
from finetune_vs_scratch.preprocessing import special_tokens
from finetune_vs_scratch.tokenizer import tokenizer_special_tokens

#tokenizer.add_special_tokens(tokenizer_special_tokens)
tokenizer.train_from_iterator(
    tweets,
    vocab_size=30_000,
    min_frequency=5,
    show_progress=True,
    limit_alphabet=500,
    special_tokens=tokenizer_special_tokens,
)






In [19]:

vocab = tokenizer.get_vocab()
inv_vocab = {v:k for k, v in vocab.items()}


tokenizer.encode("Qué hacesssss @usuario").tokens


['▁Qué', '▁haces', 'ss', 'ss', '▁@usuario']

In [20]:
tokenizer_path = "./sentence-piece-tokenizer"
!mkdir $tokenizer_path
vocab_file, merges_file = tokenizer.save_model(tokenizer_path)

In [None]:
import sentencepiece

sentence

In [21]:

from finetune_vs_scratch.tokenizer import MyTokenizer

t_tokenizer = MyTokenizer(
    vocab_file,
    merges_file,
)

t_tokenizer
#sorted(vars(t_tokenizer).keys())

PreTrainedTokenizer(name_or_path='', vocab_size=30000, model_max_len=1000000000000000019884624838656, is_fast=False, padding_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'})

In [22]:

t_tokenizer("@usuario tugo bierno skere comunista")

{'input_ids': [0, 534, 764, 624, 569, 1206, 13253, 926, 13757, 1], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [24]:
from transformers import RobertaTokenizerFast, AutoTokenizer

roberta_tokenizer = RobertaTokenizerFast(vocab_file, merges_file, never_split=special_tokens)
bertweet_tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base")

(roberta_tokenizer("@usuario tugo bierno skere comunista")["input_ids"])

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


{'vocab_file': '/home/jmperez/.cache/huggingface/transformers/9a877d0d57efbfeae96fec396a35595dc8c4685fe2b7b2049c6c094e24a0e8bf.f8a4dfe5c3c45a26f9df849d732decb191dc0c05ab270799695430332d143982', 'merges_file': '/home/jmperez/.cache/huggingface/transformers/1c2d05a06ac61a063ad62a7590731a28cc62f58e2802c76b5f993165f25894a9.75877d86011e5d5d46614d3a21757b705e9d20ed45a019805d25159b4837b0a4', 'added_tokens_file': None, 'special_tokens_map_file': None, 'tokenizer_config_file': None, 'tokenizer_file': None}
() {'model_max_length': 128, 'vocab_file': '/home/jmperez/.cache/huggingface/transformers/9a877d0d57efbfeae96fec396a35595dc8c4685fe2b7b2049c6c094e24a0e8bf.f8a4dfe5c3c45a26f9df849d732decb191dc0c05ab270799695430332d143982', 'merges_file': '/home/jmperez/.cache/huggingface/transformers/1c2d05a06ac61a063ad62a7590731a28cc62f58e2802c76b5f993165f25894a9.75877d86011e5d5d46614d3a21757b705e9d20ed45a019805d25159b4837b0a4', 'special_tokens_map_file': None, 'tokenizer_file': None, 'name_or_path': 'vinai/b

[0, 34, 529, 653, 624, 68, 1206, 85, 77, 926, 7519, 919, 1]

In [25]:
inv_vocab = {v:k for k, v in roberta_tokenizer.vocab.items()}
tok_ids = roberta_tokenizer("@usuario tugo bierno skere comunista")["input_ids"]

for tok in tok_ids:
    print(tok, " ---> ", inv_vocab[tok])

0  --->  <s>
34  --->  @
529  --->  usuario
653  --->  tu
624  --->  go
68  --->  b
1206  --->  ierno
85  --->  s
77  --->  k
926  --->  ere
7519  --->  comun
919  --->  ista
1  --->  </s>


In [26]:
inv_vocab = {v:k for k, v in t_tokenizer.encoder.items()}
tok_ids = t_tokenizer("@usuario tugo bierno skere comunista")["input_ids"]

for tok in tok_ids:
    print(tok, " ---> ", inv_vocab[tok])

0  --->  <s>
534  --->  ▁@usuario
764  --->  ▁tu
624  --->  go
569  --->  ▁b
1206  --->  ierno
13253  --->  ▁sk
926  --->  ere
13757  --->  ▁comunista
1  --->  </s>


In [27]:
%%timeit

t_tokenizer(tweets[:1000]);

None

501 ms ± 7.19 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [28]:
%%timeit

roberta_tokenizer(tweets[:1000]);

None

24.9 ms ± 1.1 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [29]:
%%timeit

tokenizer.encode_batch(tweets[:1000])

None

18.1 ms ± 613 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [30]:
%%timeit

bertweet_tokenizer(tweets[:1000])

None

Token indices sequence length is longer than the specified maximum sequence length for this model (129 > 128). Running this sequence through the model will result in indexing errors


103 ms ± 10.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


La implementación nuestra es muy muy mala