# Tokenization

### BertTweet

- fastBPE
- 64K subword

### Twilbert
- SentencePiece (fastBPE)
- 30k subword 

In [2]:
%load_ext autoreload
%autoreload 2
from glob import glob

num_files = 100
tweet_files = glob("../../data/filtered_tweets/*.txt")

train_files = tweet_files[:2]


tweets = list([x.strip("\n") for x in open(tweet_files[0])])[:1_00_000]

In [19]:
from tokenizers import SentencePieceBPETokenizer, BertWordPieceTokenizer, ByteLevelBPETokenizer
from tokenizers import normalizers, Regex
from tokenizers.processors import RobertaProcessing
from finetune_vs_scratch.preprocessing import special_tokens
from finetune_vs_scratch.tokenizer import tokenizer_special_tokens

tokenizer = SentencePieceBPETokenizer()
#replacement="_")

tokenizer.add_special_tokens(tokenizer_special_tokens)

strip_accents = True
lowercase = True

tokenizer_normalizers = [
    normalizers.NFKC(),
    normalizers.BertNormalizer(
        clean_text=True,
        handle_chinese_chars=True,
        strip_accents=strip_accents,
        lowercase=lowercase,
    ),
    normalizers.Replace(Regex("(\W)?@usuario(\W)"), " @usuario "),
    normalizers.Replace("hashtag", " hashtag "),
    # Error de preprocesamiento
    normalizers.Replace(Regex("(\W)url(\W)"), " url "),
    normalizers.Replace("http://url", " url "),
]

tokenizer.normalizer = normalizers.Sequence(tokenizer_normalizers)

vocab = tokenizer.get_vocab()

tokenizer.post_processor = RobertaProcessing(
    cls=("<s>", tokenizer.token_to_id("<s>")),
    sep=("</s>", tokenizer.token_to_id("</s>")),
)

In [20]:
from finetune_vs_scratch.preprocessing import special_tokens
from finetune_vs_scratch.tokenizer import tokenizer_special_tokens

#tokenizer.add_tokens(special_tokens)

tokenizer.train_from_iterator(
    tweets,
    vocab_size=30_000,
    min_frequency=5,
    show_progress=True,
    limit_alphabet=300,
    special_tokens=tokenizer_special_tokens+special_tokens,
)






In [21]:
tokenizer.normalizer.normalize_str("@usuariotugo")

'@usuariotugo'

In [22]:
tokenizer.normalizer.normalize_str("..url..")

'. url .'

In [23]:

for tok in tokenizer.get_vocab():
    if any(t in tok for t in special_tokens):
        print(tok)

‚ñÅ@usuario
@usuario
url
‚ñÅburlarse
emoji
‚ñÅurl
‚ñÅburlar
‚ñÅburlandose
@url
‚ñÅburla
‚ñÅurl.
‚ñÅemoji
‚ñÅburlan
‚ñÅhashtag
hashtag
‚ñÅemojis


## Alphabet

In [53]:
vocab = tokenizer.get_vocab()

inv_vocab = {v:k for k, v in vocab.items()}
inv_vocab = [inv_vocab[i] for i in range(len(inv_vocab)) if i not in {335, 2388, 3075}]

print(f"First tokens: {inv_vocab[:50]}")

alphabet = sorted(list({a for x in tokenizer.get_vocab() for a in x}))
print("Alphabet = ", " ".join(alphabet))


First tokens: ['<s>', '<pad>', '</s>', '<unk>', '<mask>', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '=', '?', '@', '[', '\\', ']', '^', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i']
Alphabet =  ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 : ; < = > ? @ [ \ ] ^ _ ` a b c d e f g h i j k l m n o p q r s t u v w x y z | ~ ¬° ¬¨ ¬Æ ¬ø …™ …¥  Ä  ü Œ± Œµ Œ∑ Œπ ŒΩ Œø œÄ œÅ œÉ œÖ –≤ –¥ –∫ –Ω –æ —Ç ◊ê ◊î ◊ï ◊ô ◊ú ◊® ◊© ◊™ ÿß ÿ® ÿ™ ÿ≠ ÿØ ÿ± ÿ≥ ÿ¥ ÿπ ŸÅ ŸÇ ŸÉ ŸÑ ŸÖ ŸÜ Ÿá Ÿà Ÿä ·ÑÄ ·ÑÅ ·ÑÇ ·ÑÉ ·ÑÑ ·ÑÖ ·ÑÜ ·Ñá ·Ñâ ·Ñä ·Ñã ·Ñå ·Ñé ·Ñè ·Ñê ·Ñë ·Ñí ·Ö° ·Ö¢ ·Ö£ ·Ö• ·Ö¶ ·Öß ·Ö© ·Ö™ ·Ö¨ ·Ö≠ ·ÖÆ ·ÖØ ·Ö± ·Ö≤ ·Ö≥ ·Ö¥ ·Öµ ·Ü® ·Ü´ ·Ü≠ ·ÜØ ·Ü∑ ·Ü∏ ·Üπ ·Ü∫ ·Üª ·Üº ·áÇ ·•± ·¥Ä ·¥á ·¥ç ·¥è ·¥õ ·¥ú ·µé ‚É£ ‚Üí ‚è± ‚ñÅ ‚ñ∫ ‚òÖ ‚òÜ ‚ô™ ‚ôª ‚úî ‚û° ‚ûΩ ‚†Ä ‚¨á „ÄÅ „ÄÇ „Äå „Äç „Äé „Äè „ÅÇ „ÅÑ „ÅÜ „Åà „Åä „Åã „Åç „Åè „Åë „Åì „Åï „Åó „Åô „Åõ „Åù „Åü „Å° „Å£ „Å§ „Å¶ „Å® „Å™ „Å´ „Å≠ „ÅÆ „ÅØ „Å≤ „Åµ „Åª „Åæ „Åø „ÇÇ „Ç

In [37]:
tokenizer.encode("@usuario son UNA MIERDA", "Viva Per√≥n")

Encoding(num_tokens=10, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [38]:
tokenizer.decode(tokenizer.encode("@usuario son UNA MIERDA", "Viva Per√≥n").ids)

'@usuario son una mierda viva peron'

In [40]:
from transformers import PreTrainedTokenizerFast

transformer_tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=tokenizer,
    bos_token="<s>",
    eos_token="</s>",
    sep_token="</s>",
    cls_token="<s>",
    unk_token="<unk>",
    pad_token="<pad>",
    mask_token="<mask>",
)

In [41]:

transformer_tokenizer.save_pretrained("small")

('small/tokenizer_config.json',
 'small/special_tokens_map.json',
 'small/tokenizer.json')

In [54]:
from transformers import AutoTokenizer
transformer_tokenizer = AutoTokenizer.from_pretrained("small")

() {'bos_token': '<s>', 'eos_token': '</s>', 'sep_token': '</s>', 'cls_token': '<s>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'mask_token': '<mask>', 'special_tokens_map_file': 'small/special_tokens_map.json', 'tokenizer_file': 'small/tokenizer.json', 'name_or_path': 'small'}


In [65]:
transformer_tokenizer._tokenizer.encode("Este es un forro @usuario impresion√°nte", "Corte gil corte basura").tokens


['<s>',
 '‚ñÅeste',
 '‚ñÅes',
 '‚ñÅun',
 '‚ñÅforro',
 '‚ñÅ',
 '@usuario',
 '‚ñÅimpresionante',
 '</s>',
 '</s>',
 '‚ñÅcorte',
 '‚ñÅgil',
 '‚ñÅcorte',
 '‚ñÅbasura',
 '</s>']

In [58]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("roberta-base")

() {'model_max_length': 512, 'vocab_file': '/home/jmperez/.cache/huggingface/transformers/d3ccdbfeb9aaa747ef20432d4976c32ee3fa69663b379deb253ccfce2bb1fdc5.d67d6b367eb24ab43b08ad55e014cf254076934f71d832bbab9ad35644a375ab', 'merges_file': '/home/jmperez/.cache/huggingface/transformers/cafdecc90fcab17011e12ac813dd574b4b3fea39da6dd817813efa010262ff3f.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b', 'tokenizer_file': '/home/jmperez/.cache/huggingface/transformers/d53fc0fa09b8342651efd4073d75e19617b3e51287c2a535becda5808a8db287.fc9576039592f026ad76a1c231b89aee8668488c671dfbe6616bab2ed298d730', 'special_tokens_map_file': None, 'name_or_path': 'roberta-base'}


In [64]:

tokenizer._tokenizer.encode("Oh man this is terrible", "Bullshit").tokens

['<s>',
 'Oh',
 'ƒ†man',
 'ƒ†this',
 'ƒ†is',
 'ƒ†terrible',
 '</s>',
 '</s>',
 'Bull',
 'shit',
 '</s>']

## Test pretrained


In [1]:
from transformers import AutoTokenizer

tokenizers = {
    "deacc": "../../models/twerto-base-deacc-uncased",
    "uncased": "../../models/twerto-base-uncased",
    "cased": "../../models/twerto-base-cased",
}

tokenizers = {k: AutoTokenizer.from_pretrained(v) for k, v in tokenizers.items()}

for model_name, tokenizer in tokenizers.items():
    print("="*80)
    print(model_name, "\n"*3)
    print("Sanity check")
    print(f"@usuario => {tokenizer.encode('@usuario')}")
    text = ["esta es una PRUEBA EN MAY√öSCULAS Y CON TILDES @usuario @usuario", "ATR cumbia gato hashtag"]
    print(f"{text}\n{tokenizer.decode(tokenizer.encode(*text))}")

deacc 



Sanity check
@usuario => [0, 433, 2]
['esta es una PRUEBA EN MAY√öSCULAS Y CON TILDES @usuario @usuario', 'ATR cumbia gato hashtag']
<s> esta es una prueba en mayusculas y con tildes @usuario @usuario</s></s> atr cumbia gato  hashtag </s>
uncased 



Sanity check
@usuario => [0, 431, 2]
['esta es una PRUEBA EN MAY√öSCULAS Y CON TILDES @usuario @usuario', 'ATR cumbia gato hashtag']
<s> esta es una prueba en may√∫sculas y con tildes @usuario @usuario</s></s> atr cumbia gato  hashtag </s>
cased 



Sanity check
@usuario => [0, 430, 2]
['esta es una PRUEBA EN MAY√öSCULAS Y CON TILDES @usuario @usuario', 'ATR cumbia gato hashtag']
<s> esta es una PRUEBA EN MAY√öSCULAS Y CON TILDES @usuario @usuario</s></s> ATR cumbia gato  hashtag </s>


In [5]:
tokenizer = tokenizers["deacc"]

inv_vocab = {v:k for k, v in tokenizer.vocab.items()}

with open("deacc_vocab.txt", "w+") as f:
    for i in range(len(inv_vocab)):
        f.write(f"{i:<6} --- {inv_vocab[i]}\n")