# Tokenization

### BertTweet

- fastBPE
- 64K subword

### Twilbert
- SentencePiece (fastBPE)
- 30k subword 

In [1]:
from glob import glob

num_files = 100
tweet_files = glob("../data/filtered_tweets/*.txt")

train_files = tweet_files[:2]


tweets = list([x.strip("\n") for x in open("../data/filtered_tweets/spanish-tweets-000.txt")])[:1_000_000]

In [25]:
len(tweets)

5224220

In [None]:
filtered_batch = filter

In [9]:
from tokenizers import BertWordPieceTokenizer, ByteLevelBPETokenizer

tokenizer = BertWordPieceTokenizer(
    clean_text=True,
    handle_chinese_chars=True,
    strip_accents=False,
    lowercase=False,
)



In [10]:
from finetune_vs_scratch.preprocessing import special_tokens

tokenizer.add_special_tokens(special_tokens)

4

In [11]:
tokenizer.train_from_iterator(
    tweets,
    vocab_size=40_000,
    min_frequency=2,
    show_progress=True,

    special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"] + special_tokens,
    limit_alphabet=600,
    wordpieces_prefix="##",
)






In [12]:
!mkdir test_tokenizer
tokenizer.save_model("./test_tokenizer/")

mkdir: cannot create directory ‘test_tokenizer’: File exists


['./test_tokenizer/vocab.txt']

## Reload

In [15]:
from transformers import BertTokenizer, BertTokenizerFast

bert_tokenizer = BertTokenizerFast.from_pretrained(
    "./test_tokenizer/",
    never_split=special_tokens,
)


## Loading trained tokenizers

In [18]:

!pip freeze | grep transf

transformers @ file:///home/jmperez/.cache/pypoetry/artifacts/d7/d1/6f/c08a86d09ebb99ef7233126f12dce131f0bcf38f23b1c8aa8d2065c528/transformers-4.8.2-py3-none-any.whl


In [20]:
from transformers import BertTokenizerFast, AutoTokenizer, BertTokenizer
from finetune_vs_scratch.model import load_tokenizer

bert_tokenizer = BertTokenizer.from_pretrained(
    "../models/tokenizers/betito_cased_accents/",
    never_split=["@usuario"]
)


bert_tokenizer("@usuario")

{'input_ids': [2, 5, 3], 'token_type_ids': [0, 0, 0], 'attention_mask': [1, 1, 1]}

Si es Fast, le tenemos que poner `additional_special_tokens`, LTA

In [24]:
from transformers import BertTokenizerFast, AutoTokenizer, BertTokenizer
from finetune_vs_scratch.model import load_tokenizer

bert_tokenizer = BertTokenizerFast.from_pretrained(
    "../models/tokenizers/betito_cased_accents/",
    additional_special_tokens=["@usuario"]
)


bert_tokenizer("@usuario")

{'input_ids': [2, 5, 3], 'token_type_ids': [0, 0, 0], 'attention_mask': [1, 1, 1]}

In [15]:

bert_tokenizer("@usuario")

{'input_ids': [2, 38, 1164, 3], 'token_type_ids': [0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1]}

In [3]:
import re 

tweet = "@usuario @usuario jajaja gil emoji riendo emoji emoji riendo emoji"

tweet = re.sub("emoji.*?emoji", "emoji", tweet)
print(tweet)
tokens = bert_tokenizer(tweet)["input_ids"]
print(tokens)
print(bert_tokenizer.decode(tokens))

@usuario @usuario jajaja gil emoji emoji
[2, 38, 1164, 38, 1164, 5733, 621, 5411, 8, 8, 3]
[CLS] @ usuario @ usuario jajaja gil emoji emoji [SEP]


In [25]:

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', use_fast=False, do_lower_case=True, do_basic_tokenize=True)
tokenizer.add_tokens(['graft', 'grafts'])

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

2

In [28]:
tokenizer.convert_ids_to_tokens(tokenizer.encode("graft"))

['[CLS]', 'graft', '[SEP]']