In [14]:
from transformers import BertTokenizer, T5Tokenizer, LongformerTokenizer

In [2]:
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
t5_tokenizer = T5Tokenizer.from_pretrained('t5-small')

Downloading (…)ve/main/spiece.model: 100%|██████████| 792k/792k [00:01<00:00, 727kB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 1.21k/1.21k [00:00<00:00, 1.04MB/s]
For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [8]:
phrase = "I am a sentence for which I would like to get its embedding 26/05"

In [12]:
bert_tokenizer.tokenize(phrase)
bert_tokenizer.convert_tokens_to_string(bert_tokenizer.tokenize(phrase))

'i am a sentence for which i would like to get its embedding 26 / 05'

In [11]:
tokens = t5_tokenizer.tokenize(phrase)
t5_tokenizer.convert_tokens_to_string(tokens)

'I am a sentence for which I would like to get its embedding 26/05'

In [15]:
longformer_tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')

In [16]:
longformer_tokenizer.tokenize(phrase)

['I',
 'Ġam',
 'Ġa',
 'Ġsentence',
 'Ġfor',
 'Ġwhich',
 'ĠI',
 'Ġwould',
 'Ġlike',
 'Ġto',
 'Ġget',
 'Ġits',
 'Ġembed',
 'ding',
 'Ġ26',
 '/',
 '05']

In [17]:
longformer_tokenizer.special_tokens_map

{'bos_token': '<s>',
 'eos_token': '</s>',
 'unk_token': '<unk>',
 'sep_token': '</s>',
 'pad_token': '<pad>',
 'cls_token': '<s>',
 'mask_token': '<mask>'}

In [21]:
import string
import random
def generate_random_words(n_words=10):
    return ''.join(random.choice(string.ascii_lowercase) for i in range(n_words))

In [23]:
texts = [generate_random_words() for i in range(1000)]

# Training

In [33]:
from tokenizers import trainers, Tokenizer
from tokenizers.models import Unigram

In [36]:
tokenizer = Tokenizer(Unigram())

In [37]:
special_tokens = list(longformer_tokenizer.special_tokens_map.values())
special_tokens.remove('<unk>')

In [38]:
trainer = trainers.UnigramTrainer(
    vocab_size=longformer_tokenizer.vocab_size, 
    special_tokens=special_tokens,
    unk_token='<unk>'
)

In [55]:
tokenizer.train_from_iterator([texts], trainer)





In [56]:
tokenizer.get_vocab_size()

1673

In [57]:
tokenizer.encode(phrase)

Encoding(num_tokens=41, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [58]:
tokenizer.save('tokenizer.json')

In [61]:
from transformers import PreTrainedTokenizerFast

unigram_tokenizer = PreTrainedTokenizerFast.from_pretrained('tokenizer.json')



In [64]:
unigram_tokenizer(phrase)

{'input_ids': [0, 396, 0, 17, 0, 7, 129, 52, 160, 31, 0, 51, 32, 0, 90, 15, 134, 0, 28, 465, 27, 18, 0, 45, 882, 0, 125, 0, 533, 19, 0, 15, 457, 0, 690, 14, 302, 18, 62, 20, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}