In [2]:
import warnings
warnings.filterwarnings('ignore')

training_data = [
    "walker walked a long walk",
]

In [4]:
## Byte Pain Encoding - BPE

from tokenizers.trainers import BpeTrainer
from tokenizers.models import BPE
from tokenizers import Tokenizer
from tokenizers.pre_tokenizers import Whitespace

bpe_tokenizer = Tokenizer(BPE())
bpe_tokenizer.pre_tokenizer = Whitespace()

bpe_trainer = BpeTrainer(vocab_size=14)

bpe_tokenizer.train_from_iterator(training_data, bpe_trainer)

In [5]:
bpe_tokenizer.get_vocab()

{'o': 7,
 'r': 8,
 'w': 9,
 'walke': 13,
 'e': 2,
 'n': 6,
 'wal': 11,
 'al': 10,
 'walk': 12,
 'd': 1,
 'g': 3,
 'k': 4,
 'l': 5,
 'a': 0}

In [6]:
bpe_tokenizer.encode("walker walked a long walk").tokens

['walke', 'r', 'walke', 'd', 'a', 'l', 'o', 'n', 'g', 'walk']

In [7]:
bpe_tokenizer.encode("wlk").ids

[9, 5, 4]

In [8]:
bpe_tokenizer.encode("wlk").tokens

['w', 'l', 'k']

In [9]:
bpe_tokenizer.encode("she walked").tokens

['e', 'walke', 'd']

In [25]:
## WordPiece

from real_wordpiece.trainer import RealWordPieceTrainer
from tokenizers.models import WordPiece

real_wordpiece_tokenizer = Tokenizer(WordPiece())
real_wordpiece_tokenizer.pre_tokenizer = Whitespace()

real_wordpiece_trainer = RealWordPieceTrainer(
    vocab_size=27,
)

In [11]:
real_wordpiece_trainer.train_tokenizer(
    training_data, real_wordpiece_tokenizer
)
real_wordpiece_tokenizer.get_vocab()

{'n': 17,
 'long': 21,
 '##n': 10,
 'd': 15,
 'g': 18,
 '##d': 8,
 'a': 5,
 'o': 16,
 '##g': 11,
 '##ed': 23,
 'e': 13,
 '##l': 2,
 '##er': 22,
 '##k': 3,
 '##lk': 25,
 '##e': 4,
 'wa': 24,
 'w': 0,
 '##o': 9,
 '##r': 7,
 'lo': 19,
 '##ng': 20,
 'l': 6,
 'k': 12,
 'walk': 26,
 '##a': 1,
 'r': 14}

In [12]:
real_wordpiece_tokenizer.encode("walker walked a long walk").tokens

['walk', '##er', 'walk', '##ed', 'a', 'long', 'walk']

In [13]:
real_wordpiece_tokenizer.encode("wlk").tokens

['w', '##lk']

In [14]:
real_wordpiece_tokenizer.encode("she walked").tokens

Exception: WordPiece error: Missing [UNK] token from the vocabulary

In [27]:
## HuggingFace WordPiece and special tokens

from tokenizers.trainers import WordPieceTrainer

unk_token = "[UNK]"

wordpiece_model = WordPiece(unk_token=unk_token)
wordpiece_tokenizer = Tokenizer(wordpiece_model)
wordpiece_tokenizer.pre_tokenizer = Whitespace()
wordpiece_trainer = WordPieceTrainer(
    vocab_size=28,
    special_tokens=[unk_token]
)

In [28]:
wordpiece_tokenizer.train_from_iterator(
    training_data, 
    wordpiece_trainer
)
wordpiece_tokenizer.get_vocab()

{'r': 9,
 '##d': 19,
 '##r': 18,
 '##e': 17,
 '##k': 16,
 'walk': 22,
 'walker': 26,
 '##o': 11,
 'walke': 23,
 'lo': 24,
 'k': 5,
 'o': 8,
 '[UNK]': 0,
 '##n': 12,
 'g': 4,
 'w': 10,
 '##a': 14,
 'walked': 27,
 'wa': 20,
 'l': 6,
 '##l': 15,
 '##g': 13,
 'a': 1,
 'd': 2,
 'n': 7,
 '##lk': 21,
 '##ng': 25,
 'e': 3}

In [29]:
wordpiece_tokenizer.encode("walker walked a long walk").tokens

['walker', 'walked', 'a', 'lo', '##ng', 'walk']

In [30]:
wordpiece_tokenizer.encode("wlk").tokens

['w', '##lk']

In [31]:
wordpiece_tokenizer.encode("she walked").tokens

['[UNK]', 'walked']

In [32]:
## Unigram

from tokenizers.trainers import UnigramTrainer
from tokenizers.models import Unigram

unigram_tokenizer = Tokenizer(Unigram())
unigram_tokenizer.pre_tokenizer = Whitespace()
unigram_trainer = UnigramTrainer(
    vocab_size=14, 
    special_tokens=[unk_token],
    unk_token=unk_token,
)

unigram_tokenizer.train_from_iterator(training_data, unigram_trainer)
unigram_tokenizer.get_vocab()

{'a': 5,
 'walk': 4,
 'k': 3,
 'n': 9,
 'walke': 1,
 '[UNK]': 0,
 'e': 8,
 'd': 7,
 'g': 10,
 'o': 11,
 'r': 12,
 'l': 6,
 'w': 2}

In [33]:
unigram_tokenizer.encode("walker walked a long walk").tokens

['walke', 'r', 'walke', 'd', 'a', 'l', 'o', 'n', 'g', 'walk']

In [34]:
unigram_tokenizer.encode("wlk").tokens

['w', 'l', 'k']

In [35]:
unigram_tokenizer.encode("she walked").tokens

['sh', 'e', 'walke', 'd']

In [36]:
unigram_tokenizer.encode("she walked").ids

[0, 8, 1, 7]