Run first the [setup notebook](./00-setup.ipynb)

# Tokenization for machine learning / language models

In [1]:
# Project Gutenberg, 244: A Study in Scarlet (en), Arthur Conan Doyle
text_en = """
This was a lofty chamber, lined and littered with countless bottles.
Broad, low tables were scattered about, which bristled with retorts,
test-tubes, and little Bunsen lamps, with their blue flickering flames.
There was only one student in the room, who was bending over a distant
table absorbed in his work. At the sound of our steps he glanced round
and sprang to his feet with a cry of pleasure. “I’ve found it! I’ve
found it,” he shouted to my companion, running towards us with a
test-tube in his hand. “I have found a re-agent which is precipitated
by hæmoglobin, and by nothing else.” Had he discovered a gold mine,
greater delight could not have shone upon his features.
""".strip()

## Byte Pair Encoding

### Step-by-step BPE Implementation (not efficient)

In [2]:
from collections import defaultdict
import nltk

text = "This course is about this topic"
word_list = [token.lower() for token in nltk.word_tokenize(text) if token.isalpha()]
word_list[:10]

['this', 'course', 'is', 'about', 'this', 'topic']

In [3]:
from collections import Counter

words = { word: (freq, [c for c in word]) for word, freq in Counter(word_list).items()}
list(words.items())[0:10]

[('this', (2, ['t', 'h', 'i', 's'])),
 ('course', (1, ['c', 'o', 'u', 'r', 's', 'e'])),
 ('is', (1, ['i', 's'])),
 ('about', (1, ['a', 'b', 'o', 'u', 't'])),
 ('topic', (1, ['t', 'o', 'p', 'i', 'c']))]

In [4]:
vocab = set()
for (freq, parts) in words.values():
    vocab = vocab | set(parts)
vocabulary = sorted(list(vocab))
print(vocabulary)

['a', 'b', 'c', 'e', 'h', 'i', 'o', 'p', 'r', 's', 't', 'u']


In [5]:
def new_pair_freqs(words):
    pair_freqs = Counter()
    for token, (freq, parts) in words.items():
        for pair in zip(parts[:-1], parts[1:]):
            pair_freqs[pair] += freq
    return pair_freqs

pair_freqs = new_pair_freqs(words)
print(pair_freqs.most_common(10))
best_pair = pair_freqs.most_common()[0][0]
best_pair

[(('i', 's'), 3), (('t', 'h'), 2), (('h', 'i'), 2), (('o', 'u'), 2), (('c', 'o'), 1), (('u', 'r'), 1), (('r', 's'), 1), (('s', 'e'), 1), (('a', 'b'), 1), (('b', 'o'), 1)]


('i', 's')

In [6]:
def merge_pair(pair, words):
    merged = [pair[0] + pair[1]]
    for token, (freq, parts) in words.items():
        i = 0
        while i < len(parts) - 1:
            if parts[i] == pair[0] and parts[i+1] == pair[1]:
                parts[i:i+2] = merged
            else:
                i += 1

vocabulary.append(''.join(best_pair))
merge_pair(best_pair, words)
words['this']

(2, ['t', 'h', 'is'])

In [7]:
vocabulary_size = 20

while len(vocabulary) < vocabulary_size:
    pair_freqs = new_pair_freqs(words)
    best_pair = pair_freqs.most_common()[0][0]
    vocabulary.append(''.join(best_pair))
    merge_pair(best_pair, words)

vocabulary.sort()
for i in range(0, len(vocabulary), 30):
    print(' '.join(vocabulary[i:i+30]))

a b c cou cour cours course e h i is o ou p r s t th this u


In [8]:
words

{'this': (2, ['this']),
 'course': (1, ['course']),
 'is': (1, ['is']),
 'about': (1, ['a', 'b', 'ou', 't']),
 'topic': (1, ['t', 'o', 'p', 'i', 'c'])}

### BPE trainer from transformers library

In [9]:
from IPython.display import display, Markdown, clear_output
from tabulate import tabulate

def print_vocabulary(tokenizer):
    headers = []
    columns = []
    for start in range(0,400,40):
        headers.append(f'{start} -> {start+40}')
        columns.append([f'{i} -> {tokenizer.decode([i])}' for i in range(start,start+40)])
    display(Markdown(tabulate(zip(*columns), headers, tablefmt="github")))

In [10]:
from tokenizers import decoders, models, normalizers, pre_tokenizers, processors, trainers, Tokenizer
from helpers import get_book

def batch_loader():
    yield get_book(244).page_content

# 1) define special tokens and create tokenizer object
unknown_token = "[UNK]"
special_tokens = [unknown_token, "[SEP]", "[MASK]", "[CLS]"]
tokenizer = Tokenizer(models.BPE(unk_token = unknown_token))

# 2) setup the trainer for BPE tokenization
trainer = trainers.BpeTrainer(
    vocab_size=5000,  
    min_frequency=3, 
    special_tokens = special_tokens, 
    continuing_subword_prefix='#', 
    end_of_word_suffix='>'
)

# 3) define how to split the text and normalize words
tokenizer.pre_tokenizer = pre_tokenizers.WhitespaceSplit()
tokenizer.normalizer = normalizers.Sequence(
    [normalizers.NFD(), normalizers.Lowercase(), normalizers.StripAccents()]
)

# 4) train the tokens from an iterator (we just load one book)
tokenizer.train_from_iterator(batch_loader(), trainer=trainer)

# 5) Build a post-processor templates for classification (example)
tokenizer.post_processor = processors.TemplateProcessing(
    single=f"[CLS]:0 $A:0 [SEP]:0",
    pair=f"[CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1",
    special_tokens=[
        ("[CLS]", tokenizer.token_to_id("[CLS]")),
        ("[SEP]", tokenizer.token_to_id("[SEP]")),
    ])   
tokenizer.decoder = decoders.ByteLevel()

# 6) encode a test text
display(Markdown(" ".join(tokenizer.encode(text_en).tokens)))
print(tokenizer.encode(text_en).tokens)

# 7) print vocabulary
print_vocabulary(tokenizer)


[CLS] this> was> a> lo #f #ty> ch #am #b #er,> l #ined> and> lit #tered> with> coun #tless> bo #t #tl #es.> bro #ad,> low> t #abl #es> were> scattered> about,> which> br #ist #led> with> ret #ort #s,> test #-t #ub #es,> and> little> bun #s #en> l #amp #s,> with> their> blue> fl #ick #ering> fl #am #es.> there> was> only> one> stud #ent> in> the> room,> who> was> b #ending> over> a> distant> table> absor #bed> in> his> work.> at> the> sound> of> our> steps> he> glanced> round> and> sprang> to> his> feet> with> a> cry> of> pleas #ure.> “i’ve> found> it #!> i’ve> found> it,”> he> shouted> to> my> companion,> running> towards> us> with> a> test #-t #u #be> in> his> hand.> “i> have> found> a> re- #ag #ent> which> is> precip #itated> by> h #æ #m #og #l #ob #in,> and> by> nothing> els #e.”> had> he> discovered> a> gold> min #e,> grea #ter> del #ight> could> not> have> sh #one> upon> his> features.> [SEP]

['[CLS]', 'this>', 'was>', 'a>', 'lo', '#f', '#ty>', 'ch', '#am', '#b', '#er,>', 'l', '#ined>', 'and>', 'lit', '#tered>', 'with>', 'coun', '#tless>', 'bo', '#t', '#tl', '#es.>', 'bro', '#ad,>', 'low>', 't', '#abl', '#es>', 'were>', 'scattered>', 'about,>', 'which>', 'br', '#ist', '#led>', 'with>', 'ret', '#ort', '#s,>', 'test', '#-t', '#ub', '#es,>', 'and>', 'little>', 'bun', '#s', '#en>', 'l', '#amp', '#s,>', 'with>', 'their>', 'blue>', 'fl', '#ick', '#ering>', 'fl', '#am', '#es.>', 'there>', 'was>', 'only>', 'one>', 'stud', '#ent>', 'in>', 'the>', 'room,>', 'who>', 'was>', 'b', '#ending>', 'over>', 'a>', 'distant>', 'table>', 'absor', '#bed>', 'in>', 'his>', 'work.>', 'at>', 'the>', 'sound>', 'of>', 'our>', 'steps>', 'he>', 'glanced>', 'round>', 'and>', 'sprang>', 'to>', 'his>', 'feet>', 'with>', 'a>', 'cry>', 'of>', 'pleas', '#ure.>', '“i’ve>', 'found>', 'it', '#!>', 'i’ve>', 'found>', 'it,”>', 'he>', 'shouted>', 'to>', 'my>', 'companion,>', 'running>', 'towards>', 'us>', 'with>', '

| 0 -> 40   | 40 -> 80   | 80 -> 120   | 120 -> 160   | 160 -> 200   | 200 -> 240    | 240 -> 280    | 280 -> 320   | 320 -> 360    | 360 -> 400    |
|-----------|------------|-------------|--------------|--------------|---------------|---------------|--------------|---------------|---------------|
| 0 ->      | 40 -> o    | 80 -> #r>   | 120 -> #q    | 160 -> and>  | 200 -> #ri    | 240 -> #ing   | 280 -> #ter> | 320 -> man    | 360 -> up>    |
| 1 ->      | 41 -> p    | 81 -> #r    | 121 -> #o>   | 161 -> of>   | 201 -> the    | 241 -> have>  | 281 -> fro   | 321 -> #un    | 361 -> #es,>  |
| 2 ->      | 42 -> q    | 82 -> #d>   | 122 -> #w>   | 162 -> ha    | 202 -> #ai    | 242 -> is>    | 282 -> #ce>  | 322 -> been>  | 362 -> do     |
| 3 ->      | 43 -> r    | 83 -> #k>   | 123 -> #6    | 163 -> #on   | 203 -> #e,>   | 243 -> #us    | 283 -> #t,>  | 323 -> #ab    | 363 -> #led>  |
| 4 -> !    | 44 -> s    | 84 -> #y>   | 124 -> #b>   | 164 -> #er>  | 204 -> #ol    | 244 -> #os    | 284 -> se    | 324 -> #ill>  | 364 -> #sel   |
| 5 -> (    | 45 -> t    | 85 -> #d    | 125 -> #x>   | 165 -> to>   | 205 -> #gh    | 245 -> #et    | 285 -> #ion  | 325 -> kn     | 365 -> #own>  |
| 6 -> )    | 46 -> u    | 86 -> #,>   | 126 -> #f>   | 166 -> #ea   | 206 -> sh     | 246 -> #ould> | 286 -> from> | 326 -> #ch    | 366 -> #itt   |
| 7 -> ,    | 47 -> v    | 87 -> #,    | 127 -> #“    | 167 -> #es   | 207 -> it>    | 247 -> an     | 287 -> #ain  | 327 -> un     | 367 -> who>   |
| 8 -> -    | 48 -> w    | 88 -> #’>   | 128 -> #7    | 168 -> hi    | 208 -> #ou>   | 248 -> #ear   | 288 -> #ul   | 328 -> #ag    | 368 -> or>    |
| 9 -> .    | 49 -> x    | 89 -> #m    | 129 -> #3    | 169 -> #ing> | 209 -> be     | 249 -> on     | 289 -> ex    | 329 -> #im    | 369 -> #ist   |
| 10 -> 0   | 50 -> y    | 90 -> #”>   | 130 -> #_    | 170 -> wa    | 210 -> #re    | 250 -> #ent   | 290 -> #t.>  | 330 -> ab     | 370 -> di     |
| 11 -> 1   | 51 -> z    | 91 -> #e>   | 131 -> #:    | 171 -> #en   | 211 -> wit    | 251 -> #ra    | 291 -> but>  | 331 -> #ked>  | 371 -> said>  |
| 12 -> 2   | 52 -> �    | 92 -> #x    | 132 -> #—>   | 172 -> #it   | 212 -> #or>   | 252 -> be>    | 292 -> they> | 332 -> ne     | 372 -> app    |
| 13 -> 3   | 53 -> œ    | 93 -> #p    | 133 -> #q>   | 173 -> wh    | 213 -> #ac    | 253 -> up     | 293 -> #ay>  | 333 -> #ers   | 373 -> #ay    |
| 14 -> 4   | 54 -> —    | 94 -> #;>   | 134 -> #�    | 174 -> #or   | 214 -> #es>   | 254 -> #is>   | 294 -> not>  | 334 -> en     | 374 -> #ir>   |
| 15 -> 5   | 55 -> ‘    | 95 -> #?    | 135 -> #)    | 175 -> #an   | 215 -> in     | 255 -> #ver   | 295 -> were> | 335 -> so>    | 375 -> #st    |
| 16 -> 6   | 56 -> ’    | 96 -> #n>   | 136 -> i>    | 176 -> #at>  | 216 -> #al    | 256 -> com    | 296 -> #ore> | 336 -> #ame>  | 376 -> pro    |
| 17 -> 7   | 57 -> “    | 97 -> #v    | 137 -> #]>   | 177 -> he>   | 217 -> #ti    | 257 -> #s,>   | 297 -> him>  | 337 -> #ak    | 377 -> #es.>  |
| 18 -> 8   | 58 -> ”    | 98 -> #.    | 138 -> #3>   | 178 -> #ar   | 218 -> you>   | 258 -> you    | 298 -> li    | 338 -> when>  | 378 -> some>  |
| 19 -> 9   | 59 -> #b   | 99 -> #t>   | 139 -> 2>    | 179 -> in>   | 219 -> #oo    | 259 -> #am    | 299 -> we>   | 339 -> one>   | 379 -> pl     |
| 20 -> :   | 60 -> #j   | 100 -> #y   | 140 -> #0    | 180 -> #ic   | 220 -> #ly>   | 260 -> #id    | 300 -> #ff   | 340 -> #gh>   | 380 -> rea    |
| 21 -> ;   | 61 -> #e   | 101 -> #—   | 141 -> #[    | 181 -> #at   | 221 -> with>  | 261 -> #ter   | 301 -> #oc   | 341 -> #op    | 381 -> tw     |
| 22 -> ?   | 62 -> #c   | 102 -> #i>  | 142 -> #)>   | 182 -> his>  | 222 -> as>    | 262 -> #nd    | 302 -> him   | 342 -> #res   | 382 -> “i>    |
| 23 -> [   | 63 -> #t   | 103 -> #k   | 143 -> #0>   | 183 -> #el   | 223 -> #e.>   | 263 -> ar     | 303 -> all>  | 343 -> are>   | 383 -> #our   |
| 24 -> ]   | 64 -> #.>  | 104 -> #h>  | 144 -> #5>   | 184 -> #on>  | 224 -> #as    | 264 -> #pp    | 304 -> #ow>  | 344 -> #ight> | 384 -> #ess>  |
| 25 -> _   | 65 -> #w   | 105 -> #m>  | 145 -> a>    | 185 -> was>  | 225 -> #ut>   | 265 -> #igh   | 305 -> #ion> | 345 -> an>    | 385 -> “w     |
| 26 -> a   | 66 -> #a   | 106 -> #’   | 146 -> #4    | 186 -> #en>  | 226 -> whic   | 266 -> on>    | 306 -> #ent> | 346 -> int    | 386 -> #ts>   |
| 27 -> b   | 67 -> #s>  | 107 -> #z   | 147 -> #8    | 187 -> that> | 227 -> al     | 267 -> sai    | 307 -> som   | 347 -> #ther> | 387 -> #er,>  |
| 28 -> c   | 68 -> #l   | 108 -> #!>  | 148 -> #œ    | 188 -> #om   | 228 -> at>    | 268 -> ch     | 308 -> #et>  | 348 -> cl     | 388 -> #ong>  |
| 29 -> d   | 69 -> #i   | 109 -> #a>  | 149 -> #9    | 189 -> #ro   | 229 -> #ld>   | 269 -> #ir    | 309 -> by>   | 349 -> “th    | 389 -> #ap    |
| 30 -> e   | 70 -> #n   | 110 -> #2   | 150 -> #7>   | 190 -> #is   | 230 -> which> | 270 -> #uc    | 310 -> her>  | 350 -> hol    | 390 -> #out>  |
| 31 -> f   | 71 -> #g   | 111 -> #1   | 151 -> #;    | 191 -> st    | 231 -> #ec    | 271 -> there> | 311 -> no>   | 351 -> #ang   | 391 -> into>  |
| 32 -> g   | 72 -> #g>  | 112 -> #:>  | 152 -> #8>   | 192 -> #ed   | 232 -> #an>   | 272 -> upon>  | 312 -> #ed.> | 352 -> #ep    | 392 -> ac     |
| 33 -> h   | 73 -> #u   | 113 -> #_>  | 153 -> th    | 193 -> #th   | 233 -> no     | 273 -> #s.>   | 313 -> #rea  | 353 -> #ad    | 393 -> could> |
| 34 -> i   | 74 -> #s   | 114 -> #!   | 154 -> the>  | 194 -> #ve>  | 234 -> #,”>   | 274 -> #y,>   | 314 -> #?”>  | 354 -> #rou   | 394 -> #ef    |
| 35 -> j   | 75 -> #l>  | 115 -> #‘   | 155 -> #er   | 195 -> #il   | 235 -> for>   | 275 -> con    | 315 -> #y.>  | 355 -> sp     | 395 -> holm   |
| 36 -> k   | 76 -> #f   | 116 -> #p>  | 156 -> #in   | 196 -> #ur   | 236 -> #le>   | 276 -> #ver>  | 316 -> me>   | 356 -> #al>   | 396 -> fac    |
| 37 -> l   | 77 -> #-   | 117 -> #c>  | 157 -> #ed>  | 197 -> had>  | 237 -> my>    | 277 -> #’s>   | 317 -> #d,>  | 357 -> ou     | 397 -> wor    |
| 38 -> m   | 78 -> #h   | 118 -> #?>  | 158 -> #nd>  | 198 -> #ow   | 238 -> #.”>   | 278 -> this>  | 318 -> as    | 358 -> #um    | 398 -> what>  |
| 39 -> n   | 79 -> #o   | 119 -> #u>  | 159 -> #ou   | 199 -> #ere> | 239 -> #em    | 279 -> #ed,>  | 319 -> #ted> | 359 -> would> | 399 -> #se>   |

### Using a pre-trained BPE tokenizer (GPT-2)

In [11]:
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

text_example="A mouse called Petar sits on the legendary throne in the ivory tower."
print(tokenizer(text_example)["input_ids"])
print([tokenizer.decode(token) for token in tokenizer(text_example)["input_ids"]])
print(tokenizer.tokenize(text_example))
print(tokenizer.decode(tokenizer(text_example)["input_ids"]))
print()

text_example="Auf dem legendären Thron im Elfenbeinturm sitzt eine Maus namens Petar."
print(tokenizer(text_example)["input_ids"])
print([tokenizer.decode(token) for token in tokenizer(text_example)["input_ids"]])
print(tokenizer.tokenize(text_example))
print(tokenizer.decode(tokenizer(text_example)["input_ids"]))

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

[32, 10211, 1444, 4767, 283, 10718, 319, 262, 13273, 19262, 287, 262, 32630, 10580, 13]
['A', ' mouse', ' called', ' Pet', 'ar', ' sits', ' on', ' the', ' legendary', ' throne', ' in', ' the', ' ivory', ' tower', '.']
['A', 'Ġmouse', 'Ġcalled', 'ĠPet', 'ar', 'Ġsits', 'Ġon', 'Ġthe', 'Ġlegendary', 'Ġthrone', 'Ġin', 'Ġthe', 'Ġivory', 'Ġtower', '.']
A mouse called Petar sits on the legendary throne in the ivory tower.

[32, 3046, 1357, 8177, 11033, 918, 536, 1313, 545, 19067, 268, 1350, 600, 333, 76, 1650, 89, 83, 304, 500, 6669, 385, 299, 321, 641, 4767, 283, 13]
['A', 'uf', ' dem', ' legend', 'ä', 'ren', ' Th', 'ron', ' im', ' Elf', 'en', 'be', 'int', 'ur', 'm', ' sit', 'z', 't', ' e', 'ine', ' Ma', 'us', ' n', 'am', 'ens', ' Pet', 'ar', '.']
['A', 'uf', 'Ġdem', 'Ġlegend', 'Ã¤', 'ren', 'ĠTh', 'ron', 'Ġim', 'ĠElf', 'en', 'be', 'int', 'ur', 'm', 'Ġsit', 'z', 't', 'Ġe', 'ine', 'ĠMa', 'us', 'Ġn', 'am', 'ens', 'ĠPet', 'ar', '.']
Auf dem legendären Thron im Elfenbeinturm sitzt eine Maus namens

## WordPiece Encoding

### Step-by-step WordPiece Implementation (not efficient)

In [12]:
from collections import defaultdict
import nltk

text = "This course is about this topic"
word_list = [token.lower() for token in nltk.word_tokenize(text) if token.isalpha()]
word_list[:10]

['this', 'course', 'is', 'about', 'this', 'topic']

In [13]:
from collections import Counter

words = { word: (freq, [word[0]] + ['##'+c for c in word[1:]]) for word, freq in Counter(word_list).items()}
list(words.items())[0:10]

[('this', (2, ['t', '##h', '##i', '##s'])),
 ('course', (1, ['c', '##o', '##u', '##r', '##s', '##e'])),
 ('is', (1, ['i', '##s'])),
 ('about', (1, ['a', '##b', '##o', '##u', '##t'])),
 ('topic', (1, ['t', '##o', '##p', '##i', '##c']))]

In [14]:
vocab = set()
for (freq, parts) in words.values():
    vocab = vocab | set(parts)
vocabulary = sorted(list(vocab))
print(vocabulary)

['##b', '##c', '##e', '##h', '##i', '##o', '##p', '##r', '##s', '##t', '##u', 'a', 'c', 'i', 't']


In [15]:
def new_pair_freqs(words):
    part_freqs = defaultdict(int)
    pair_freqs = defaultdict(int)
    for token, (freq, parts) in words.items():
        for p in parts:
            part_freqs[p] += freq
        for pair in zip(parts[:-1], parts[1:]):
            pair_freqs[pair] += freq
    return part_freqs, pair_freqs

def best_pair_wordpiece(part_freqs, pair_freqs):
    for pair, freq in pair_freqs.items():
        pair_freqs[pair] = freq / part_freqs[pair[0]] / part_freqs[pair[1]]
    return max(pair_freqs, key=pair_freqs.get)

part_freqs, pair_freqs = new_pair_freqs(words)
print(sorted(pair_freqs.items(), key=lambda x: x[1], reverse=True))
best_pair = best_pair_wordpiece(part_freqs, pair_freqs)
print(sorted(pair_freqs.items(), key=lambda x: x[1], reverse=True))
best_pair

[(('t', '##h'), 2), (('##h', '##i'), 2), (('##i', '##s'), 2), (('##o', '##u'), 2), (('c', '##o'), 1), (('##u', '##r'), 1), (('##r', '##s'), 1), (('##s', '##e'), 1), (('i', '##s'), 1), (('a', '##b'), 1), (('##b', '##o'), 1), (('##u', '##t'), 1), (('t', '##o'), 1), (('##o', '##p'), 1), (('##p', '##i'), 1), (('##i', '##c'), 1)]
[(('a', '##b'), 1.0), (('##u', '##r'), 0.5), (('##u', '##t'), 0.5), (('t', '##h'), 0.3333333333333333), (('##h', '##i'), 0.3333333333333333), (('c', '##o'), 0.3333333333333333), (('##o', '##u'), 0.3333333333333333), (('##b', '##o'), 0.3333333333333333), (('##o', '##p'), 0.3333333333333333), (('##p', '##i'), 0.3333333333333333), (('##i', '##c'), 0.3333333333333333), (('##r', '##s'), 0.25), (('##s', '##e'), 0.25), (('i', '##s'), 0.25), (('##i', '##s'), 0.16666666666666666), (('t', '##o'), 0.1111111111111111)]


('a', '##b')

In [16]:
def merge_pair(pair, words):
    merged = [pair[0] + pair[1][2:]]
    for token, (freq, parts) in words.items():
        i = 0
        while i < len(parts) - 1:
            if parts[i] == pair[0] and parts[i+1] == pair[1]:
                parts[i:i+2] = merged
            else:
                i += 1

vocabulary.append(best_pair[0] + best_pair[1][2:])
merge_pair(best_pair, words)
words['about']

(1, ['ab', '##o', '##u', '##t'])

In [17]:
vocabulary_size = 25

while len(vocabulary) < vocabulary_size:
    part_freqs, pair_freqs = new_pair_freqs(words)
    best_pair = best_pair_wordpiece(part_freqs, pair_freqs)
    vocabulary.append(best_pair[0] + best_pair[1][2:])
    merge_pair(best_pair, words)

vocabulary.sort()
for i in range(0, len(vocabulary), 30):
    print(' '.join(vocabulary[i:i+30]))

##b ##c ##e ##h ##i ##o ##p ##pi ##pic ##r ##s ##t ##u ##ur ##ut a ab abo c co cour i t th thi


In [18]:
words

{'this': (2, ['thi', '##s']),
 'course': (1, ['cour', '##s', '##e']),
 'is': (1, ['i', '##s']),
 'about': (1, ['abo', '##ut']),
 'topic': (1, ['t', '##o', '##pic'])}

### WordPiece Trainer from transformers library

In [19]:
from tokenizers import decoders, models, normalizers, pre_tokenizers, processors, trainers, Tokenizer
from helpers import get_book

def batch_loader():
    yield get_book(244).page_content

# 1) define special tokens and create tokenizer object
unknown_token = "[UNK]"
special_tokens = [unknown_token, "[SEP]", "[MASK]", "[CLS]"]
tokenizer = Tokenizer(models.WordPiece(unk_token = unknown_token))

# 2) setup the trainer for WordPiece tokenization
trainer = trainers.WordPieceTrainer(
    vocab_size=5000,  
    min_frequency=3, 
    special_tokens = special_tokens, 
    continuing_subword_prefix='#', 
    end_of_word_suffix='>'
)

# 3) define how to split the text and normalize words
tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()
tokenizer.normalizer = normalizers.Sequence(
    [normalizers.NFD(), normalizers.Lowercase(), normalizers.StripAccents()]
)

# 4) train the tokens from an iterator (we just load one book)
tokenizer.train_from_iterator(batch_loader(), trainer=trainer)

# 5) Build a post-processor templates for classification (example)
tokenizer.post_processor = processors.TemplateProcessing(
    single=f"[CLS]:0 $A:0 [SEP]:0",
    pair=f"[CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1",
    special_tokens=[
        ("[CLS]", tokenizer.token_to_id("[CLS]")),
        ("[SEP]", tokenizer.token_to_id("[SEP]")),
    ])   
tokenizer.decoder = decoders.ByteLevel()

# 6) encode a test text
display(Markdown(" ".join(tokenizer.encode(text_en).tokens)))
print(tokenizer.encode(text_en).tokens)

# 7) print vocabulary
print_vocabulary(tokenizer)


[CLS] th #is wa #s a lo #f #t #y cha #mb #er , lin #ed an #d litt #er #ed with count #l #ess bott #l #es . broad , low ta #bl #es we #re sc #att #er #ed ab #ou #t , whi #ch bri #st #l #ed with ret #ort #s , te #st - tu #b #es , an #d litt #l #e bun #s #en la #mp #s , with the #ir bl #u #e fl #ick #er #ing fla #m #es . ther #e wa #s on #l #y on #e stud #ent in the roo #m , who wa #s ben #din #g over a dist #ant ta #bl #e absor #b #ed in hi #s wor #k . at the sou #n #d o #f our ste #p #s he gla #n #ce #d rou #n #d an #d spr #ang to hi #s fee #t with a cr #y o #f pleas #ur #e . “ i ’ v #e fo #un #d it ! i ’ v #e fo #un #d it , ” he sho #ut #ed to my compan #ion , ru #n #ni #n #g tow #ar #ds us with a te #st - tu #b #e in hi #s hand . “ i ha #v #e fo #un #d a re - ag #ent whi #ch is precipit #at #ed by h #æ #mo #g #lo #b #in , an #d by not #h #ing el #s #e . ” ha #d he discov #er #ed a gol #d min #e , grea #ter del #ight cou #l #d not ha #v #e sho #n #e up #on hi #s fea #t #ur #es . [SEP]

['[CLS]', 'th', '#is', 'wa', '#s', 'a', 'lo', '#f', '#t', '#y', 'cha', '#mb', '#er', ',', 'lin', '#ed', 'an', '#d', 'litt', '#er', '#ed', 'with', 'count', '#l', '#ess', 'bott', '#l', '#es', '.', 'broad', ',', 'low', 'ta', '#bl', '#es', 'we', '#re', 'sc', '#att', '#er', '#ed', 'ab', '#ou', '#t', ',', 'whi', '#ch', 'bri', '#st', '#l', '#ed', 'with', 'ret', '#ort', '#s', ',', 'te', '#st', '-', 'tu', '#b', '#es', ',', 'an', '#d', 'litt', '#l', '#e', 'bun', '#s', '#en', 'la', '#mp', '#s', ',', 'with', 'the', '#ir', 'bl', '#u', '#e', 'fl', '#ick', '#er', '#ing', 'fla', '#m', '#es', '.', 'ther', '#e', 'wa', '#s', 'on', '#l', '#y', 'on', '#e', 'stud', '#ent', 'in', 'the', 'roo', '#m', ',', 'who', 'wa', '#s', 'ben', '#din', '#g', 'over', 'a', 'dist', '#ant', 'ta', '#bl', '#e', 'absor', '#b', '#ed', 'in', 'hi', '#s', 'wor', '#k', '.', 'at', 'the', 'sou', '#n', '#d', 'o', '#f', 'our', 'ste', '#p', '#s', 'he', 'gla', '#n', '#ce', '#d', 'rou', '#n', '#d', 'an', '#d', 'spr', '#ang', 'to', 'hi', '#s'

| 0 -> 40   | 40 -> 80   | 80 -> 120   | 120 -> 160   | 160 -> 200   | 200 -> 240   | 240 -> 280    | 280 -> 320    | 320 -> 360    | 360 -> 400     |
|-----------|------------|-------------|--------------|--------------|--------------|---------------|---------------|---------------|----------------|
| 0 ->      | 40 -> o    | 80 -> #p>   | 120 -> !>    | 160 -> ,>    | 200 -> #es>  | 240 -> #ich>  | 280 -> up     | 320 -> #ery>  | 360 -> ev      |
| 1 ->      | 41 -> p    | 81 -> #n>   | 121 -> #5>   | 161 -> .>    | 201 -> #el   | 241 -> which> | 281 -> #ow>   | 321 -> int    | 361 -> would>  |
| 2 ->      | 42 -> q    | 82 -> #f    | 122 -> ‘>    | 162 -> #4>   | 202 -> #at   | 242 -> for>   | 282 -> #id    | 322 -> som    | 362 -> #as     |
| 3 ->      | 43 -> r    | 83 -> #q    | 123 -> 1>    | 163 -> #7>   | 203 -> it>   | 243 -> #ol    | 283 -> li     | 323 -> #ff    | 363 -> #mes>   |
| 4 -> !    | 44 -> s    | 84 -> #k    | 124 -> #u>   | 164 -> ]>    | 204 -> #om   | 244 -> my>    | 284 -> #al    | 324 -> #ble>  | 364 -> cl      |
| 5 -> (    | 45 -> t    | 85 -> #r>   | 125 -> #7    | 165 -> [>    | 205 -> #ve>  | 245 -> #ra    | 285 -> upon>  | 325 -> #rea   | 365 -> #ong>   |
| 6 -> )    | 46 -> u    | 86 -> #c>   | 126 -> #1    | 166 -> —>    | 206 -> #ere> | 246 -> #st>   | 286 -> con    | 326 -> #ds>   | 366 -> ab      |
| 7 -> ,    | 47 -> v    | 87 -> #y    | 127 -> #3>   | 167 -> th    | 207 -> #ro   | 247 -> #ec    | 287 -> #ore>  | 327 -> #un    | 367 -> out>    |
| 8 -> -    | 48 -> w    | 88 -> #m    | 128 -> #q>   | 168 -> the>  | 208 -> st    | 248 -> is>    | 288 -> we>    | 328 -> been>  | 368 -> #us     |
| 9 -> .    | 49 -> x    | 89 -> #b    | 129 -> #�    | 169 -> #ed>  | 209 -> #ri   | 249 -> have>  | 289 -> #uc    | 329 -> en     | 369 -> who>    |
| 10 -> 0   | 50 -> y    | 90 -> #w    | 130 -> 5>    | 170 -> #in   | 210 -> #ly>  | 250 -> #em    | 290 -> but>   | 330 -> #ke>   | 370 -> sp      |
| 11 -> 1   | 51 -> z    | 91 -> #t>   | 131 -> s>    | 171 -> #er   | 211 -> #ai   | 251 -> #ould> | 291 -> #ted>  | 331 -> #al>   | 371 -> #ch     |
| 12 -> 2   | 52 -> �    | 92 -> #v    | 132 -> ’>    | 172 -> #nd>  | 212 -> had>  | 252 -> him>   | 292 -> all>   | 332 -> #ge>   | 372 -> to      |
| 13 -> 3   | 53 -> œ    | 93 -> #l>   | 133 -> (>    | 173 -> #er>  | 213 -> #ll>  | 253 -> #ver>  | 293 -> fro    | 333 -> them>  | 373 -> #ess    |
| 14 -> 4   | 54 -> —    | 94 -> #h>   | 134 -> #8    | 174 -> #ou   | 214 -> yo    | 254 -> there> | 294 -> un     | 334 -> kn     | 374 -> di      |
| 15 -> 5   | 55 -> ‘    | 95 -> #k>   | 135 -> #6>   | 175 -> and>  | 215 -> #le>  | 255 -> #ght>  | 295 -> #ked>  | 335 -> when>  | 375 -> sa      |
| 16 -> 6   | 56 -> ’    | 96 -> #j    | 136 -> v>    | 176 -> of>   | 216 -> you>  | 256 -> be>    | 296 -> from>  | 336 -> #ill>  | 376 -> #sel    |
| 17 -> 7   | 57 -> “    | 97 -> #m>   | 137 -> #2    | 177 -> ha    | 217 -> #ur   | 257 -> #me>   | 297 -> #ir    | 337 -> #our>  | 377 -> #one>   |
| 18 -> 8   | 58 -> ”    | 98 -> #a>   | 138 -> 8>    | 178 -> #ing> | 218 -> #or>  | 258 -> an     | 298 -> they>  | 338 -> ne     | 378 -> #ain>   |
| 19 -> 9   | 59 -> #o   | 99 -> #z    | 139 -> 7>    | 179 -> to>   | 219 -> the   | 259 -> ca     | 299 -> ex     | 339 -> #ough> | 379 -> or>     |
| 20 -> :   | 60 -> #l   | 100 -> ;>   | 140 -> d>    | 180 -> hi    | 220 -> #is   | 260 -> #ter>  | 300 -> not>   | 340 -> are>   | 380 -> #ell>   |
| 21 -> ;   | 61 -> #d>  | 101 -> #w>  | 141 -> e>    | 181 -> #ea   | 221 -> sh    | 261 -> #ear   | 301 -> no>    | 341 -> up>    | 381 -> #ep     |
| 22 -> ?   | 62 -> #a   | 102 -> #8>  | 142 -> 2>    | 182 -> wh    | 222 -> #an>  | 262 -> #ti    | 302 -> #la    | 342 -> #ir>   | 382 -> #ing    |
| 23 -> [   | 63 -> #u   | 103 -> #i>  | 143 -> 4>    | 183 -> #on>  | 223 -> #ld>  | 263 -> on     | 303 -> her>   | 343 -> al     | 383 -> #itt    |
| 24 -> ]   | 64 -> #g   | 104 -> #f>  | 144 -> #9>   | 184 -> wa    | 224 -> be    | 264 -> #is>   | 304 -> se     | 344 -> la     | 384 -> #king>  |
| 25 -> _   | 65 -> #h   | 105 -> #o>  | 145 -> ”>    | 185 -> #it   | 225 -> #re   | 265 -> #ent>  | 305 -> #il    | 345 -> #ul    | 385 -> #um     |
| 26 -> a   | 66 -> #e   | 106 -> #6   | 146 -> u>    | 186 -> #at>  | 226 -> wit   | 266 -> on>    | 306 -> #ered> | 346 -> #de>   | 386 -> #ood>   |
| 27 -> b   | 67 -> #x   | 107 -> #4   | 147 -> m>    | 187 -> #or   | 227 -> #ion> | 267 -> #st    | 307 -> were>  | 347 -> #ent   | 387 -> hol     |
| 28 -> c   | 68 -> #p   | 108 -> #2>  | 148 -> 6>    | 188 -> he>   | 228 -> #ce>  | 268 -> com    | 308 -> #son>  | 348 -> as     | 388 -> #out>   |
| 29 -> d   | 69 -> #s   | 109 -> 3>   | 149 -> i>    | 189 -> #en   | 229 -> #gh   | 269 -> #ay>   | 309 -> #ts>   | 349 -> an>    | 389 -> #tion>  |
| 30 -> e   | 70 -> #i   | 110 -> “>   | 150 -> c>    | 190 -> #on   | 230 -> #ow   | 270 -> #et>   | 310 -> #oun   | 350 -> fa     | 390 -> ma      |
| 31 -> f   | 71 -> #n   | 111 -> #b>  | 151 -> o>    | 191 -> #es   | 231 -> in    | 271 -> ar     | 311 -> #ther> | 351 -> #led>  | 391 -> #ed     |
| 32 -> g   | 72 -> #g>  | 112 -> ->   | 152 -> j>    | 192 -> in>   | 232 -> #oo   | 272 -> #aid>  | 312 -> #ess>  | 352 -> #ound> | 392 -> app     |
| 33 -> h   | 73 -> #r   | 113 -> #x>  | 153 -> )>    | 193 -> #en>  | 233 -> #ut>  | 273 -> #ac    | 313 -> one>   | 353 -> what>  | 393 -> pro     |
| 34 -> i   | 74 -> #e>  | 114 -> #0>  | 154 -> #v>   | 194 -> #ar   | 234 -> no    | 274 -> this>  | 314 -> by>    | 354 -> #os    | 394 -> rea     |
| 35 -> j   | 75 -> #t   | 115 -> #1>  | 155 -> a>    | 195 -> #ic   | 235 -> #th   | 275 -> #et    | 315 -> so>    | 355 -> #ain   | 395 -> tw      |
| 36 -> k   | 76 -> #s>  | 116 -> #œ   | 156 -> _>    | 196 -> that> | 236 -> with> | 276 -> #pp    | 316 -> #oc    | 356 -> #th>   | 396 -> if>     |
| 37 -> l   | 77 -> #c   | 117 -> ?>   | 157 -> h>    | 197 -> #an   | 237 -> as>   | 277 -> me>    | 317 -> #gh>   | 357 -> #ers>  | 397 -> #ation> |
| 38 -> m   | 78 -> #y>  | 118 -> l>   | 158 -> t>    | 198 -> his>  | 238 -> #se>  | 278 -> ch     | 318 -> man>   | 358 -> #own>  | 398 -> some>   |
| 39 -> n   | 79 -> #d   | 119 -> :>   | 159 -> 9>    | 199 -> was>  | 239 -> at>   | 279 -> said>  | 319 -> #ight> | 359 -> #op    | 399 -> #thing> |

### Pre-defined WordPiece Tokenizer

In [20]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
text_example="A mouse called Petar sits on the legendary throne in the ivory tower."
print(tokenizer(text_example)["input_ids"])
print([tokenizer.decode(token) for token in tokenizer(text_example)["input_ids"]])
print(tokenizer.tokenize(text_example))
print(tokenizer.decode(tokenizer(text_example)["input_ids"]))
print()

text_example="Auf dem legendären Thron im Elfenbeinturm sitzt eine Maus namens Petar."
print(tokenizer(text_example)["input_ids"])
print([tokenizer.decode(token) for token in tokenizer(text_example)["input_ids"]])
print(tokenizer.tokenize(text_example))
print(tokenizer.decode(tokenizer(text_example)["input_ids"]))

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

[101, 1037, 8000, 2170, 9004, 2906, 7719, 2006, 1996, 8987, 6106, 1999, 1996, 11554, 3578, 1012, 102]
['[CLS]', 'a', 'mouse', 'called', 'pet', '##ar', 'sits', 'on', 'the', 'legendary', 'throne', 'in', 'the', 'ivory', 'tower', '.', '[SEP]']
['a', 'mouse', 'called', 'pet', '##ar', 'sits', 'on', 'the', 'legendary', 'throne', 'in', 'the', 'ivory', 'tower', '.']
[CLS] a mouse called petar sits on the legendary throne in the ivory tower. [SEP]

[101, 21200, 17183, 5722, 12069, 2078, 16215, 4948, 10047, 17163, 2368, 19205, 3372, 3126, 2213, 4133, 2480, 2102, 27665, 5003, 2271, 2171, 3619, 9004, 2906, 1012, 102]
['[CLS]', 'auf', 'dem', 'legend', '##are', '##n', 'th', '##ron', 'im', 'elf', '##en', '##bei', '##nt', '##ur', '##m', 'sit', '##z', '##t', 'eine', 'ma', '##us', 'name', '##ns', 'pet', '##ar', '.', '[SEP]']
['auf', 'dem', 'legend', '##are', '##n', 'th', '##ron', 'im', 'elf', '##en', '##bei', '##nt', '##ur', '##m', 'sit', '##z', '##t', 'eine', 'ma', '##us', 'name', '##ns', 'pet', '##ar',