### GPT Tokenizer live

[Tiktoken](https://tiktokenizer.vercel.app/)

In [104]:
import tiktoken
import collections, re
import pprint as pp

In [4]:
enc = tiktoken.get_encoding("cl100k_base")

In [16]:
vocab = enc.token_byte_values()

In [63]:
# for v in vocab:
#     print(v.decode('utf-8', errors='ignore'))

### BPE by [Sennrich et al., 2016](https://arxiv.org/abs/1508.07909)

In [105]:
def get_stats(vocab):
    pairs = collections.defaultdict(int)
    for word, freq in vocab.items():
        symbols = word.split()
        for i in range(len(symbols)-1):
            pairs[symbols[i],symbols[i+1]] += freq
    return pairs


def merge_vocab(pair, v_in):
    v_out = {}
    bigram = re.escape(' '.join(pair))
    p = re.compile(r'(?<!\S)' + bigram + r'(?!\S)')
    for word in v_in:
        w_out = p.sub(''.join(pair), word)
        v_out[w_out] = v_in[word]
    return v_out


In [106]:
vocab = {'l o w </w>' : 5, 'l o w e r </w>' : 2, 'n e w e s t </w>':6, 'w i d e s t </w>':3}
num_merges = 10

In [107]:
for i in range(num_merges):
    pairs = get_stats(vocab)
    print(f'pairs before megre {i+1}: {pp.pformat(pairs)}', '\n')
    best = max(pairs, key=pairs.get)
    vocab = merge_vocab(best, vocab)
    # print(f'pairs after megre {i+1}: {pp.pformat(get_stats(vocab))}', '\n')
    print(f'vocab after megre {i+1}: {pp.pformat(vocab)}', '\n')


pairs before megre 1: defaultdict(<class 'int'>,
            {('d', 'e'): 3,
             ('e', 'r'): 2,
             ('e', 's'): 9,
             ('e', 'w'): 6,
             ('i', 'd'): 3,
             ('l', 'o'): 7,
             ('n', 'e'): 6,
             ('o', 'w'): 7,
             ('r', '</w>'): 2,
             ('s', 't'): 9,
             ('t', '</w>'): 9,
             ('w', '</w>'): 5,
             ('w', 'e'): 8,
             ('w', 'i'): 3}) 

vocab after megre 1: {'l o w </w>': 5,
 'l o w e r </w>': 2,
 'n e w es t </w>': 6,
 'w i d es t </w>': 3} 

pairs before megre 2: defaultdict(<class 'int'>,
            {('d', 'es'): 3,
             ('e', 'r'): 2,
             ('e', 'w'): 6,
             ('es', 't'): 9,
             ('i', 'd'): 3,
             ('l', 'o'): 7,
             ('n', 'e'): 6,
             ('o', 'w'): 7,
             ('r', '</w>'): 2,
             ('t', '</w>'): 9,
             ('w', '</w>'): 5,
             ('w', 'e'): 2,
             ('w', 'es'): 6,
             

### BPE....getting down in the Unicode trenches

In [183]:
text = "Ｕｎｉｃｏｄｅ! 🅤🅝🅘🅒🅞🅓🅔‽ 🇺‌🇳‌🇮‌🇨‌🇴‌🇩‌🇪! 😄 The very name strikes fear and awe into the hearts of programmers worldwide. We all know we ought to “support Unicode” in our software (whatever that means—like using wchar_t for all the strings, right?). But Unicode can be abstruse, and diving into the thousand-page Unicode Standard plus its dozens of supplementary annexes, reports, and notes can be more than a little intimidating. I don’t blame programmers for still finding the whole thing mysterious, even 30 years after Unicode’s inception."

In [184]:
print(len(text))

533


In [185]:
text = text.encode('utf-8')
for t in text:
    print(int(t))
# tokens = list(map(int, text))
# print(text)
# print(tokens)
# print(len(tokens))

239
188
181
239
189
142
239
189
137
239
189
131
239
189
143
239
189
132
239
189
133
33
32
240
159
133
164
240
159
133
157
240
159
133
152
240
159
133
146
240
159
133
158
240
159
133
147
240
159
133
148
226
128
189
32
240
159
135
186
226
128
140
240
159
135
179
226
128
140
240
159
135
174
226
128
140
240
159
135
168
226
128
140
240
159
135
180
226
128
140
240
159
135
169
226
128
140
240
159
135
170
33
32
240
159
152
132
32
84
104
101
32
118
101
114
121
32
110
97
109
101
32
115
116
114
105
107
101
115
32
102
101
97
114
32
97
110
100
32
97
119
101
32
105
110
116
111
32
116
104
101
32
104
101
97
114
116
115
32
111
102
32
112
114
111
103
114
97
109
109
101
114
115
32
119
111
114
108
100
119
105
100
101
46
32
87
101
32
97
108
108
32
107
110
111
119
32
119
101
32
111
117
103
104
116
32
116
111
32
226
128
156
115
117
112
112
111
114
116
32
85
110
105
99
111
100
101
226
128
157
32
105
110
32
111
117
114
32
115
111
102
116
119
97
114
101
32
40
119
104
97
116
101
118
101
114
32
116
104
97
116
32


In [171]:
def get_stats(ids):
    counts = {}
    for pair in zip(ids, ids[1:]):
        counts[pair] = counts.get(pair, 0) + 1
    return counts

In [172]:
stats = get_stats(tokens)
pprint(sorted(((v, k) for k, v in stats.items()), reverse=True))

[(20, (101, 32)),
 (15, (240, 159)),
 (12, (226, 128)),
 (12, (105, 110)),
 (10, (115, 32)),
 (10, (97, 110)),
 (10, (32, 97)),
 (9, (32, 116)),
 (8, (116, 104)),
 (7, (159, 135)),
 (7, (159, 133)),
 (7, (97, 114)),
 (6, (239, 189)),
 (6, (140, 240)),
 (6, (128, 140)),
 (6, (116, 32)),
 (6, (114, 32)),
 (6, (111, 114)),
 (6, (110, 103)),
 (6, (110, 100)),
 (6, (109, 101)),
 (6, (104, 101)),
 (6, (101, 114)),
 (6, (32, 105)),
 (5, (117, 115)),
 (5, (115, 116)),
 (5, (110, 32)),
 (5, (100, 101)),
 (5, (44, 32)),
 (5, (32, 115)),
 (4, (116, 105)),
 (4, (116, 101)),
 (4, (115, 44)),
 (4, (114, 105)),
 (4, (111, 117)),
 (4, (111, 100)),
 (4, (110, 116)),
 (4, (110, 105)),
 (4, (105, 99)),
 (4, (104, 97)),
 (4, (103, 32)),
 (4, (101, 97)),
 (4, (100, 32)),
 (4, (99, 111)),
 (4, (97, 109)),
 (4, (85, 110)),
 (4, (32, 119)),
 (4, (32, 111)),
 (4, (32, 102)),
 (4, (32, 85)),
 (3, (118, 101)),
 (3, (116, 115)),
 (3, (116, 114)),
 (3, (116, 111)),
 (3, (114, 116)),
 (3, (114, 115)),
 (3, (114, 10