### GPT Tokenizer live

[Tiktoken](https://tiktokenizer.vercel.app/)

In [1]:
import tiktoken
import collections, re
import pprint as pp

In [2]:
enc = tiktoken.get_encoding("cl100k_base")

In [3]:
vocab = enc.token_byte_values()

In [4]:
# for v in vocab:
#     print(v.decode('utf-8', errors='ignore'))

### BPE by [Sennrich et al., 2016](https://arxiv.org/abs/1508.07909)

In [5]:
def get_stats(vocab):
    pairs = collections.defaultdict(int)
    for word, freq in vocab.items():
        symbols = word.split()
        for i in range(len(symbols)-1):
            pairs[symbols[i],symbols[i+1]] += freq
    return pairs


def merge_vocab(pair, v_in):
    v_out = {}
    bigram = re.escape(' '.join(pair))
    p = re.compile(r'(?<!\S)' + bigram + r'(?!\S)')
    for word in v_in:
        w_out = p.sub(''.join(pair), word)
        v_out[w_out] = v_in[word]
    return v_out


In [6]:
vocab = {'l o w </w>' : 5, 'l o w e r </w>' : 2, 'n e w e s t </w>':6, 'w i d e s t </w>':3}
num_merges = 10

In [7]:
for i in range(num_merges):
    pairs = get_stats(vocab)
    print(f'pairs before megre {i+1}: {pp.pformat(pairs)}', '\n')
    best = max(pairs, key=pairs.get)
    vocab = merge_vocab(best, vocab)
    # print(f'pairs after megre {i+1}: {pp.pformat(get_stats(vocab))}', '\n')
    print(f'vocab after megre {i+1}: {pp.pformat(vocab)}', '\n')


pairs before megre 1: defaultdict(<class 'int'>,
            {('d', 'e'): 3,
             ('e', 'r'): 2,
             ('e', 's'): 9,
             ('e', 'w'): 6,
             ('i', 'd'): 3,
             ('l', 'o'): 7,
             ('n', 'e'): 6,
             ('o', 'w'): 7,
             ('r', '</w>'): 2,
             ('s', 't'): 9,
             ('t', '</w>'): 9,
             ('w', '</w>'): 5,
             ('w', 'e'): 8,
             ('w', 'i'): 3}) 

vocab after megre 1: {'l o w </w>': 5,
 'l o w e r </w>': 2,
 'n e w es t </w>': 6,
 'w i d es t </w>': 3} 

pairs before megre 2: defaultdict(<class 'int'>,
            {('d', 'es'): 3,
             ('e', 'r'): 2,
             ('e', 'w'): 6,
             ('es', 't'): 9,
             ('i', 'd'): 3,
             ('l', 'o'): 7,
             ('n', 'e'): 6,
             ('o', 'w'): 7,
             ('r', '</w>'): 2,
             ('t', '</w>'): 9,
             ('w', '</w>'): 5,
             ('w', 'e'): 2,
             ('w', 'es'): 6,
             

### BPE .... getting down into the Unicode trenches

In [8]:
text = "Ｕｎｉｃｏｄｅ! 🅤🅝🅘🅒🅞🅓🅔‽ 🇺‌🇳‌🇮‌🇨‌🇴‌🇩‌🇪! 😄 The very name strikes fear and awe into the hearts of programmers worldwide. We all know we ought to “support Unicode” in our software (whatever that means—like using wchar_t for all the strings, right?). But Unicode can be abstruse, and diving into the thousand-page Unicode Standard plus its dozens of supplementary annexes, reports, and notes can be more than a little intimidating. I don’t blame programmers for still finding the whole thing mysterious, even 30 years after Unicode’s inception."

In [9]:
print(len(text))

533


In [10]:
text = text.encode('utf-8')
tokens = list(map(int, text))
print(text)
print(tokens)
print(len(tokens))

b'\xef\xbc\xb5\xef\xbd\x8e\xef\xbd\x89\xef\xbd\x83\xef\xbd\x8f\xef\xbd\x84\xef\xbd\x85! \xf0\x9f\x85\xa4\xf0\x9f\x85\x9d\xf0\x9f\x85\x98\xf0\x9f\x85\x92\xf0\x9f\x85\x9e\xf0\x9f\x85\x93\xf0\x9f\x85\x94\xe2\x80\xbd \xf0\x9f\x87\xba\xe2\x80\x8c\xf0\x9f\x87\xb3\xe2\x80\x8c\xf0\x9f\x87\xae\xe2\x80\x8c\xf0\x9f\x87\xa8\xe2\x80\x8c\xf0\x9f\x87\xb4\xe2\x80\x8c\xf0\x9f\x87\xa9\xe2\x80\x8c\xf0\x9f\x87\xaa! \xf0\x9f\x98\x84 The very name strikes fear and awe into the hearts of programmers worldwide. We all know we ought to \xe2\x80\x9csupport Unicode\xe2\x80\x9d in our software (whatever that means\xe2\x80\x94like using wchar_t for all the strings, right?). But Unicode can be abstruse, and diving into the thousand-page Unicode Standard plus its dozens of supplementary annexes, reports, and notes can be more than a little intimidating. I don\xe2\x80\x99t blame programmers for still finding the whole thing mysterious, even 30 years after Unicode\xe2\x80\x99s inception.'
[239, 188, 181, 239, 189, 142

### byte level encoding

In [11]:
def get_stats(ids):
    counts = {}
    for pair in zip(ids, ids[1:]):
        counts[pair] = counts.get(pair, 0) + 1
    return counts


def merge(ids, pair, idx):
    merged = []
    i = 0
    while i < len(ids):
        if i < len(ids) - 1 and ids[i] == pair[0] and ids[i+1] == pair[1]:
            merged.append(idx)
            i += 2
        else:
            merged.append(ids[i])
            i += 1
    return merged

vocab_size = 276
num_merges = vocab_size - 256
ids = list(tokens)


merges = {} # (int, int) -> int
for i in range(num_merges):
    counts = get_stats(ids)
    max_pair = max(counts, key=counts.get)
    idx = 256 + i
    print(f"merging pair {max_pair} into new token with idx: {idx}")
    ids = merge(ids, max_pair, idx)
    merges[max_pair] = idx
    

merging pair (101, 32) into new token with idx: 256
merging pair (240, 159) into new token with idx: 257
merging pair (226, 128) into new token with idx: 258
merging pair (105, 110) into new token with idx: 259
merging pair (115, 32) into new token with idx: 260
merging pair (97, 110) into new token with idx: 261
merging pair (116, 104) into new token with idx: 262
merging pair (257, 133) into new token with idx: 263
merging pair (257, 135) into new token with idx: 264
merging pair (97, 114) into new token with idx: 265
merging pair (239, 189) into new token with idx: 266
merging pair (258, 140) into new token with idx: 267
merging pair (267, 264) into new token with idx: 268
merging pair (101, 114) into new token with idx: 269
merging pair (111, 114) into new token with idx: 270
merging pair (116, 32) into new token with idx: 271
merging pair (259, 103) into new token with idx: 272
merging pair (115, 116) into new token with idx: 273
merging pair (261, 100) into new token with idx: 27

In [12]:
print("length of the tokens before merges:", len(tokens))
print("length of the tokens after merges:", len(ids))

length of the tokens before merges: 616
length of the tokens after merges: 451


### decoding (tokens to text)

In [13]:
vocab = {i: bytes([i]) for i in range(256)}
for (t1, t2), idx in merges.items():
    vocab[idx] = vocab[t1] + vocab[t2]

print(vocab)

{0: b'\x00', 1: b'\x01', 2: b'\x02', 3: b'\x03', 4: b'\x04', 5: b'\x05', 6: b'\x06', 7: b'\x07', 8: b'\x08', 9: b'\t', 10: b'\n', 11: b'\x0b', 12: b'\x0c', 13: b'\r', 14: b'\x0e', 15: b'\x0f', 16: b'\x10', 17: b'\x11', 18: b'\x12', 19: b'\x13', 20: b'\x14', 21: b'\x15', 22: b'\x16', 23: b'\x17', 24: b'\x18', 25: b'\x19', 26: b'\x1a', 27: b'\x1b', 28: b'\x1c', 29: b'\x1d', 30: b'\x1e', 31: b'\x1f', 32: b' ', 33: b'!', 34: b'"', 35: b'#', 36: b'$', 37: b'%', 38: b'&', 39: b"'", 40: b'(', 41: b')', 42: b'*', 43: b'+', 44: b',', 45: b'-', 46: b'.', 47: b'/', 48: b'0', 49: b'1', 50: b'2', 51: b'3', 52: b'4', 53: b'5', 54: b'6', 55: b'7', 56: b'8', 57: b'9', 58: b':', 59: b';', 60: b'<', 61: b'=', 62: b'>', 63: b'?', 64: b'@', 65: b'A', 66: b'B', 67: b'C', 68: b'D', 69: b'E', 70: b'F', 71: b'G', 72: b'H', 73: b'I', 74: b'J', 75: b'K', 76: b'L', 77: b'M', 78: b'N', 79: b'O', 80: b'P', 81: b'Q', 82: b'R', 83: b'S', 84: b'T', 85: b'U', 86: b'V', 87: b'W', 88: b'X', 89: b'Y', 90: b'Z', 91: b'[',

In [14]:
def decode(ids):
    tokens = b"".join(vocab[id] for id in ids)
    text = tokens.decode("utf-8", errors='replace')
    return text

print(decode(ids))
    

Ｕｎｉｃｏｄｅ! 🅤🅝🅘🅒🅞🅓🅔‽ 🇺‌🇳‌🇮‌🇨‌🇴‌🇩‌🇪! 😄 The very name strikes fear and awe into the hearts of programmers worldwide. We all know we ought to “support Unicode” in our software (whatever that means—like using wchar_t for all the strings, right?). But Unicode can be abstruse, and diving into the thousand-page Unicode Standard plus its dozens of supplementary annexes, reports, and notes can be more than a little intimidating. I don’t blame programmers for still finding the whole thing mysterious, even 30 years after Unicode’s inception.


### encoding more concrete (text to tokens)

example conversion of text to byte stream

In [18]:
text = "Ｕｎｉｃｏｄｅ! 🅤🅝🅘🅒🅞🅓🅔‽ 🇺‌🇳‌🇮‌🇨‌🇴‌🇩‌🇪! 😄 The very name strikes fear and awe into the hearts of programmers worldwide. We all know we ought to “support Unicode” in our software (whatever that means—like using wchar_t for all the strings, right?). But Unicode can be abstruse, and diving into the thousand-page Unicode Standard plus its dozens of supplementary annexes, reports, and notes can be more than a little intimidating. I don’t blame programmers for still finding the whole thing mysterious, even 30 years after Unicode’s inception."
encoded = text.encode("utf-8")
print(encoded, list(encoded))

b'\xef\xbc\xb5\xef\xbd\x8e\xef\xbd\x89\xef\xbd\x83\xef\xbd\x8f\xef\xbd\x84\xef\xbd\x85! \xf0\x9f\x85\xa4\xf0\x9f\x85\x9d\xf0\x9f\x85\x98\xf0\x9f\x85\x92\xf0\x9f\x85\x9e\xf0\x9f\x85\x93\xf0\x9f\x85\x94\xe2\x80\xbd \xf0\x9f\x87\xba\xe2\x80\x8c\xf0\x9f\x87\xb3\xe2\x80\x8c\xf0\x9f\x87\xae\xe2\x80\x8c\xf0\x9f\x87\xa8\xe2\x80\x8c\xf0\x9f\x87\xb4\xe2\x80\x8c\xf0\x9f\x87\xa9\xe2\x80\x8c\xf0\x9f\x87\xaa! \xf0\x9f\x98\x84 The very name strikes fear and awe into the hearts of programmers worldwide. We all know we ought to \xe2\x80\x9csupport Unicode\xe2\x80\x9d in our software (whatever that means\xe2\x80\x94like using wchar_t for all the strings, right?). But Unicode can be abstruse, and diving into the thousand-page Unicode Standard plus its dozens of supplementary annexes, reports, and notes can be more than a little intimidating. I don\xe2\x80\x99t blame programmers for still finding the whole thing mysterious, even 30 years after Unicode\xe2\x80\x99s inception.' [239, 188, 181, 239, 189, 142

In [19]:
def encode(text):
    tokens = list(text.encode("utf-8"))
    while True:
        stats = get_stats(tokens)
        pair = min(stats, key=lambda p: merges.get(p, float("inf")))
        if pair not in merges:
            break
        idx = merges[pair]
        tokens = merge(tokens, pair, idx)
    return tokens


### Testing

In [20]:
text2 = decode(encode(text))
text2 == text

True

### Onto the complex tokenizers

#### GPT 2 Pattern

In [21]:
import regex as re
gpt2pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
print(re.findall(gpt2pat, "hello world !!!!?"))

['hello', ' world', ' !!!!?']


#### GPT 2 Vocab and encoder files

In [23]:
import os, json

with open('assets/encoder.json', 'r') as f:
    encoder = json.load(f)

print(len(encoder))

50257


In [24]:
with open('assets/vocab.bpe', 'r', encoding='utf-8') as f:
    bpe_data = f.read()

bpe_merges = [tuple(merge_str.split()) for merge_str in bpe_data.split('\n') [1:-1]]

In [25]:
bpe_merges

[('Ġ', 't'),
 ('Ġ', 'a'),
 ('h', 'e'),
 ('i', 'n'),
 ('r', 'e'),
 ('o', 'n'),
 ('Ġt', 'he'),
 ('e', 'r'),
 ('Ġ', 's'),
 ('a', 't'),
 ('Ġ', 'w'),
 ('Ġ', 'o'),
 ('e', 'n'),
 ('Ġ', 'c'),
 ('i', 't'),
 ('i', 's'),
 ('a', 'n'),
 ('o', 'r'),
 ('e', 's'),
 ('Ġ', 'b'),
 ('e', 'd'),
 ('Ġ', 'f'),
 ('in', 'g'),
 ('Ġ', 'p'),
 ('o', 'u'),
 ('Ġa', 'n'),
 ('a', 'l'),
 ('a', 'r'),
 ('Ġt', 'o'),
 ('Ġ', 'm'),
 ('Ġo', 'f'),
 ('Ġ', 'in'),
 ('Ġ', 'd'),
 ('Ġ', 'h'),
 ('Ġan', 'd'),
 ('i', 'c'),
 ('a', 's'),
 ('l', 'e'),
 ('Ġt', 'h'),
 ('i', 'on'),
 ('o', 'm'),
 ('l', 'l'),
 ('en', 't'),
 ('Ġ', 'n'),
 ('Ġ', 'l'),
 ('s', 't'),
 ('Ġ', 're'),
 ('v', 'e'),
 ('Ġ', 'e'),
 ('r', 'o'),
 ('l', 'y'),
 ('Ġb', 'e'),
 ('Ġ', 'g'),
 ('Ġ', 'T'),
 ('c', 't'),
 ('Ġ', 'S'),
 ('i', 'd'),
 ('o', 't'),
 ('Ġ', 'I'),
 ('u', 't'),
 ('e', 't'),
 ('Ġ', 'A'),
 ('Ġ', 'is'),
 ('Ġ', 'on'),
 ('i', 'm'),
 ('a', 'm'),
 ('o', 'w'),
 ('a', 'y'),
 ('a', 'd'),
 ('s', 'e'),
 ('Ġth', 'at'),
 ('Ġ', 'C'),
 ('i', 'g'),
 ('Ġf', 'or'),
 ('a', 'c'),
 ('Ġ