### GPT Tokenizer live

[Tiktoken](https://tiktokenizer.vercel.app/)

In [83]:
import tiktoken
import collections, re
import pprint as pp

In [4]:
enc = tiktoken.get_encoding("cl100k_base")

In [16]:
vocab = enc.token_byte_values()

In [63]:
# for v in vocab:
#     print(v.decode('utf-8', errors='ignore'))

### BPE by [Sennrich et al., 2016](https://arxiv.org/abs/1508.07909)

In [87]:
def get_stats(vocab):
    pairs = collections.defaultdict(int)
    for word, freq in vocab.items():
        symbols = word.split()
        for i in range(len(symbols)-1):
            pairs[symbols[i],symbols[i+1]] += freq
    return pairs


def merge_vocab(pair, v_in):
    v_out = {}
    bigram = re.escape(' '.join(pair))
    p = re.compile(r'(?<!\S)' + bigram + r'(?!\S)')
    for word in v_in:
        w_out = p.sub(''.join(pair), word)
        v_out[w_out] = v_in[word]
    return v_out


In [88]:
vocab = {'l o w </w>' : 5, 'l o w e r </w>' : 2, 'n e w e s t </w>':6, 'w i d e s t </w>':3}
num_merges = 10

In [89]:
for i in range(num_merges):
    pairs = get_stats(vocab)
    print(f'pairs before megre {i+1}: {pp.pformat(pairs)}', '\n')
    best = max(pairs, key=pairs.get)
    vocab = merge_vocab(best, vocab)
    print(f'pairs after megre {i+1}: {pp.pformat(get_stats(vocab))}', '\n')
    print(f'vocab after megre {i+1}: {pp.pformat(vocab)}', '\n')


pairs before megre 1: defaultdict(<class 'int'>,
            {('d', 'e'): 3,
             ('e', 'r'): 2,
             ('e', 's'): 9,
             ('e', 'w'): 6,
             ('i', 'd'): 3,
             ('l', 'o'): 7,
             ('n', 'e'): 6,
             ('o', 'w'): 7,
             ('r', '</w>'): 2,
             ('s', 't'): 9,
             ('t', '</w>'): 9,
             ('w', '</w>'): 5,
             ('w', 'e'): 8,
             ('w', 'i'): 3}) 

pairs after megre 1: defaultdict(<class 'int'>,
            {('d', 'es'): 3,
             ('e', 'r'): 2,
             ('e', 'w'): 6,
             ('es', 't'): 9,
             ('i', 'd'): 3,
             ('l', 'o'): 7,
             ('n', 'e'): 6,
             ('o', 'w'): 7,
             ('r', '</w>'): 2,
             ('t', '</w>'): 9,
             ('w', '</w>'): 5,
             ('w', 'e'): 2,
             ('w', 'es'): 6,
             ('w', 'i'): 3}) 

vocab after megre 1: {'l o w </w>': 5,
 'l o w e r </w>': 2,
 'n e w es t </w>': 6,
 'w i d 