In [2]:
text = "This is some text"
byte_ary = bytearray(text,"utf-8")
print(bytearray)

<class 'bytearray'>


In [3]:
ids = list(byte_ary)
print(ids)

[84, 104, 105, 115, 32, 105, 115, 32, 115, 111, 109, 101, 32, 116, 101, 120, 116]


In [4]:
print("Number of characters:", len(text))
print("Number of token IDs:", len(ids))

Number of characters: 17
Number of token IDs: 17


#####  BPE tokenizers have a vocabulary where we have a token ID for whole words or subwords instead of each character

### BPE Algorithm outline
### 1. Identify frequent pairs

In each iteration, scan the text to find the most commonly occurring pair of bytes (or characters)

### 2. Replace and record

Replace that pair with a new placeholder ID (one not already in use, e.g., if we start with 0...255, the first placeholder would be 256)
Record this mapping in a lookup table
The size of the lookup table is a hyperparameter, also called "vocabulary size" (for GPT-2, that's 50,257)

### 3. Repeat until no gains

Keep repeating steps 1 and 2, continually merging the most frequent pairs
Stop when no further compression is possible (e.g., no pair occurs more than once)
Decompression (decoding)

To restore the original text, reverse the process by substituting each ID with its corresponding pair, using the lookup table

In [None]:
from collections import Counter, deque
from functools import lru_cache
import json

class BPETokenizerSimple:
    def __init__(self):
        self.vocab = {}
        self.inverse_vocab = {}
        self.bpe_merges = {}
        self.bpe_ranks = {}

    def train(self, text, vocab_size, allowed_special={"<|endoftext|>"}):
        processed_text = []
        for i, char in enumerate(text):
            if char == " " and i!=0:
                processed_text.append("G")
            if char != " ":
                processed_text.append(char)
        
        unique_chars = [chr(i) for i in range(256)]
        unique_chars.extend_char for char in sorted(set(processed_text)) if char not in unique_chars]
        if "G" not in unique_chars:
            unique_chars.append("G")
        
        self.vocab = {i:char for i,char in enumerate(unique_chars)}
        self.inverse_vocab = {char:i for i,char in self.vocab.items()}

        if allowed_special:
            for token in allowed_special:
                if token not in self.inverse_vocab:
                    idx = len(self.vocab)
                    self.vocab[idx] = token
                    self.inverse_vocab[token] = idx
            
        token_ids = [self.inverse_vocab[char] for char in processed_text]

        for new_id in range(len(self.vocab), vocab_size):
            pair_id = self.find_freq_pair(token_ids, mode = "most")
            if pair_id is None:
                break
            token_ids = self.replace_pair(token_ids, pair_id, new_id)
            self.bpe_merges[pair_id] = new_id
        
        for (p0,p1), new_id in self.bpe_merges.items():
            merged_token = self.vocab[p0] + self.vocab[p1]
            self.vocab[new_id] = merged_token
            self.inverse_vocab[merged_token] = new_id