## Basic Byte-Pair-Encoding(BPE)

In [48]:
class BasicBPE:

    def __init__(self, vocab_size: int):
        self.vocab_size = vocab_size
        self.vocab = {idx: bytes([idx]) for idx in range(256)} # BPE definition
        self.merges = dict()

    def train(self, text: str):
        """
        1. Encode text into UTF-8 format.
        2. Search for pairs and merges.
        3. Apply merges and repeat 2. until end condition.
        """

        tokens = text.encode('utf-8')
        tokens = list(map(int, tokens))

        while len(self.merges) + 256 <= self.vocab_size:
            # compute tokens stats
            tokens_stats = compute_pair_of_tokens_stats(tokens)
            # get most common pair of tokens
            most_common_pair = max(tokens_stats, key=tokens_stats.get)
            new_token_id = len(self.merges) + 1 + 256 # UTF-8 has 256 ints
            # save change
            self.merges[most_common_pair] = new_token_id
            # apply change
            tokens = merge_pair(tokens, pair=most_common_pair, idx=new_token_id)

        # update tokens map
        for (p0, p1), idx in self.merges.items():
            self.vocab[idx] = self.vocab[p0] + self.vocab[p1]
    
    def encode(self, text: str):
        """
        1. Compute UTF-8 encoding of text.
        2. Apply merges to convert UTF-8 encoding to BPE encoding.
        """
        tokens = list(text.encode('utf-8'))
        while len(tokens) >= 2:
            stats = compute_pair_of_tokens_stats(tokens)
            pair = min(stats, key=lambda p: self.merges.get(p, float("inf")))
            if pair not in self.merges:
                break # nothing else can be merged
            idx = self.merges[pair]
            tokens = merge_pair(tokens, pair, idx)
        return tokens

    def decode(self, tokens):
        """
        1. Convert tokens from BPE enconding to UTF-8 encoding.
        2. Decode UTF-8 to text.
        """
        text = b"".join(self.vocab[x] for x in tokens)
        text = text.decode("utf-8", errors="replace")
        return text


def compute_pair_of_tokens_stats(tokens):

    info = {}
    for pair in zip(tokens, tokens[1:]):
        if pair not in info.keys():
            info[pair] = 1
        else:
            info[pair] += 1
    return info

def merge_pair(tokens, pair, idx):
    new_list = []
    i = 0
    total_tokens = len(tokens)
    while i < total_tokens:
        if i < total_tokens - 1 and tokens[i] == pair[0] and tokens[i+1] == pair[1]:
            new_list.append(idx)
            i += 2
        else:
            new_list.append(tokens[i])
            i += 1
    return new_list

In [47]:
# We always start with a dataset to train on. Let's download the tiny shakespeare dataset
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2026-01-14 16:31:24--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolviendo raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.108.133, ...
Conectando con raw.githubusercontent.com (raw.githubusercontent.com)[185.199.110.133]:443... conectado.
Petición HTTP enviada, esperando respuesta... 200 OK
Longitud: 1115394 (1,1M) [text/plain]
Guardando como: ‘input.txt’


2026-01-14 16:31:24 (12,3 MB/s) - ‘input.txt’ guardado [1115394/1115394]



In [50]:
text = open("input.txt", "r").read()

bpe = BasicBPE(vocab_size=300)
bpe.train(text)

decoding_example = bpe.decode([270,260])
print("Decoded example: ", decoding_example)

encoding_example = bpe.encode(text[:5])
print("Encoded example: ", encoding_example)

Decoded example:  o s 
Encoded example:  [70, 300, 297]


In [51]:
print(bpe.decode(bpe.encode("hello world")))

hello world


In [52]:
tokens_level_utf8 = text.encode("utf-8")
tokens_level_bpe = bpe.encode(text)
print(f"Number of tokens using UFT-8: {len(tokens_level_utf8)}")
print(f"Number of tokens using BPE: {len(tokens_level_bpe)}")
print(f"Compression ratio: {len(tokens_level_utf8)/len(tokens_level_bpe)*100}")

Number of tokens using UFT-8: 1115394
Number of tokens using BPE: 785969
Compression ratio: 141.91323067449224
