In [None]:
def bpe(mergeable_ranks, token, max_rank=None):
    parts = [bytes([b]) for b in token]
    while True:
        min_idx = None
        min_rank = None
        for i, pair in enumerate(zip(parts[:-1], parts[1:])):
            rank = mergeable_ranks.get(pair[0] + pair[1])
            if rank is not None and (min_rank is None or rank < min_rank):
                min_idx = i
                min_rank = rank
        if min_rank is None or (max_rank is not None and min_rank >= max_rank):
            break
        assert min_idx is not None
        parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2:]
    return parts

def recover_merges(mergeable_ranks):
    merges = {}
    for token, rank in mergeable_ranks.items():
        if len(token) == 1:
            continue
        pair = tuple(bpe(mergeable_ranks, token, max_rank=rank))
        assert len(pair) == 2, f"Failed on {token!r} -> {pair}"
        ix0 = mergeable_ranks[pair[0]]
        ix1 = mergeable_ranks[pair[1]]
        merges[(ix0, ix1)] = rank
    return merges

# Fake cl100k-style data
mergeable_ranks = {
    b"a": 97,
    b"b": 98,
    b"c": 99,
    b"ab": 256,
    b"abc": 257,
    b"bc": 258,
}

merges = recover_merges(mergeable_ranks)
for (a, b), new in merges.items():
    print(f"({a}, {b}) → {new}")

[b'abc']
[b'ab', b'c']
[b'abcd']
[b'ab', b'cd']
[b'ab', b'cd']


recover_merges, with help from bpe(), tries to figure out how tiktoken merged bytes together when it built the GPT-4 tokenizer.
It is like reverse-engineering (finding the original steps).

The GPT-4 tokenizer gives you only the final merged tokens and their ranks (rank = merge order).
But it does not give the original merge pairs.

recover_merges uses bpe() to simulate merging until the point where a token was created, then it sees which two smaller tokens were combined last. From that, it recreates the missing original merge rules.

######
recover_merges() answers this question:
"Which two old tokens were glued together to make this new token?"
Tiktoken gives you the final tokens (like "ing" → token 1234), but hides which two pieces made it.
recover_merges() looks at each final token and says:

For "ing", it figures out: it was "in" + "g" that got merged.
Then it finds the IDs: "in" was 567, "g" was 103 → so merge (567, 103) → 1234

It does this for every token → rebuilds the full merge table exactly like OpenAI used.

# bpe va recover_merges funksiyalari tiktoken kutubxonasidagi tiktoken.get_encoding("cl100k_base") ni qanday qilib 
# gpt4 cl100k_base lug'atni qurganin aniqlab beradigan funksiyalar yani gpt4 cl100k_base bizga tayyor {"abc:256} ko'rinish 
# tayyor lu'gat beradi so'zlar qanday qilib birlashtirilgan haqida info yo'q lekin bpe va recover_merges qanday qilib so'zlar 
# birlashtirilgan haqida bilib olsa bo'ladi