# MinBPE exercise

Implement the tokenizer we describe in the video and exercise

In [55]:
from IPython.display import display, Markdown
import regex as re
from multiprocessing import Pool
import os

In [10]:
# some basic tests
test_strings = [
    "", # empty string
    "?", # single character
    "hello world!!!? (안녕하세요!) lol123 😉", # fun small string
    """The llama (/ˈlɑːmə/; Spanish pronunciation: [ˈʎama] or [ˈʝama]) (Lama glama) is a domesticated South American camelid, widely used as a meat and pack animal by Andean cultures since the pre-Columbian era.

Llamas are social animals and live with others as a herd. Their wool is soft and contains only a small amount of lanolin.[2] Llamas can learn simple tasks after a few repetitions. When using a pack, they can carry about 25 to 30% of their body weight for 8 to 13 km (5–8 miles).[3] The name llama (also historically spelled "lama" or "glama") was adopted by European settlers from native Peruvians.[4]""",
"FILE:taylorswift.txt"
]

def unpack(text):
    # we do this because `pytest -v .` prints the arguments to console, and we don't
    # want to print the entire contents of the file, it creates a mess. So here we go.
    if text.startswith("FILE:"):
        taylorswift_file = text[5:]
        contents = open(taylorswift_file, "r", encoding="utf-8").read()
        return contents
    else:
        return text

In [31]:
# read content
# source: https://www.kaggle.com/datasets/ukveteran/big-text
with open("big.txt", "r") as file:
    text = file.read()

# lets train on 1st million words in this
text = text[:1000000]

len(text)

1000000

In [14]:
display(Markdown(text[:1000]))

The Project Gutenberg EBook of The Adventures of Sherlock Holmes
by Sir Arthur Conan Doyle
(#15 in our series by Sir Arthur Conan Doyle)

Copyright laws are changing all over the world. Be sure to check the
copyright laws for your country before downloading or redistributing
this or any other Project Gutenberg eBook.

This header should be the first thing seen when viewing this Project
Gutenberg file.  Please do not remove it.  Do not change or edit the
header without written permission.

Please read the "legal small print," and other information about the
eBook and Project Gutenberg at the bottom of this file.  Included is
important information about your specific rights and restrictions in
how the file may be used.  You can also find out about how to make a
donation to Project Gutenberg, and how to get involved.


**Welcome To The World of Free Plain Vanilla Electronic Texts**

**eBooks Readable By Both Humans and By Computers, Since 1971**

*****These eBooks Were Prepared By Thousan

In [24]:
def get_stats(tokens):
    pair_freq_stats = {}
    for c1, c2 in zip(tokens, tokens[1:]):
        pair_freq_stats[(c1, c2)] = pair_freq_stats.get((c1, c2), 0) + 1
    return pair_freq_stats

def replace_pair(tokens, pair, pair_idx):
    new_tokens, i = [], 0

    while i < len(tokens):
        if (i < len(tokens) - 1) and ((tokens[i], tokens[i + 1]) == pair):
            new_tokens.append(pair_idx)
            i += 2
        else:
            new_tokens.append(tokens[i])
            i += 1

    return new_tokens

## BasicTokenizer

In [13]:
class BasicTokenizer:
    def __init__(self):
        self.merges = {}
        self.vocab_size = 256
        self.vocab = {idx: bytes([idx]) for idx in range(self.vocab_size)}

    def train(self, text, vocab_size, verbose=False, verbose_iters=None):
        ## Train the BPE tokenizer.
        assert vocab_size > self.vocab_size # to ensure we have a larger vocab size
        num_merges = vocab_size - self.vocab_size

        tokens = list(text.encode("utf-8"))
        old_token_length = len(tokens)

        if verbose:
            start = "Start"
            print(f"{start:.20s} | No. of tokens: {len(tokens):6d} | Vocab size: {self.vocab_size:5d}")

        for i in range(num_merges):
            # first get the stats of the bigrams
            pair_stats = get_stats(tokens)
            # get the max freq bigram
            max_freq_pair = max(pair_stats, key=lambda k: pair_stats.get(k, -float("inf")))
            # now lets create new tokens by replacing this pair with this
            tokens = replace_pair(tokens, max_freq_pair, self.vocab_size)
            # update the running variables
            self.merges[max_freq_pair] = self.vocab_size
            self.vocab[self.vocab_size] = self.vocab[max_freq_pair[0]] + self.vocab[max_freq_pair[1]]
            self.vocab_size += 1

            if verbose:
                if ((i + 1) % verbose_iters == 0) or (i == num_merges - 1):
                    print((f"Iteration {(i + 1):4d} | No. of tokens: {len(tokens):6d} | Merged pair: {str(max_freq_pair):10s} --> {self.merges[max_freq_pair]:5d}"))

        if verbose:
            compression = old_token_length / len(tokens)
            print(f"Compression : {compression:.2f} X")


    def encode(self, text):
        tokens = list(text.encode("utf-8"))

        while len(tokens)>=2:
            # first get the stats of the bigrams
            pair_stats = get_stats(text)
            # now check if there is a pair which is merged as per our tokenizer
            merge_pair = min(pair_stats, key=lambda k: self.merges.get(k, float("inf"))) # it will check if we get a merge pair candidate, else returns the first element
            # check if there is actually a match
            if self.merges.get(merge_pair) is None:
                break

            # now replace with the merges token
            tokens = replace_pair(tokens, merge_pair, self.merges[merge_pair])

        return tokens


    def decode(self, ids):
        enc_text = b"".join(self.vocab[id] for id in ids)
        text = enc_text.decode("utf-8", errors="replace")
        return text

In [15]:
# check encoded str length
len(list(text.encode("utf-8")))

1000000

In [16]:
# now lets train for some small iterations
basic_tokenizer = BasicTokenizer()
basic_tokenizer.train(text=text, vocab_size=300, verbose=True, verbose_iters=1)

Start | No. of tokens: 1000000 | Vocab size:   256
Iteration    1 | No. of tokens: 970416 | Merged pair: (101, 32)  -->   256
Iteration    2 | No. of tokens: 948950 | Merged pair: (116, 104) -->   257
Iteration    3 | No. of tokens: 932251 | Merged pair: (100, 32)  -->   258
Iteration    4 | No. of tokens: 916862 | Merged pair: (115, 32)  -->   259
Iteration    5 | No. of tokens: 902588 | Merged pair: (116, 32)  -->   260
Iteration    6 | No. of tokens: 888972 | Merged pair: (105, 110) -->   261
Iteration    7 | No. of tokens: 875719 | Merged pair: (101, 114) -->   262
Iteration    8 | No. of tokens: 863648 | Merged pair: (97, 110)  -->   263
Iteration    9 | No. of tokens: 852577 | Merged pair: (44, 32)   -->   264
Iteration   10 | No. of tokens: 842084 | Merged pair: (257, 256) -->   265
Iteration   11 | No. of tokens: 832073 | Merged pair: (111, 110) -->   266
Iteration   12 | No. of tokens: 823657 | Merged pair: (121, 32)  -->   267
Iteration   13 | No. of tokens: 815311 | Merged p

In [17]:
basic_tokenizer.merges

{(101, 32): 256,
 (116, 104): 257,
 (100, 32): 258,
 (115, 32): 259,
 (116, 32): 260,
 (105, 110): 261,
 (101, 114): 262,
 (97, 110): 263,
 (44, 32): 264,
 (257, 256): 265,
 (111, 110): 266,
 (121, 32): 267,
 (101, 110): 268,
 (111, 117): 269,
 (111, 32): 270,
 (102, 32): 271,
 (111, 114): 272,
 (46, 32): 273,
 (101, 258): 274,
 (111, 271): 275,
 (97, 114): 276,
 (32, 32): 277,
 (114, 101): 278,
 (263, 258): 279,
 (116, 105): 280,
 (116, 270): 281,
 (261, 103): 282,
 (97, 108): 283,
 (104, 105): 284,
 (115, 116): 285,
 (97, 32): 286,
 (104, 97): 287,
 (10, 10): 288,
 (32, 265): 289,
 (97, 259): 290,
 (97, 260): 291,
 (262, 32): 292,
 (101, 115): 293,
 (111, 109): 294,
 (282, 32): 295,
 (73, 32): 296,
 (99, 104): 297,
 (111, 108): 298,
 (261, 32): 299}

In [18]:
# now lets get 1k compressions just like GPT-2
basic_tokenizer = BasicTokenizer()
basic_tokenizer.train(text=text, vocab_size=1256, verbose=True, verbose_iters=100)

Start | No. of tokens: 1000000 | Vocab size:   256
Iteration  100 | No. of tokens: 576783 | Merged pair: (117, 112) -->   355
Iteration  200 | No. of tokens: 495579 | Merged pair: (266, 103) -->   455
Iteration  300 | No. of tokens: 452955 | Merged pair: (109, 105) -->   555
Iteration  400 | No. of tokens: 424971 | Merged pair: (119, 423) -->   655
Iteration  500 | No. of tokens: 403227 | Merged pair: (99, 326)  -->   755
Iteration  600 | No. of tokens: 386082 | Merged pair: (282, 264) -->   855
Iteration  700 | No. of tokens: 372312 | Merged pair: (668, 405) -->   955
Iteration  800 | No. of tokens: 360795 | Merged pair: (573, 265) -->  1055
Iteration  900 | No. of tokens: 350883 | Merged pair: (84, 73)   -->  1155
Iteration 1000 | No. of tokens: 342119 | Merged pair: (103, 346) -->  1255
Compression : 2.92 X


In [19]:
# lets visualise the merges
with open("basic_merges.txt", "w") as file:
    sorted_merges = sorted(list(basic_tokenizer.merges.items()), key=lambda x: x[1])
    for k, v in sorted_merges:
        file.write(f"[{(basic_tokenizer.vocab[k[0]]).decode('utf-8', errors='replace')}][{(basic_tokenizer.vocab[k[1]]).decode('utf-8', errors='replace')}]  ---->   {v}\n")

In [32]:
for test_string in test_strings:
    test_text = unpack(test_string)
    assert test_text == basic_tokenizer.decode(basic_tokenizer.encode(test_text))

## RegexTokenizer

In [22]:
GPT4_SPLIT_PATTERN = r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+"""
GPT4_PATTERN_REGEX = re.compile(GPT4_SPLIT_PATTERN)

In [57]:
class RegexTokenizer:
    def __init__(self):
        self.merges = {}
        self.vocab_size = 256
        self.vocab = {idx: bytes([idx]) for idx in range(self.vocab_size)}

    def train(self, text, vocab_size, verbose=False, verbose_iters=None):
        ## Train the BPE tokenizer.
        assert vocab_size > self.vocab_size # to ensure we have a larger vocab size
        num_merges = vocab_size - self.vocab_size

        split_text = GPT4_PATTERN_REGEX.findall(text)

        tokens_list = [list(t.encode("utf-8")) for t in split_text]
        old_token_length = sum(len(tokens) for tokens in tokens_list)

        if verbose:
            start = "Start"
            print(f"{start:.20s} | No. of tokens: {old_token_length:6d} | Vocab size: {self.vocab_size:5d}")

        for i in range(num_merges):
            # itrate over each token list
            pair_stats_full = {}
            for tokens in tokens_list:
                # first get the stats of the bigrams
                pair_stats = get_stats(tokens)
                # combine
                for k, v in pair_stats.items():
                    pair_stats_full[k] = pair_stats_full.get(k, 0) + v

            # get the max freq bigram
            max_freq_pair = max(pair_stats_full, key=lambda k: pair_stats_full.get(k, -float("inf")))
            # now lets create new tokens by replacing this pair with this
            new_tokens_list = []
            for tokens in tokens_list:
                new_tokens_list.append(replace_pair(tokens, max_freq_pair, self.vocab_size))
            # update the running variables
            self.merges[max_freq_pair] = self.vocab_size
            self.vocab[self.vocab_size] = self.vocab[max_freq_pair[0]] + self.vocab[max_freq_pair[1]]
            self.vocab_size += 1
            tokens_list = list(new_tokens_list)

            if verbose:
                if ((i + 1) % verbose_iters == 0) or (i == num_merges - 1):
                    new_token_length = sum(len(tokens) for tokens in tokens_list)
                    print((f"Iteration {(i + 1):4d} | No. of tokens: {new_token_length:6d} | Merged pair: {str(max_freq_pair):10s} --> {self.merges[max_freq_pair]:5d}"))

        if verbose:
            new_token_length = sum(len(tokens) for tokens in tokens_list)
            compression = old_token_length / new_token_length
            print(f"Compression : {compression:.2f} X")

    def _encode_chunk(self, tokens):
        while len(tokens) >= 2:
            # first get the stats of the bigrams
            pair_stats = get_stats(text)
            # now check if there is a pair which is merged as per our tokenizer
            merge_pair = min(pair_stats, key=lambda k: self.merges.get(k, float("inf"))) # it will check if we get a merge pair candidate, else returns the first element
            # check if there is actually a match
            if self.merges.get(merge_pair) is None:
                break

            # now replace with the merges token
            tokens = replace_pair(tokens, merge_pair, self.merges[merge_pair])

        return tokens

    def _encode_parallel(self, tokens_list, num_processes=None):
        with Pool(processes=num_processes) as pool:
            results = pool.map(self._encode_chunk, tokens_list)
        return results


    def encode(self, text):
        split_text = GPT4_PATTERN_REGEX.findall(text)

        tokens_list = [list(t.encode("utf-8")) for t in split_text]

        encoded_tokens_list = self._encode_parallel(tokens_list)

        final_tokens = [item for sublist in encoded_tokens_list for item in sublist]

        return final_tokens


    def decode(self, ids):
        enc_text = b"".join(self.vocab[id] for id in ids)
        text = enc_text.decode("utf-8", errors="replace")
        return text

In [58]:
# now lets train for some small iterations
regex_tokenizer = RegexTokenizer()
regex_tokenizer.train(text=text, vocab_size=266, verbose=True, verbose_iters=1)

Start | No. of tokens: 1000000 | Vocab size:   256
Iteration    1 | No. of tokens: 975654 | Merged pair: (32, 116)  -->   256
Iteration    2 | No. of tokens: 954925 | Merged pair: (104, 101) -->   257
Iteration    3 | No. of tokens: 938057 | Merged pair: (32, 97)   -->   258
Iteration    4 | No. of tokens: 924441 | Merged pair: (105, 110) -->   259
Iteration    5 | No. of tokens: 911626 | Merged pair: (256, 257) -->   260
Iteration    6 | No. of tokens: 900809 | Merged pair: (32, 111)  -->   261
Iteration    7 | No. of tokens: 890173 | Merged pair: (114, 101) -->   262
Iteration    8 | No. of tokens: 879576 | Merged pair: (32, 119)  -->   263
Iteration    9 | No. of tokens: 869899 | Merged pair: (32, 115)  -->   264
Iteration   10 | No. of tokens: 861074 | Merged pair: (101, 114) -->   265
Compression : 1.16 X


In [60]:
# now lets train for 1k iterations
regex_tokenizer = RegexTokenizer()
regex_tokenizer.train(text=text, vocab_size=1256, verbose=True, verbose_iters=100)

Start | No. of tokens: 1000000 | Vocab size:   256
Iteration  100 | No. of tokens: 580314 | Merged pair: (97, 109)  -->   355
Iteration  200 | No. of tokens: 501016 | Merged pair: (32, 118)  -->   455
Iteration  300 | No. of tokens: 458709 | Merged pair: (453, 110) -->   555
Iteration  400 | No. of tokens: 430368 | Merged pair: (459, 110) -->   655
Iteration  500 | No. of tokens: 410025 | Merged pair: (300, 408) -->   755
Iteration  600 | No. of tokens: 394498 | Merged pair: (103, 103) -->   855
Iteration  700 | No. of tokens: 382035 | Merged pair: (290, 107) -->   955
Iteration  800 | No. of tokens: 371558 | Merged pair: (341, 285) -->  1055
Iteration  900 | No. of tokens: 362520 | Merged pair: (735, 1080) -->  1155
Iteration 1000 | No. of tokens: 354597 | Merged pair: (109, 98)  -->  1255
Compression : 2.82 X


In [61]:
# lets visualise the merges
with open("regex_merges.txt", "w") as file:
    sorted_merges = sorted(list(regex_tokenizer.merges.items()), key=lambda x: x[1])
    for k, v in sorted_merges:
        file.write(f"[{(regex_tokenizer.vocab[k[0]]).decode('utf-8', errors='replace')}][{(regex_tokenizer.vocab[k[1]]).decode('utf-8', errors='replace')}]  ---->   {v}\n")

In [59]:
for i, test_string in enumerate(test_strings):
    test_text = unpack(test_string)
    try:
        assert test_text == regex_tokenizer.decode(regex_tokenizer.encode(test_text))
        print(f"Test string: {i} Passed! :)")
    except AssertionError:
        print(f"Test string: {i} Failed! :(")
    assert test_text == regex_tokenizer.decode(regex_tokenizer.encode(test_text))

Test string: 0 Passed! :)
Test string: 1 Passed! :)
Test string: 2 Passed! :)
Test string: 3 Passed! :)


Process ForkPoolWorker-19:


KeyboardInterrupt: 

In [53]:


def your_function(x):
    # Your processing logic here
    return x * x  # example function

def process_list_parallel(input_list, num_processes=None):
    with Pool(processes=num_processes) as pool:
        results = pool.map(your_function, input_list)
    return results

# Usage
if __name__ == '__main__':
    my_list = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
    results = process_list_parallel(my_list)
    print(results)

[1, 4, 9, 16, 25, 36, 49, 64, 81, 100]


In [56]:
os.cpu_count()

2