# exercise

Build your own GPT-4 Tokenizer!

In [1]:
!pip install tiktoken
import regex as re
import tiktoken


Collecting tiktoken
  Downloading tiktoken-0.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Downloading tiktoken-0.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tiktoken
Successfully installed tiktoken-0.7.0


In [2]:
! wget "https://raw.githubusercontent.com/karpathy/minbpe/master/tests/taylorswift.txt"

--2024-06-08 20:01:03--  https://raw.githubusercontent.com/karpathy/minbpe/master/tests/taylorswift.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 185768 (181K) [text/plain]
Saving to: 'taylorswift.txt'


2024-06-08 20:01:03 (1.72 MB/s) - 'taylorswift.txt' saved [185768/185768]



In [3]:
text = open("taylorswift.txt").read()
text = re.sub('\n', ' ', text)


### Step 1

Write the `BasicTokenizer` class, with the following three core functions:

- `def train(self, text, vocab_size, verbose=False)`
- `def encode(self, text)`
- `def decode(self, ids)`

Train your tokenizer on whatever text you like and visualize the merged tokens. Do they look reasonable? One default test you may wish to use is the text file `tests/taylorswift.txt`.


In [4]:
class Tokenizer:
    
    def count_pairs(self, tk, pairs=None):
        pairs = {} if pairs is None else pairs
        for i in range(1, len(tk)):
            pairs[(tk[i-1], tk[i])] = pairs.get((tk[i-1],tk[i]), 0) + 1
        return pairs
    
    def merge(self,tk, target, idx):
        merged = []
        i = 0
        while i < len(tk):
            if i < len(tk)-1 and (tk[i], tk[i+1]) == target:
                merged.append(idx)
                i+=2
            else:
                merged.append(tk[i])
                i+=1
        return merged

In [5]:
class BasicTokenizer(Tokenizer):
    def __init__(self):
        self.merges = {}
        self.encoding = {i: bytes([i]) for i in range(256)}
    
    def train(self, text, vocab_size, verbose=False):
        tokens = list(bytes(text, "UTF-8"))
        ids = list(tokens)
        merge_cnt = vocab_size-256
        new_id = 256
        for i in range(merge_cnt):
            pairs = self.count_pairs(ids)
            target = max(pairs, key = pairs.get)
            ids = self.merge(ids, target, new_id)
            self.merges[target] = new_id
            self.encoding[new_id] = self.encoding[target[0]] + self.encoding[target[1]]
            new_id+=1
        
        print(f'merges done: {len(self.merges)}')

    def encode(self, text):
        text_bytes = list(bytes(text, "UTF-8"))
        ids = list(text_bytes)
        while len(ids) >= 2:
            stats = self.count_pairs(ids)
            pair = min(stats, key=lambda p: self.merges.get(p, float("inf")))
            if pair not in self.merges:
                break
            idx = self.merges[pair]
            ids = self.merge(ids, pair, idx)
        return ids
    
    def decode(self, ids):
        text_bytes = b"".join(self.encoding[idx] for idx in ids)
        return text_bytes.decode('utf-8', errors='replace')

In [6]:
t = BasicTokenizer()

In [7]:
%%time
t.train(text, 5000)

merges done: 4744
CPU times: user 5min 53s, sys: 91 ms, total: 5min 53s
Wall time: 5min 53s


In [8]:
string = """In The New Yorker in 2011, Swift said she identifies as a songwriter first: "I write songs, and my voice is just a way to get those lyrics across". Her personal experiences were a common inspiration for her early songs, which helped her navigate life."""
out = [t.decode([y]) for y in t.encode(string)]
print(len(out))
print(out)

52
['In ', 'The New Yorker ', 'in 2011', ', Swift ', 'said she ', 'identifies ', 'as a songwrit', 'er ', 'fir', 'st', ': "', 'I ', 'write ', 'song', 's, and ', 'my ', 'voice ', 'is ', 'just ', 'a ', 'way ', 'to ', 'get ', 'th', 'o', 'se ', 'lyrics ', 'ac', 'ross', '". ', 'Her ', 'personal ', 'experi', 'ences ', 'were ', 'a ', 'common ', 'inspir', 'ation ', 'for her ', 'early ', 'song', 's, which ', 'help', 'ed her ', 'n', 'av', 'ig', 'ate ', 'li', 'fe', '.']



### Step 2

Convert you `BasicTokenizer` into a `RegexTokenizer`, which takes a regex pattern and splits the text exactly as GPT-4 would. Process the parts separately as before, then concatenate the results. Retrain your tokenizer and compare the results before and after. You should see that you will now have no tokens that go across categories (numbers, letters, punctuation, more than one whitespace). Use the GPT-4 pattern:

```
GPT4_SPLIT_PATTERN = r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+"""
```

In [9]:
class RegexTokenizer(Tokenizer):
    def __init__(self):
        self.merges = {}
        self.encoding = {i: bytes([i]) for i in range(256)}
        self.regx = r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+"""
        
        
    def train(self, text, vocab_size):
        chunks = re.findall(self.regx, text)
        tokens = [list(bytes(t, "utf-8")) for t in chunks]
        mergecnt = vocab_size-256
        new_idx = 256
        ids = list(tokens)
        for i in range(mergecnt):
            pairs = {}
            for tks in ids: self.count_pairs(tks, pairs)
            target = max(pairs, key=pairs.get)
            ids = [self.merge(tks, target, new_idx) for tks in ids]
            self.merges[target] = new_idx
            self.encoding[new_idx] = self.encoding[target[0]] + self.encoding[target[1]]
            new_idx += 1
            
    def encode_(self, text):
        text_bytes = list(bytes(text, "UTF-8"))
        ids = list(text_bytes)
        while len(ids) >= 2:
            stats = self.count_pairs(ids)
            pair = min(stats, key=lambda p: self.merges.get(p, float("inf")))
            if pair not in self.merges:
                break
            idx = self.merges[pair]
            ids = self.merge(ids, pair, idx)
        return ids        
    
    def encode(self, text):
        chunks = re.findall(self.regx, text)
        ids = []
        for chunk in chunks:
            chunk_ids = self.encode_(chunk)
            ids.extend(chunk_ids)
            
        return ids
    
    def decode(self, ids):
        text_bytes = b"".join(self.encoding[idx] for idx in ids)
        return text_bytes.decode('utf-8', errors='replace')

In [10]:
t1 = RegexTokenizer()

In [11]:
%%time
t1.train(text, 5000)

CPU times: user 11min 4s, sys: 156 ms, total: 11min 4s
Wall time: 11min 4s


In [12]:
string = """In The New Yorker in 2011, Swift said she identifies as a songwriter first: "I write songs, and my voice is just a way to get those lyrics across". Her personal experiences were a common inspiration for her early songs, which helped her navigate life."""
out = [t1.decode([y]) for y in t1.encode(string)]
print(len(out))
print(out)

60
['In', ' The', ' New', ' Yorker', ' in', ' ', '201', '1', ',', ' Swift', ' said', ' she', ' identifies', ' as', ' a', ' songwriter', ' first', ':', ' "', 'I', ' write', ' songs', ',', ' and', ' my', ' voice', ' is', ' just', ' a', ' way', ' to', ' get', ' those', ' lyrics', ' ac', 'ross', '".', ' Her', ' personal', ' experi', 'ences', ' were', ' a', ' common', ' insp', 'iration', ' for', ' her', ' early', ' songs', ',', ' which', ' helped', ' her', ' n', 'av', 'ig', 'ate', ' life', '.']


In [13]:
# list(t1.encoding.items())[250:]
# t1.merges