In [None]:
import regex as re
from .base import Tokenizer, get_stats, merge


# the main GPT text split patterns, see
# https://github.com/openai/tiktoken/blob/main/tiktoken_ext/openai_public.py
GPT2_SPLIT_PATTERN = r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
GPT4_SPLIT_PATTERN = r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+"""


class RegexTokenizer(Tokenizer):

    def __init__(self, pattern=None):
        """
        - pattern: optional string to override the default (GPT-4 split pattern)
        - special_tokens: str -> int dictionary of special tokens
          example: {'<|endoftext|>': 100257}
        """
        super().__init__()
        self.pattern = GPT4_SPLIT_PATTERN if pattern is None else pattern
        self.compiled_pattern = re.compile(self.pattern)
        self.special_tokens = {}
        self.inverse_special_tokens = {}

I’ll explain **only** this part in very simple English, step by step, with small advanced-word explanations.

---

## `re.compile(self.pattern)` — simple meaning

This line **turns the regex (regular expression) string into a “compiled pattern object.”**

Beginner meaning:

It **prepares the regex** so Python can run it fast later.

When you use a regex many times (like in a tokenizer), Python normally must:

* read the regex string
* parse it
* convert it to an internal format
* then run it

That is slow.

`re.compile()` makes Python **do the parsing only once**.

Then later, you can use:

* `self.compiled_pattern.findall(text)`
* `self.compiled_pattern.split(text)`
* `self.compiled_pattern.search(text)`

These run much faster.

---

## Think of it like a “pre-built machine”

Imagine the regex pattern is a **blueprint**, and `re.compile` builds the **machine** from the blueprint.

After the machine is built, using it is fast.

---

## What exactly happens inside Python?

Here is a simple, clear breakdown.

1. You give Python a regex string.
   Example: `r"\p{L}+"`

2. Python reads the string character by character.

3. Python converts it into an internal structure (like bytecode for regex).
   Beginner explanation: It makes a small “program” that can match text.

4. Python returns a **compiled regex object**.
   Example: `<regex.Pattern object>`

5. This object is stored in `self.compiled_pattern`.

From now on, the tokenizer will use the **compiled** version, not the raw string.

---

## Why the tokenizer needs it

The tokenizer will call:

# re.findall(self.compiled_pattern, text)


many, many times.

If the pattern is not compiled, Python would compile it **every time**, making tokenization slow.

So:

* `re.compile` = compile once, use many times
* Faster tokenization
* Less CPU work
* More stable behavior

---

In [3]:
def get_stats(ids, counts=None):
    """
    Given a list of integers, return a dictionary of counts of consecutive pairs
    Example: [1, 2, 3, 1, 2] -> {(1, 2): 2, (2, 3): 1, (3, 1): 1}
    Optionally allows to update an existing dictionary of counts
    """
    counts = {} if counts is None else counts
    for pair in zip(ids, ids[1:]): # iterate consecutive elements
        counts[pair] = counts.get(pair, 0) + 1
    return counts

In [4]:
def merge(ids, pair, idx):
    """
    In the list of integers (ids), replace all consecutive occurrences
    of pair with the new integer token idx
    Example: ids=[1, 2, 3, 1, 2], pair=(1, 2), idx=4 -> [4, 3, 4]
    """
    newids = []
    i = 0
    while i < len(ids):
        # if not at the very last position AND the pair matches, replace it
        if ids[i] == pair[0] and i < len(ids) - 1 and ids[i+1] == pair[1]:
            newids.append(idx)
            i += 2
        else:
            newids.append(ids[i])
            i += 1
    return newids

# if vocab_size less then 256 assert raise assertion error.

# for example: vocab_size = 200 -----> 200 >= 256 it is False

# num_merges used for how many times for cycle has to work

# if vocab_size = 300 it means we have 44 new words which needs to take new ids



In [22]:
# DO NOT use: import re
import regex as re  # <--- This is the key change

GPT4_SPLIT_PATTERN = r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+"""

# This will now compile successfully because the 'regex' library supports \p
compiled_pattern = re.compile(GPT4_SPLIT_PATTERN)

text = "Hello, world!"

text_chunks = re.findall(compiled_pattern, text)
# used downloaded re compile


ids = [list(ch.encode("utf-8")) for ch in text_chunks]

print(f"text_chunks: {text_chunks}")
print(f"ids: {ids}")

text_chunks: ['Hello', ',', ' world', '!']
ids: [[72, 101, 108, 108, 111], [44], [32, 119, 111, 114, 108, 100], [33]]


In [None]:
import re

def train_bpe(text, vocab_size, verbose=False):
    """
    Standalone version of RegexTokenizer.train
    Returns: (merges, vocab)
    """
    # 1. Configuration
    assert vocab_size >= 256
    
    num_merges = vocab_size - 256
    GPT4_SPLIT_PATTERN = r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+"""
    compiled_pattern = re.compile(GPT4_SPLIT_PATTERN)
    
    print(f"[Info] Starting training. Target merges: {num_merges}")
    
    text_chunks = re.findall(compiled_pattern, text)  # example: ['Hello', ',', ' world', '!']
    
    if verbose:
        print(f"[Info] Text split into {len(text_chunks)} chunks: {text_chunks}")
        
    merges = {} # (int, int) -> int
    vocab = {idx: bytes([idx]) for idx in range(256)} # idx -> bytes
    
    