In [124]:
import regex as re
text = open("train.txt", "r", encoding="utf-8").read()

In [107]:
import regex as re

class GPT4Tokenizer:
    def __init__(self):
        self.pattern = r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+"""
        self.vocab_size = 1024
        self.merges = {}
        self.vocab = {}
    
    # Find consecutive pairs   
    def get_stats(self, tokens, stats):
        for pair in zip(tokens, tokens[1:]):
            stats[pair] = stats.get(pair, 0) + 1
        return stats
    
    # Merge tokens
    def merge(self, tokens, pair, new_index):
        _tokens = []
        i = 0
        while i < len(tokens):
            if (i < len(tokens)-1) and (tokens[i]==pair[0]) and (tokens[i+1]==pair[1]):
                _tokens.append(new_index)
                i += 2

            else:
                _tokens.append(tokens[i])
                i += 1
        return _tokens
    
    def train(self, text, verbose=False):
        assert self.vocab_size >= 256
        num_merges = self.vocab_size - 256
        
        text_chunks = re.findall(self.pattern, text)
        tokens = [list(chunk.encode('utf-8')) for chunk in text_chunks]
        
        self.vocab = {idx: bytes([idx]) for idx in range(256)}
        
        for i in range(num_merges):
            stats = {}
            for chunk_token in tokens:
                self.get_stats(chunk_token, stats)
            top_pair = max(stats, key=stats.get)
            index = 256 + i
            if verbose:
                print(f"merged : {top_pair} -> {index}")
                
            tokens = [self.merge(chunk_token, top_pair, index) for chunk_token in tokens]   
                
            self.vocab[index] = self.vocab[top_pair[0]] + self.vocab[top_pair[1]]
            self.merges[top_pair] = index
    
    
    # encode chunk
    def encode_chunks(self, chunk_bytes):
        chunk_token = list(chunk_bytes)
        while len(chunk_token) >=2: 
            stats = {}
            self.get_stats(chunk_token, stats)
            pair = min(stats, key= lambda x: self.merges.get(x, float("inf")))
            if pair not in self.merges:
                break
            index = self.merges[pair]
            chunk_token = self.merge(chunk_token, pair, index)
        return chunk_token
    
    # encode full text
    def encode(self, text):
        text_chunks = re.findall(self.pattern, text)
        tokens = []
        
        for chunk in text_chunks:
            chunk_bytes = chunk.encode("utf-8")
            chunk_tokens = self.encode_chunks(chunk_bytes)
            tokens.extend(chunk_tokens)
        return tokens
                
    # decoding
    def decode(self, tokens):
        chunk_bytes = []
        for token in tokens:
            if token in self.vocab:
                chunk_bytes.append(self.vocab[token])
            else:
                raise ValueError(f"Invalid token id: {token}")
         
        
        b_tokens = b"".join(chunk_bytes)
        text = b_tokens.decode('utf-8', errors= "replace")
        return text
    

In [108]:
tokenizer = GPT4Tokenizer()

In [109]:
tokenizer.train(text, verbose=True)

merged : (101, 114) -> 256
merged : (50, 48) -> 257
merged : (111, 114) -> 258
merged : (105, 110) -> 259
merged : (101, 100) -> 260
merged : (32, 116) -> 261
merged : (111, 110) -> 262
merged : (104, 101) -> 263
merged : (32, 83) -> 264
merged : (97, 114) -> 265
merged : (97, 110) -> 266
merged : (32, 65) -> 267
merged : (261, 263) -> 268
merged : (97, 108) -> 269
merged : (114, 105) -> 270
merged : (118, 260) -> 271
merged : (115, 116) -> 272
merged : (119, 105) -> 273
merged : (32, 82) -> 274
merged : (257, 49) -> 275
merged : (32, 102) -> 276
merged : (257, 50) -> 277
merged : (32, 84) -> 278
merged : (102, 116) -> 279
merged : (97, 121) -> 280
merged : (32, 34) -> 281
merged : (273, 279) -> 282
merged : (101, 116) -> 283
merged : (264, 282) -> 284
merged : (99, 104) -> 285
merged : (98, 256) -> 286
merged : (97, 116) -> 287
merged : (111, 109) -> 288
merged : (101, 115) -> 289
merged : (101, 110) -> 290
merged : (101, 109) -> 291
merged : (34, 46) -> 292
merged : (32, 40) -> 293
m

In [117]:
Input_text = "Hi, can you34 help me?"

In [118]:
tokens = tokenizer.encode(Input_text)
tokens

[72, 105, 44, 349, 266, 598, 309, 51, 52, 950, 344, 101, 63]

In [120]:
tokenizer.decode([72, 105, 44, 349, 266, 598, 309, 51, 52, 950, 344, 101, 63])

'Hi, can you34 help me?'

In [121]:
tokenizer.vocab

{0: b'\x00',
 1: b'\x01',
 2: b'\x02',
 3: b'\x03',
 4: b'\x04',
 5: b'\x05',
 6: b'\x06',
 7: b'\x07',
 8: b'\x08',
 9: b'\t',
 10: b'\n',
 11: b'\x0b',
 12: b'\x0c',
 13: b'\r',
 14: b'\x0e',
 15: b'\x0f',
 16: b'\x10',
 17: b'\x11',
 18: b'\x12',
 19: b'\x13',
 20: b'\x14',
 21: b'\x15',
 22: b'\x16',
 23: b'\x17',
 24: b'\x18',
 25: b'\x19',
 26: b'\x1a',
 27: b'\x1b',
 28: b'\x1c',
 29: b'\x1d',
 30: b'\x1e',
 31: b'\x1f',
 32: b' ',
 33: b'!',
 34: b'"',
 35: b'#',
 36: b'$',
 37: b'%',
 38: b'&',
 39: b"'",
 40: b'(',
 41: b')',
 42: b'*',
 43: b'+',
 44: b',',
 45: b'-',
 46: b'.',
 47: b'/',
 48: b'0',
 49: b'1',
 50: b'2',
 51: b'3',
 52: b'4',
 53: b'5',
 54: b'6',
 55: b'7',
 56: b'8',
 57: b'9',
 58: b':',
 59: b';',
 60: b'<',
 61: b'=',
 62: b'>',
 63: b'?',
 64: b'@',
 65: b'A',
 66: b'B',
 67: b'C',
 68: b'D',
 69: b'E',
 70: b'F',
 71: b'G',
 72: b'H',
 73: b'I',
 74: b'J',
 75: b'K',
 76: b'L',
 77: b'M',
 78: b'N',
 79: b'O',
 80: b'P',
 81: b'Q',
 82: b'R',
 83: b'

In [123]:
raw_text = "The academic discipline of artificial intelligence was established at a research workshop held at Dartmouth College in 1956 and has experienced several waves of advancement and optimism in the decades since.[20] Since its inception, researchers in the field have raised philosophical and ethical arguments about the nature of the human mind and the consequences of creating artificial beings with human-like intelligence; these issues have previously been explored by myth, fiction and philosophy since antiquity.[21] The concept of automated art dates back at least to the automata of ancient Greek civilization, where inventors such as Daedalus and Hero of Alexandria were described as having designed machines capable of writing text, generating sounds, and playing music.[22][23] The tradition of creative automatons has flourished throughout history, exemplified by Maillardet's automaton created in the early 1800s.[24]"
d_text = tokenizer.decode(tokenizer.encode(raw_text))
print(d_text == raw_text)

True
