### Basic Tokenizer

### Text

In [16]:
text = open("train.txt", "r", encoding="utf-8").read()

##### Find consecutive pairs

In [17]:
def get_stats(token_ids):
    counts = {}
    for pair in zip(token_ids, token_ids[1:]):
        counts[pair] = counts.get(pair, 0) + 1
    return counts

##### Merge token_ids

In [18]:
def merge(token_ids, pair, new_index):
    new_tokens_ids = []
    i = 0
    while i < len(token_ids):
        if (i < len(token_ids)-1) and (token_ids[i]==pair[0]) and (token_ids[i+1]==pair[1]):
            new_tokens_ids.append(new_index)
            i += 2

        else:
            new_tokens_ids.append(token_ids[i])
            i += 1
    return new_tokens_ids        

#### Training

In [19]:
def train(text, vocab_size, verbose=False):
    assert vocab_size >= 256
    num_merges = vocab_size - 256
    token_ids = list(text.encode('utf-8'))
    
    merges = {}
    for i in range(num_merges):
        stats = get_stats(token_ids)
        top_pair = max(stats, key=stats.get)
        index = 256 + i
        if verbose:
            print(f"merged : {top_pair} -> {index}")
        token_ids = merge(token_ids, top_pair, index)
        merges[top_pair] = index
    return merges
        

In [20]:
merges = train(text, 1024, verbose=True)

merged : (101, 32) -> 256
merged : (44, 32) -> 257
merged : (100, 32) -> 258
merged : (46, 32) -> 259
merged : (114, 32) -> 260
merged : (50, 48) -> 261
merged : (115, 32) -> 262
merged : (105, 110) -> 263
merged : (111, 110) -> 264
merged : (114, 105) -> 265
merged : (116, 32) -> 266
merged : (116, 104) -> 267
merged : (101, 258) -> 268
merged : (257, 261) -> 269
merged : (97, 110) -> 270
merged : (97, 114) -> 271
merged : (101, 260) -> 272
merged : (121, 32) -> 273
merged : (97, 108) -> 274
merged : (267, 256) -> 275
merged : (118, 268) -> 276
merged : (119, 105) -> 277
merged : (101, 114) -> 278
merged : (264, 32) -> 279
merged : (277, 102) -> 280
merged : (82, 101) -> 281
merged : (83, 280) -> 282
merged : (111, 260) -> 283
merged : (99, 104) -> 284
merged : (269, 49) -> 285
merged : (111, 109) -> 286
merged : (98, 272) -> 287
merged : (32, 275) -> 288
merged : (97, 121) -> 289
merged : (101, 110) -> 290
merged : (111, 114) -> 291
merged : (274, 32) -> 292
merged : (101, 109) -> 29

In [21]:
print(len(merges))

768


#### Encoding

In [22]:
def encode(text, merges):
    token_ids = list(text.encode('utf-8'))
    while len(token_ids) >=2: 
        stats = get_stats(token_ids)
        pair = min(stats, key= lambda x: merges.get(x, float("inf")))
        if pair not in merges:
            break
        index = merges[pair]
        token_ids = merge(token_ids, pair, index)
    return token_ids

#### Decoding

In [23]:
vocab = {idx: bytes([idx]) for idx in range(256)}
for (p0, p1), idx in merges.items():
    vocab[idx] = vocab[p0] + vocab[p1]

In [24]:
vocab

{0: b'\x00',
 1: b'\x01',
 2: b'\x02',
 3: b'\x03',
 4: b'\x04',
 5: b'\x05',
 6: b'\x06',
 7: b'\x07',
 8: b'\x08',
 9: b'\t',
 10: b'\n',
 11: b'\x0b',
 12: b'\x0c',
 13: b'\r',
 14: b'\x0e',
 15: b'\x0f',
 16: b'\x10',
 17: b'\x11',
 18: b'\x12',
 19: b'\x13',
 20: b'\x14',
 21: b'\x15',
 22: b'\x16',
 23: b'\x17',
 24: b'\x18',
 25: b'\x19',
 26: b'\x1a',
 27: b'\x1b',
 28: b'\x1c',
 29: b'\x1d',
 30: b'\x1e',
 31: b'\x1f',
 32: b' ',
 33: b'!',
 34: b'"',
 35: b'#',
 36: b'$',
 37: b'%',
 38: b'&',
 39: b"'",
 40: b'(',
 41: b')',
 42: b'*',
 43: b'+',
 44: b',',
 45: b'-',
 46: b'.',
 47: b'/',
 48: b'0',
 49: b'1',
 50: b'2',
 51: b'3',
 52: b'4',
 53: b'5',
 54: b'6',
 55: b'7',
 56: b'8',
 57: b'9',
 58: b':',
 59: b';',
 60: b'<',
 61: b'=',
 62: b'>',
 63: b'?',
 64: b'@',
 65: b'A',
 66: b'B',
 67: b'C',
 68: b'D',
 69: b'E',
 70: b'F',
 71: b'G',
 72: b'H',
 73: b'I',
 74: b'J',
 75: b'K',
 76: b'L',
 77: b'M',
 78: b'N',
 79: b'O',
 80: b'P',
 81: b'Q',
 82: b'R',
 83: b'

In [25]:
def decode(token_ids, vocab):
    b_tokens = b"".join(vocab[idx] for idx in token_ids)
    text = b_tokens.decode('utf-8', errors= "replace")
    return text

### Test

In [26]:
Input_text = "I am fine"

In [27]:
token_ids = encode(Input_text, merges)
token_ids

[866, 362, 32, 102, 263, 101]

In [28]:
decoded_text = decode([72, 105, 257, 99, 357, 121, 321, 32, 104, 358, 588, 420, 63], vocab)

In [29]:
decoded_text

'Hi, can you help me?'

In [30]:
raw_text = "The 🅤🅝🅘🅒🅞🅓🅔‽ academic discipline of artificial intelligence was established at a research workshop held at Dartmouth College in 1956 and has experienced several waves of advancement and optimism in the decades since.[20] Since its inception, researchers in the field have raised philosophical and ethical arguments about the nature of the human mind and the consequences of creating artificial beings with human-like intelligence; these issues have previously been explored by myth, fiction and philosophy since antiquity.[21] The concept of automated art dates back at least to the automata of ancient Greek civilization, where inventors such as Daedalus and Hero of Alexandria were described as having designed machines capable of writing text, generating sounds, and playing music.[22][23] The tradition of creative automatons has flourished throughout history, exemplified by Maillardet's automaton created in the early 1800s.[24]"
d_text = decode(encode(raw_text, merges), vocab)
print(d_text == raw_text)

True


In [31]:
class BasicTokenizer:
    def __init__(self):
        self.vocab_size = 1024
        self.merges = {}
    
    # Find consecutive pairs   
    def get_stats(self, token_ids):
        counts = {}
        for pair in zip(token_ids, token_ids[1:]):
            counts[pair] = counts.get(pair, 0) + 1
        return counts
    
    # Merge tokens ids
    def merge(self, token_ids, pair, new_index):
        _tokens = []
        i = 0
        while i < len(token_ids):
            if (i < len(token_ids)-1) and (token_ids[i]==pair[0]) and (token_ids[i+1]==pair[1]):
                _tokens.append(new_index)
                i += 2

            else:
                _tokens.append(token_ids[i])
                i += 1
        return _tokens
    
    def train(self, text, verbose=False):
        assert self.vocab_size >= 256
        num_merges = self.vocab_size - 256
        token_ids = list(text.encode('utf-8'))
        
        for i in range(num_merges):
            stats = self.get_stats(token_ids)
            top_pair = max(stats, key=stats.get)
            index = 256 + i
            if verbose:
                print(f"merged : {top_pair} -> {index}")
            token_ids = self.merge(token_ids, top_pair, index)
            self.merges[top_pair] = index
        return self.merges
    
    # get vocabulary 
    def get_vocab(self):
        vocab = {idx: bytes([idx]) for idx in range(256)}
        for (p0, p1), idx in self.merges.items():
            vocab[idx] = vocab[p0] + vocab[p1]
        return vocab
    
    # encoding
    def encode(self, text):
        token_ids = list(text.encode('utf-8'))
        while len(token_ids) >=2: 
            stats = self.get_stats(token_ids)
            pair = min(stats, key= lambda x: self.merges.get(x, float("inf")))
            if pair not in self.merges:
                break
            index = self.merges[pair]
            token_ids = self.merge(token_ids, pair, index)
        return token_ids
    
    # decoding
    def decode(self, token_ids):
        vocab = self.get_vocab()
        b_tokens = b"".join(vocab[idx] for idx in token_ids)
        text = b_tokens.decode('utf-8', errors= "replace")
        return text
    

In [32]:
tokenizer = BasicTokenizer()

In [35]:
text = open("train.txt", "r", encoding="utf-8").read()
text

'Copy paste of the Wikipedia article on Taylor Swift, as of Feb 16, 2024.\n---\n\nMain menu\n\nWikipediaThe Free Encyclopedia\n\nSearch\nCreate account\nLog in\n\nPersonal tools\nContents  hide\n(Top)\nLife and career\nToggle Life and career subsection\nArtistry\nToggle Artistry subsection\nAccolades and achievements\nCultural status\nToggle Cultural status subsection\nWealth\nToggle Wealth subsection\nDiscography\nFilmography\nTours\nSee also\nFootnotes\nReferences\nToggle References subsection\nExternal links\nTaylor Swift\n\n136 languages\nArticle\nTalk\nRead\nView source\nView history\n\nTools\n Featured article\nPage semi-protected\nFrom Wikipedia, the free encyclopedia\nFor the album, see Taylor Swift (album).\nTaylor Swift\nPortrait of Taylor Swift in a cocktail dress\nSwift at the 2023 MTV Video Music Awards\nBorn\tTaylor Alison Swift\nDecember 13, 1989 (age 34)\nWest Reading, Pennsylvania, US\nOccupations\nSinger-songwriter producer director businesswoman actress\nYears active

In [36]:
merges = tokenizer.train(text, verbose=True)

merged : (101, 32) -> 256
merged : (44, 32) -> 257
merged : (100, 32) -> 258
merged : (46, 32) -> 259
merged : (114, 32) -> 260
merged : (50, 48) -> 261
merged : (115, 32) -> 262
merged : (105, 110) -> 263
merged : (111, 110) -> 264
merged : (114, 105) -> 265
merged : (116, 32) -> 266
merged : (116, 104) -> 267
merged : (101, 258) -> 268
merged : (257, 261) -> 269
merged : (97, 110) -> 270
merged : (97, 114) -> 271
merged : (101, 260) -> 272
merged : (121, 32) -> 273
merged : (97, 108) -> 274
merged : (267, 256) -> 275
merged : (118, 268) -> 276
merged : (119, 105) -> 277
merged : (101, 114) -> 278
merged : (264, 32) -> 279
merged : (277, 102) -> 280
merged : (82, 101) -> 281
merged : (83, 280) -> 282
merged : (111, 260) -> 283
merged : (99, 104) -> 284
merged : (269, 49) -> 285
merged : (111, 109) -> 286
merged : (98, 272) -> 287
merged : (32, 275) -> 288
merged : (97, 121) -> 289
merged : (101, 110) -> 290
merged : (111, 114) -> 291
merged : (274, 32) -> 292
merged : (101, 109) -> 29

In [37]:
len(merges)

768

In [38]:
raw_text = "The 🅤🅝🅘🅒🅞🅓🅔‽ academic discipline of artificial intelligence was established at a research workshop held at Dartmouth College in 1956 and has experienced several waves of advancement and optimism in the decades since.[20] Since its inception, researchers in the field have raised philosophical and ethical arguments about the nature of the human mind and the consequences of creating artificial beings with human-like intelligence; these issues have previously been explored by myth, fiction and philosophy since antiquity.[21] The concept of automated art dates back at least to the automata of ancient Greek civilization, where inventors such as Daedalus and Hero of Alexandria were described as having designed machines capable of writing text, generating sounds, and playing music.[22][23] The tradition of creative automatons has flourished throughout history, exemplified by Maillardet's automaton created in the early 1800s.[24]"
d_text = tokenizer.decode(tokenizer.encode(raw_text))
print(d_text == raw_text)

True


In [39]:
tokenizer.get_vocab()

{0: b'\x00',
 1: b'\x01',
 2: b'\x02',
 3: b'\x03',
 4: b'\x04',
 5: b'\x05',
 6: b'\x06',
 7: b'\x07',
 8: b'\x08',
 9: b'\t',
 10: b'\n',
 11: b'\x0b',
 12: b'\x0c',
 13: b'\r',
 14: b'\x0e',
 15: b'\x0f',
 16: b'\x10',
 17: b'\x11',
 18: b'\x12',
 19: b'\x13',
 20: b'\x14',
 21: b'\x15',
 22: b'\x16',
 23: b'\x17',
 24: b'\x18',
 25: b'\x19',
 26: b'\x1a',
 27: b'\x1b',
 28: b'\x1c',
 29: b'\x1d',
 30: b'\x1e',
 31: b'\x1f',
 32: b' ',
 33: b'!',
 34: b'"',
 35: b'#',
 36: b'$',
 37: b'%',
 38: b'&',
 39: b"'",
 40: b'(',
 41: b')',
 42: b'*',
 43: b'+',
 44: b',',
 45: b'-',
 46: b'.',
 47: b'/',
 48: b'0',
 49: b'1',
 50: b'2',
 51: b'3',
 52: b'4',
 53: b'5',
 54: b'6',
 55: b'7',
 56: b'8',
 57: b'9',
 58: b':',
 59: b';',
 60: b'<',
 61: b'=',
 62: b'>',
 63: b'?',
 64: b'@',
 65: b'A',
 66: b'B',
 67: b'C',
 68: b'D',
 69: b'E',
 70: b'F',
 71: b'G',
 72: b'H',
 73: b'I',
 74: b'J',
 75: b'K',
 76: b'L',
 77: b'M',
 78: b'N',
 79: b'O',
 80: b'P',
 81: b'Q',
 82: b'R',
 83: b'