### Simple Basic Tokenizer.

In [1]:
with open("the-verdict.txt", 'r', encoding="utf-8") as f:
    raw_text = f.read()

print('Total number of characters:', len(raw_text))
print(raw_text[:99])

Total number of characters: 20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


In [2]:
import re
text = 'Hello, world, This is a test'
result = re.split(r'([,.]|\s)', text)
print(result)

['Hello', ',', '', ' ', 'world', ',', '', ' ', 'This', ' ', 'is', ' ', 'a', ' ', 'test']


In [3]:
result = [item for item in result if item.strip()]
print(result)

['Hello', ',', 'world', ',', 'This', 'is', 'a', 'test']


In [4]:
text = 'Hello, world, This is-- a test?'
result = re.split(r'([,.:;?_!"()\']|--|\s)', text)
result = [item for item in result if item.strip()]
print(result)

['Hello', ',', 'world', ',', 'This', 'is', '--', 'a', 'test', '?']


In [5]:
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
preprocessed = [item for item in preprocessed if item.strip()]

preprocessed.extend(['<|unk|>', '<|endoftext|>'])
print(len(preprocessed))

4692


In [6]:
print(preprocessed[:30])

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']


In [7]:
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)

print(vocab_size)

1132


In [8]:
vocab = {word:index for index, word in enumerate(all_words)}

In [9]:
for i, item in enumerate(vocab.items()):
    print(item)
    if i >= 50: break

('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)
(',', 5)
('--', 6)
('.', 7)
(':', 8)
(';', 9)
('<|endoftext|>', 10)
('<|unk|>', 11)
('?', 12)
('A', 13)
('Ah', 14)
('Among', 15)
('And', 16)
('Are', 17)
('Arrt', 18)
('As', 19)
('At', 20)
('Be', 21)
('Begin', 22)
('Burlington', 23)
('But', 24)
('By', 25)
('Carlo', 26)
('Chicago', 27)
('Claude', 28)
('Come', 29)
('Croft', 30)
('Destroyed', 31)
('Devonshire', 32)
('Don', 33)
('Dubarry', 34)
('Emperors', 35)
('Florence', 36)
('For', 37)
('Gallery', 38)
('Gideon', 39)
('Gisburn', 40)
('Gisburns', 41)
('Grafton', 42)
('Greek', 43)
('Grindle', 44)
('Grindles', 45)
('HAD', 46)
('Had', 47)
('Hang', 48)
('Has', 49)
('He', 50)


In [10]:
class SimpleTokenizerV1:
    def __init__(self,vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}
    
    def encoder(self,text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [item for item in preprocessed if item.strip()]

        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
    
    def decoder(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        # remove white spaces before punctuations
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        
        return text

In [11]:
tokenizer = SimpleTokenizerV1(vocab)

text = 'this is a , how it!'

tokenizer.encoder(text)

[1001, 586, 117, 5, 562, 587, 0]

In [12]:
tokenizer.decoder(tokenizer.encoder(text))

'this is a, how it!'

In [13]:
class SimpleTokenizerV2:
    def __init__(self,vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}
    
    def encoder(self,text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [item for item in preprocessed if item.strip()]

        ids = [self.str_to_int[s] if s in self.str_to_int else self.str_to_int['<|unk|>'] for s in preprocessed]
        return ids
    
    def decoder(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        # remove white spaces before punctuations
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        
        return text

In [14]:
tokenizer = SimpleTokenizerV2(vocab)

text = 'this is a , how it! helloe'

tokenizer.encoder(text)

[1001, 586, 117, 5, 562, 587, 0, 11]

In [15]:
tokenizer.decoder(tokenizer.encoder(text))   

'this is a, how it! <|unk|>'

In [16]:
text1= 'This is a test'
text2 = 'adding to above'

text = " <|endoftext|> ".join((text1, text2))
tokenizer.encoder(text)

[99, 586, 117, 11, 10, 11, 1018, 121]

In [17]:
print(text)

This is a test <|endoftext|> adding to above


In [18]:
tokenizer.decoder(tokenizer.encoder(text))  

'This is a <|unk|> <|endoftext|> <|unk|> to above'

Models like GPT use **Byte Pair Encoding (BPE)** to handle unknown words more effectively. The text is divided into smaller subword units (tokens), which are optimized using the BPE algorithm. This allows the model to represent rare or unseen words by combining known subword tokens.


### Byte Pair Encoding.