In [2]:
from importlib.metadata import version

print("torch version:", version("torch"))
print("tiktoken version:", version("tiktoken"))

torch version: 2.5.1
tiktoken version: 0.9.0


In [17]:
# data import
with open("the_post_office.txt", "r", encoding="utf-8") as file:
    text = file.read()

print(text[:500])

The Project Gutenberg eBook of The Post Office
    
This ebook is for the use of anyone anywhere in the United States and
most other parts of the world at no cost and with almost no restrictions
whatsoever. You may copy it, give it away or re-use it under the terms
of the Project Gutenberg License included with this ebook or online
at www.gutenberg.org. If you are not located in the United States,
you will have to check the laws of the country where you are located
before using this eBook.

Titl


In [18]:
# total characters
print(len(text))

58675


In [19]:
# get regular expressions
import re

preprocessed_text = re.split(r'([,.:;?_!"()\']|--|\s)', text) # |: or
print(preprocessed_text[:500])  

['The', ' ', 'Project', ' ', 'Gutenberg', ' ', 'eBook', ' ', 'of', ' ', 'The', ' ', 'Post', ' ', 'Office', '\n', '', ' ', '', ' ', '', ' ', '', ' ', '', '\n', 'This', ' ', 'ebook', ' ', 'is', ' ', 'for', ' ', 'the', ' ', 'use', ' ', 'of', ' ', 'anyone', ' ', 'anywhere', ' ', 'in', ' ', 'the', ' ', 'United', ' ', 'States', ' ', 'and', '\n', 'most', ' ', 'other', ' ', 'parts', ' ', 'of', ' ', 'the', ' ', 'world', ' ', 'at', ' ', 'no', ' ', 'cost', ' ', 'and', ' ', 'with', ' ', 'almost', ' ', 'no', ' ', 'restrictions', '\n', 'whatsoever', '.', '', ' ', 'You', ' ', 'may', ' ', 'copy', ' ', 'it', ',', '', ' ', 'give', ' ', 'it', ' ', 'away', ' ', 'or', ' ', 're-use', ' ', 'it', ' ', 'under', ' ', 'the', ' ', 'terms', '\n', 'of', ' ', 'the', ' ', 'Project', ' ', 'Gutenberg', ' ', 'License', ' ', 'included', ' ', 'with', ' ', 'this', ' ', 'ebook', ' ', 'or', ' ', 'online', '\n', 'at', ' ', 'www', '.', 'gutenberg', '.', 'org', '.', '', ' ', 'If', ' ', 'you', ' ', 'are', ' ', 'not', ' ', 'locat

In [22]:
# total tokens
print("Total tokens:", len(preprocessed_text))

# unique tokens
print("Unique tokens:", len(set(preprocessed_text)))


Total tokens: 28673
Unique tokens: 2056


In [23]:
# looking at the sorted token
print(sorted(set(preprocessed_text)))

['', '\n', ' ', '!', '"', '#6523]', '$1', '$5', "'", '(', ')', '***', ',', '-', '--', '.', '000', '1', '1500', '1914', '2', '20%', '2001', '2004', '2024', '21', '3', '30', '4', '5', '50', '501', '596-1887', '6', '60', '64-6221541', '7', '8', '801', '809', '84116', '9', '90', ':', ';', '?', 'A', 'ACT', 'ACTUAL', 'AGREE', 'AGREEMENT', 'AMAL', 'AMAL]', 'AND', 'ANY', 'ANYTHING', 'ASCII”', 'About', 'Additional', 'Ah', 'Alas', 'All', 'Am', 'Amal', 'And', 'Any', 'Archive', 'Are', 'Aren', 'As', 'At', 'Auntie', 'Author', 'B', 'BE', 'BEFORE', 'BOY', 'BOYS', 'BREACH', 'BUT', 'Bad', 'Badal', 'Be', 'Bed]', 'Been', 'Before', 'Beg', 'Benay', 'Bengali', 'Better', 'Between', 'Bile', 'Blow', 'But', 'By', 'C', 'CONSEQUENTIAL', 'CONTRACT', 'CURTAIN', 'Can', 'Chakradhan', 'Champa', 'Chetan', 'Chyabana', 'City', 'Come', 'Company', 'Company]', 'Compliance', 'Contact', 'Contributions', 'Copyright', 'Creating', 'Credits', 'Cross', 'Curd', 'Curds', 'Curdseller', 'Cutting', 'D', 'DAIRYMAN', 'DAMAGE', 'DAMAGES', 

In [24]:
# construct vocab
vocab = sorted(set(preprocessed_text))

# assign unique token ids to tokens
token_to_id = {token: i for i, token in enumerate(vocab)}

In [28]:
# lookking at the sorted tokens and their assigned id
for i, item in enumerate(token_to_id.items()):
    print(item)
    if i > 20:
        break

('', 0)
('\n', 1)
(' ', 2)
('!', 3)
('"', 4)
('#6523]', 5)
('$1', 6)
('$5', 7)
("'", 8)
('(', 9)
(')', 10)
('***', 11)
(',', 12)
('-', 13)
('--', 14)
('.', 15)
('000', 16)
('1', 17)
('1500', 18)
('1914', 19)
('2', 20)
('20%', 21)


In [None]:
# create tokenizer class
class nakliTokenizer:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {v: k for k, v in vocab.items()}

    def encode(self, text: str) -> list[int]:
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [
            item.strip() for item in preprocessed if item.strip()
        ]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids

    def decode(self, ids: list[int]) -> str:
        text = " ".join([self.int_to_str[i] for i in ids])
        # Replace spaces before the specified punctuations
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text