# Mapping tokens to token IDs
### Creating the first version of the simple tokeniser.

In [1]:
# importing necessary packages
from pathlib import Path

In [2]:
with open(Path("../resources/verdict.txt"), "r", encoding="utf-8") as f:
    verdict = f.read()

print(f"length of the text {len(verdict)}")
print("\n", verdict[:99])

# We have now confirmed the length of the text, and the also printed the first 99 characters and 
# the length includes the spaces

length of the text 20559

 The Verdict: Edith Wharton: 1908
Exported from Wikisource on October 21, 2024

I HAD always thought


In [3]:
import re
regex_logic = r"([,.:;?_!\"()']|--|\s)"

In [4]:
preprocessed_text = re.split(regex_logic, verdict)
preprocessed_text = [text.strip() for text in preprocessed_text if text.strip()]
print(len(preprocessed_text))

4705


Above we have simply copied the code as we step into further processing steps

In [5]:
all_words = sorted(set(preprocessed_text))


#word_mapping = {}
#for i,word in enumerate(all_words):
#    word_mapping[word]= i

# much more concise way of writing the above, with a slight change in the output using () 
# would be the following
vocab = {word: idx for idx, word in enumerate(all_words)} 

In [6]:
for i,word in enumerate(vocab.items()):
    if i < 10:
        print(word)

('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)
(',', 5)
('--', 6)
('.', 7)
('1908', 8)
('2024', 9)


Now we further need to understand that simply converting one way isnt what we need, 
We need to be able to convert tokens to token ids but also get back tokens from token ids.
This way when we apply the model we are able to generate tokens and understand their meaning.

In [7]:
class SimpleTokenizerV1:
    def __init__(self,vocab):
        self.str_to_int = vocab # this instance of class expects we have a vocab already defined
        self.int_to_str = {idx:text for text,idx in vocab.items()}

    def encode_text(self, text):
        preprocessed_text = re.split(regex_logic, text)
        preprocessed_text = [text.strip() for text in preprocessed_text if text.strip()]
        mapping_idx = [self.str_to_int[token] for token in preprocessed_text]
        return mapping_idx
        

    def decode_text(self, index):
        joined_idx= " ".join(self.int_to_str[idx] for idx in index)
        joined_idx = re.sub(r"\s+([,?.!\"()'])", r"\1", joined_idx)
        # the above regex attempts to remove the whitespace before the special characters,
        # this is achieved by the capturing group which suggest remove space but keep the capture group
        # characters.
        return joined_idx
        
        
        

In [8]:
tokeniser = SimpleTokenizerV1(vocab)

In [9]:
token_ids = tokeniser.encode_text("This is a ,lovely day")
print(token_ids)

[103, 593, 124, 5, 657, 324]


In [10]:
tokeniser.decode_text(token_ids) # notice how the space id removed just before the ,

'This is a, lovely day'