In [1]:
# importing necessary packages
from pathlib import Path

In [2]:
with open(Path("../resources/verdict.txt"), "r", encoding="utf-8") as f:
    verdict = f.read()

print(f"length of the text {len(verdict)}")
print("\n", verdict[:99])

# We have now confirmed the length of the text, and the also printed the first 99 characters and 
# the length includes the spaces

length of the text 20559

 The Verdict: Edith Wharton: 1908
Exported from Wikisource on October 21, 2024

I HAD always thought


In [3]:
import re
regex_logic = r"([,.:;?_!\"()']|--|\s)"

In [4]:
preprocessed_text = re.split(regex_logic, verdict)
preprocessed_text = [text.strip() for text in preprocessed_text if text.strip()]
print(len(preprocessed_text))

4705


In [5]:
all_words = sorted(set(preprocessed_text))
all_words.extend(["<|unk|>","<|endoftext|>"])


#word_mapping = {}
#for i,word in enumerate(all_words):
#    word_mapping[word]= i

# much more concise way of writing the above, with a slight change in the output using () 
# would be the following
vocab = {word: idx for idx, word in enumerate(all_words)} 

In [6]:
# print(vocab) # we now have the extended dictionary with 2 special tokens added.
# 'would': 1129, 'wouldn': 1130, 'year': 1131, 'years': 1132, 'yellow': 1133, 'yet': 1134, 
# 'you': 1135, 'younger': 1136, 'your': 1137, 'yourself': 1138, '<|unk|>': 1139, '<|endoftext|>': 1140}

In [7]:
for items in list(vocab.items())[-5:]:
    print(items)

('younger', 1136)
('your', 1137)
('yourself', 1138)
('<|unk|>', 1139)
('<|endoftext|>', 1140)


In [11]:
class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {idx:token for token, idx in vocab.items()}

    def encode(self,text):
        preprocess_text = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        # note that the if text.strip() validates whether the stripped text is not empty
        # filtering out empty or whitespace only strings. The next strip strips the space from the text.
        # and adds that to the preprocess list.
        preprocess_text = [text.strip() for text in preprocess_text if text.strip()]
        preprocess_text =[text if text in self.str_to_int else "<|unk|>" for text in preprocess_text]
        ids = [self.str_to_int[text] for text in preprocess_text]
        return ids

    def decode(self,ids):
        text= " ".join([self.int_to_str[i] for i in ids])
        final_text = re.sub(r'\s+([,.:;?!"()\'])', r'\1', text)
        return final_text

In [12]:
tokeniser = SimpleTokenizerV2(vocab)
text1= "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."
text = " <|endoftext|> ".join((text1,text2))
print(text)

Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.


In [13]:
print(tokeniser.encode(text))

[1139, 5, 364, 1135, 637, 984, 13, 1140, 60, 997, 965, 993, 731, 997, 1139, 7]


In [14]:
print(tokeniser.decode(tokeniser.encode(text)))

<|unk|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>.
