In [2]:
from pathlib import Path
import re

In [3]:
def read_data(filepath: Path):
    with open(filepath, "r", encoding="utf-8") as f:
        raw_text = f.read()
    return raw_text


In [4]:
data = read_data("../resources/verdict.txt")

In [5]:
print(len(data))

20559


In [6]:
preprocessed_data = re.split(r'([,.?_!"()\']|--|\s)', data) 

In [7]:
print(preprocessed_data[:30])

['The', ' ', 'Verdict:', ' ', 'Edith', ' ', 'Wharton:', ' ', '1908', '\n', 'Exported', ' ', 'from', ' ', 'Wikisource', ' ', 'on', ' ', 'October', ' ', '21', ',', '', ' ', '2024', '\n', '', '\n', 'I', ' ']


In [8]:
unique_tokens= sorted(list(set(preprocessed_data)))
vocab_size = len(unique_tokens)

In [9]:
print(f"{vocab_size=}")

vocab_size=1171


In [10]:
vocab = {token:idx for idx, token in enumerate(unique_tokens)}

In [11]:
list(vocab.items())[:5]

[('', 0), ('\n', 1), (' ', 2), ('!', 3), ('"', 4)]

In [12]:
class SimpleTokeniser:
    def __init__(self, vocab):
        self.str_to_int = vocab #vocab is already processed and passed to the class.
        self.int_to_str = {int:str for str,int in vocab.items()}


    def encode(self,text, pattern = r'([,.?_!"()\']|--|\s)'):
        preprocessed_data = re.split(pattern,text)
        #here the condition makes sure that no whitespaces are passed for e.g. "\n" or " "
        preprocessed_data = [item.strip() for item in preprocessed_data if item.strip()] 
        ids = [self.str_to_int[string] for string in preprocessed_data]
        return ids

    def decode(self, ids):
        text= " ".join([self.int_to_str[idx] for idx in ids])
        text = re.sub(r'\s+([,.?!"()\'])',r'\1',text)
        return text

In [13]:
tokeniser = SimpleTokeniser(vocab)

In [16]:
text = """"It's the last he painted, you know," 
           Mrs. Gisburn said with pardonable pride."""
idx = tokeniser.encode(text)
print(idx)

[4, 66, 5, 884, 1025, 627, 553, 775, 8, 1167, 620, 8, 4, 77, 10, 47, 885, 1148, 785, 824, 10]


In [17]:
text = tokeniser.decode(idx)
print(text)

" It' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.


adding `<|unk|>` and `<|endoftext|>` to the represent unknown words and to seperate 2 unrelated content respectively.

In [21]:
all_tokens = sorted(set(preprocessed_data))
all_tokens.extend(["<|endoftext|>","<|unk|>"])
vocab = {token:idx for idx, token in enumerate(all_tokens)}

In [22]:
# plese note that previsouly when we created the vocab its length was 1171 and now its 1173 as we added 2 tokens.
print(len(vocab))

1173


In [26]:
print(list(vocab.items())[-5:])

[('younger', 1168), ('your', 1169), ('yourself', 1170), ('<|endoftext|>', 1171), ('<|unk|>', 1172)]


In [38]:
class SimpleTokeniserV2:
    def __init__(self, vocab):
        self.str_to_int = vocab #vocab is already processed and passed to the class.
        self.int_to_str = {int:str for str,int in vocab.items()}


    def encode(self,text, pattern = r'([,.?_!"()\']|--|\s)'):
        preprocessed_data = re.split(pattern,text)
        #here the condition makes sure that no whitespaces are passed for e.g. "\n" or " "
        preprocessed_data = [item.strip() for item in preprocessed_data if item.strip()] 
        preprocessed_data = [item if item in self.str_to_int else "<|unk|>" for item in preprocessed_data]
        ids = [self.str_to_int[string] for string in preprocessed_data]
        return ids

    def decode(self, ids):
        text= " ".join([self.int_to_str[idx] for idx in ids])
        text = re.sub(r'\s+([,.?!"()\'])',r'\1',text)
        return text

In [39]:
tokeniser = SimpleTokeniserV2(vocab)

In [40]:
text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace"
text = " <|endoftext|> ".join((text1,text2))
print(text)

Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace


In [42]:
ids = tokeniser.encode(text) #1171 represents <|endoftext> and 1172 <|unk|> since hello was not part of the vocab
print(ids)

[1172, 8, 374, 1167, 654, 1012, 16, 1171, 65, 1025, 993, 1021, 750, 1025, 1172]


In [43]:
tokeniser.decode(ids)

'<|unk|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>'