# Reading data

In [2]:
with open("data/the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

In [5]:
print(len(raw_text))

20479


# Cleaning Data

In [33]:
import re

# Split data into useful words/characters -> tokens
split_data = re.split('([,.:;?_!"()\']|--|\s)', raw_text)
split_data = [data for data in split_data if data.strip()]

# Convert tokens to token_ids
"""
1. Build vocabulary: In alphabeltical order
2. Each unique token is assigned to a unique id
"""
all_words = sorted(set(split_data))
all_words.extend(["<|endoftext|>", "<|unk|>"])
vocab_size = len(all_words)
vocab = {}
for i in range(vocab_size):
    vocab[all_words[i]] = i            # Can be seen as a very simple encoder
    

In [34]:
len(vocab)

1132

# Tokenizer Class

In [26]:
import re
class CustomTokernizerV1:

    def __init__(self, vocab):

        self.str_to_int = vocab
        self.int_to_str = {i:s for s, i in vocab.items()}

    def read_data(self, path):

        with open(path, "r", encoding="utf-8") as f:
            raw_text = f.read()
        
        return raw_text
    
    def encode(self, raw_text):

        split_data = re.split('([,.:;?_!"()\']|--|\s)', raw_text)
        split_data = [data for data in split_data if data.strip()]
        ids = [self.str_to_int[s] for s in split_data]

        return ids
    
    def decode(self, ids):

        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [27]:
obj = CustomTokernizerV1(vocab=vocab)
print(obj.encode("The strain is small"))
print(obj.decode([93, 933, 584, 904]))

[93, 933, 584, 904]
The strain is small


# Adding special context tokens

In [46]:
import re
class CustomTokernizerV2:

    def __init__(self, vocab):

        self.str_to_int = vocab
        self.int_to_str = {i:s for s, i in vocab.items()}

    def read_data(self, path):

        with open(path, "r", encoding="utf-8") as f:
            raw_text = f.read()
        
        return raw_text
    
    def encode(self, raw_text):

        split_data = re.split('([,.:;?_!"()\']|--|\s)', raw_text)
        split_data = [data for data in split_data if data.strip()]
        
        ids = [self.str_to_int[s] if s in self.str_to_int 
               else self.str_to_int["<|unk|>"] 
               for s in split_data]

        return ids
    
    def decode(self, ids):

        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [47]:
obj = CustomTokernizerV2(vocab=vocab)

text1 = "hello, The strain is small"
text2 = "In the sunlit terraces of the palace."
text = " <|endoftext|> ".join([text1, text2])
ids = obj.encode(text)
print(ids)
print(obj.decode(ids))

[1131, 5, 93, 933, 584, 904, 1130, 55, 988, 956, 984, 722, 988, 1131, 7]
<|unk|>, The strain is small <|endoftext|> In the sunlit terraces of the <|unk|>.


## Other special tokens:
1. BOS: Beginning of sequence
2. EOS: End of sequence
3. PAD: To ensure all texts of different sizes are of same size, padding is done