## Tokenization

In [None]:
from urllib.request import urlopen
# read file from url
files = [
    "https://sherlock-holm.es/stories/plain-text/cnus.txt",
    "https://raw.githubusercontent.com/lucko515/rnn-sherlock-holmes-book/refs/heads/master/datasets/holmes.txt"
]
special_characters = '<|endoftext|>'

data = ""
for file in files:
    file_url = file
    print(f"reading file from: {file}")
    url_data:str = urlopen(file_url).read().decode('utf-8')
    data = data + url_data + special_characters
    
print(data)

reading file from: https://sherlock-holm.es/stories/plain-text/cnus.txt
reading file from: https://raw.githubusercontent.com/lucko515/rnn-sherlock-holmes-book/refs/heads/master/datasets/holmes.txt




                          THE COMPLETE SHERLOCK HOLMES

                               Arthur Conan Doyle



                                Table of contents

               A Study In Scarlet

               The Sign of the Four

                  The Adventures of Sherlock Holmes
               A Scandal in Bohemia
               The Red-Headed League
               A Case of Identity
               The Boscombe Valley Mystery
               The Five Orange Pips
               The Man with the Twisted Lip
               The Adventure of the Blue Carbuncle
               The Adventure of the Speckled Band
               The Adventure of the Engineer's Thumb
               The Adventure of the Noble Bachelor
               The Adventure of the Beryl Coronet
               The Adventure o

In [45]:
import re
# splitting data into words
preprocessed = re.split(r'(--|-{5}|-{10}|\.{3}|[.,:;_"\'()&!?$#@+%^*\-=/`\[\]]|\s|//|"<\|endoftext\|>")', data)
preprocessed = [item.strip() for item in preprocessed if item.strip()]

In [46]:
# creating vocabilities
all_words = sorted(set(preprocessed))
all_words.append('<|unk|>')
vocabs:dict = {word:token for token,word in enumerate(all_words)}
vocabs.items()



In [49]:
# creating encoders and decoders
# encoder: text -> token -> token ids
# decoder: token ids -> token -> text

class WordTokenizer:
    def __init__(self, vocabs):
        self.token_token_ids = vocabs
        self.token_ids_token = {token_ids:token for token,token_ids in vocabs.items()}
    
    def encode(self, text:str):
        preprocessed_text = re.split(r'(--|-{5}|-{10}|\.{3}|[.,:;_"\'()&!?$%^*\-=`]|\s|"<\|endoftext\|>")', text)
        
        preprocessed_text = [item.strip() for item in preprocessed_text if item.strip()]
        
        preprocessed_text = [
            item if item in self.token_token_ids else "<|unk|>" for item in preprocessed_text
        ]
        
        token_ids = [self.token_token_ids[token] for token in preprocessed_text]
        return token_ids
    
    def decode(self, ids:list):
        text = " ".join([self.token_ids_token[token_id] for token_id in ids])
        text = re.sub(r'\s+([.,:;_"\'()&!?$%^*\-=`])', r'\1', text)
        return text

In [51]:
tokenizer = WordTokenizer(vocabs)
text = """No, I do not think so. I think that there was probably some more
tangible cause. And now, Miss Stoner, we must leave you for if
Dr. Roylott returned and saw us our journey would be in vain.
Good-bye, and be brave, for if you will do what I have told you,
you may rest assured that we shall soon drive away the dangers
that threaten you."""
encoded_text = tokenizer.encode(text)
decoded_text = tokenizer.decode(encoded_text)
print(encoded_text)
print(decoded_text)

[2638, 10, 1928, 8345, 13421, 18261, 17043, 13, 1928, 18261, 18216, 18237, 19552, 14798, 17107, 13066, 18055, 6166, 13, 365, 13451, 10, 2471, 3505, 10, 19598, 13182, 12165, 20028, 9799, 11128, 1244, 13, 3168, 15800, 4558, 16186, 19194, 13719, 11860, 19935, 5172, 11273, 19235, 13, 1665, 11, 5952, 10, 4558, 5172, 5683, 10, 9799, 11128, 20028, 19788, 8345, 19675, 1928, 10716, 18443, 20028, 10, 20028, 12735, 15747, 4874, 18216, 19598, 16512, 17123, 8513, 5013, 18219, 7582, 18216, 18305, 20028, 13]
No, I do not think so. I think that there was probably some more tangible cause. And now, Miss Stoner, we must leave you for if Dr. Roylott returned and saw us our journey would be in vain. Good- bye, and be brave, for if you will do what I have told you, you may rest assured that we shall soon drive away the dangers that threaten you.
