# **Tokenizing Text**

In [34]:
import re
from typing import Dict, List

In [15]:
# reading the text data
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

In [None]:
all_words = sorted(set([m.group(0) for m in re.finditer(r'\w+|([^\w\s])\1*', raw_text)]))

all_tokens.extend(["<|endoftext|>", "<|unk|>"]) 

#
vocab = {token:integer for integer, token in enumerate(all_words)}

In [35]:
class SimpleTokenizerV1:
    def __init__(self, vocab: Dict[str, int]):
        self.str_to_int: Dict[str, int] = vocab  # {"!": 0, ...}
        self.int_to_str: Dict[int, str] = {i: s for s, i in vocab.items()}  # {0: '!',...}

    def encode(self, text: str) -> List[int]:
        # Create Tokens
        preprocessed: List[str] = [m.group(0) for m in re.finditer(r'\w+|([^\w\s])\1*', text)]  # ["txt1","txt2",...]

        # Look for tokens in the str_to_int and get the id
        # ["Hello", ...] -> {..., "Hello":5} -> [5, ...]
        token_ids: List[int] = [self.str_to_int[s] for s in preprocessed]
        return token_ids

    def decode(self, ids: List[int]) -> str:
        # Look for ID in the int_to_str and get the token
        # [5, ...] -> {..., 5:"Hello"} -> "Hello..." 
        text: str = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r"\s+([^\w\s])", r"\1", text)
        return text

In [36]:
tokenizer = SimpleTokenizerV1(vocab) 
text = """"It's the last he painted, you know,"  
Mrs. Gisburn said with pardonable pride.""" 
ids = tokenizer.encode(text) 
print(ids)

[1, 57, 2, 868, 1007, 616, 547, 761, 5, 1145, 610, 5, 1, 68, 8, 39, 869, 1127, 770, 811, 8]


In [37]:
tokenizer.decode(ids)

'" It\' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.'

In [38]:
tokenizer.encode("Hello")

KeyError: 'Hello'