In [1]:
import numpy as np
from typing import List, Dict

class SimpleTokenizer:

    def __init__(self):
        self.word_to_id = {}
        self.id_to_word = {}
        self.vocab_size = 0

        self.pad_token = "<PAD>"
        self.unk_token = "<UNK>"
        self.bos_token = "<BOS>"
        self.eos_token = "<EOS>"

    def build_vocab(self, texts: List[str]) -> None:
        special_tokens = [
            self.pad_token,
            self.unk_token,
            self.bos_token,
            self.eos_token
        ]

        for token in special_tokens:
            self.word_to_id[token] = self.vocab_size
            self.id_to_word[self.vocab_size] = token
            self.vocab_size += 1

        for text in texts:
            for w in text.split():
                if w not in self.word_to_id:
                    self.word_to_id[w] = self.vocab_size
                    self.id_to_word[self.vocab_size] = w
                    self.vocab_size += 1

    def encode(self, text: str) -> List[int]:
        ids = [self.word_to_id[self.bos_token]]

        for w in text.split():
            ids.append(self.word_to_id.get(w, self.word_to_id[self.unk_token]))

        ids.append(self.word_to_id[self.eos_token])
        return ids

    def decode(self, ids: List[int]) -> str:
        words = []

        for i in ids:
            word = self.id_to_word.get(i, self.unk_token)

            if word in {self.pad_token, self.bos_token, self.eos_token}:
                continue

            words.append(word)

        return " ".join(words)


In [2]:
# ðŸ”¥ Example training texts
texts = [
    "I love NLP",
    "NLP is fun",
    "I love machine learning"
]

# Create tokenizer
tokenizer = SimpleTokenizer()

# Build vocabulary
tokenizer.build_vocab(texts)

# ðŸ“š Show vocabulary
print("Word â†’ ID mapping:")
for word, idx in tokenizer.word_to_id.items():
    print(f"{word:15} -> {idx}")

print("\nVocabulary size:", tokenizer.vocab_size)


Word â†’ ID mapping:
<PAD>           -> 0
<UNK>           -> 1
<BOS>           -> 2
<EOS>           -> 3
I               -> 4
love            -> 5
NLP             -> 6
is              -> 7
fun             -> 8
machine         -> 9
learning        -> 10

Vocabulary size: 11
