In [2]:
class TokenizerBase:
    def __init__(self):
        self.vocab = {}
        self.special_tokens = []

    def tokenize(self, text):
        raise NotImplementedError

    def detokenize(self, tokens):
        raise NotImplementedError

    def add_special_tokens(self, tokens):
        self.special_tokens.extend(tokens)
        for token in tokens:
            if token not in self.vocab:
                self.vocab[token] = len(self.vocab) + 1

    def add_tokens(self, tokens):
        for token in tokens:
            if token not in self.vocab:
                self.vocab[token] = len(self.vocab) + 1

    def vocab_size(self):
        return len(self.vocab)

In [3]:
class SimpleTokenizer(TokenizerBase):
    def __init__(self):
        super().__init__()
        self.vocab = {"The": 1, "quick": 2, "brown": 3, "fox": 4, "jumps": 5,
                      "over": 6, "lazy": 7, "dog": 8}

    def tokenize(self, text):
        tokens = text.split()
        return [self.vocab[token] for token in tokens]

    def detokenize(self, tokens):
        text = " ".join([self.vocab[token] for token in tokens])
        return text

    def vocab_size(self):
        return len(self.vocab)

In [5]:
tokenizer = SimpleTokenizer()
tokenizer.add_tokens(["the"])

In [7]:
text = "The quick brown fox jumps over the lazy dog."

tokens = tokenizer.tokenize(text)
print(tokens)

detokenized_text = tokenizer.detokenize(tokens)
print(detokenized_text)


KeyError: 'the'