# char-tokenizer.ipynb

> Implementation of a character-level tokenizer.

In [None]:
#| default_exp tokenizers.char_tokenizer

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| hide
from fastcore.test import *

In [None]:
#| export 
from typing import Callable, Dict, Iterable, Tuple

In [None]:
# | export
SToI = Dict[str, int]
IToS = Dict[int, str]
EncodeFn = Callable[[str], Iterable[int]]
DecodeFn = Callable[[Iterable[int]], str]


def create_character_tokenizer(
    text: str,
) -> Tuple[Iterable[str], int, SToI, IToS, EncodeFn, DecodeFn]:
    """Create a character tokenizer from text."""
    chars = sorted(list(set(text)))
    vocab_size = len(chars)
    stoi = {ch: i for i, ch in enumerate(chars)}
    itos = {i: ch for i, ch in enumerate(chars)}
    encode = lambda s: [stoi[c] for c in s]
    decode = lambda l: ''.join([itos[i] for i in l])

    return chars, vocab_size, stoi, itos, encode, decode

In [None]:
# Tests for create_character_tokenizer
chars, vocab_size, stoi, itos, encode, decode = create_character_tokenizer('abcabc')
test_eq(chars, ['a', 'b', 'c'])
test_eq(vocab_size, 3)
test_eq(stoi, {'a': 0, 'b': 1, 'c': 2})
test_eq(itos, {0: 'a', 1: 'b', 2: 'c'})
test_eq(encode('cab'), [2, 0, 1])
test_eq(decode([2, 1, 0]), 'cba')


In [None]:
#| hide
import nbdev; nbdev.nbdev_export()