# Text Tokenizers

In [None]:
#| default_exp text.tokenizers

In [None]:
#| hide
%load_ext autoreload
%autoreload 2
from nbdev.showdoc import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Phonemizer

In [None]:
#| export
from phonemizer.backend import EspeakBackend
from phonemizer.backend.espeak.language_switch import LanguageSwitch
from phonemizer.backend.espeak.words_mismatch import WordMismatch
from phonemizer.punctuation import Punctuation
from phonemizer.separator import Separator
from phonemizer import phonemize
from torch.utils.data import DataLoader
from plum import dispatch
from typing import List, Tuple

Assumes espeak backend is installed via `apt-get install espeak`

In [None]:
#| export
class Phonemizer():
    def __init__(self,
        separator=Separator(word=" ", syllable="|", phone=None), # separator
        language='en-us', # language
        backend='espeak', # phonemization backend (espeak)
        strip=True, # strip
        preserve_punctuation=True # preserve punctuation
        ):
        self.separator = separator
        self.language = language
        self.backend = backend
        self.strip = strip
        self.preserve_punctuation = preserve_punctuation
    
    @dispatch
    def __call__(self, text:str, n_jobs=1)->str:
        return(
            phonemize(
                text,
                language=self.language,
                backend=self.backend,
                separator=self.separator,
                strip=self.strip,
                preserve_punctuation=self.preserve_punctuation,
                njobs=n_jobs
                )
        )

    @dispatch
    def __call__(self, texts:List[str], n_jobs=1)->List[str]:
        return(
            [phonemize(
                text,
                language=self.language,
                backend=self.backend,
                separator=self.separator,
                strip=self.strip,
                preserve_punctuation=self.preserve_punctuation,
                njobs=n_jobs
                )
        for text in texts])

## Usage

In [None]:
p = Phonemizer()
text = "oh shoot I missed my train"
print(p(text))
text = ["Oh Dear, you'll be fine!", "this is it"]
print(p(text))

oʊ ʃuːt aɪ mɪst maɪ tɹeɪn
['oʊ dɪɹ, juːl biː faɪn!', 'ðɪs ɪz ɪt']


## Tokenizer

Requires download of spacy specific language e.g. `python -m spacy download en`

In [None]:
#| export
import torch
from collections import Counter
import torchtext
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from collections import Counter
from torchtext.datasets import AG_NEWS
from typing import Iterable, List, Tuple
from torch.nn.utils.rnn import pad_sequence

In [None]:
#| export 

class Tokenizer:
    def __init__(self, backend='spacy', language='en'):
        if language == 'en':
            language = 'en_core_web_sm'
        self.tokenizer = get_tokenizer(backend, language=language)

    @dispatch
    def __call__(self, text:str)->List[str]:
        return self.tokenizer(text)
    
    @dispatch
    def __call__(self, texts:List[str])->List[List[str]]:
        return [self.tokenizer(text) for text in texts]
    
    @dispatch # to replace Iterable
    # works with agnews type of dataset [(index, text)]
    def __call__(self, data_iter:Iterable)->Iterable:
        for _, text in data_iter:
            yield self.tokenizer(text)

    @dispatch    
    def inverse(self, tokens:List[str])->str:
        # TODO: take care of white spaces
        return ' '.join(tokens)

    @dispatch
    def inverse(self, list_of_tokens:List[List[str]])->List[str]:
        s = []
        for tokens in list_of_tokens:
            s.append(' '.join(tokens)) 
        return s

## Usage

In [None]:
tok = Tokenizer()

In [None]:
# str -> List[str]
s = "Oh, yeah I don't know dude..."
tokenized = tok(s)
print(s)
print(tokenized)
print(tok.inverse(tokenized))

# List[str]->List[List[str]]
s = ["Oh, yeah I don't know dude...", "this is a test"]
tokenized = tok(s)
print(tokenized)
print(tok.inverse(tokenized))

# Iterable -> Iterable
ds = AG_NEWS(split='test') # data pipe
sample = next(iter(ds)) # (label, text)
# print(sample)
it = tok(ds)
tokens = [token for token in it]
print(tokens[:2])

Oh, yeah I don't know dude...
['Oh', ',', 'yeah', 'I', 'do', "n't", 'know', 'dude', '...']
Oh , yeah I do n't know dude ...
[['Oh', ',', 'yeah', 'I', 'do', "n't", 'know', 'dude', '...'], ['this', 'is', 'a', 'test']]
["Oh , yeah I do n't know dude ...", 'this is a test']
[['Fears', 'for', 'T', 'N', 'pension', 'after', 'talks', 'Unions', 'representing', 'workers', 'at', 'Turner', '  ', 'Newall', 'say', 'they', 'are', "'", 'disappointed', "'", 'after', 'talks', 'with', 'stricken', 'parent', 'firm', 'Federal', 'Mogul', '.'], ['The', 'Race', 'is', 'On', ':', 'Second', 'Private', 'Team', 'Sets', 'Launch', 'Date', 'for', 'Human', 'Spaceflight', '(', 'SPACE.com', ')', 'SPACE.com', '-', 'TORONTO', ',', 'Canada', '--', 'A', 'second\\team', 'of', 'rocketeers', 'competing', 'for', 'the', ' ', '#', '36;10', 'million', 'Ansari', 'X', 'Prize', ',', 'a', 'contest', 'for\\privately', 'funded', 'suborbital', 'space', 'flight', ',', 'has', 'officially', 'announced', 'the', 'first\\launch', 'date', 'for',

## Numericalizer

In [None]:
#| export
# TODO: add more special characters
class Numericalizer():
    def __init__(self, tokens_iter:Iterable, specials=["<pad>", "<unk>", "<bos>", "<eos>"]):
        self._vocab = self.build_map_from_iter(tokens_iter, specials)
    
    def build_map_from_iter(self,data_iter:Iterable, specials=None):
        self._vocab = torchtext.vocab.build_vocab_from_iterator(data_iter, specials=specials)
        if "<unk>" in specials:
            self._vocab.set_default_index(self._vocab["<unk>"])
        return self._vocab

    @dispatch
    def __call__(self, texts:List[str])->List[List[int]]:
        # TODO: check self._vocab has been built
        return [self._vocab[text] for text in texts]
    
    @dispatch
    def __call__(self, texts:List[List[str]]):
        # TODO: use nested list comprehension
        res = []
        for row in texts:
            res.append([self._vocab[text] for text in row])
        return res
        
    @dispatch
    def __call__(self, text:str)->int:
        return self._vocab[text]
    
    @property
    def vocab(self):
        return(self._vocab)
    
    @dispatch
    def inverse(self, idx:int)->str:
        return self._vocab.get_itos()[idx]

    @dispatch
    def inverse(self, indices:List[int])->List[str]:
        return [self._vocab.get_itos()[i] for i in indices]
    

## Usage

In [None]:
tok = Tokenizer()
# In the case of agnews, dataset is: [(index, text)]
def token_iterator(data_iter:Iterable)->Iterable:
    for _, text in data_iter:
        yield tok(text)
tok_it= token_iterator(ds)
# initialize numericalizer based on token iterator
num = Numericalizer(tok_it)

In [None]:
print(num('<pad>'), num('<unk>'))

0 1


In [None]:
print(num.vocab['the'])
print(num('the'))
print(num(['<bos>', '<pad>', '<unk>', 'a', 'this', 'the', 'lkjsdf']))
print(num.inverse(0))
print(num.inverse([6,55]))
print(num([['<bos>', '<pad>'], ['<unk>', 'a', 'this', 'the', 'lkjsdf']]))

4
4
[2, 0, 1, 9, 58, 4, 1]
<pad>
['.', 'Monday']
[[2, 0], [1, 9, 58, 4, 1]]


In [None]:
tokens = tok(["here we go. asdflkj", "it was time..."])
print(tokens)
print([num(tok) for tok in tokens])
print(num(tokens))



[['here', 'we', 'go', '.', 'asdflkj'], ['it', 'was', 'time', '...']]
[[534, 1040, 310, 6, 1], [34, 40, 101, 67]]
[[534, 1040, 310, 6, 1], [34, 40, 101, 67]]


In [None]:
#| export
class TextCollater:
    def __init__(self,
                 tokenizer,
                 numericalizer,
                 padding_value:int=0
                ):
        self._numericalizer = numericalizer
        self._tokenizer = tokenizer
        self.padding_value = padding_value

    def collate_list(self, texts:List[str])->Tuple[torch.Tensor, torch.Tensor]:
        token_list = self._tokenizer(texts)
        token_list = [torch.LongTensor(tokens) for tokens in self._numericalizer(token_list)]
        text_lens = torch.LongTensor([tokens.shape[0] for tokens in token_list])
        text_pad = pad_sequence(token_list, batch_first=True, padding_value=self.padding_value)
        return text_pad, text_lens

    def collate_agnews(self, batch)->Tuple[torch.Tensor, torch.Tensor]:
        texts = [item[1] for item in batch]
        token_list = self._tokenizer(texts)
        token_list = [torch.LongTensor(tokens) for tokens in self._numericalizer(token_list)]
        text_lens = torch.LongTensor([tokens.shape[0] for tokens in token_list])
        text_pad = pad_sequence(token_list, batch_first=True, padding_value=self.padding_value)
        return text_pad, text_lens

In [None]:
texts = ["this is it...", "this is the second sentence."]
t = tok(texts)
print(t)
tt = num(t)
print(tt)
ttt= [torch.Tensor(t) for t in tt]
[t.shape[0] for t in ttt]
collater = TextCollater(tok, num)
print(collater.collate_list(texts))
dl = DataLoader(dataset=ds, batch_size=2, shuffle=True, collate_fn=collater.collate_agnews)

[['this', 'is', 'it', '...'], ['this', 'is', 'the', 'second', 'sentence', '.']]
[[58, 27, 34, 67], [58, 27, 4, 95, 3714, 6]]
(tensor([[  58,   27,   34,   67,    0,    0],
        [  58,   27,    4,   95, 3714,    6]]), tensor([4, 6]))


In [None]:
b = next(iter(dl))
print('batch: ', b)
tokens, lens = b[0], b[1]
for token, len in zip(tokens, lens):
    print(token[:len].tolist())
    print(num.inverse(token[:len].tolist()))

batch:  (tensor([[ 3894,  6448,    13,   532,   179,  9855,  1683,    14,   932,    17,
          3586,    18,   453,    11,  3894,   371,  2260,     5,     4,   100,
            16,    19,   171,   284,     8,   312,  1820,     5,  1406,    42,
            22,   454,   401,  2514,    28,     4,    63,    27,  2297,  3327,
            21,    41,  2455,  2275,   797,    69],
        [ 3592,  6422,  3288,     7,  1359,   928,    15,  2846,    17,    36,
            18,     8,   212,   119,  2290,    71,   157,    15,  3202,  1163,
          8066,    60,    13,  7023,   971,    20,    15,  1673,    10,  2735,
          6044,    39,  2951, 10519,   194,     4,    15,   873,  8398,  1748,
             6,     0,     0,     0,     0,     0]]), tensor([46, 41]))
[3894, 6448, 13, 532, 179, 9855, 1683, 14, 932, 17, 3586, 18, 453, 11, 3894, 371, 2260, 5, 4, 100, 16, 19, 171, 284, 8, 312, 1820, 5, 1406, 42, 22, 454, 401, 2514, 28, 4, 63, 27, 2297, 3327, 21, 41, 2455, 2275, 797, 69]
['Vodafone', 'D

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()