# Text Tokenizers

In [None]:
#| default_exp text.tokenizers

In [None]:
#| hide
%load_ext autoreload
%autoreload 2
from nbdev.showdoc import *

## Phonemizer

In [None]:
#| export
from phonemizer.backend import EspeakBackend
from phonemizer.backend.espeak.language_switch import LanguageSwitch
from phonemizer.backend.espeak.words_mismatch import WordMismatch
from phonemizer.punctuation import Punctuation
from phonemizer.separator import Separator
from phonemizer import phonemize
from torch.utils.data import DataLoader
from multipledispatch import dispatch
from typing import List, Tuple

Assumes espeak backend is installed via `apt-get install espeak`

In [None]:
#| export
class Phonemizer():
    def __init__(self,
        separator=Separator(word=" ", syllable="|", phone=None), # separator
        language='en-us', # language
        backend='espeak', # phonemization backend (espeak)
        strip=True, # strip
        preserve_punctuation=True # preserve punctuation
        ):
        self.separator = separator
        self.language = language
        self.backend = backend
        self.strip = strip
        self.preserve_punctuation = preserve_punctuation
    
    @dispatch(str)
    def __call__(self, text:str, n_jobs=1)->str:
        return(
            phonemize(
                text,
                language=self.language,
                backend=self.backend,
                separator=self.separator,
                strip=self.strip,
                preserve_punctuation=self.preserve_punctuation,
                njobs=n_jobs
                )
        )

    @dispatch(list)
    def __call__(self, texts:List[str], n_jobs=1)->List[str]:
        return(
            [phonemize(
                text,
                language=self.language,
                backend=self.backend,
                separator=self.separator,
                strip=self.strip,
                preserve_punctuation=self.preserve_punctuation,
                njobs=n_jobs
                )
        for text in texts])

## Usage

In [None]:
p = Phonemizer()
text = ["Oh Dear! This suck...\n We'll be fine!", "this is it"]
print(p(text))

['oʊ dɪɹ! ðɪs sʌk...\nwiːl biː faɪn!', 'ðɪs ɪz ɪt']


In [None]:
#| hide
text = "Oh Dear! This suck...\n We'll be fine!"
text = Punctuation(';:,.!"?()-').remove(text)
print("text:", text)
words = {w.lower() for line in text for w in line.strip().split(' ') if w}
print("words:", words)
# initialize the espeak backend for English
backend = EspeakBackend('en-us')

# separate phones by a space and ignoring words boundaries
separator = Separator(phone=' ', word=None)
# build the lexicon by phonemizing each word one by one. The backend.phonemize
# function expect a list as input and outputs a list.
lexicon = {
    word: backend.phonemize([word], separator=separator, strip=True)[0]
    for word in words}
print("lexicon: ", lexicon)
separator=Separator(word=" ", syllable="|", phone=None)

phn = phonemize(
    text,
    language='en-us',
    backend='espeak',
    separator=separator,
    strip=True,
    preserve_punctuation=True,
    njobs=4)
print(phn)

text: Oh Dear This suck We'll be fine
words: {'o', 't', 'e', 'a', 'r', 'f', 'i', 'd', 'k', 'w', "'", 'l', 'c', 'h', 'n', 's', 'b', 'u'}
lexicon:  {'o': 'oʊ', 't': 't iː', 'e': 'iː', 'a': 'eɪ', 'r': 'ɑːɹ', 'f': 'ɛ f', 'i': 'aɪ', 'd': 'd iː', 'k': 'k eɪ', 'w': 'd ʌ b əl j uː', "'": '', 'l': 'ɛ l', 'c': 's iː', 'h': 'eɪ tʃ', 'n': 'ɛ n', 's': 'ɛ s', 'b': 'b iː', 'u': 'j uː'}
oʊ dɪɹ ðɪs sʌk wiːl biː faɪn


## Tokenizer

Requires download of spacy specific language e.g. `python -m spacy download en`

In [None]:
#| export
import torch
from collections import Counter
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from collections import Counter
from torchtext.datasets import AG_NEWS
from typing import Iterable, List, Tuple
from torch.nn.utils.rnn import pad_sequence

In [None]:
#| export 

class Tokenizer:
    def __init__(self, backend='spacy', language='en'):
        if language == 'en':
            language = 'en_core_web_sm'
        self.tokenizer = get_tokenizer(backend, language=language)

    @dispatch(str)
    def __call__(self, text:str)->str:
        return self.tokenizer(text)
    
    @dispatch(object) # to replace Iterable
    # works with agnews type of dataset [(index, text)]
    def __call__(self, data_iter:Iterable)->Iterable:
        for _, text in data_iter:
            yield self.tokenizer(text)
    
    @dispatch(list)
    def __call__(self, texts:List[str])->List[str]:
        return [self.tokenizer(text) for text in texts]

    def inverse(self, tokens:List[int]):
        # TODO: take care of white spaces
        return ' '.join(tokens)

## Usage

In [None]:
tok = Tokenizer()
# string
s = "Oh, yeah\n I don't know dude..."
tokenized = tok(s)
print(s)
print(tokenized)
print(tok.inverse(tokenized))

# list of strings
s = ["Oh, yeah I don't know dude...", "this is a test"]
tokenized = tok(s)
print(tokenized)
# iterable 
ds = AG_NEWS(split='test') # data pipe
sample = next(iter(ds)) # (label, text)
print(sample)
it = tok(ds)
tokens = [token for token in it]
print(tokens[:2])

Oh, yeah
 I don't know dude...
['Oh', ',', 'yeah', '\n ', 'I', 'do', "n't", 'know', 'dude', '...']
Oh , yeah 
  I do n't know dude ...
[['Oh', ',', 'yeah', 'I', 'do', "n't", 'know', 'dude', '...'], ['this', 'is', 'a', 'test']]
(3, "Fears for T N pension after talks Unions representing workers at Turner   Newall say they are 'disappointed' after talks with stricken parent firm Federal Mogul.")
[['Fears', 'for', 'T', 'N', 'pension', 'after', 'talks', 'Unions', 'representing', 'workers', 'at', 'Turner', '  ', 'Newall', 'say', 'they', 'are', "'", 'disappointed', "'", 'after', 'talks', 'with', 'stricken', 'parent', 'firm', 'Federal', 'Mogul', '.'], ['The', 'Race', 'is', 'On', ':', 'Second', 'Private', 'Team', 'Sets', 'Launch', 'Date', 'for', 'Human', 'Spaceflight', '(', 'SPACE.com', ')', 'SPACE.com', '-', 'TORONTO', ',', 'Canada', '--', 'A', 'second\\team', 'of', 'rocketeers', 'competing', 'for', 'the', ' ', '#', '36;10', 'million', 'Ansari', 'X', 'Prize', ',', 'a', 'contest', 'for\\private

## Numericalizer

In [None]:
#| export
# TODO: add more special characters
class Numericalizer():
    def __init__(self, tokenizer:Tokenizer, data_iter:Iterable, specials=["<unk>"]):
        self._tokenizer = tokenizer
        self._vocab = self.build_map_from_iter(data_iter, specials=specials)
    
    def build_map_from_iter(self,data_iter:Iterable, specials = ["<unk>"]):
        self._vocab = build_vocab_from_iterator(self._tokenizer.tokenize_iter(data_iter), specials=specials)
        if "<unk>" in specials:
            self._vocab.set_default_index(self._vocab["<unk>"])
        return self._vocab

    @dispatch(list, type=torch.LongTensor)
    def __call__(self, texts:List[str], type=torch.LongTensor)->List[List[int]]:
        return [type(self._vocab(self._tokenizer(text))) for text in texts]
        
    @dispatch(str)
    def __call__(self, text:str)->List[int]:
        return self._vocab(self._tokenizer(text))
    
    @property
    def vocab(self):
        return(self._vocab)
    
    def inverse(self, indices:List[int]):
        return self._tokenizer.inverse([self._vocab.get_itos()[i] for i in indices])
    

## Usage

In [None]:
tok = Tokenizer()
num = Numericalizer(tok, ds)

In [None]:
print(num(["here we go. asdflkj", "it was time..."]))
print(num("this is it"))
vocab = num.vocab
print(vocab.get_stoi()['the'])
print(vocab.get_itos()[1])
print(num.inverse([55, 24, 31]))

[tensor([ 531, 1037,  307,    3,    0]), tensor([31, 37, 98, 64])]
[55, 24, 31]
1
the
this is it


In [None]:
#| export
class TextCollater:
    def __init__(self,
                 numericalizer,
                 padding_value:int= -1
                 ):
        self._numericalizer = numericalizer
        self.padding_value = padding_value

    def collate_list(self, texts:List[str])->Tuple[torch.Tensor, torch.Tensor]:
        tokens = self._numericalizer(texts)
        text_lens = torch.LongTensor([token.shape[0] for token in tokens])
        text_pad = pad_sequence(tokens, batch_first=True, padding_value=self.padding_value)
        return text_pad, text_lens

    def collate_agnews(self, batch)->Tuple[torch.Tensor, torch.Tensor]:
        texts = [row[1] for row in batch]
        tokens = self._numericalizer(texts)
        text_lens = torch.LongTensor([token.shape[0] for token in tokens])
        text_pad = pad_sequence(tokens, batch_first=True, padding_value=self.padding_value)
        return text_pad, text_lens

In [None]:
collater = TextCollater(num)
texts = ["this is it...", "this is the second sentence."]
print(collater.collate_list(texts))
dl = DataLoader(dataset=ds, batch_size=2, shuffle=True, collate_fn=collater.collate_agnews)

(tensor([[  55,   24,   31,   64,   -1,   -1],
        [  55,   24,    1,   92, 3711,    3]]), tensor([4, 6]))


In [None]:
b = next(iter(dl))
print(b)
tokens, lens = b[0], b[1]
for token, len in zip(tokens, lens):
    print(token[:len])
    print(num.inverse(token[:len]))

(tensor([[  161, 12780,   514,  1898,  6079,  3465,   325,    22,   161, 12780,
           514,    24,  3666,     1,    47,  6079,   279,   650, 10158,    10,
            54,     3,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
            -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
            -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
            -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
            -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
            -1],
        [  467,     2,  4583,     2,  5184,     4,   346,  5020,     5,  4389,
          7554,  8169,     8,   312,    90,   198,    64,   197,    27,   962,
            14, 16492,    15,     5,  3503,    17,    46,  6261,    65,     7,
             1,   100,     5,  1143,  8544,   808,  6039,     2,     1,  5020,
             5,  4389,  7554,     2,   246,     4,   487,  8169,    17,   154,
            10,  1669,  3447,     

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()