# Text Tokenizers

In [None]:
#| default_exp text.tokenizers

In [None]:
#| hide
%load_ext autoreload
%autoreload 2
from nbdev.showdoc import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Phonemizer

In [None]:
#| export
from phonemizer.backend import EspeakBackend
from phonemizer.backend.espeak.language_switch import LanguageSwitch
from phonemizer.backend.espeak.words_mismatch import WordMismatch
from phonemizer.punctuation import Punctuation
from phonemizer.separator import Separator
from phonemizer import phonemize
from torch.utils.data import DataLoader
from multipledispatch import dispatch

Assumes espeak backend is installed via `apt-get install espeak`

In [None]:
#| export
class Phonemizer():
    def __init__(self,
        separator=Separator(word=" ", syllable="|", phone=None), # separator
        language='en-us', # language
        backend='espeak', # phonemization backend (espeak)
        strip=True, # strip
        preserve_punctuation=True # preserve punctuation
        ):
        self.separator = separator
        self.language = language
        self.backend = backend
        self.strip = strip
        self.preserve_punctuation = preserve_punctuation
    
    def __call__(self, text, n_jobs=1):
        return(
            phonemize(
                text,
                language=self.language,
                backend=self.backend,
                separator=self.separator,
                strip=self.strip,
                preserve_punctuation=self.preserve_punctuation,
                njobs=n_jobs
                )
        )

## Usage

In [None]:
p = Phonemizer()
text = "Oh Dear! This suck...\n We'll be fine!"
print(p(text))

oʊ dɪɹ! ðɪs sʌk...
wiːl biː faɪn!


In [None]:
#| hide
text = "Oh Dear! This suck...\n We'll be fine!"
text = Punctuation(';:,.!"?()-').remove(text)
print("text:", text)
words = {w.lower() for line in text for w in line.strip().split(' ') if w}
print("words:", words)
# initialize the espeak backend for English
backend = EspeakBackend('en-us')

# separate phones by a space and ignoring words boundaries
separator = Separator(phone=' ', word=None)
# build the lexicon by phonemizing each word one by one. The backend.phonemize
# function expect a list as input and outputs a list.
lexicon = {
    word: backend.phonemize([word], separator=separator, strip=True)[0]
    for word in words}
print("lexicon: ", lexicon)
separator=Separator(word=" ", syllable="|", phone=None)

phn = phonemize(
    text,
    language='en-us',
    backend='espeak',
    separator=separator,
    strip=True,
    preserve_punctuation=True,
    njobs=4)
print(phn)

text: Oh Dear This suck We'll be fine
words: {'b', 'd', 't', 'o', 'w', 'h', 'e', 'u', 'l', 'n', "'", 'c', 'r', 'i', 'a', 'k', 'f', 's'}
lexicon:  {'b': 'b iː', 'd': 'd iː', 't': 't iː', 'o': 'oʊ', 'w': 'd ʌ b əl j uː', 'h': 'eɪ tʃ', 'e': 'iː', 'u': 'j uː', 'l': 'ɛ l', 'n': 'ɛ n', "'": '', 'c': 's iː', 'r': 'ɑːɹ', 'i': 'aɪ', 'a': 'eɪ', 'k': 'k eɪ', 'f': 'ɛ f', 's': 'ɛ s'}
oʊ dɪɹ ðɪs sʌk wiːl biː faɪn


## Tokenizer

Requires download of spacy specific language e.g. `python -m spacy download en`

In [None]:
#| export
import torch
from collections import Counter
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from collections import Counter
from torchtext.datasets import AG_NEWS
from typing import Iterable, List
from torch.nn.utils.rnn import pad_sequence
from typing import List

In [None]:
#| export 

class Tokenizer:
    def __init__(self, backend='spacy', language='en'):
        if language == 'en':
            language = 'en_core_web_sm'
        self.tokenizer = get_tokenizer(backend, language=language)

    def __call__(self, text:str):
        return self.tokenizer(text)
    
    def tokenize_iter(self, data_iter:Iterable):
        for _, text in data_iter:
            yield self.tokenizer(text)

    def inverse(self, tokens:List[int]):
        # TODO: take care of white spaces
        return ' '.join(tokens)

## Usage

In [None]:
tok = Tokenizer()
s = "Oh, yeah\n I don't know dude..."
tokenized = tok(s)
print(s)
print(tokenized)
print(tok.inverse(tokenized))
ds = AG_NEWS(split='test') # data pipe
sample = next(iter(ds)) # (label, text)
print(sample)
tokenized_ds = tok.tokenize_iter(ds)
sample = next(iter(tokenized_ds))
print(sample)


Oh, yeah
 I don't know dude...
['Oh', ',', 'yeah', '\n ', 'I', 'do', "n't", 'know', 'dude', '...']
Oh , yeah 
  I do n't know dude ...
(3, "Fears for T N pension after talks Unions representing workers at Turner   Newall say they are 'disappointed' after talks with stricken parent firm Federal Mogul.")
['Fears', 'for', 'T', 'N', 'pension', 'after', 'talks', 'Unions', 'representing', 'workers', 'at', 'Turner', '  ', 'Newall', 'say', 'they', 'are', "'", 'disappointed', "'", 'after', 'talks', 'with', 'stricken', 'parent', 'firm', 'Federal', 'Mogul', '.']


## Numericalizer

In [None]:
#| export
# TODO: add more special characters
class Numericalizer():
    def __init__(self, tokenizer:Tokenizer, data_iter:Iterable, specials=["<unk>"]):
        self._tokenizer = tokenizer
        self._vocab = self.build_map_from_iter(data_iter, specials=specials)
    
    def build_map_from_iter(self,data_iter:Iterable, specials = ["<unk>"]):
        self._vocab = build_vocab_from_iterator(self._tokenizer.tokenize_iter(data_iter), specials=specials)
        if "<unk>" in specials:
            self._vocab.set_default_index(self._vocab["<unk>"])
        return self._vocab

    @dispatch(list, type=torch.LongTensor)
    def __call__(self, texts:List[str], type=torch.LongTensor)->List[List[int]]:
        return [type(self._vocab(self._tokenizer(text))) for text in texts]
        
    @dispatch(str)
    def __call__(self, text:str)->List[int]:
        return self._vocab(self._tokenizer(text))
    
    @property
    def vocab(self):
        return(self._vocab)
    
    def inverse(self, indices:List[int]):
        return self._tokenizer.inverse([self._vocab.get_itos()[i] for i in indices])
    

## Usage

In [None]:
tok = Tokenizer()
num = Numericalizer(tok, ds)

In [None]:
print(num(["here we go. asdflkj", "it was time..."]))
print(num("this is it"))
vocab = num.vocab
print(vocab.get_stoi()['the'])
print(vocab.get_itos()[1])
print(num.inverse([55, 24, 31]))

[tensor([ 531, 1037,  307,    3,    0]), tensor([31, 37, 98, 64])]
[55, 24, 31]
1
the
this is it


In [None]:
#| export
class TextCollater:
    def __init__(self,
                 numericalizer,
                 padding_value:int= -1
                 ):
        self._numericalizer = numericalizer
        self.padding_value = padding_value

    def collate_fn(self, batch):
        texts = [row[1] for row in batch]
        tokens = self._numericalizer(texts)
        text_lens = torch.LongTensor([token.shape[0] for token in tokens])
        text_pad = pad_sequence(tokens, batch_first=True, padding_value=self.padding_value)
        return text_pad, text_lens

In [None]:
collater = TextCollater(num)
dl = DataLoader(dataset=ds, batch_size=2, shuffle=True, collate_fn=collater.collate_fn)

In [None]:
b = next(iter(dl))
print(b)
tokens, lens = b[0], b[1]
for token, len in zip(tokens, lens):
    print(token[:len])
    print(num.inverse(token[:len]))

(tensor([[ 2635,  3278,  9765, 12976,  1101,  1066,    14,    32,    15,    32,
             5,    19,  2142,     9,  1183,  2487,    28,  1028,     6, 11123,
             5, 23940,  3181,  5429,     4, 15088,   254, 10522,    17,    20,
           332,    20, 15957,   109,    94,   785,   516,    17,   129,    42,
          1728,     4,  1101,     2,     1,  3181,    22,  2930,    25,    52,
             3],
        [ 2534, 21995,   207,     5, 12021,  6549,   774,  5396,  9394,  6549,
            37,   250,  2201,     5,  2722,     8,   543,    20,    48,  4773,
          4857, 10362,    17,     6,   586,   210,   169,  2607,   268,    34,
          2022, 10415,   475,     3,    -1,    -1,    -1,    -1,    -1,    -1,
            -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
            -1]]), tensor([51, 34]))
tensor([ 2635,  3278,  9765, 12976,  1101,  1066,    14,    32,    15,    32,
            5,    19,  2142,     9,  1183,  2487,    28,  1028,     6, 11123,

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()