# Text Tokenizers

In [None]:
#| default_exp text.tokenizers

In [None]:
#| hide
%load_ext autoreload
%autoreload 2
from nbdev.showdoc import *

## Phonemizer

In [None]:
#| export
from phonemizer.backend import EspeakBackend
from phonemizer.backend.espeak.language_switch import LanguageSwitch
from phonemizer.backend.espeak.words_mismatch import WordMismatch
from phonemizer.punctuation import Punctuation
from phonemizer.separator import Separator
from phonemizer import phonemize

ModuleNotFoundError: No module named 'joblib'

Assumes espeak backend is installed via `apt-get install espeak`

In [None]:
#| export
class Phonemizer():
    def __init__(self,
        separator=Separator(word=" ", syllable="|", phone=None), # separator
        language='en-us', # language
        backend='espeak', # phonemization backend (espeak)
        strip=True, # strip
        preserve_punctuation=True # preserve punctuation
        ):
        self.separator = separator
        self.language = language
        self.backend = backend
        self.strip = strip
        self.preserve_punctuation = preserve_punctuation
    
    def __call__(self, text, n_jobs=1):
        return(
            phonemize(
                text,
                language=self.language,
                backend=self.backend,
                separator=self.separator,
                strip=self.strip,
                preserve_punctuation=self.preserve_punctuation,
                njobs=n_jobs
                )
        )

## Usage

In [None]:
p = Phonemizer()
text = "Oh Dear! This suck...\n We'll be fine!"
print(p(text))

oʊ dɪɹ! ðɪs sʌk...
wiːl biː faɪn!


In [None]:
#| hide
text = "Oh Dear! This suck...\n We'll be fine!"
text = Punctuation(';:,.!"?()-').remove(text)
print("text:", text)
words = {w.lower() for line in text for w in line.strip().split(' ') if w}
print("words:", words)
# initialize the espeak backend for English
backend = EspeakBackend('en-us')

# separate phones by a space and ignoring words boundaries
separator = Separator(phone=' ', word=None)
# build the lexicon by phonemizing each word one by one. The backend.phonemize
# function expect a list as input and outputs a list.
lexicon = {
    word: backend.phonemize([word], separator=separator, strip=True)[0]
    for word in words}
print("lexicon: ", lexicon)
separator=Separator(word=" ", syllable="|", phone=None)

phn = phonemize(
    text,
    language='en-us',
    backend='espeak',
    separator=separator,
    strip=True,
    preserve_punctuation=True,
    njobs=4)
print(phn)

text: Oh Dear This suck We'll be fine
words: {'i', 'o', 'f', 'd', 'h', "'", 'w', 'n', 'c', 'b', 't', 'r', 'e', 's', 'l', 'u', 'k', 'a'}
lexicon:  {'i': 'aɪ', 'o': 'oʊ', 'f': 'ɛ f', 'd': 'd iː', 'h': 'eɪ tʃ', "'": '', 'w': 'd ʌ b əl j uː', 'n': 'ɛ n', 'c': 's iː', 'b': 'b iː', 't': 't iː', 'r': 'ɑːɹ', 'e': 'iː', 's': 'ɛ s', 'l': 'ɛ l', 'u': 'j uː', 'k': 'k eɪ', 'a': 'eɪ'}
oʊ dɪɹ ðɪs sʌk wiːl biː faɪn


## Tokenizer

requires download of spacy specific lang `python -m spacy download en`

In [None]:
#| export
import torchtext
import torch
from collections import Counter
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from collections import Counter
from torchtext.vocab import Vocab
from torchtext.utils import download_from_url, extract_archive
import spacy
from torchtext.datasets import AG_NEWS
from typing import Iterable
from torch.nn.utils.rnn import pad_sequence
# import io

In [None]:
#| export 

class Tokenizer:
    def __init__(self, backend='spacy', language='en'):
        self.tokenizer = get_tokenizer(backend, language=language)
        self.counter = Counter()
        self._vocab = None

    def __call__(self, text:str):
        return self.tokenizer(text)
    
    def tokenize_iter(self, data_iter:Iterable):
        for _, text in data_iter:
            yield self.tokenizer(text)


## Numericalizer

In [None]:
#| export
class Numericalizer():
    def __init__(self, tokenizer:Tokenizer):
        self.tokenizer = tokenizer
        self._vocab = None
    
    def build_map_from_iter(self,data_iter:Iterable, specials = ["<unk>"]):
        self._vocab = build_vocab_from_iterator(self.tokenizer.tokenize_iter(data_iter), specials=specials)
        if "<unk>" in specials:
            self._vocab.set_default_index(self._vocab["<unk>"])
        return self._vocab

In [None]:
# TODO: collate text

## Usage

In [None]:
tok = Tokenizer()
tokenized = tok("Oh, yeah\n I don't know dude...")
ds = AG_NEWS(split='test') # data pipe
sample = next(iter(ds))
print(sample)
tokenized_ds = tok.tokenize_iter(ds)
sample = next(iter(tokenized_ds))
print(sample)



(3, "Fears for T N pension after talks Unions representing workers at Turner   Newall say they are 'disappointed' after talks with stricken parent firm Federal Mogul.")
['Fears', 'for', 'T', 'N', 'pension', 'after', 'talks', 'Unions', 'representing', 'workers', 'at', 'Turner', '  ', 'Newall', 'say', 'they', 'are', "'", 'disappointed', "'", 'after', 'talks', 'with', 'stricken', 'parent', 'firm', 'Federal', 'Mogul', '.']


In [None]:
num = Numericalizer(tok)
mapper = num.build_map_from_iter(ds)
print(mapper["<unk>"])
print(mapper(tok("here we go. asdflkj")))
# text_pipeline = lambda x: voc(tokenizer(x))
print(mapper(tokenized))

0
[531, 1037, 307, 3, 0]
[7808, 2, 0, 0, 296, 378, 255, 1324, 0, 64]


In [None]:
a = mapper(tok("here we go. asdflkj"))
# print(a.shape)
b = mapper(tok("Oh, yeah\n I don't know dude..."))
mini_batch = [a, b]
x = [torch.LongTensor(x_i) for x_i in mini_batch]
x_padded = pad_sequence(x, batch_first=True, padding_value=0)
print(x_padded)

tensor([[ 531, 1037,  307,    3,    0,    0,    0,    0,    0,    0],
        [7808,    2,    0,    0,  296,  378,  255, 1324,    0,   64]])


In [None]:
def text_collate(batch):
    xx = batch
    x_lens = [len(x) for x in xx]
    xx_pad = pad_sequence(xx, batch_first=True, padding_value=0)
    return xx_pad, x_lens


In [None]:
text_collate(x)

(tensor([[ 531, 1037,  307,    3,    0,    0,    0,    0,    0,    0],
         [7808,    2,    0,    0,  296,  378,  255, 1324,    0,   64]]),
 [5, 10])

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()