# Text Tokenizers

In [None]:
#| default_exp text.tokenizers

In [None]:
#| hide
%load_ext autoreload
%autoreload 2
from nbdev.showdoc import *

## Phonemizer

In [None]:
#| export
from phonemizer.backend import EspeakBackend
from phonemizer.backend.espeak.language_switch import LanguageSwitch
from phonemizer.backend.espeak.words_mismatch import WordMismatch
from phonemizer.punctuation import Punctuation
from phonemizer.separator import Separator
from phonemizer import phonemize
from torch.utils.data import DataLoader

Assumes espeak backend is installed via `apt-get install espeak`

In [None]:
#| export
class Phonemizer():
    def __init__(self,
        separator=Separator(word=" ", syllable="|", phone=None), # separator
        language='en-us', # language
        backend='espeak', # phonemization backend (espeak)
        strip=True, # strip
        preserve_punctuation=True # preserve punctuation
        ):
        self.separator = separator
        self.language = language
        self.backend = backend
        self.strip = strip
        self.preserve_punctuation = preserve_punctuation
    
    def __call__(self, text, n_jobs=1):
        return(
            phonemize(
                text,
                language=self.language,
                backend=self.backend,
                separator=self.separator,
                strip=self.strip,
                preserve_punctuation=self.preserve_punctuation,
                njobs=n_jobs
                )
        )

## Usage

In [None]:
p = Phonemizer()
text = "Oh Dear! This suck...\n We'll be fine!"
print(p(text))

oʊ dɪɹ! ðɪs sʌk...
wiːl biː faɪn!


In [None]:
#| hide
text = "Oh Dear! This suck...\n We'll be fine!"
text = Punctuation(';:,.!"?()-').remove(text)
print("text:", text)
words = {w.lower() for line in text for w in line.strip().split(' ') if w}
print("words:", words)
# initialize the espeak backend for English
backend = EspeakBackend('en-us')

# separate phones by a space and ignoring words boundaries
separator = Separator(phone=' ', word=None)
# build the lexicon by phonemizing each word one by one. The backend.phonemize
# function expect a list as input and outputs a list.
lexicon = {
    word: backend.phonemize([word], separator=separator, strip=True)[0]
    for word in words}
print("lexicon: ", lexicon)
separator=Separator(word=" ", syllable="|", phone=None)

phn = phonemize(
    text,
    language='en-us',
    backend='espeak',
    separator=separator,
    strip=True,
    preserve_punctuation=True,
    njobs=4)
print(phn)

text: Oh Dear This suck We'll be fine
words: {'i', 'o', 'f', 'd', 'h', "'", 'w', 'n', 'c', 'b', 't', 'r', 'e', 's', 'l', 'u', 'k', 'a'}
lexicon:  {'i': 'aɪ', 'o': 'oʊ', 'f': 'ɛ f', 'd': 'd iː', 'h': 'eɪ tʃ', "'": '', 'w': 'd ʌ b əl j uː', 'n': 'ɛ n', 'c': 's iː', 'b': 'b iː', 't': 't iː', 'r': 'ɑːɹ', 'e': 'iː', 's': 'ɛ s', 'l': 'ɛ l', 'u': 'j uː', 'k': 'k eɪ', 'a': 'eɪ'}
oʊ dɪɹ ðɪs sʌk wiːl biː faɪn


## Tokenizer

Requires download of spacy specific language e.g. `python -m spacy download en`

In [None]:
#| export
import torch
from collections import Counter
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from collections import Counter
from torchtext.datasets import AG_NEWS
from typing import Iterable
from torch.nn.utils.rnn import pad_sequence

In [None]:
#| export 

class Tokenizer:
    def __init__(self, backend='spacy', language='en'):
        self.tokenizer = get_tokenizer(backend, language=language)
        self.counter = Counter()
        self._vocab = None

    def __call__(self, text:str):
        return self.tokenizer(text)
    
    def tokenize_iter(self, data_iter:Iterable):
        for _, text in data_iter:
            yield self.tokenizer(text)


## Numericalizer

In [None]:
#| export
class Numericalizer():
    def __init__(self, tokenizer:Tokenizer):
        self.tokenizer = tokenizer
        self._vocab = None
    
    def build_map_from_iter(self,data_iter:Iterable, specials = ["<unk>"]):
        self._vocab = build_vocab_from_iterator(self.tokenizer.tokenize_iter(data_iter), specials=specials)
        if "<unk>" in specials:
            self._vocab.set_default_index(self._vocab["<unk>"])
        return self._vocab

In [None]:
# TODO: collate text + add special characters & 0 != unk

## Usage

In [None]:
tok = Tokenizer()
tokenized = tok("Oh, yeah\n I don't know dude...")
ds = AG_NEWS(split='test') # data pipe
sample = next(iter(ds)) # (label, text)
print(sample)
tokenized_ds = tok.tokenize_iter(ds)
sample = next(iter(tokenized_ds))
print(sample)



(3, "Fears for T N pension after talks Unions representing workers at Turner   Newall say they are 'disappointed' after talks with stricken parent firm Federal Mogul.")
['Fears', 'for', 'T', 'N', 'pension', 'after', 'talks', 'Unions', 'representing', 'workers', 'at', 'Turner', '  ', 'Newall', 'say', 'they', 'are', "'", 'disappointed', "'", 'after', 'talks', 'with', 'stricken', 'parent', 'firm', 'Federal', 'Mogul', '.']


In [None]:
num = Numericalizer(tok)
mapper = num.build_map_from_iter(ds)
print(mapper["<unk>"])
print(mapper(tok("here we go. asdflkj")))
# text_pipeline = lambda x: voc(tokenizer(x))
print(mapper(tokenized))

0
[531, 1037, 307, 3, 0]
[7808, 2, 0, 0, 296, 378, 255, 1324, 0, 64]


In [None]:
a = mapper(tok("here we go. asdflkj"))
print(len(a))
b = mapper(tok("Oh, yeah\n I don't know dude..."))
print(len(b))
mini_batch = [a, b]
x = [torch.LongTensor(x_i) for x_i in mini_batch]
print(x)
x_padded = pad_sequence(x, batch_first=True, padding_value=0)
print(x_padded)

5
10
[tensor([ 531, 1037,  307,    3,    0]), tensor([7808,    2,    0,    0,  296,  378,  255, 1324,    0,   64])]
tensor([[ 531, 1037,  307,    3,    0,    0,    0,    0,    0,    0],
        [7808,    2,    0,    0,  296,  378,  255, 1324,    0,   64]])


In [None]:
def text_collate(batch):
    # batch: [(label, text), ]
    # from ipdb import set_trace; set_trace()
    texts = [row[1] for row in batch]
    tokens = [torch.LongTensor(mapper(tok(row))) for row in texts]
    text_lens = [len(token) for token in tokens]
    text_pad = pad_sequence(tokens, batch_first=True, padding_value=-1)
    return text_pad, text_lens

In [None]:
dl = DataLoader(dataset=ds, batch_size=5, shuffle=True, collate_fn=text_collate)

In [None]:
b = next(iter(dl))
print(b)

(tensor([[11991, 29434, 24121, 10187,  2896,   147,     6,   238,    17,    28,
            81,  4305,    20,    12,    13, 16249,    13,    76,    27,    26,
             1,  2065,  3871,     7, 17562,     2, 11991,  1048,    24,     4,
           885,     1, 11025,     7, 10187,    20,     6,  5517,     3,    -1,
            -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
            -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
            -1],
        [23259,    11,     6, 17457,  1195, 12240,     6,  9439, 13543,     1,
          2814,  2657,     7,  6523,  4490,    22, 15031,    82,  1383,    21,
         21972,    22,     2, 14020, 23182,    11,     1,  4490,   907,     2,
            56,    72,  1076,     1,  2196,     5,  2173, 10377,     3,    -1,
            -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
            -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
            -1],
        [ 3891,  

In [None]:
torch.sum(torch.Tensor([17564, 24659, 29258, 26399,    13, 16230,  1350,  1321,     4,     6,
          1649,    20,  1471,     7,   386, 10675,     6,   784,   648,     8,
          1734,    58,     1,   351,  4033,     7,   315,   217,  4224,  2494,
            13,    16,  2880,     5,   363,   105,     3,    -1,    -1,    -1,
            -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
            -1,    -1,    -1,    -1,    -1,    -1])!= -1)

tensor(37)

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()