# Text Tokenizers

In [None]:
#| default_exp text.tokenizers

In [None]:
#| hide
%load_ext autoreload
%autoreload 2
from nbdev.showdoc import *

## Phonemizer

In [None]:
#| export
from phonemizer.backend import EspeakBackend
from phonemizer.backend.espeak.language_switch import LanguageSwitch
from phonemizer.backend.espeak.words_mismatch import WordMismatch
from phonemizer.punctuation import Punctuation
from phonemizer.separator import Separator
from phonemizer import phonemize

Assumes espeak backend is installed via `apt-get install espeak`

In [None]:
#| export
class Phonemizer():
    def __init__(self,
        separator=Separator(word=" ", syllable="|", phone=None), # separator
        language='en-us', # language
        backend='espeak', # phonemization backend (espeak)
        strip=True, # strip
        preserve_punctuation=True # preserve punctuation
        ):
        self.separator = separator
        self.language = language
        self.backend = backend
        self.strip = strip
        self.preserve_punctuation = preserve_punctuation
    
    def __call__(self, text, n_jobs=1):
        return(
            phonemize(
                text,
                language=self.language,
                backend=self.backend,
                separator=self.separator,
                strip=self.strip,
                preserve_punctuation=self.preserve_punctuation,
                njobs=n_jobs
                )
        )

## Usage

In [None]:
p = Phonemizer()
text = "Oh Dear! This suck...\n We'll be fine!"
print(p(text))

oʊ dɪɹ! ðɪs sʌk...
wiːl biː faɪn!


In [None]:
#| hide
text = "Oh Dear! This suck...\n We'll be fine!"
text = Punctuation(';:,.!"?()-').remove(text)
print("text:", text)
words = {w.lower() for line in text for w in line.strip().split(' ') if w}
print("words:", words)
# initialize the espeak backend for English
backend = EspeakBackend('en-us')

# separate phones by a space and ignoring words boundaries
separator = Separator(phone=' ', word=None)
# build the lexicon by phonemizing each word one by one. The backend.phonemize
# function expect a list as input and outputs a list.
lexicon = {
    word: backend.phonemize([word], separator=separator, strip=True)[0]
    for word in words}
print("lexicon: ", lexicon)
separator=Separator(word=" ", syllable="|", phone=None)

phn = phonemize(
    text,
    language='en-us',
    backend='espeak',
    separator=separator,
    strip=True,
    preserve_punctuation=True,
    njobs=4)
print(phn)

text: Oh Dear This suck We'll be fine
words: {'i', 'o', 'f', 'd', 'h', "'", 'w', 'n', 'c', 'b', 't', 'r', 'e', 's', 'l', 'u', 'k', 'a'}
lexicon:  {'i': 'aɪ', 'o': 'oʊ', 'f': 'ɛ f', 'd': 'd iː', 'h': 'eɪ tʃ', "'": '', 'w': 'd ʌ b əl j uː', 'n': 'ɛ n', 'c': 's iː', 'b': 'b iː', 't': 't iː', 'r': 'ɑːɹ', 'e': 'iː', 's': 'ɛ s', 'l': 'ɛ l', 'u': 'j uː', 'k': 'k eɪ', 'a': 'eɪ'}
oʊ dɪɹ ðɪs sʌk wiːl biː faɪn


In [None]:
#| hide
import nbdev; nbdev.nbdev_export()