In [None]:
%load_ext autoreload
%autoreload 2

# Generate phonemes 
Just for testing, to make sure we can

We didn't actually use phonemes in the end

In [None]:
from collections import Counter
import json
import random

import pandas as pd
from tqdm.auto import tqdm

from src.data.filesystem import fopen

In [None]:
given_surname = "given"
sample_size = 500000
num_unigrams = 63
num_bigrams = 500

train_path = f"s3://familysearch-names/processed/tree-hr-{given_surname}-train-v2.csv.gz"
espeak_vocab_path = f"s3://nama-data/data/models/fs-{given_surname}-espeak_phoneme_vocab.json"
espeak_bigrams_vocab_path = f"s3://nama-data/data/models/fs-{given_surname}-espeak_phoneme_vocab_bigrams.json"

## Load data

In [None]:
df = pd.read_csv(train_path, na_filter=False)
print(df.shape)
df.head(3)

In [None]:
all_names = set(df['tree_name']) | set(df['record_name'])
print(len(all_names))
next(iter(all_names))

In [None]:
sampled_names = random.sample(list(all_names), sample_size)

### Try eng_to_ipa
This test failed, because eng_to_ipa just uses a dictionary lookup

In [None]:
import eng_to_ipa as ipa

In [None]:
ipa.convert('john', stress_marks=False)

In [None]:
name_phonemes = {}
for name in tqdm(sampled_names):
    name_phonemes[name] = ipa.convert(name, stress_marks=False)
len(name_phonemes)

In [None]:
for ix, (name, phonemes) in enumerate(name_phonemes.items()):
    if ix > 20:
        break
    print(name, phonemes)

### Try phonemizer

In [None]:
# sudo apt-get install espeak-ng, festival, mbrola, mbrola-us1
from phonemizer import phonemize
from phonemizer.separator import Separator
from phonemizer.backend import EspeakBackend, EspeakMbrolaBackend, SegmentsBackend, FestivalBackend

In [None]:
EspeakBackend.supported_languages()

In [None]:
EspeakMbrolaBackend.supported_languages()

In [None]:
SegmentsBackend.supported_languages()

In [None]:
FestivalBackend.supported_languages()

In [None]:
espeak = EspeakBackend('en-us')
mbrola = EspeakMbrolaBackend('mb-us1')
festival = FestivalBackend('en-us')  # too slow

# only festival supports separating syllables, so don't do it
separator = Separator(phone=' ', syllable=None, word='|')

In [None]:
print(espeak.phonemize(
    ['john'], 
    separator=separator,
    strip=True
)[0])
print(mbrola.phonemize(
    ['john'], 
    separator=separator,
    strip=True
)[0])
# festival is too slow
# print(festival.phonemize(
#     ['gornale'], 
#     separator=separator,
#     strip=True
# )[0])

In [None]:
espeak_phonemes = {}
espeak_phoneme_bigrams = {}
for name in tqdm(sampled_names):
    phonemes = espeak.phonemize([name], separator=separator, strip=True)[0]
    espeak_phonemes[name] = phonemes
    # bigrams
    context_phoneme = 'START'
    phonemes = phonemes.split(' ')
    phonemes.append('END')
    phoneme_bigrams = []
    for phoneme in phonemes:
        phoneme_bigrams.append(f"{context_phoneme},{phoneme}")
        context_phoneme = phoneme
    espeak_phoneme_bigrams[name] = ' '.join(phoneme_bigrams)
print(len(espeak_phonemes))
print(len(espeak_phoneme_bigrams))
print(espeak_phonemes['john'])
print(espeak_phoneme_bigrams['john'])

In [None]:
mbrola_phonemes = {}
for name in tqdm(sampled_names):
    mbrola_phonemes[name] = mbrola.phonemize([name], separator=separator, strip=True)[0]
print(len(mbrola_phonemes))

In [None]:
# festival is incredibly slow
# festival_phonemes = {}
# for name in tqdm(sampled_names):
#     festival_phonemes[name] = festival.phonemize([name], separator=separator, strip=True)[0]
# print(len(festival_phonemes))

In [None]:
# festival is incredibly slow
festival_phonemes = {}
for name in tqdm(sampled_names[:1000]):
    festival_phonemes[name] = festival.phonemize([name], separator=separator, strip=True)[0]
print(len(festival_phonemes))

In [None]:
for ix, name in enumerate(espeak_phonemes):
    if ix > 20:
        break
    print(name, 'espeak', espeak_phonemes[name], 'mbrola', mbrola_phonemes[name])

In [None]:
espeak_counter = Counter()
espeak_lengths = Counter()
espeak_bigrams_counter = Counter()
for name, phonemes in espeak_phonemes.items():
    phonemes_list = phonemes.split(' ')
    for phoneme in phonemes_list:
        espeak_counter[phoneme] += 1
    context_phoneme = 'START'
    phonemes_list.append('END')
    for phoneme in phonemes_list:
        espeak_bigrams_counter[f"{context_phoneme},{phoneme}"] += 1
        context_phoneme = phoneme
    espeak_lengths[len(phonemes_list)] += 1
print(len(espeak_counter))
print(len(espeak_bigrams_counter))

In [None]:
mbrola_counter = Counter()
for name, phonemes in mbrola_phonemes.items():
    for phoneme in phonemes.split(' '):
        mbrola_counter[phoneme] += 1
print(len(mbrola_counter))

In [None]:
espeak_counter.most_common(num_unigrams)

In [None]:
espeak_bigrams_counter.most_common(num_bigrams)

In [None]:
mbrola_counter.most_common()

In [None]:
espeak_lengths

### Try gruut
MIT license but too slow

In [None]:
from gruut import sentences

In [None]:
for sentence in sentences("john", lang="en-us"):
    for word in sentence:
        print(word.phonemes)

In [None]:
gruut_phonemes = {}
for name in tqdm(sampled_names[:1000]):
    for ix, sentence in enumerate(sentences(name, lang='en-us')):
        if ix > 0:
            print("name has multiple sentences", name)
            break
        for ix, word in enumerate(sentence):
            if ix > 0:
                print("name as multiple words", name)
                break
            gruut_phonemes[name] = word.phonemes
len(gruut_phonemes)

In [None]:
gruut_counter = Counter()
for name, phonemes in gruut_phonemes.items():
    for phoneme in phonemes:
        gruut_counter[phoneme] += 1
print(len(gruut_counter))

In [None]:
gruut_counter.most_common()

## Save Vocab

In [None]:
espeak_vocab = {}
espeak_bigrams_vocab = {}
ix = 0
for phoneme, _ in espeak_counter.most_common(num_unigrams):
    espeak_vocab[phoneme] = ix
    espeak_bigrams_vocab[phoneme] = ix
    ix += 1
for phoneme_bigram, _ in espeak_bigrams_counter.most_common(num_bigrams):
    espeak_bigrams_vocab[phoneme_bigram] = ix
    ix += 1

In [None]:
print(len(espeak_vocab))
espeak_vocab

In [None]:
print(len(espeak_bigrams_vocab))
espeak_bigrams_vocab

In [None]:
with fopen(espeak_vocab_path, 'w') as f:
    json.dump(espeak_vocab, f)

In [None]:
with fopen(espeak_bigrams_vocab_path, 'w') as f:
    json.dump(espeak_bigrams_vocab, f)