# Final phoneme sets for each language


In [1]:
import pandas as pd

from tqdm import tqdm

phoible = pd.read_csv('../../../data/phoible.csv')
phonemes = phoible.Phoneme.unique()

  phoible = pd.read_csv('../../../data/phoible.csv')


In [2]:
TONES = '˧˥˩̰˨˩˦'

def get_phoneme_set(csv_file, display=False):
    """ Prints set of phonemes found in the csv file

    """
    if not csv_file.endswith('.csv'):
        dataset = pd.read_csv(csv_file+'/processed.csv')
    else:
        dataset = pd.read_csv(csv_file)

    token_counts = {}

    for line in dataset['phonemized_utterance']:
        # Our tool combines tone markers with the preceeding vowel, we remove tone markers in our comparison so that we don't get many "unknown phonemes" consisting of a known vowel + tone marker.
        #line = line.replace('˧˥', '').replace('˧˩̰', '').replace('˩˧', '').replace('˨', '').replace('˥', '').replace('˧', '').replace('˧˥', '').replace('˧˩̰', '').replace('˩˧','').replace('˩','').replace('˦','')
        tokens = line.strip().split()
        for token in tokens:
            if token == 'WORD_BOUNDARY':
                continue
            if token not in token_counts:
                token_counts[token] = 0
            token_counts[token] += 1
    
    vowels = []
    consonants = []
    other = []
    for phoneme in token_counts:
        cmp_phoneme = phoneme
        if phoneme not in phonemes:
            has_tones = False
            for tone in TONES:
                if tone in phoneme:
                    has_tones = True
                    cmp_phoneme = cmp_phoneme.replace(tone, '')
            if not has_tones or cmp_phoneme not in phonemes:
                print(f'{phoneme} not in phoible')
                other.append(phoneme)
                continue
        if phoible[phoible.Phoneme == cmp_phoneme].SegmentClass.iloc[0] == 'vowel':
            vowels.append(phoneme)
        elif phoible[phoible.Phoneme == cmp_phoneme].SegmentClass.iloc[0] == 'consonant':
            consonants.append(phoneme)
        else:
            other.append(phoneme)

    if display:
        print(f'Phonemes:')
        print(', '.join(sorted(vowels)))
        print(', '.join(sorted(consonants)))
        print(', '.join(sorted(other)))
        print(f'Vowels: {vowels)}\nConsonants: {consonants)}\nOther: {other)}\nTOTAL: {vowels)+consonants)+other)}')
    
    return token_counts


In [None]:
# Get every folder in ../../CHILDES-dataset
import os
for folder in sorted(os.listdir('../CHILDES-dataset')):
    if '.' in folder:
        continue
    print(folder)
    get_phoneme_set(os.path.join('../CHILDES-dataset', folder), display=True)
    print('\n\n')

Basque
Phonemes:
a, ai̯, au̯, e, ei̯, eu̯, i, o, oi̯, u, y
b, c, d̪, f, j, k, l, m, n, p, r, s̪̻, s̺, t̠ʃ, t̪, t̪̻s̪̻, t̺s̺, x, ð, ɟ, ɡ, ɣ, ɲ, ɾ, ʃ, ʎ, β, θ

Vowels: 11
Consonants: 28
Other: 0
TOTAL: 39



Cantonese
Phonemes:
aːĭ˥, aːĭ˧, aːĭ˧˥, aːĭ˧˩̰, aːĭ˨, aːĭ˩˧, aːŭ˥, aːŭ˧, aːŭ˧˥, aːŭ˧˩̰, aːŭ˨, aːŭ˩˧, a̞˥, a̞˧, a̞˧˥, a̞˧˩̰, a̞˨, a̞˩˧, ei˥, ei˧, ei˧˥, ei˧˩̰, ei˨, ei˩˧, iːŭ˥, iːŭ˧, iːŭ˧˥, iːŭ˧˩̰, iːŭ˨, iːŭ˩˧, i˥, i˧, i˧˥, i˧˩̰, i˨, i˩˧, ou˥, ou˧, ou˧˥, ou˧˩̰, ou˨, ou˩˧, uːĭ˥, uːĭ˧, uːĭ˧˥, uːĭ˧˩̰, uːĭ˨, uːĭ˩˧, u˥, u˧, u˧˥, u˧˩̰, u˨, u˩˧, y˥, y˧, y˧˥, y˧˩̰, y˨, y˩˧, œ̞˥, œ̞˧, œ̞˧˥, œ̞˧˩̰, œ̞˨, œ̞˩˧, ɐi˥, ɐi˧, ɐi˧˥, ɐi˧˩̰, ɐi˨, ɐi˩˧, ɐu˥, ɐu˧, ɐu˧˥, ɐu˧˩̰, ɐu˨, ɐu˩˧, ɐ˥, ɐ˧, ɐ˧˥, ɐ˧˩̰, ɐ˨, ɐ˩˧, ɔːĭ˥, ɔːĭ˧, ɔːĭ˧˥, ɔːĭ˧˩̰, ɔːĭ˨, ɔːĭ˩˧, ɔ̽˥, ɔ̽˧, ɔ̽˧˥, ɔ̽˧˩̰, ɔ̽˨, ɔ̽˩˧, ɛ, ɛ˥, ɛ˧, ɛ˧˥, ɛ˧˩̰, ɛ˨, ɛ˩˧, ɪ̞˥, ɪ̞˧, ɪ̞˧˥, ɪ̞˧˩̰, ɪ̞˨, ɵy˥, ɵy˧, ɵy˧˥, ɵy˧˩̰, ɵy˨, ɵy˩˧, ɵ˥, ɵ˧, ɵ˧˥, ɵ˧˩̰, ɵ˨, ɵ˩˧, ʊ̟˥, ʊ̟˧, ʊ̟˧˥, ʊ̟˧˩̰, ʊ̟˨, ʊ̟˩˧
f, h, j, k, kʰ, l, m, m̩˥, m̩˧, m̩˧˥, m̩

## Compare Phoneme Inventories

In [3]:
import sys, os
from pathlib import Path
import pandas as pd

sys.path.append('../../../')
sys.path.append('./')
from corpus_phonemizer import phonemize_utterances

os.environ['PHONEMIZER_ESPEAK_LIBRARY'] = "/opt/local/lib/libespeak-ng.dylib"

SKIP_DOWNLOAD = True
KEEP_CHILD_UTTERANCES = True
DOWNLOAD_OUT_PATH = Path('../downloaded')
MAX_AGE = None

def get_vocab_from_phonemizing(language, backend, lang):

    df = pd.read_csv(f'../CHILDES-dataset/{language}/processed.csv')
    lines = df['stem'] if lang in ['mandarin', 'cantonese', 'yue-Latn', 'cmn-Latn'] else df['processed_gloss']
    phonemized = phonemize_utterances(lines, backend, lang, keep_word_boundaries=True, verbose=True, use_folding=False)

    vocab = set()
    for line in phonemized:
        tokens = line.strip().split()
        for token in tokens:
            vocab.add(token)
    vocab.remove('WORD_BOUNDARY')
    return ' '.join(sorted(vocab))



In [4]:
epitran_vocab = get_vocab_from_phonemizing('French', 'epitran', 'fra-Latn')
phonemizer_vocab = get_vocab_from_phonemizing('French', 'phonemizer', 'fr-fr')

DEBUG:src.wrappers.wrapper.EpitranWrapper:Initializing EpitranWrapper with language "fra-Latn" and wrapper_kwargs "{}"
DEBUG:src.wrappers.wrapper.EpitranWrapper:Using epitram backend with language code "fra-Latn"...
DEBUG:src.wrappers.wrapper.EpitranWrapper:Skipping folding dictionary post-processing, using uncorrected output from epitran.
DEBUG:src.wrappers.wrapper.PhonemizerWrapper:Initializing PhonemizerWrapper with language "fr-fr" and wrapper_kwargs "{}"
DEBUG:src.wrappers.wrapper.PhonemizerWrapper:Using espeak backend with language code "fr-fr"...
DEBUG:src.wrappers.wrapper.PhonemizerWrapper:Skipping folding dictionary post-processing, using uncorrected output from phonemizer.


In [5]:
phoible_vocab = ' '.join(sorted(phoible[phoible.InventoryID == 2269].Phoneme))
dataset_vocab = ' '.join(sorted(get_phoneme_set('../CHILDES-dataset/French', display=False).keys()))

In [6]:
print('Epitran vocab:', epitran_vocab)
print('Phonemizer vocab:', phonemizer_vocab)
print('Phoible vocab:', phoible_vocab)
print('Dataset vocab:', dataset_vocab)

Epitran vocab: a ä b d e f i j k l m n o p s t u ü v w y ÿ z ø ŋ œ œ̃ ƭ ǝ ɑ ɑ̃ ɔ ɔ̃ ɘ ə ɛ ɛ̃ ɡ ɥ ɲ ʀ ʃ ʒ ̀ ́ ̂ ̈
Phonemizer vocab: a aː b d dʒ e f i iː j k l m n o oː p s t tʃ u v w y yː z ø øː ŋ œ œ̃ ɑ̃ ɔ ɔ̃ ə ɛ ɛ̃ ɡ ɲ ʁ ʃ ʒ
Phoible vocab: a b d e f i j k l m n o p s t u v w y z ø œ ɑ̃ ɔ ɔ̃ ə ɛ ɛ̃ ɡ ɥ ɲ ʁ ʃ ʒ
Dataset vocab: a b d d̠ʒ e f i j k l m n o p s t t̠ʃ u v w y z ø ŋ œ ɑ̃ ɔ ɔ̃ ə ɛ ɛ̃ ɡ ɲ ʁ ʃ ʒ


In [21]:
all = set(dataset_vocab) & set(phoible_vocab) & set(epitran_vocab) & set(phonemizer_vocab)
d_e_p = set(dataset_vocab) & set(phoible_vocab) & set(epitran_vocab) - set(phonemizer_vocab)
d_e_ph = set(dataset_vocab) & set(phonemizer_vocab) & set(epitran_vocab) - set(phoible_vocab)
d_p_ph = set(dataset_vocab) & set(phonemizer_vocab) & set(phoible_vocab) - set(epitran_vocab)
e_p_ph = set(epitran_vocab) & set(phonemizer_vocab) & set(phoible_vocab) - set(dataset_vocab)
d_e = set(dataset_vocab) & set(epitran_vocab) - set(phonemizer_vocab) - set(phoible_vocab)
d_p = set(dataset_vocab) & set(phonemizer_vocab) - set(epitran_vocab) - set(phoible_vocab)
d_ph = set(dataset_vocab) & set(phoible_vocab) - set(epitran_vocab) - set(phonemizer_vocab)
e_p = set(epitran_vocab) & set(phonemizer_vocab) - set(dataset_vocab) - set(phoible_vocab)
e_ph = set(epitran_vocab) & set(phoible_vocab) - set(dataset_vocab) - set(phonemizer_vocab)
p_ph = set(phonemizer_vocab) & set(phoible_vocab) - set(dataset_vocab) - set(epitran_vocab)
d = set(dataset_vocab) - set(epitran_vocab) - set(phonemizer_vocab) - set(phoible_vocab)
e = set(epitran_vocab) - set(dataset_vocab) - set(phonemizer_vocab) - set(phoible_vocab)
p = set(phonemizer_vocab) - set(dataset_vocab) - set(epitran_vocab) - set(phoible_vocab)
ph = set(phoible_vocab) - set(dataset_vocab) - set(epitran_vocab) - set(phonemizer_vocab)

print('All:', ' '.join(sorted(all)))
print('Dataset, Epitran, Phoible:', ' '.join(sorted(d_e_p)))
print('Dataset, Epitran, Phonemizer:', ' '.join(sorted(d_e_ph)))
print('Dataset, Phonemizer, Phoible:', ' '.join(sorted(d_p_ph)))
print('Epitran, Phonemizer, Phoible:', ' '.join(sorted(e_p_ph)))
print('Dataset, Epitran:', ' '.join(sorted(d_e)))
print('Dataset, Phonemizer:', ' '.join(sorted(d_p)))
print('Dataset, Phoible:', ' '.join(sorted(d_ph)))
print('Epitran, Phonemizer:', ' '.join(sorted(e_p)))
print('Epitran, Phoible:', ' '.join(sorted(e_ph)))
print('Phonemizer, Phoible:', ' '.join(sorted(p_ph)))
print('Dataset:', ' '.join(sorted(d)))
print('Epitran:', ' '.join(sorted(e)))
print('Phonemizer:', ' '.join(sorted(p)))
print('Phoible:', ' '.join(sorted(ph)))


All: a b d e f i j k l m n o p s t u v w y z ø œ ɑ̃ ɔ ɔ̃ ə ɛ ɛ̃ ɡ ɲ ʃ ʒ
Dataset, Epitran, Phoible: 
Dataset, Epitran, Phonemizer: ŋ
Dataset, Phonemizer, Phoible: ʁ
Epitran, Phonemizer, Phoible: 
Dataset, Epitran: 
Dataset, Phonemizer: 
Dataset, Phoible: 
Epitran, Phonemizer: œ̃
Epitran, Phoible: ɥ
Phonemizer, Phoible: 
Dataset: d̠ʒ t̠ʃ
Epitran: ä ü ÿ ƭ ǝ ɑ ɘ ʀ ̀ ́ ̂ ̈
Phonemizer: aː dʒ iː oː tʃ yː øː
Phoible: 
