In [1]:
import os
import sys
import rpy2.robjects.lib.ggplot2 as ggplot2
import pandas as pd
import numpy as np
%load_ext rpy2.ipython

In [2]:
sys.path.append('../../')
from src.utils import configuration
config = configuration.Config()

In [3]:
# read in the phono from the pickle so that we have the associated CMU pronunciations
all_tokens_phono = pd.read_pickle(os.path.join(config.prov_csv_dir, 'pvd_utt_glosses_phono_cleaned_inflated_to_next_notebook.pkl'))

In [4]:
excludes = ['', '*','(.)','(..)', '(...)','(....)','(.....)']
phono = all_tokens_phono.loc[~all_tokens_phono.actual_phonology.isin(excludes) & 
                             ~all_tokens_phono.model_phonology.isin(excludes)]
phono.shape

(222881, 32)

In [5]:
phono.iloc[0]

token                             mommy
utterance_id                   16759315
gloss                             Mommy
transcript_id                     42204
utterance_order                       6
target_child_name                  Alex
speaker_code                        CHI
type                        declarative
punct                                 .
speaker_code_simple               [CHI]
gloss_with_punct           [CHI] mommy.
token_id                          20565
seq_utt_id                            3
actual_phonology                    ɑmɪ
model_phonology                   mɑmiː
target_child_age                    514
bert_token_id                        23
model_phonology_clean              mɑmi
actual_phonology_clean              ɑmə
model_phonology_no_dia             mɑmi
actual_phonology_no_dia             ɑmə
cv_raw_actual                       vcv
cv_collapsed_actual                 vcv
num_vowels_actual                     2
cv_raw_model                       cvcv


In [6]:
phono.loc[phono.gloss == 'watch'][['model_phonology_no_dia', 'actual_phonology_no_dia']]

Unnamed: 0,model_phonology_no_dia,actual_phonology_no_dia
152856,wɑʧ,wɑtʃ
177031,wɑʧ,wɑth
214016,wɑʧ,wətʃ
240496,wɑʧ,wɑtʃ
249089,wɑʧ,wɑtʃ
272053,wɑʧ,wɑtʃ
278047,wɑʧ,wɑtʃ
285615,wɑʧ,wɑtʃ
294085,wɑʧ,wɑtʃ
299420,wɑʧ,wɑtʃ


In [7]:
cmu = pd.read_pickle(config.cmu_path)
cmu.head(5)

Unnamed: 0,index,word,pronunciation,phones,ipa,structure,num_vowels,ipa_short
70,71,a,AH0,[AH],[ə],[v],1,ə
71,72,a,EY1,[EY],[eɪ],[v],1,eə
77,78,aa,EY2 EY1,"[EY, EY]","[eɪ, eɪ]","[v, v]",2,eəeə
92,93,aaron,EH1 R AH0 N,"[EH, R, AH, N]","[ɛ, ɹ, ə, n]","[v, c, v, c]",2,ɛɹən
102,103,ab,AE1 B,"[AE, B]","[æ, b]","[v, c]",1,æb


In [65]:
phono['model_ipa'] = phono.model_phonology_no_dia
phono['actual_ipa'] = [' '.join(list(x)) for x in phono.actual_phonology_no_dia]

In [66]:
'a' in set(list(''.join(phono['actual_ipa'])))

True

In [104]:
cmu_map = dict(zip(cmu.word, cmu.ipa_short))
def cmu_mapper(x, cmu_map):
    token = x[0]
    model = x[1]
    if x[0] in cmu_map:
        return(cmu_map[x[0]])
    else:
        return(x[1])
    

# cmu inputs need to be changed for the digraphs
phono['cmu'] = [cmu_mapper(x, cmu_map) for x in zip(phono.token, phono.model_ipa)]
phono['cmu'] = [' '.join(list(x.replace('ʧ','tʃ').replace('ʤ','dʒ'))) for x in phono['cmu']]
print(phono.shape)
phono = phono.loc[~phono.cmu.str.contains('\\*')]
print(phono.shape)

(222881, 36)
(222880, 36)


In [105]:
phono[['cmu','actual_ipa']]

Unnamed: 0,cmu,actual_ipa
2986381,ə n,ə n
2321570,b ɛ ɹ,b ə
40583,b ɛ ɹ,b ɛ ə
957249,t i v i,t ʃ i v i
1766402,ɑ,ɑ
...,...,...
1608358,h æ p i,h æ ʔ p i
2989720,a ə,a ə
1159219,j æ,j æ
157982,t u,t u


In [106]:
num_dev = 20000
num_train = 180000
num_test = 20000

for output_dir in ('dev','train','test'):
    dir_to_make = os.path.join(config.project_root, 'output/fst', output_dir)
    if not os.path.exists(dir_to_make):
        os.makedirs(dir_to_make)
        
phono = phono.sample(frac = 1)        

phono[['cmu','actual_ipa']].iloc[0:num_dev].to_csv(os.path.join(
    config.project_root, 'output/fst/dev/chi_dev.tsv'),
    sep='\t', header=False, index=False)
phono[['cmu','actual_ipa']].iloc[num_dev:num_dev+num_train].to_csv(os.path.join(
    config.project_root, 'output/fst/train/chi_train.tsv'),
    sep='\t', header=False, index=False)
phono[['cmu','actual_ipa']].iloc[num_dev+num_train:num_dev+num_train+num_test].to_csv(os.path.join(
    config.project_root,'output/fst/test/chi_test.tsv'),
    sep='\t', header=False, index=False)

In [107]:
phono[['model_ipa_split']] = [' '.join(list(x)) for x in phono['cmu']]
phono[['model_ipa_split']].iloc[num_dev:num_dev+num_train].to_csv(os.path.join(config.project_root,
    'output/fst/train/chi_inputs.tsv'),
    sep='\t', header=False, index=False)

# Split the Phonology Training Data by Child

In [None]:
# what we want now is dev / train / test for each of the children

In [108]:
dev_tokens = phono.iloc[0:num_dev]
training_tokens = phono.iloc[num_dev:num_dev+num_train]
test_tokens = phono.iloc[num_dev+num_train:num_dev+num_train+num_test]

In [109]:
training_tokens_grouped = training_tokens.groupby(training_tokens['target_child_name'])
dev_tokens_grouped = dev_tokens.groupby(dev_tokens['target_child_name'])
test_tokens_grouped = test_tokens.groupby(test_tokens['target_child_name'])

In [110]:
for child in training_tokens_grouped.groups.keys():    
    
    dev_df = dev_tokens_grouped.get_group(child)    
    dev_df[['cmu','actual_ipa']].to_csv(os.path.join(config.project_root,'output/fst/dev/'+child+'_dev.tsv'), sep='\t', header=False, index=False)
    
    train_df = training_tokens_grouped.get_group(child)
    train_df[['cmu','actual_ipa']].to_csv(os.path.join(config.project_root,'output/fst/train/'+child+'_train.tsv'), sep='\t', header=False, index=False)
    
    test_df  = test_tokens_grouped.get_group(child)
    test_df[['cmu','actual_ipa']].to_csv(os.path.join(config.project_root,'output/fst/test/'+child+'_test.tsv'), sep='\t', header=False, index=False)

# Make a UTF-8 symbol set that covers all of the characters across all children, train + test sets

In [111]:
all_sounds = [x.split(' ') for x in  phono['actual_ipa']]
phones = np.unique([item for subl in all_sounds for item in subl])
phones

array(['a', 'b', 'd', 'e', 'f', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
       'p', 's', 't', 'u', 'v', 'w', 'x', 'z', 'æ', 'ð', 'ŋ', 'ɑ', 'ə',
       'ɛ', 'ɜ', 'ɡ', 'ɣ', 'ɱ', 'ɲ', 'ɹ', 'ɾ', 'ʃ', 'ʊ', 'ʒ', 'ʔ', 'ʧ',
       'θ'], dtype='<U1')

In [112]:
# cmu
all_cmu_sounds = [x.split(' ') for x in  phono['cmu']]
cmu_phones = np.unique([item for subl in all_cmu_sounds for item in subl])
cmu_phones

array(['a', 'b', 'd', 'e', 'f', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
       'p', 's', 't', 'u', 'v', 'w', 'z', 'æ', 'ð', 'ŋ', 'ɑ', 'ə', 'ɛ',
       'ɡ', 'ɹ', 'ʃ', 'ʊ', 'ʒ', 'θ'], dtype='<U1')

In [113]:
set(cmu_phones) - set(phones)

set()

In [None]:
# check what was loaded by likelihoods and see if it is outside of this set. Actual_ipa may be from the CMU renditions, which are different

In [114]:
'ɲ' in phones

True

In [115]:
utf8_sym = pd.concat([
    pd.DataFrame({'utf8':['<epsilon>'], 'sym': [0]}),
    pd.DataFrame({'utf8':phones, 'sym': [ord(x) for x in phones]}),
])
utf8_sym

Unnamed: 0,utf8,sym
0,<epsilon>,0
0,a,97
1,b,98
2,d,100
3,e,101
4,f,102
5,h,104
6,i,105
7,j,106
8,k,107


In [116]:
utf8_sym.to_csv('../../src/external/all_child_phones.sym', header=False, index=False, sep='\t')