In [None]:
#note that this should be run with the "openfst" kernel to allow us to open the correct python bindings for openfst,
# pywrapfst

In [2]:
import pickle
import Levenshtein
def load_pickle(filename):
    with open(filename, 'rb') as file:
        return(pickle.load(file))
import pandas as pd
import numpy as np
import sys
sys.path.append('/usr/local/lib/python3.6/site-packages')
from utils import wfst

In [3]:
all_tokens_phono = load_pickle('likelihood_inputs/all_tokens_phono.obj')
cmu_in_initial_vocab = load_pickle('likelihood_inputs/cmu_in_initial_vocab_with_duplicates.obj')
# note that cmu_in_initial_vocab was updated to reflect additional records
initial_vocab = load_pickle('likelihood_inputs/initial_vocab.obj')
priors_for_age_interval = load_pickle('likelihood_inputs/priors_for_age_interval.obj')

initial_vocab.sort()

In [4]:
cmu_in_initial_vocab

Unnamed: 0,word,pronunciation,phones,ipa,ipa_short,structure,num_vowels
70,a,AH0,[AH],[ə],ə,[v],1
71,a,EY1,[EY],[eə],eə,[v],1
77,aa,EY2 EY1,"[EY, EY]","[eə, eə]",eəeə,"[v, v]",2
92,aaron,EH1 R AH0 N,"[EH, R, AH, N]","[ɛ, ɹ, ə, n]",ɛɹən,"[v, c, v, c]",2
102,ab,AE1 B,"[AE, B]","[æ, b]",æb,"[v, c]",1
...,...,...,...,...,...,...,...
133666,zombie,Z AA1 M B IY0,"[Z, AA, M, B, IY]","[z, ɑ, m, b, i]",zɑmbi,"[c, v, c, c, v]",2
133671,zone,Z OW1 N,"[Z, OW, N]","[z, oʊʊ, n]",zoʊʊn,"[c, v, c]",1
133676,zoo,Z UW1,"[Z, UW]","[z, u]",zu,"[c, v]",1
133683,zoom,Z UW1 M,"[Z, UW, M]","[z, u, m]",zum,"[c, v, c]",1


In [5]:
cmu_in_initial_vocab.loc[cmu_in_initial_vocab.word == 'read']

Unnamed: 0,word,pronunciation,phones,ipa,ipa_short,structure,num_vowels
97825,read,R EH1 D,"[R, EH, D]","[ɹ, ɛ, d]",ɹɛd,"[c, v, c]",1
97826,read,R IY1 D,"[R, IY, D]","[ɹ, i, d]",ɹid,"[c, v, c]",1


In [6]:
cmu_in_initial_vocab_original = load_pickle('likelihood_inputs/cmu_in_initial_vocab.obj')
cmu_in_initial_vocab_original

Unnamed: 0,index,word,pronunciation,phones,ipa,ipa_short,structure,num_vowels
0,71,a,AH0,[AH],[ə],ə,[v],1
1,78,aa,EY2 EY1,"[EY, EY]","[eɪ, eɪ]",eəeə,"[v, v]",2
5,93,aaron,EH1 R AH0 N,"[EH, R, AH, N]","[ɛ, ɹ, ə, n]",ɛɹən,"[v, c, v, c]",2
7,103,ab,AE1 B,"[AE, B]","[æ, b]",æb,"[v, c]",1
9,150,abbey,AE1 B IY0,"[AE, B, IY]","[æ, b, i]",æbi,"[v, c, v]",2
...,...,...,...,...,...,...,...,...
12546,133667,zombie,Z AA1 M B IY0,"[Z, AA, M, B, IY]","[z, ɑ, m, b, i]",zɑmbi,"[c, v, c, c, v]",2
12547,133672,zone,Z OW1 N,"[Z, OW, N]","[z, oʊ, n]",zoʊʊn,"[c, v, c]",1
12548,133677,zoo,Z UW1,"[Z, UW]","[z, u]",zu,"[c, v]",1
12549,133684,zoom,Z UW1 M,"[Z, UW, M]","[z, u, m]",zum,"[c, v, c]",1


# Levenshtein Distance

In [7]:
def get_edit_distance_matrix(all_tokens_phono, prior_data, initial_vocab,  cmu_2syl_inchildes):    
    '''
    Get an edit distance matrix for matrix-based computation of the posterior

    all_tokens_phono: corpus in tokenized from, with phonological transcriptions
    prior_data: priors of the form output by `compare_successes_failures_*`
    initial_vocab: word types corresponding to the softmask mask
    cmu_2syl_inchildes: cmu pronunctiations, must have 'word' and 'ipa_short' columns 
    '''

    bert_token_ids = prior_data['scores']['bert_token_id']
    ipa = pd.DataFrame({'bert_token_id':bert_token_ids}).merge(all_tokens_phono[['bert_token_id',
        'actual_phonology_no_dia']])

    iv = pd.DataFrame({'word':initial_vocab})
    iv = iv.merge(cmu_2syl_inchildes, how='left')

    levdists = np.vstack([np.array([Levenshtein.distance(target,x) for x in iv.ipa_short
    ]) for target in ipa.actual_phonology_no_dia])    
    return(levdists, ipa)    

In [8]:
levdists, lev_ipa = get_edit_distance_matrix(all_tokens_phono, priors_for_age_interval, initial_vocab,  cmu_in_initial_vocab)

In [9]:
levdists.shape

(11718, 8869)

In [11]:
levdists.shape

(11718, 8869)

In [15]:
levdists_reduced = wfst.reduce_duplicates(levdists, cmu_in_initial_vocab)

In [17]:
levdists

array([[3, 3, 4, ..., 3, 3, 4],
       [2, 2, 4, ..., 2, 3, 4],
       [2, 2, 4, ..., 2, 3, 4],
       ...,
       [2, 2, 3, ..., 3, 3, 4],
       [2, 2, 4, ..., 2, 3, 4],
       [0, 1, 3, ..., 2, 3, 4]])

In [16]:
levdists_reduced

array([[3, 4, 4, ..., 3, 3, 4],
       [2, 4, 4, ..., 2, 3, 4],
       [2, 4, 4, ..., 2, 3, 4],
       ...,
       [2, 3, 3, ..., 3, 3, 4],
       [2, 4, 4, ..., 2, 3, 4],
       [1, 3, 3, ..., 2, 3, 4]])

# WFST Distance

In [None]:
%pdb

In [39]:
import imp
imp.reload(wfst)

<module 'utils.wfst' from '/home/stephan/notebooks/nicole/child-directed-listening/utils/wfst.py'>

In [None]:
wfst_dists, wfst_ipa = wfst.get_wfst_distance_matrix(all_tokens_phono.iloc[0:3], priors_for_age_interval, initial_vocab,  cmu_in_initial_vocab,
    'fst/chi-1.txt', 'fst/chi_phones.sym', num_cores=48)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ats_end[0,2] = ''
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ats_end[0,3] = ''
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the d

> [0;32m/home/stephan/notebooks/nicole/child-directed-listening/utils/wfst.py[0m(288)[0;36mget_wfst_distance_matrix[0;34m()[0m
[0;32m    286 [0;31m    [0;32mimport[0m [0mpdb[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    287 [0;31m    [0mpdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 288 [0;31m    [0mdistances[0m [0;34m=[0m [0;34m[[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    289 [0;31m    [0md_fsa_inputs[0m [0;34m=[0m [0;34m[[0m[0mserial_inputs[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    290 [0;31m    [0mdistances[0m [0;34m=[0m [0;34m[[0m[0mcompute_all_likelihoods_for_w_over_paths_one[0m[0;34m([0m[0md_fsa_input[0m[0;34m)[0m [0;32mfor[0m [0md_fsa_input[0m [0;32min[0m [0md_fsa_inputs[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  serial_inputs


[]


ipdb>  ipa.actual_phonology_no_dia


Series([], Name: actual_phonology_no_dia, dtype: object)


ipdb>  ipa


Empty DataFrame
Columns: [bert_token_id, actual_phonology_no_dia]
Index: []


ipdb>  bert_token_ids = prior_data['scores']['bert_token_id']
ipdb>  prior_data['scores']['bert_token_id']


2749419    2749419.0
558301      558301.0
558302      558302.0
558303      558303.0
558304      558304.0
             ...    
955657      955657.0
248413      248413.0
1725984    1725984.0
2545317    2545317.0
1533942    1533942.0
Name: bert_token_id, Length: 11718, dtype: float64


ipdb>  ipa = pd.DataFrame({'bert_token_id':bert_token_ids}).merge(all_tokens_phono[['bert_token_id',         'actual_phonology_no_dia']])
ipdb>  Ipa.shape


*** NameError: name 'Ipa' is not defined


ipdb>  ipa.shape


(0, 2)


ipdb>  all_tokens_phono[['bert_token_id']]


   bert_token_id
0              0
1              1
2              2


ipdb>  all_tokens_phono.shape


(3, 72)


ipdb>  np.min(prior_data['scores']['bert_token_id’])


*** SyntaxError: EOL while scanning string literal


ipdb>  np.min(prior_data['scores']['bert_token_id'])


42474.0


ipdb>  all_tokens_phono.loc[all_tokens_phono.bert_token_id == 42474]


Empty DataFrame
Columns: [token, utterance_id, gloss, transcript_id, utterance_order, target_child_name, speaker_code, type, punct, speaker_code_simple, gloss_with_punct, token_id, seq_utt_id, actual_phonology, model_phonology, target_child_age, bert_token_id, model_phonology_clean, actual_phonology_clean, model_phonology_no_dia, actual_phonology_no_dia, cv_raw_actual, cv_collapsed_actual, num_vowels_actual, cv_raw_model, cv_collapsed_model, num_vowels_model, num_vowels, in_vocab, success_token, yyy_token, partition, phase_sample, year, phase_child_sample, phase_child_finetune, phase_child_sample_n=2_type=success_name=Alex, phase_child_sample_n=2_type=success_name=Ethan, phase_child_sample_n=2_type=success_name=Lily, phase_child_sample_n=2_type=success_name=Naima, phase_child_sample_n=2_type=success_name=Violet, phase_child_sample_n=2_type=success_name=William, phase_child_sample_n=2_type=yyy_name=Alex, phase_child_sample_n=2_type=yyy_name=Ethan, phase_child_sample_n=2_type=yyy_name=Li

In [24]:
wfst_dists

array([[2.91034475e-07, 9.70735108e-10, 3.67834322e-09, ...,
        7.37326126e-07, 8.21103669e-08, 4.29370811e-09],
       [1.52810475e-05, 1.07906226e-02, 3.85958957e-04, ...,
        3.55403403e-07, 7.24668517e-03, 5.87386341e-08],
       [1.54677852e-05, 1.29174011e-07, 1.94688503e-06, ...,
        5.18680997e-05, 2.73407839e-06, 6.45766567e-06],
       ...,
       [3.71623671e-06, 3.66785446e-07, 1.41435053e-05, ...,
        1.57430170e-08, 4.41272637e-07, 3.82429106e-06],
       [1.99342754e-05, 7.02976751e-07, 6.30810697e-07, ...,
        2.25177957e-06, 3.33650960e-07, 8.37396635e-08],
       [8.37781899e-01, 2.10227361e-03, 7.96600896e-03, ...,
        8.34289234e-04, 3.93670646e-03, 9.49116777e-05]])

In [29]:
assert(wfst_dists.shape[1] != 7904)

AssertionError: 

In [22]:
wfst_dists_reduced = wfst.reduce_duplicates(wfst_dists, cmu_in_initial_vocab)
wfst_dists_reduced

IndexError: index 7904 is out of bounds for axis 1 with size 7904

In [None]:
wfst_dists_reduced.shape

## Sanity Check

In [None]:
def sanity_check_wfst(wfst_dists_reduced, wfst_ipa, row_index):
    d_word = wfst_ipa.iloc[row_index]
    w_word_index = np.argmax(wfst_dists_reduced[row_index,:])
    w_word = initial_vocab[w_word_index]
    print(d_word)
    print(w_word_index)
    print(w_word)

sanity_check_wfst(wfst_dists_reduced, wfst_ipa, 7530)
sanity_check_wfst(wfst_dists_reduced, wfst_ipa, 7531)
sanity_check_wfst(wfst_dists_reduced, wfst_ipa, 7532)
sanity_check_wfst(wfst_dists_reduced, wfst_ipa, 7533)
sanity_check_wfst(wfst_dists_reduced, wfst_ipa, 7534)
sanity_check_wfst(wfst_dists_reduced, wfst_ipa, 7535)

In [None]:
initial_vocab_with_duplicates = list(cmu_in_initial_vocab["word"])

def sanity_check_lev(levdists, lev_ipa, row_index):
    d_word = lev_ipa.iloc[row_index]
    w_word_index = np.argmin(levdists[row_index,:])
    w_word = initial_vocab_with_duplicates[w_word_index]
    print(d_word)
    print(w_word)

sanity_check_lev(levdists, lev_ipa, 7530)
sanity_check_lev(levdists, lev_ipa, 7531)
sanity_check_lev(levdists, lev_ipa, 7532)
sanity_check_lev(levdists, lev_ipa, 7533)
sanity_check_lev(levdists, lev_ipa, 7534)
sanity_check_lev(levdists, lev_ipa, 7535)

In [None]:
levdists