In [1]:
#note that this should be run with the "openfst" kernel to allow us to open the correct python bindings for openfst,
# pywrapfst

In [2]:
import pickle
import Levenshtein
def load_pickle(filename):
    with open(filename, 'rb') as file:
        return(pickle.load(file))
import pandas as pd
import numpy as np
import sys
sys.path.append('/usr/local/lib/python3.6/site-packages')
from utils import wfst

In [3]:
all_tokens_phono = load_pickle('likelihood_inputs/all_tokens_phono.obj')
cmu_in_initial_vocab = load_pickle('likelihood_inputs/cmu_in_initial_vocab_with_duplicates.obj')
initial_vocab = load_pickle('likelihood_inputs/initial_vocab.obj')
priors_for_age_interval = load_pickle('likelihood_inputs/priors_for_age_interval.obj')

initial_vocab.sort()

# Levenshtein Distance

In [4]:
def get_edit_distance_matrix(all_tokens_phono, prior_data, initial_vocab,  cmu_2syl_inchildes):    
    '''
    Get an edit distance matrix for matrix-based computation of the posterior

    all_tokens_phono: corpus in tokenized from, with phonological transcriptions
    prior_data: priors of the form output by `compare_successes_failures_*`
    initial_vocab: word types corresponding to the softmask mask
    cmu_2syl_inchildes: cmu pronunctiations, must have 'word' and 'ipa_short' columns 
    '''

    bert_token_ids = prior_data['scores']['bert_token_id']
    ipa = pd.DataFrame({'bert_token_id':bert_token_ids}).merge(all_tokens_phono[['bert_token_id',
        'actual_phonology_no_dia']])

    iv = pd.DataFrame({'word':initial_vocab})
    iv = iv.merge(cmu_2syl_inchildes, how='left')

    levdists = np.vstack([np.array([Levenshtein.distance(target,x) for x in iv.ipa_short
    ]) for target in ipa.actual_phonology_no_dia])    
    return(levdists, ipa)    

In [5]:
levdists, lev_ipa = get_edit_distance_matrix(all_tokens_phono, priors_for_age_interval, initial_vocab,  cmu_in_initial_vocab)

In [6]:
levdists.shape

(11718, 8869)

# WFST Distance

In [7]:
%pdb

Automatic pdb calling has been turned ON


In [8]:
import imp
imp.reload(wfst)

<module 'utils.wfst' from '/home/szhi/child-directed-listening/utils/wfst.py'>

In [None]:
wfst_dists, wfst_ipa = wfst.get_wfst_distance_matrix(all_tokens_phono, priors_for_age_interval, initial_vocab,  cmu_in_initial_vocab,
    'fst/chi-1.txt', 'fst/chi_phones.sym')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ats_end[0,2] = ''
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ats_end[0,3] = ''
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the d

In [None]:
wfst_dists_reduced = wfst.reduce_duplicates(wfst_dists, cmu_in_initial_vocab)
wfst_dists_reduced

In [None]:
wfst_dists_reduced.shape

## Sanity Check

In [None]:
def sanity_check_wfst(wfst_dists_reduced, wfst_ipa, row_index):
    d_word = wfst_ipa.iloc[row_index]
    w_word_index = np.argmax(wfst_dists_reduced[row_index,:])
    w_word = initial_vocab[w_word_index]
    print(d_word)
    print(w_word_index)
    print(w_word)

sanity_check_wfst(wfst_dists_reduced, wfst_ipa, 7530)
sanity_check_wfst(wfst_dists_reduced, wfst_ipa, 7531)
sanity_check_wfst(wfst_dists_reduced, wfst_ipa, 7532)
sanity_check_wfst(wfst_dists_reduced, wfst_ipa, 7533)
sanity_check_wfst(wfst_dists_reduced, wfst_ipa, 7534)
sanity_check_wfst(wfst_dists_reduced, wfst_ipa, 7535)

In [None]:
initial_vocab_with_duplicates = list(cmu_in_initial_vocab["word"])

def sanity_check_lev(levdists, lev_ipa, row_index):
    d_word = lev_ipa.iloc[row_index]
    w_word_index = np.argmin(levdists[row_index,:])
    w_word = initial_vocab_with_duplicates[w_word_index]
    print(d_word)
    print(w_word)

sanity_check_lev(levdists, lev_ipa, 7530)
sanity_check_lev(levdists, lev_ipa, 7531)
sanity_check_lev(levdists, lev_ipa, 7532)
sanity_check_lev(levdists, lev_ipa, 7533)
sanity_check_lev(levdists, lev_ipa, 7534)
sanity_check_lev(levdists, lev_ipa, 7535)