In [1]:
#note that this should be run with the "openfst" kernel to allow us to open the correct python bindings for openfst,
# pywrapfst

In [2]:
import pickle
import Levenshtein
def load_pickle(filename):
    with open(filename, 'rb') as file:
        return(pickle.load(file))
import pandas as pd
import numpy as np

In [58]:
all_tokens_phono = load_pickle('likelihood_inputs/all_tokens_phono.obj')
cmu_in_initial_vocab = load_pickle('likelihood_inputs/cmu_in_initial_vocab.obj')
initial_vocab = load_pickle('likelihood_inputs/initial_vocab.obj')
priors_for_age_interval = load_pickle('likelihood_inputs/priors_for_age_interval.obj')

# Levenshtein Distance

In [31]:
def get_edit_distance_matrix(all_tokens_phono, prior_data, initial_vocab,  cmu_2syl_inchildes):    
    '''
    Get an edit distance matrix for matrix-based computation of the posterior

    all_tokens_phono: corpus in tokenized from, with phonological transcriptions
    prior_data: priors of the form output by `compare_successes_failures_*`
    initial_vocab: word types corresponding to the softmask mask
    cmu_2syl_inchildes: cmu pronunctiations, must have 'word' and 'ipa_short' columns 
    '''

    bert_token_ids = prior_data['scores']['bert_token_id']
    ipa = pd.DataFrame({'bert_token_id':bert_token_ids}).merge(all_tokens_phono[['bert_token_id',
        'actual_phonology_no_dia']])

    iv = pd.DataFrame({'word':initial_vocab})
    iv = iv.merge(cmu_2syl_inchildes, how='left')

    levdists = np.vstack([np.array([Levenshtein.distance(target,x) for x in iv.ipa_short
    ]) for target in ipa.actual_phonology_no_dia])    
    return(levdists)    

In [32]:
levdists = get_edit_distance_matrix(all_tokens_phono, priors_for_age_interval, initial_vocab,  cmu_in_initial_vocab)

In [33]:
levdists.shape

(11718, 7904)

# WFST Distance

In [8]:
import sys
sys.path.append('/usr/local/lib/python3.6/site-packages')
import pywrapfst
from joblib import Parallel, delayed
import os
import copy

In [112]:
def vectorized_compute_all_likelihoods_for_w_over_paths(d_fsa, w_fsas, ws):    
    '''return a vector with entries corresponding to the total path weights from this d_fsa to each word in ws'''
    return([get_likelihood_for_fsas_over_paths(d_fsa, w_fsas, w) for w in ws])

def compute_all_likelihoods_for_w_over_paths_one(list_of_tuples):
    '''wrapper to compute likelihoods for a list of n (d_fsa, w_fsa, w) tuples'''
    test = [vectorized_compute_all_likelihoods_for_w_over_paths(x[0], x[1], x[2]) for x in list_of_tuples]
    return(np.vstack(test))

def get_weight_for_path(arc, shortest_paths): 
    '''get the weight of a single arc by iterating in Python'''
    #print(arc)
    path_weight = float(arc.weight)
    finished = False
    while not finished:
        #print(arc.nextstate)
        outgoing_arcs = [x for x in shortest_paths.arcs(arc.nextstate)]
        if len(outgoing_arcs) == 1: 
            arc = outgoing_arcs[0]
            path_weight += float(arc.weight)
        else:
            finished = True

    return(path_weight)

def get_weights_for_paths(shortest_paths):
    '''get the weights for a selection of paths'''
    initial_arcs = [x for x in shortest_paths.arcs(0)]
    return([get_weight_for_path(arc, shortest_paths) for arc in initial_arcs])

def get_likelihood_for_fsas_over_paths(d_fsa, w_fsas, w, num_paths=10, return_type = "probability"):
    '''get the weight of a single arc by iterating in Python'''
    if num_paths <= 0:
        raise ValueError('num_paths must be a positive integer')
        
    w_fsa = w_fsas[w]    
    dw_composed = pywrapfst.compose(w_fsa, d_fsa)
    dw_composed.arcsort(sort_type="ilabel")
       
    if num_paths > 1:
        shortest_paths = pywrapfst.epsnormalize(pywrapfst.shortestpath(dw_composed, nshortest=num_paths))
        if return_type == "shortest_paths": 
            return(shortest_paths)
        if shortest_paths.num_states() > 0:

            # take the reverse distance because with multiple shortest paths, 0 is the start state, 1 is the final state
            shortest_distance = pywrapfst.shortestdistance(shortest_paths, reverse=True)

            # iterate over all outgoing arcs from the start state  
            path_weights = get_weights_for_paths(shortest_paths)                                
            if return_type == "path_weights":
                return(path_weights)
            shortest_paths_sum = np.sum(np.exp(-1. * np.array(path_weights)))                    
            if return_type == "probability":
                return(shortest_paths_sum)
        else:
            # this is the case where there is no way to compose the d_fsa and the w_fsa
            return(0)

    else:
        shortest_path = pywrapfst.shortestpath(dw_composed)
        if shortest_path.num_states() > 0:
            shortest_distance = pywrapfst.shortestdistance(shortest_path)
            return(np.exp(-1 *float(shortest_distance[0])))
        else:
            return(0)
        
def string_to_fsa(input_string, sym):
    '''build an FSA for a given input string using the symbol table, sym'''
    
    # first make sure all chars can be converted
    input_list = list(input_string)
    for i in input_list:
        if sym.find(i) == -1:
            raise ValueError('Input character not found')
    
    # build the FSA
    
    f = pywrapfst.VectorFst()
    one = pywrapfst.Weight.one(f.weight_type())
    f.set_input_symbols(sym)
    f.set_output_symbols(sym)
    s = f.add_state()
    f.set_start(s)
    for i in input_list:    
        n = f.add_state()
        f.add_arc(s, pywrapfst.Arc(sym.find(i),
            sym.find(i),  one, n))
        s = n 
    f.set_final(n, 1)
        
    # verify
    if not f.verify():
        raise ValueError('FSA failed to verify')
    return(f)

def write_out_edited_fst(edited_fst, output_path):
    '''writes out a pandas data frame to an FST formatted text file that can them be compiled with OpenFST'''

    # needs to write each state terminal separately
    
    # get the indices of the terminals
    state_weight = np.hstack([np.array([-1]), np.where(edited_fst[[3]] == '')[0]])
    
    first = True
    for i in range(len(state_weight) -1):
        section_start = state_weight[i] + 1
        section_end = state_weight[i+1]         
        #print('Main section: '+str(section_start)+ ' - ', str(section_end))
        
        terminal_start =  state_weight[i+1]
        terminal_end = state_weight[i+1] + 1
        #print('Terminal section: '+str(terminal_start)+ ' - ', str(terminal_end))
        
        ats_section = edited_fst[section_start:section_end]        
        for j in range(3):
            ats_section[[j]] = ats_section[[j]].astype('int')
        #print(ats_section)
            
        if first: 
            ats_section.to_csv(output_path, index=False, header=None, sep='\t')
            first = False
        else:
            ats_section.to_csv(output_path, mode='a', index=False, header=None, sep='\t')

        ats_end = edited_fst.iloc[terminal_start : terminal_end]

        ats_end[0,2] = ''
        ats_end[0,3] = ''
        ats_end[0,4] = ''
        
        ats_end.to_csv(output_path, mode='a',index=False, header=None, sep='\t')

    # catch any remaining arcs
    ats_section = edited_fst[terminal_end:edited_fst.shape[0]]
    for j in range(3):
        ats_section[[j]] = ats_section[[j]].astype('int')
    
    ats_section.to_csv(output_path, mode='a',index=False, header=None, sep='\t')

    

In [100]:
def reconcile_symbols(fit_model, path_to_chi_phones_sym):
    '''generate a transducer and symbol set in the same symbol set which includes all inputs and outputs'''
    ints = [int(x) for x in np.unique(fit_model[[2]]) if not np.isnan(x)]        
    input_symbol_table = pd.DataFrame({'symbol':[chr(x) for x in ints], 'int':ints})
    input_symbol_table.at[input_symbol_table.int ==0, 'symbol'] = '<epsilon>'
    #input_symbol_table.to_csv('test_input_phones.sym', sep='\t', header=None, index=False)
    input_cypher = dict(zip(input_symbol_table.int, input_symbol_table.symbol))
    
    
    output_symbol_table = pd.read_csv(path_to_chi_phones_sym, sep='\t', header=None)
    output_symbol_table.columns = ['symbol','int']
    output_cypher = dict(zip(output_symbol_table.int, output_symbol_table.symbol))
    output_cypher
    
    symbols_not_in_output = set(input_symbol_table.symbol).difference(set(output_symbol_table.symbol))
    
    superset_cypher = copy.copy(output_cypher)
    i = len(output_cypher.keys())
    for symbol in symbols_not_in_output:
        superset_cypher[i] = symbol
        i += 1    
    reverse_superset_cypher = dict(zip(superset_cypher.values(),superset_cypher.keys()))

    fit_model_superset = copy.copy(fit_model)

    # recode the input symbols
    fit_model_superset[[2]] = [reverse_superset_cypher[input_cypher[int(x)]] if not np.isnan(x)
            else '' for x in fit_model[[2]].values[:,0]]

    # recode the output symbols
    fit_model_superset[[3]] = [reverse_superset_cypher[output_cypher[int(x)]] if not np.isnan(x)
            else '' for x in fit_model[[3]].values[:,0]]
    
    fit_model_labeled = copy.copy(fit_model)

    write_out_edited_fst(fit_model_superset, 'fst/chi_edited_fst.csv')

    superset_chi = pd.DataFrame({'sym': reverse_superset_cypher.keys(),
        'utf8':reverse_superset_cypher.values()})
    superset_chi.to_csv('fst/superset_chi.sym', header = None, index=False, sep='\t')
    return(fit_model_superset, superset_chi)

def normalize_log_probs(vec):
    vec = vec.values.flatten()
    ps = np.exp(-1 * vec)
    total = np.sum(ps)
    return(ps / total)

def normalize_partition(x): 
    '''for a given selection of FST arcs, for example all where input is a particular symbol, normalize the log probs'''
    df = x[1]
    df[[4]] = -1 * np.log(normalize_log_probs(df[[4]]))
    return(df)

def split(a, n):
    '''split a list into n approximately equal length sublists, appropriate for parallelization'''
    k, m = divmod(len(a), n)
    return (a[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n))

In [129]:
def get_wfst_distance_matrix(all_tokens_phono, prior_data, initial_vocab,  cmu_2syl_inchildes, 
    path_to_baum_welch_transducer, path_to_chi_phones_sym, num_cores=24):    
    '''
    Get wfst distance matrix for matrix-based computation of the posterior

    all_tokens_phono: corpus in tokenized from, with phonological transcriptions
    prior_data: priors of the form output by `compare_successes_failures_*`
    initial_vocab: word types corresponding to the softmask mask
    cmu_2syl_inchildes: cmu pronunctiations, must have 'word' and 'ipa_short' columns 
    path_to_baum_welch_transducer: path to the OpenFST transducer yielded by the BaumWelch package
    '''
    
    bert_token_ids = prior_data['scores']['bert_token_id']
    ipa = pd.DataFrame({'bert_token_id':bert_token_ids}).merge(all_tokens_phono[['bert_token_id',
        'actual_phonology_no_dia']])

    iv = pd.DataFrame({'word':initial_vocab})
    iv = iv.merge(cmu_2syl_inchildes, how='left')
    
    
    # [X] Load the transducer, create a covering symbol set, and change the transducer to the data symbol set
    fit_model = pd.read_csv(path_to_baum_welch_transducer, sep='\t', header=None)
    
    fit_model_superset, superset_chi = reconcile_symbols(fit_model, path_to_chi_phones_sym)
    superset_chi_sym = pywrapfst.SymbolTable.read_text('fst/superset_chi.sym')

    # [X] Change from a joint model to a conditional model.
    # as of 11/10/21, only works for the unigram case
    grouped = list(fit_model_superset.iloc[0:fit_model_superset.shape[0] - 1].groupby(2))
    conditioned = pd.concat([normalize_partition(x) for x in grouped ])
    tail = fit_model_superset.tail(1)
    tail[[1]] = -1 * np.log(1)
    conditioned = pd.concat([conditioned, tail])
    write_out_edited_fst(conditioned, 'fst/chi_conditioned_fst.csv')
    os.system('fstcompile --arc_type=standard fst/chi_conditioned_fst.csv fst/chi_conditioned.fst')    
    transducer = pywrapfst.Fst.read("fst/chi_conditioned.fst")
            
    #[X] translate all words in the vocab into FSAs (w_fsas)and compose with the n-gram transducer
    
    w_fsas = {}
    ws = []
    for w in iv.to_dict('records'):    
        w_fsa = string_to_fsa(w['ipa_short'], superset_chi_sym)    
        w_in = pywrapfst.compose(w_fsa.arcsort(sort_type="ilabel"), transducer.arcsort(sort_type="ilabel"))
        w_fsas[w['ipa_short']] = w_in.arcsort(sort_type="ilabel")
        ws.append(w['ipa_short'])

    #[ ]  check if there are repeats in d_fsas -- the pairwise computation is expensive
    
    #[X] translate all observed words (data) into FSAs (d_fsas)
    d_fsas = [string_to_fsa(d, superset_chi_sym).arcsort(sort_type="olabel") for d in ipa.actual_phonology_no_dia]
    
    
    import pdb
    pdb.set_trace()
    
    data_test = string_to_fsa('hævəŋ', superset_chi_sym)
    get_likelihood_for_fsas_over_paths(data_test, w_fsas, 'nɑəts', num_paths=10, return_type = "probability")
    
    # make the splits on the dfsas
    serial_inputs = [(x, w_fsas, ws) for x in d_fsas[0:1000]]
    d_fsa_inputs = split(serial_inputs, num_cores)    
    distances = Parallel(n_jobs=num_cores)(delayed(compute_all_likelihoods_for_w_over_paths_one)(d_fsa_input) for d_fsa_input in d_fsa_inputs)
    #distances = [vectorized_compute_all_likelihoods_for_w_over_paths(d_fsa, w_fsas, ws) for d_fsa in d_fsas]
    
    # yield the matrix of distances
    
    # make sure that the ordering of the results is not permuted 
    
    return(np.vstack(distances))    

In [121]:
%pdb

Automatic pdb calling has been turned OFF


In [None]:
wfst_dists = get_wfst_distance_matrix(all_tokens_phono, priors_for_age_interval, initial_vocab,  cmu_in_initial_vocab,
    'fst/chi-1.txt', 'fst/chi_phones.sym')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/

> [0;32m<ipython-input-129-35fd31293ea2>[0m(57)[0;36mget_wfst_distance_matrix[0;34m()[0m
[0;32m     55 [0;31m    [0mpdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     56 [0;31m[0;34m[0m[0m
[0m[0;32m---> 57 [0;31m    [0mdata_test[0m [0;34m=[0m [0mstring_to_fsa[0m[0;34m([0m[0;34m'hævəŋ'[0m[0;34m,[0m [0msuperset_chi_sym[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     58 [0;31m    [0mget_likelihood_for_fsas_over_paths[0m[0;34m([0m[0mdata_test[0m[0;34m,[0m [0mw_fsas[0m[0;34m,[0m [0;34m'nɑəts'[0m[0;34m,[0m [0mnum_paths[0m[0;34m=[0m[0;36m10[0m[0;34m,[0m [0mreturn_type[0m [0;34m=[0m [0;34m"probability"[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     59 [0;31m[0;34m[0m[0m
[0m
ipdb> data_test = string_to_fsa('zibwə', superset_chi_sym)
ipdb> get_likelihood_for_fsas_over_paths(data_test, w_fsas, 'pɑz', num_paths=10, return_type = "probability")
1.0260383567671263e-

In [128]:
wfst_dists


array([[1.52810475e-05, 1.07906226e-02, 3.85958957e-04, ...,
        3.55403403e-07, 7.24668517e-03, 5.87386341e-08],
       [1.54677852e-05, 1.29174011e-07, 1.94688503e-06, ...,
        5.18680997e-05, 2.73407839e-06, 6.45766567e-06],
       [4.01254220e-05, 1.01620441e-05, 1.66337201e-04, ...,
        9.13624215e-07, 8.65060033e-06, 2.61470627e-06],
       ...,
       [5.46743849e-05, 7.70809733e-05, 7.70398642e-01, ...,
        8.90087130e-07, 7.22927414e-06, 4.32638519e-05],
       [5.82977784e-04, 1.27004903e-05, 9.61850316e-04, ...,
        2.62190922e-06, 1.10474269e-05, 2.42942632e-05],
       [2.26606102e-04, 4.01385454e-05, 1.40714475e-04, ...,
        1.42982341e-06, 5.33623727e-05, 3.82358853e-06]])