In [1]:
# Pairwise Levenshtein Distances for the Providence corpus / Likelihood

In [2]:
%load_ext rpy2.ipython
import rpy2.robjects.lib.ggplot2 as ggplot2
import childespy
import numpy as np
import os
import imp
import pandas as pd
import Levenshtein
import itertools

In [3]:
pvd_idx = childespy.get_sql_query('select * from corpus where name = "Providence"').iloc[0]['id']

R[write to console]: Using current database version: '2020.1'.



In [5]:
regenerate = True

if regenerate:
    pvd_chi_tokens = childespy.get_sql_query('select gloss, target_child_name, target_child_age, \
    speaker_code, actual_phonology, model_phonology, transcript_id, utterance_id, \
    token_order from token where speaker_code = "CHI" and corpus_id = '+str(pvd_idx),
        db_version = "2020.1")
    pvd_chi_tokens.to_csv('csv/pvd_tokens.csv', index=False)
else: 
    pvd_chi_tokens = pd.read_csv('csv/pvd_tokens.csv')

R[write to console]: Using supported database version: '2020.1'.



In [16]:
excludes = ['*','(.)','(..)', '(...)','(....)','(.....)']
pvd_chi_tokens = pvd_chi_tokens.loc[~(pvd_chi_tokens.model_phonology.isin(excludes) |
    pvd_chi_tokens.actual_phonology.isin(excludes))]

In [17]:
pvd_chi_tokens.iloc[0]

gloss                   Mommy
target_child_name        Alex
target_child_age        514.0
speaker_code              CHI
actual_phonology          ɑmɪ
model_phonology         mɑmiː
transcript_id           42204
utterance_id         16759315
token_order                 1
Name: 1, dtype: object

In [151]:
# get the most common transcription for each form
pvd_chi_tokens.gloss = [x.lower() for x in pvd_chi_tokens.gloss]
ipa_for_glosses = pvd_chi_tokens.groupby(['gloss']).model_phonology.agg(lambda x:
    x.value_counts().reset_index().iloc[0]['index']    
).reset_index()

In [152]:
print(ipa_for_glosses.shape)
test = ipa_for_glosses.dropna()
test.shape

(7929, 2)


(7929, 2)

In [153]:
test_words = ['dog','cat','box','fishes', 'mommy', 'potato','a']
ipa_for_glosses.loc[ipa_for_glosses.gloss.isin(test_words)]

Unnamed: 0,gloss,model_phonology
4,a,ə
768,box,bɑks
1072,cat,kæt
1892,dog,dɑɡ
2420,fishes,fɪʃəz
4340,mommy,mɑmiː
5271,potato,pəteɪtoʊ


In [223]:
ipa_for_glosses = ipa_for_glosses.loc[ipa_for_glosses.model_phonology != ""]
ipa_for_glosses['model_phonology'] = [x.replace("ː","").replace('ʌ','ə')
.replace('ɪ','ə').replace('ɔ','ɑ') for x in ipa_for_glosses['model_phonology']]
ipa_for_glosses.shape


(6753, 2)

In [224]:
ipa_for_glosses.iloc[0:3]

Unnamed: 0,gloss,model_phonology
4,a,ə
14,abe,eəb
15,abide,əbaəd


In [225]:
# using just the model phonology alone is a problem: many words won't be attested. Need
# to compare extend these to CMU

In [226]:
cmu = pd.read_csv('/shared_hd0/corpora/CMU_pronunciation/cmu_dict_df.csv')
cmu.columns = ['index','word','pronunciation']
from string import digits
remove_digits = str.maketrans('', '', digits)
cmu['phones'] = [[y.translate(remove_digits) for y in x.split(' ')] for x in cmu['pronunciation']]
cmu.word = [str(x).lower() for x in cmu.word]

In [227]:
# def flatten(list_of_lists):
#     return([item for subl in list_of_lists for item in subl])

# phone_inventory = pd.DataFrame({'arpa':np.unique(flatten(cmu.phones))})
# phone_inventory.to_csv('phon/phon_map.csv', index=False)

In [260]:
phone_map_df = pd.read_csv('phon/phon_map_populated.csv')
phone_map = dict(zip(phone_map_df.arpa, phone_map_df.ipa))
cv_map = dict(zip(phone_map_df.arpa, phone_map_df.c_or_v))
cmu['ipa'] = [[phone_map[x] for x in y] for y in cmu.phones]
cmu['structure'] = [[cv_map[x] for x in y] for y in cmu.phones]
cmu['num_vowels'] = [np.sum(np.array(x) == 'v') for x in cmu['structure']]
cmu['ipa_short'] = [''.join(x) for x in cmu['ipa']]
cmu['ipa_short'] =  [x.replace('ɝ', 'əɹ').replace('ɪ','ə').replace(
'ɔ','ɑ') for x in cmu['ipa_short']]




In [261]:
cmu.loc[cmu.word.isin(test_words)]

Unnamed: 0,index,word,pronunciation,phones,ipa,ipa_short,structure,num_vowels
70,71,a,AH0,[AH],[ə],ə,[v],1
14024,14025,box,B AA1 K S,"[B, AA, K, S]","[b, ɑ, k, s]",bɑks,"[c, v, c, c]",1
19019,19020,cat,K AE1 T,"[K, AE, T]","[k, æ, t]",kæt,"[c, v, c]",1
33320,33321,dog,D AO1 G,"[D, AO, G]","[d, ɔ, ɡ]",dɑɡ,"[c, v, c]",1
42452,42453,fishes,F IH1 SH AH0 Z,"[F, IH, SH, AH, Z]","[f, ɪ, ʃ, ə, z]",fəʃəz,"[c, v, c, v, c]",2
79787,79788,mommy,M AA1 M IY0,"[M, AA, M, IY]","[m, ɑ, m, i]",mɑmi,"[c, v, c, v]",2
93614,93615,potato,P AH0 T EY1 T OW2,"[P, AH, T, EY, T, OW]","[p, ə, t, eɪ, t, oʊ]",pəteətoʊ,"[c, v, c, v, c, v]",3


In [230]:
# proportion of Providence annotations that match the annotation in adjusted CMU 
found_words = ipa_for_glosses.merge(cmu, left_on=['gloss','model_phonology'],
    right_on = ['word','ipa_short'])
found_words.shape[0] / ipa_for_glosses.shape[0] # 83% found after remapping

0.8307418924922256

In [234]:
# missing words
missing = ipa_for_glosses.loc[~ipa_for_glosses.gloss.isin(found_words.gloss)]
missing = missing.merge(cmu, left_on=['gloss'], right_on=['word'])
missing_short =missing[['gloss', 'model_phonology','ipa_short']]
missing_short.columns = ['gloss','pvd_ipa','cmu_ipa']
missing_short.sample(10)

Unnamed: 0,gloss,pvd_ipa,cmu_ipa
133,swallowing,swɑloʊwəŋ,swɑloʊəŋ
130,sprinklers,spɹəŋkələɹz,spɹəŋkləɹz
144,unusual,ənjuʒəwwəl,ənjuʒuəl
66,hola,olɑ,hoʊlə
30,cuckoo,kuku,kəku
20,beware,bəwwɛɹ,bəwɛɹ
91,mastodon,mæsdədɑn,mæstədɑn
95,might've,maətv,maətəv
54,genuine,ʤɛnjəwwən,ʤɛnjuən
14,baa,bæ,bieəeə


In [None]:
# Character differences / shorthand
[X] cmu: ɝ -> əɹ
[X] cmu:  tʃ -> ʧ
[X] cmu  dʒ -> ʤ      
# Collapses
[X] cmu  I -> schwa
[X] ɑ in model phonlogy vs ɔ in cmu: convert ɔ to ɑ in both
[X] model phonology: ʌ ->  ə

In [60]:
# this doesn't produce diphthongs; vowels are weird
# cmu = pd.read_csv('/shared_hd0/corpora/CMU_pronunciation/CMU.in.IPA.txt', 
#                   encoding='utf-8',header = None, sep = ",[ \t]*", na_filter=False)
# cmu.columns = ['word','ipa']
# cmu['ipa'] = [x.replace("ˌ","").replace("ˈ","") if x is not None else None for x in cmu.ipa ]
# cmu.loc[cmu.word.isin(['dog','cat','box','fishes', 'mommy', 'potato'])]

  


In [235]:
wswg  = pd.read_csv('/home/stephan/notebooks/ws_analysis/data/raw_data/WSWG_50percentproducing.csv')

In [272]:
wswg['word'] = [x.split(' ')[0].split('/')[0] for x in wswg.word]
' '.join(wswg['word'])
wswg_cmu = wswg.merge(cmu)
print('Total coverage')
print(wswg_cmu.shape[0] / wswg.shape[0])
print('Coverage for 2 syllable words')
print(wswg_cmu.loc[wswg_cmu.num_vowels <= 2].shape[0] / wswg.shape[0])
print('Coverage for 1 syllable words')
print(wswg_cmu.loc[wswg_cmu.num_vowels <= 1].shape[0] / wswg.shape[0])

Total coverage
0.9827586206896551
Coverage for 2 syllable words
0.9341692789968652
Coverage for 1 syllable words
0.622257053291536


In [267]:
wswg_cmu.ipa_short.value_counts()

lɑɹi          17
hɑk           13
kɑɹ           13
kɛɹi          13
bɑɹ           12
              ..
lændvjuaət     1
ʒaʊpəŋz        1
ɡɹəmbəl        1
pəɡmɛnt        1
neəvə          1
Name: ipa_short, Length: 109530, dtype: int64

In [249]:
len(cmu_words) #133, 852

133852

In [295]:
childes_counts = pd.read_csv('data/vocab.csv')
childes_counts.columns 
cmu_in_childes = cmu.loc[(cmu.word.isin(
    childes_counts.loc[childes_counts['count'] > 3].word)) & (cmu['num_vowels'] <=2)]

In [297]:
cmu_in_childes.shape

(12557, 8)

In [298]:
' '.join(cmu_in_childes.word[0:1000])

"a aa aah aardvark aardvarks aaron aaron's ab abba abbey abbie abby abe able aboard about above abra abroad absent absorb abstract absurd abu abuse ac accent accept access account accused ace aces ache aches aching achoo acid ack acorn acorns acre acres across act acted acting action actions active actor actress acts ad ada adah adam adam's adams adar aday add added adder addict addie adding address addressed adds ade adele adele's adjust admire admit ado adopt adore adores adrift ads adult adults advance advanced advent advice ady ae aero afar affect affects afford afloat afraid afro aft after ag aga again against age aged agent ages aggie aggy agnes ago agra agree agreed agrees agua ah ahah ahead ahh ahn ahold ahoy ai aid aidan aiden aids aiken aim aimed aimee aiming ain't air air's airborne aircraft airing airline airmail airplane airplane's airplanes airport airports airship ais aisha aisle aisles aislinn ajax aka akbar al al's ala alan alan's alarm alarm's alarms albert album albu

In [299]:
cmu_in_childes.iloc[0]

index              71
word                a
pronunciation     AH0
phones           [AH]
ipa               [ə]
ipa_short           ə
structure         [v]
num_vowels          1
Name: 70, dtype: object

In [311]:
cmu_in_childes.loc[cmu_in_childes.word == 'goat']

Unnamed: 0,index,word,pronunciation,phones,ipa,ipa_short,structure,num_vowels
48256,48257,goat,G OW1 T,"[G, OW, T]","[ɡ, oʊ, t]",ɡoʊt,"[c, v, c]",1


In [310]:
cmu_in_childes.to_csv('phon/cmu_in_childes.csv',index=False)

In [308]:
def get_levidsts(target, cmu):
    distances = [Levenshtein.distance(target,x) for x in cmu_in_childes.ipa_short]
    dist_df = pd.DataFrame({'levdist':distances,'candidate':cmu_in_childes.word})
    return(dist_df.sort_values(by=['levdist']))

get_levidsts('woʊt', cmu)

Unnamed: 0,levdist,candidate
12650,1,boat
85141,1,oat
131327,1,wo
22381,1,coat
131340,1,woe
...,...,...
47706,10,girlfriends
14887,10,bridesmaid
113531,10,sprinklers
121981,10,transcribe
