In [14]:
import os
import sys
import rpy2.robjects.lib.ggplot2 as ggplot2
import pandas as pd
import numpy as np
%load_ext rpy2.ipython

The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython


In [2]:
sys.path.append('../../')
from src.utils import configuration
config = configuration.Config()

In [3]:
# read in the phono from the pickle so that we have the associated CMU pronunciations
all_tokens_phono = pd.read_pickle(os.path.join(config.prov_csv_dir, 'pvd_utt_glosses_phono_cleaned_inflated_to_next_notebook.pkl'))

In [4]:
excludes = ['', '*','(.)','(..)', '(...)','(....)','(.....)']
phono = all_tokens_phono.loc[~all_tokens_phono.actual_phonology.isin(excludes) & 
                             ~all_tokens_phono.model_phonology.isin(excludes)]
phono.shape

(222881, 32)

In [5]:
phono.iloc[0]

token                             mommy
utterance_id                   16759315
gloss                             Mommy
transcript_id                     42204
utterance_order                       6
target_child_name                  Alex
speaker_code                        CHI
type                        declarative
punct                                 .
speaker_code_simple               [CHI]
gloss_with_punct           [CHI] mommy.
token_id                          20565
seq_utt_id                            3
actual_phonology                    ɑmɪ
model_phonology                   mɑmiː
target_child_age                    514
bert_token_id                        23
model_phonology_clean              mɑmi
actual_phonology_clean              ɑmə
model_phonology_no_dia             mɑmi
actual_phonology_no_dia             ɑmə
cv_raw_actual                       vcv
cv_collapsed_actual                 vcv
num_vowels_actual                     2
cv_raw_model                       cvcv


In [23]:
phono.loc[phono.gloss == 'watch'][['model_phonology_no_dia', 'actual_phonology_no_dia']]

Unnamed: 0,model_phonology_no_dia,actual_phonology_no_dia
152856,wɑʧ,wɑtʃ
177031,wɑʧ,wɑth
214016,wɑʧ,wətʃ
240496,wɑʧ,wɑtʃ
249089,wɑʧ,wɑtʃ
272053,wɑʧ,wɑtʃ
278047,wɑʧ,wɑtʃ
285615,wɑʧ,wɑtʃ
294085,wɑʧ,wɑtʃ
299420,wɑʧ,wɑtʃ


In [6]:
cmu = pd.read_pickle(config.cmu_path)
cmu.head(5)

Unnamed: 0,index,word,pronunciation,phones,ipa,structure,num_vowels,ipa_short
70,71,a,AH0,[AH],[ə],[v],1,ə
71,72,a,EY1,[EY],[eɪ],[v],1,eə
77,78,aa,EY2 EY1,"[EY, EY]","[eɪ, eɪ]","[v, v]",2,eəeə
92,93,aaron,EH1 R AH0 N,"[EH, R, AH, N]","[ɛ, ɹ, ə, n]","[v, c, v, c]",2,ɛɹən
102,103,ab,AE1 B,"[AE, B]","[æ, b]","[v, c]",1,æb


In [15]:

phono['model_ipa'] = phono.model_phonology_no_dia
phono['actual_ipa'] = [''.join(list(x)) for x in phono.actual_phonology_no_dia]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  phono['model_ipa'] = phono.model_phonology_no_dia
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  phono['actual_ipa'] = [''.join(list(x)) for x in phono.actual_phonology_no_dia]


In [16]:
'a' in set(list(''.join(phono['actual_ipa'])))

True

In [17]:
cmu_map = dict(zip(cmu.word, cmu.ipa_short))
def cmu_mapper(x, cmu_map):
    token = x[0]
    model = x[1]
    if x[0] in cmu_map:
        return(cmu_map[x[0]])
    else:
        return(x[1])


phono['cmu'] = [cmu_mapper(x, cmu_map) for x in zip(phono.token, phono.model_ipa)]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  phono['cmu'] = [cmu_mapper(x, cmu_map) for x in zip(phono.token, phono.model_ipa)]


In [18]:
phono[['cmu','actual_ipa']]

Unnamed: 0,cmu,actual_ipa
23,mɑmi,ɑmə
125,wi,wi
151,wi,wi
189,wu,u
685,əɹni,ɛ
...,...,...
2991828,noʊbɑdi,nobɑɾi
2991829,heəts,heəs
2991834,oʊ,o
2991835,hwaə,waə


In [25]:
# cmu inputs need to be changed for the digraphs
phono['cmu'] = [x.replace('ʧ','tʃ').replace('ʤ','dʒ') for x in phono['cmu']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  phono['cmu'] = [x.replace('ʧ','tʃ').replace('ʤ','dʒ') for x in phono['cmu']]


In [26]:
num_dev = 20000
num_train = 180000
num_test = 20000

for output_dir in ('dev','train','test'):
    dir_to_make = os.path.join(config.project_root, 'output/fst', output_dir)
    if not os.path.exists(dir_to_make):
        os.makedirs(dir_to_make)

phono[['cmu','actual_ipa']].iloc[0:num_dev].to_csv(os.path.join(
    config.project_root, 'output/fst/dev/chi_dev.tsv'),
    sep='\t', header=False, index=False)
phono[['cmu','actual_ipa']].iloc[num_dev:num_dev+num_train].to_csv(os.path.join(
    config.project_root, 'output/fst/train/chi_train.tsv'),
    sep='\t', header=False, index=False)
phono[['cmu','actual_ipa']].iloc[num_dev+num_train:num_dev+num_train+num_test].to_csv(os.path.join(
    config.project_root,'output/fst/test/chi_test.tsv'),
    sep='\t', header=False, index=False)

In [27]:
phono[['model_ipa_split']] = [' '.join(list(x)) for x in phono['cmu']]
phono[['model_ipa_split']].iloc[num_dev:num_dev+num_train].to_csv(os.path.join(config.project_root,
    'output/fst/train/chi_inputs.tsv'),
    sep='\t', header=False, index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  phono[['model_ipa_split']] = [' '.join(list(x)) for x in phono['cmu']]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)
