## Prepares Providence data

Evaluation set for all and age splits.

Both the finetuning and evaluation set (split) for child finetuning.

In [99]:
# 7/22/21: https://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython

%load_ext autoreload
%autoreload 2

# end cite

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [7]:
%load_ext rpy2.ipython
import rpy2.robjects.lib.ggplot2 as ggplot2
import childespy
import numpy as np
import os
import imp
import pandas as pd
import transformers
import torch
import re
import unicodedata
import scipy.stats
import copy
from string import punctuation

import config
np.random.seed(config.SEED)

The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython


In [8]:
from os.path import join, exists

In [100]:
from utils import split_gen, data_cleaning, load_splits, load_models, data_cleaning, transformers_bert_completions
from utils_child import child_split_gen

### Prepare and clean the source of individual utterance samples

In [4]:
# Communicative success: how many no-xxx, no-yyy child  utterances are in Providence? 
# Communicative failures: how many one-yyy, no-xxx child utterances are in Providence?
# Subset to instances that are monosyllabic later

In [11]:
pvd_idx = childespy.get_sql_query('select * from corpus where name = "Providence"').iloc[0]['id']

phono_glosses = childespy.get_sql_query('select gloss, target_child_name, target_child_age, \
    speaker_code, actual_phonology, model_phonology, transcript_id, utterance_id, \
    token_order, corpus_name, collection_name, language from token where \
    actual_phonology != "" and model_phonology != "" and collection_name = "Eng-NA" \
    and corpus_id = '+str(pvd_idx) ,
        db_version = "2020.1")

R[write to console]: Using current database version: '2020.1'.

R[write to console]: Using supported database version: '2020.1'.



In [12]:
if config.verbose: 
    print(phono_glosses.corpus_name.value_counts())
    print(phono_glosses.loc[phono_glosses.gloss == 'xxx'].actual_phonology.value_counts())
    print(phono_glosses.loc[phono_glosses.gloss == 'yyy'].actual_phonology.value_counts())

Providence    396621
Name: corpus_name, dtype: int64
*           26736
ə              10
(.)             7
aɪ              4
pitched         2
            ...  
dulɪ            1
vɛnt            1
neɪniwɑə        1
mɪ              1
bi              1
Name: actual_phonology, Length: 76, dtype: int64
ɛ         3206
ʌ         2132
ɪ         1881
ə          512
o          507
          ... 
paʊs         1
pʰɑkə        1
fɔləbi       1
ɪɛː          1
aɪsɛ         1
Name: actual_phonology, Length: 30293, dtype: int64


In [13]:
chi_phono = phono_glosses.loc[(phono_glosses.speaker_code == 'CHI') & 
    (phono_glosses.target_child_age < (365*5))]

def count_transmission_errors(utt_vector, error_codes):
    return(np.sum([x in error_codes for x in  utt_vector]))

In [14]:
xxxs_per_utt = chi_phono.groupby('utterance_id').gloss.agg(
    lambda x: count_transmission_errors(x, ['xxx'])).reset_index()
xxxs_per_utt.columns = ['utterance_id', 'num_xxx']
yyys_per_utt = chi_phono.groupby('utterance_id').gloss.agg(
    lambda x: count_transmission_errors(x, ['yyy'])).reset_index()
yyys_per_utt.columns = ['utterance_id', 'num_yyy']
failures_per_utt = xxxs_per_utt.merge(yyys_per_utt)

raw_yyy_utts = failures_per_utt.loc[(failures_per_utt.num_xxx == 0) &  (failures_per_utt.num_yyy == 1)]

if config.verbose: print(raw_yyy_utts.shape)

raw_success_utts = failures_per_utt.loc[(failures_per_utt.num_xxx == 0) &  
    (failures_per_utt.num_yyy == 0)]

if config.verbose: print(raw_success_utts.shape)
    

(31457, 3)
(83880, 3)


### Prepare and clean Providence data 

Corresponds to: 4 | Prep Utterances / Tokens for BERT,
    in the original notebook

In [15]:

# Get the index of the Providence corpus
pvd_idx = childespy.get_sql_query('select * from corpus where name = "Providence"').iloc[0]['id']

# Load utterances from the Providence corpus from childs-db

if config.regenerate:
    raw_utt_glosses = childespy.get_sql_query('select gloss, transcript_id, id, \
    utterance_order, speaker_code, target_child_name, target_child_age, type from utterance where corpus_id = '+str(pvd_idx) ,
        db_version = "2020.1")
    raw_utt_glosses.to_csv('csv/pvd_utt_glosses.csv', index=False)
else: 
    raw_utt_glosses = pd.read_csv('csv/pvd_utt_glosses.csv')

R[write to console]: Using current database version: '2020.1'.



In [16]:
for_chi_phono_utts = raw_utt_glosses.copy() # Avoid cleaning the glosses for the utt_glosses twice (see prep code for child splits)
utt_glosses = data_cleaning.clean_glosses(for_chi_phono_utts)

if config.verbose: utt_glosses[utt_glosses.id == 17280964]

declarative                   335678
question                       84707
imperative_emphatic            15954
trail off                      12351
self interruption               6658
interruption                    2928
self interruption question       825
trail off question               650
interruption question            304
quotation precedes                 3
question exclamation               2
broken for coding                  1
Name: type, dtype: int64
Cell 238 gloss                where do you want me to go
transcript_id                             42204
id                                     16759250
utterance_order                               1
speaker_code                                OPE
target_child_name                          Alex
target_child_age                          514.0
type                                   question
punct                                         ?
Name: 0, dtype: object


### Build the Vocabulary

In [17]:

cmu_2syl_inchildes = load_models.get_cmu_dict_info()

# tokenize with the most extensive tokenizer, which is the one used for model #2

initial_tokenizer = load_models.get_meylan_original_model(with_tags = True)['tokenizer']

initial_tokenizer.add_tokens(['yyy','xxx']) #must maintain xxx and yyy for alignment,
# otherwwise, BERT tokenizer will try to separate these into x #x and #x and y #y #y
inital_vocab_mask, initial_vocab = transformers_bert_completions.get_softmax_mask(initial_tokenizer,
    cmu_2syl_inchildes.word)

# confirm yyy treated as a separate character
assert initial_tokenizer.tokenize('this is a yyy.') == ['this', 'is', 'a', 'yyy', '.']

cmu_in_initial_vocab = cmu_2syl_inchildes.loc[cmu_2syl_inchildes.word.isin(initial_vocab)]

if config.verbose: print(cmu_in_initial_vocab.shape)
    

(7904, 8)


In [18]:

# build a dataframe of tokens 
# this is slow, because tokenization is slow
def inflate (row):
    tokens = initial_tokenizer.tokenize(row['gloss_with_punct'])
    return(pd.DataFrame({'token':tokens, 'id':row['id']}) )

if config.regenerate:
    all_tokens = pd.concat([inflate(x) for x in utt_glosses.to_dict('records')])
    all_tokens = all_tokens.merge(utt_glosses)
    all_tokens.to_csv('csv/pvd_utt_glosses_inflated.csv')

else:
    all_tokens = pd.read_csv('csv/pvd_utt_glosses_inflated.csv', na_filter=False)

if config.verbose: print(all_tokens.iloc[0:10])

# Assign a token_id (integer in the BERT vocabulary). 
# Because these are from the tokenized utterances, there is no correpsondence 
# with childes-db token ids
all_tokens['token_id'] = initial_tokenizer.convert_tokens_to_ids(all_tokens['token'])
# assigns utterances a 0-indexed index column
all_tokens['seq_utt_id'] = all_tokens['id'].astype('category').cat.codes


   Unnamed: 0  token        id                                         gloss  \
0           0  [cgv]  16759250                    where do you want me to go   
1           1  where  16759250                    where do you want me to go   
2           2     do  16759250                    where do you want me to go   
3           3    you  16759250                    where do you want me to go   
4           4   want  16759250                    where do you want me to go   
5           5     me  16759250                    where do you want me to go   
6           6     to  16759250                    where do you want me to go   
7           7     go  16759250                    where do you want me to go   
8           8      ?  16759250                    where do you want me to go   
9           9  [cgv]  16759261  anywhere you'll feel comfortable um anywhere   

   transcript_id  utterance_order speaker_code target_child_name  \
0          42204                1          OPE     

### Add back IPA, syllable structure, and child ages for child productions

In [19]:
# get the token-level data, esp phonology

if config.regenerate:

    # get token-level information for Providence
    pvd_chi_tokens = childespy.get_sql_query('select gloss, target_child_name, target_child_age, \
    speaker_code, actual_phonology, model_phonology, transcript_id, utterance_id, \
    token_order from token where speaker_code = "CHI" and corpus_id = '+str(pvd_idx),
        db_version = "2020.1")
    pvd_chi_tokens['gloss'] = [data_cleaning.fix_gloss(x) for x in pvd_chi_tokens.gloss]
    
    # prep the tokens generated from segmenting the utterances
    all_tokens_test = copy.deepcopy(all_tokens) 

    # initialize the fields that need to be populated
    all_tokens_test['actual_phonology'] = ''
    all_tokens_test['model_phonology'] = ''
    all_tokens_test['target_child_age'] = np.nan
    
    # get a set of unique utterances
    _, idx = np.unique(all_tokens_test.id, return_index=True)
    all_utt_indices = all_tokens_test.id[np.sort(idx)]
    
    # For fast retrieval of IPA, split pvd_chi_tokens into a dictionary
    pvd_chi_tokens_list = pvd_chi_tokens.groupby(['utterance_id'])
    pvd_chi_tokens_dict = dict(zip(
        [x[0] for x in pvd_chi_tokens_list], 
        [x[1] for x in pvd_chi_tokens_list], 
    ))
    
    # For fast retrival of BERT tokenization
    all_tokens_test_list = all_tokens_test.groupby(['id'])
    all_tokens_test_dict = dict(zip(
        [x[0] for x in all_tokens_test_list], 
        [x[1] for x in all_tokens_test_list], 
    ))
        
    # Augment the tokens from all_tokens with the IPA from pvd_chi_tokens 
    rvs = [] 
    utts_to_retrieve = raw_yyy_utts.utterance_id.to_list() + raw_success_utts.utterance_id.to_list()
    i=-1
    for utt_index in all_utt_indices: #utts_to_retrieve: #[16760331]:       
        i+=1
        if i % int(len(all_utt_indices) / 100) == 0:
            print(str(np.round((i / (len(all_utt_indices)) * 100),2))+'% complete...')    
            # should learn to use tqdm instead
        if utt_index in utts_to_retrieve:        
            utt_df = copy.deepcopy(all_tokens_test_dict[utt_index])
            utt_df['model_phonology'] = transfomers_bert_completions.augment_with_ipa(
              utt_df, pvd_chi_tokens_dict[utt_index],initial_tokenizer, 'model_phonology')
            utt_df['actual_phonology'] = transfomers_bert_completions.augment_with_ipa(
              utt_df, pvd_chi_tokens_dict[utt_index],initial_tokenizer, 'actual_phonology')
            utt_df['target_child_age'] = pvd_chi_tokens_dict[utt_index].iloc[0].target_child_age    
            rvs.append(utt_df)  
        else:
            rvs.append(all_tokens_test_dict[utt_index])  
            
    # get the resulting augmented forms back into a dataframe
    all_tokens_phono = pd.concat(rvs)
    
    # add a unique identifier to the BERT tokens
    all_tokens_phono['bert_token_id'] = range(all_tokens_phono.shape[0])
    
    #save the results
    all_tokens_phono.to_pickle('csv/pvd_utt_glosses_phono_inflated.pkl')
else:
    all_tokens_phono = pd.read_pickle('csv/pvd_utt_glosses_phono_inflated.pkl')

In [20]:
# Get the IPA map
phone_map_df = pd.read_csv('phon/phon_map_populated.csv')

In [21]:
if config.verbose:
    # Inspect the IPA
    print(all_tokens_phono.loc[all_tokens_phono.actual_phonology != ''][['token','actual_phonology','model_phonology']])
    print(phone_map_df.head())

          token actual_phonology model_phonology
42        mommy              ɑmɪ           mɑmiː
81          yyy                ʌ               *
170         wee              wiː             wiː
173         yyy               aʊ               *
201         wee              wiː             wiː
...         ...              ...             ...
3083588  nobody           nobɑɾi        noʊbɑdiː
3083589   hates             heɪs           heɪts
3083594      oh                o              oʊ
3083595     why              waɪ             waɪ
3083596    lick              liʔ             lɪk

[254517 rows x 3 columns]
  arpa ipa c_or_v
0   AA   ɑ      v
1   AE   æ      v
2   AH   ə      v
3   AO   ɔ      v
4   AW  aʊ      v


In [22]:
def phone_remap(x):
    return(x.replace("ː","").replace('ʌ','ə')
.replace('ɪ','ə').replace('ɔ','ɑ').replace('a','ɑ').replace('o','oʊ').replace('˞','').replace('ʰ',
    ''). replace('r','ɹ')).replace('\\^','').replace('\\ ̃','').replace(' ̩','').replace('^',''
).replace('ʙ','b').replace('(','').replace(')','').replace('.','').replace('ch','ʧ'
).replace('c','k').replace('g','ɡ').replace('y','j').replace('ʁ','ɹ')

def strip_accents(string, accents=('COMBINING ACUTE ACCENT', 
    'COMBINING GRAVE ACCENT', 'COMBINING TILDE', 'COMBINING VERTICAL LINE BELOW',
    'COMBINING SHORT STROKE OVERLAY')):
    accents = set(map(unicodedata.lookup, accents))
    chars = [c for c in unicodedata.normalize('NFD', string) if c not in accents]
    return unicodedata.normalize('NFC', ''.join(chars))

cv_map = dict(zip(phone_map_df['ipa'], phone_map_df['c_or_v']))
cv_map['o'] = 'v' 
cv_map['ɜ'] = 'v'
cv_map['e'] = 'v'
cv_map['ʔ'] = 'c'
cv_map['ɾ'] = 'c'
cv_map['ɲ'] = 'c'
cv_map['x'] = 'c'
cv_map['ɱ'] = 'c'
cv_map['ɣ'] = 'c'

def cv_mapper(x, cv_map):
    try:
        return(cv_map[x])
    except:
        raise ValueError(x)

if config.regenerate:    

    # Do the same excludes as were used to identify appropriate utterances
    excludes = ['*','(.)','(..)', '(...)','(....)','(.....)']
    all_tokens_phono.loc[all_tokens_phono.actual_phonology.isin(excludes),'actual_phonology'] =''
    all_tokens_phono.loc[all_tokens_phono.actual_phonology.str.contains('V'),'actual_phonology'] =''
    
    # remap phonology from narrow phonetic transcription to broad phonological transcription
    all_tokens_phono['model_phonology_clean'] = [phone_remap(x) for x in all_tokens_phono['model_phonology']]
    all_tokens_phono['actual_phonology_clean'] = [phone_remap(x) for x in all_tokens_phono['actual_phonology']]

    # remove any non-combining diacritical marks
    all_tokens_phono['model_phonology_no_dia'] = [strip_accents(x) for x in \
    all_tokens_phono['model_phonology_clean']]
    all_tokens_phono['actual_phonology_no_dia'] = [strip_accents(x) for x in \
    all_tokens_phono['actual_phonology_clean']]
    
    # Compute the number of non-contiguous vowels.
    # slightly different than the cmu vowel computation ---
    # because here we are computing it directly from IPA
    all_tokens_phono['cv_raw'] = [''.join([cv_mapper(x, cv_map) for x in list(y)]) if y != '' else '' for y in all_tokens_phono['actual_phonology_no_dia']]    
    all_tokens_phono['cv_collapsed']  = [re.sub(r'(.)\1+', r'\1', str(x)) if x != '' else '' for x in all_tokens_phono['cv_raw']]
    all_tokens_phono['num_vowels'] = [np.sum(np.array(list(x)) == 'v') if x !='' else np.nan for x in all_tokens_phono['cv_collapsed']]
    all_tokens_phono.to_pickle('csv/pvd_utt_glosses_phono_cleaned_inflated.pkl')
else:
    all_tokens_phono = pd.read_pickle('csv/pvd_utt_glosses_phono_cleaned_inflated.pkl')


In [23]:
all_tokens_phono[all_tokens_phono.id == 16759250][['gloss', 'seq_utt_id', 'token']]

Unnamed: 0,gloss,seq_utt_id,token
0,where do you want me to go,0,[cgv]
1,where do you want me to go,0,where
2,where do you want me to go,0,do
3,where do you want me to go,0,you
4,where do you want me to go,0,want
5,where do you want me to go,0,me
6,where do you want me to go,0,to
7,where do you want me to go,0,go
8,where do you want me to go,0,?


In [24]:
if config.verbose:
    # Why no actual phonology?
    print(all_tokens_phono.loc[all_tokens_phono.actual_phonology_no_dia != '']['actual_phonology_no_dia'])
    print(all_tokens_phono.shape)

42             ɑmə
81               ə
170             wi
173             ɑʊ
201             wi
            ...   
3083588    noʊbɑɾi
3083589       heəs
3083594         oʊ
3083595        wɑə
3083596        liʔ
Name: actual_phonology_no_dia, Length: 254440, dtype: object
(3083625, 24)


### Identify the tokens that can be evaluated 

In [25]:
successful_utt_ids = set(raw_success_utts['utterance_id']) 
initial_vocab_set = set(initial_vocab)
yyy_utt_ids = set(raw_yyy_utts['utterance_id'])
all_tokens_phono['in_vocab'] = all_tokens_phono['token'].isin(initial_vocab_set)
all_tokens_phono['success_token'] = [x in successful_utt_ids for x in 
    all_tokens_phono['id']]
all_tokens_phono['yyy_token'] = [x in yyy_utt_ids for x in 
    all_tokens_phono['id']]

In [26]:
if config.verbose:
    print(initial_vocab)
    print(all_tokens_phono.shape)

['a' 'b' 'c' ... 'hideout' 'pudding' 'stalks']
(3083625, 27)


### Identify the subset of success and failure utterances that have transcriptions

In [27]:
all_tokens_phono['partition'] = 'none'

In [28]:
success_tokens = all_tokens_phono.loc[(all_tokens_phono['success_token']) & 
    (all_tokens_phono['num_vowels'] <= 2) ]
all_tokens_phono.loc[(all_tokens_phono['success_token']) & 
    (all_tokens_phono['num_vowels'] <= 2), 'partition'] = 'success'     

if config.verbose:
    print(success_tokens.shape)
    print(all_tokens_phono.loc[(all_tokens_phono['success_token']) & 
    (all_tokens_phono['num_vowels'] <= 2)])

(188212, 28)
         token        id               gloss  transcript_id  utterance_order  \
42       mommy  16759315               Mommy          42204                6   
170        wee  16759467                 wee          42204               24   
201        wee  16759501                 wee          42204               28   
239        woo  16759549                 woo          42204               33   
743      ernie  16759752               Ernie          42204               58   
...        ...       ...                 ...            ...              ...   
3083575   help  17280891                help          42569              752   
3083589  hates  17280946  nobody hates Simba          42569              755   
3083594     oh  17280964   oh why lick hippo          42569              756   
3083595    why  17280964   oh why lick hippo          42569              756   
3083596   lick  17280964   oh why lick hippo          42569              756   

        speaker_code targe

In [29]:
yyy_tokens = all_tokens_phono.loc[(all_tokens_phono['yyy_token']) & 
(all_tokens_phono['token'] == 'yyy') & (all_tokens_phono.num_vowels <= 2) ]
all_tokens_phono.loc[(all_tokens_phono['yyy_token']) & 
(all_tokens_phono['token'] == 'yyy') & (all_tokens_phono.num_vowels <= 2),'partition'] = 'yyy'


In [30]:
if config.verbose:
    print(yyy_tokens.shape)
    print(all_tokens_phono.partition.value_counts())
    print(initial_tokenizer.unk_token_id)

(27693, 28)
none       2867720
success     188212
yyy          27693
Name: partition, dtype: int64
100


In [31]:
all_tokens_phono.loc[all_tokens_phono.token == 'xxx','token_id'] = initial_tokenizer.unk_token_id
all_tokens_phono.loc[all_tokens_phono.token == 'yyy','token_id'] = initial_tokenizer.unk_token_id

## Additional section from 6 | Prevalence of Successes and Failures Over Time

Need to augment successes/failures with information on age.

In [32]:
# get number of tokens per age
# Warnings were present in the original code

raw_success_utts['set'] = 'success'
raw_yyy_utts['set'] = 'failure'

utt_age = chi_phono.groupby('utterance_id').target_child_age.agg(np.unique).reset_index()

# Additional attributes needed for the text split.
utt_name = chi_phono.groupby('utterance_id').target_child_name.agg(np.unique).reset_index()
utt_transcript = chi_phono.groupby('utterance_id').transcript_id.agg(np.unique).reset_index()
# Manually asserted that speaker code is always CHI for all of chi_phono, so OK to set it directly to CHI later.

inter_success_utts = raw_success_utts.copy()
inter_yyy_utts = raw_yyy_utts.copy()

for add_attr in [utt_age, utt_name, utt_transcript]:
    inter_success_utts = inter_success_utts.merge(add_attr, on = 'utterance_id')
    inter_yyy_utts = inter_yyy_utts.merge(add_attr, on = 'utterance_id')

# Merge the glosses separately because they aren't the same for both the successes and the yyy.
#Generate the glosses per utterance id
utt_gloss_save_success = data_cleaning.gloss_df_augmentation(chi_phono, raw_success_utts.utterance_id)
utt_gloss_save_yyy = data_cleaning.gloss_df_augmentation(chi_phono, raw_yyy_utts.utterance_id)
    
success_utts = inter_success_utts.merge(utt_gloss_save_success, on = 'utterance_id')
yyy_utts = inter_yyy_utts.merge(utt_gloss_save_yyy, on = 'utterance_id')

utts_with_ages = pd.concat([success_utts, yyy_utts])

assert len(set(utts_with_ages['utterance_id'])) == utts_with_ages.shape[0],\
"Make sure that the utterance id is a unique identifier for the observations in the yyy and success dataframes"
assert len(set(utt_age['utterance_id'])) == utt_age.shape[0],\
"Make sure that the utterance id is a unique identifier for the observations in the utt_age dataframe"

# Changed from the original: the merged dfs don't have the same order of utterance id immediately,
# so am now merging on utterance id

utts_with_ages['year'] = .5*np.floor(utts_with_ages['target_child_age'] / (365. /2) ) 

if config.verbose:
    print(utts_with_ages.loc[utts_with_ages.set == 'failure'].year.value_counts())
    print(utts_with_ages.loc[utts_with_ages.set == 'success'].year.value_counts())


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


Computing gloss df augmentation, 0.0% complete.
Computing gloss df augmentation, 11.9218% complete.
Computing gloss df augmentation, 23.8436% complete.
Computing gloss df augmentation, 35.7654% complete.
Computing gloss df augmentation, 47.6872% complete.
Computing gloss df augmentation, 59.609% complete.
Computing gloss df augmentation, 71.5308% complete.
Computing gloss df augmentation, 83.4526% complete.
Computing gloss df augmentation, 95.3743% complete.
Computing gloss df augmentation, 0.0% complete.
Computing gloss df augmentation, 31.7894% complete.
Computing gloss df augmentation, 63.5789% complete.
Computing gloss df augmentation, 95.3683% complete.
1.5    9919
2.0    7261
1.0    6693
2.5    4895
3.0    2097
3.5     414
0.5     167
4.0      11
Name: year, dtype: int64
2.0    22432
2.5    21194
1.5    16798
3.0    12564
1.0     6697
3.5     3683
4.0      379
0.5      133
Name: year, dtype: int64


In [33]:
# A cell that acts as a check for the utt_age, utt_name augumentations.

# Check that every utterance_id is matched to its right age in yyy/success dataframe
# This is a valid method because all of the utterance IDs are unique per dataframe.

for i in range(utts_with_ages.shape[0]):
    
    if i % 10000 == 0: print(f'{(i / utts_with_ages.shape[0]) * 100.0}% complete')
    this_entry = utts_with_ages.iloc[i]
    this_id = this_entry['utterance_id']
    
    keys_to_check = ['target_child_age', 'target_child_name', 'transcript_id', 'gloss']
    
    cross_entry = chi_phono[chi_phono['utterance_id'] == this_id]
    # Why is the cross value actually still a string?
    # Where is it converted to non-string -- is there a way to convert it to non string?

    for key in keys_to_check:
        
        this_value = this_entry[key]
    
        
        if key == 'gloss' and ' ' in this_entry['gloss']:
            # If this utterance is multiple tokens,
            # you will have to match across multiple entries in chi_phono and join them to make the gloss.
            # For example, idx = 5 using utts_with_ages indexing
            
            # what this checks for
            # 1) you got the right pieces of the gloss
            # 2) they are in the right token order
            
            formatted_cross = list(cross_entry[key])
            
            assert list(cross_entry['token_order']) == list(range(1, 1 + cross_entry.shape[0])),\
            "Cross entry was not sliced in ascending token order, so gloss order of words is wrong."
            
            
            assert this_value == ' '.join(formatted_cross), f'if, at index: {i}, key: {key}, real: {this_value}, cross: {formatted_cross}'
            
        else:
            
            # The item to be matched is a single value or string.
            # Applies to everything but the multiple token gloss.
            # If the gloss of the utt_with_ages is multiple tokens,
            # then it will still match to multiple locations in chi_phono tokens.
            # However, because it's not the gloss attribute itself, the child attribute
            # should be repeated across all of those entries.
            
            if cross_entry.shape[0] == 1:
                cross_single_val = cross_entry[key].item()
            else:
                cross_set = list(set(cross_entry[key]))
                assert len(cross_set) == 1
                cross_single_val = cross_set[0]
                
            assert this_value == cross_single_val, f'else, at index: {i}, key: {key}, real: {this_value}, cross: {cross_entry[key]}'

print('Asserts passed.')

0.0% complete
8.670244587599816% complete
17.340489175199632% complete
26.010733762799447% complete
34.680978350399265% complete
43.351222937999076% complete
52.021467525598894% complete
60.69171211319871% complete
69.36195670079853% complete
78.03220128839835% complete
86.70244587599815% complete
95.37269046359798% complete
Asserts passed.


## Get the samples and splits

In [107]:
# Partition all_tokens_phono into a "val" and "eval" phase for use in the age/old, age/young, and all/all evaluations.

# Go ahead and limit all_tokens_phono to successes only.

phono_pool = all_tokens_phono[all_tokens_phono.success_token | all_tokens_phono.yyy_token]

split_attr = 'transcript_id'
phono_val_idxs, phono_eval_idxs = split_gen.determine_split_idxs(phono_pool, split_attr, 0.5)
phono_phase_data = dict()

for phase, idx_set in zip(['val', 'eval'], [phono_val_idxs, phono_eval_idxs]):
    # It's on transcript_id, not actual idx, so this is OK.
    # all_tokens_phono will receive the val/eval phase marking where it applies.
     
    this_phase_data, all_tokens_phono = split_gen.assign_and_find_phase_data(phase, split_attr, idx_set, all_tokens_phono)
    phono_phase_data[phase] = this_phase_data

all_tokens_phono = data_cleaning.augment_target_child_year(all_tokens_phono)

# Below: For debugging only
all_tokens_phono.to_pickle(join(config.eval_dir, 'pvd_all_tokens_phono_for_eval_before_child.pkl'))

# This is not the same as pvd_utt_glosses_phono_cleaned_inflated.pkl', avoid this name to avoid confusion.

FileNotFoundError: [Errno 2] No such file or directory: '/home/nwong/chompsky/childes/child_listening_continuation/child-directed-listening/eval/new_splits/pvd_all_tokens_phono_for_eval_before_child.pkl'

In [118]:
# What to do here? Resampling for 10 for now -- in order to make sure child scripts run fine.

print('Getting age split data')

young_phono, old_phono = split_gen.get_age_split_data(all_tokens_phono)

phono_pool = [
    all_tokens_phono,
    young_phono,
    old_phono
]

model_args = [('all', 'all'), ('age', 'young'), ('age', 'old')]

for (split_name, dataset_name), this_phono_raw in zip(model_args, phono_pool):
    
    print('Processing', split_name, dataset_name)
    phono_phase = this_phono_raw[this_phono_raw.phase == 'val']

    # age = None means don't filter on a given age
    result_beta_sample = load_splits.sample_successes('beta', split_name, dataset_name, None, phono_phase, 'val')        

    print('\tbeta sample', result_beta_sample.shape)


# Dropping ages 0.5 and 4.0 because of data sparsity.
# -- for 4.0 there is only one transcript so it's not possible to do a val/eval split.
# -- for 1.0 it's possible to have a sample size of 1 or 0, which is too unstable.

used_ages = list(set(all_tokens_phono['year'].dropna()))
for age in used_ages[1:-1]:
    
    for phase in ['val', 'eval']:
        
        for sample_func in [load_splits.sample_successes, load_splits.sample_yyy]:

            if sample_func == load_splits.sample_successes:
                print('successes')
            else:
                print('failures')

            phono_phase = all_tokens_phono[all_tokens_phono.phase == phase]
            this_age_sample = sample_func('models_across_time', None, None, age, phono_phase, phase)        

            print('age sample', this_age_sample.shape)

Getting age split data
Processing all all
{'val'}
Resampling for: beta, all, all, age: None, phase: val
	beta sample (5000, 1)
Processing age young
{'val'}
Resampling for: beta, age, young, age: None, phase: val
	beta sample (5000, 1)
Processing age old
{'val'}
Resampling for: beta, age, old, age: None, phase: val
	beta sample (5000, 1)
successes
{'val'}
Resampling for: models_across_time, None, None, age: 1.0, phase: val
age sample (3787, 1)
failures
{'val'}
Resampling for: models_across_time, None, None, age: 1.0, phase: val
age sample (3173, 1)
successes
{'eval'}
Resampling for: models_across_time, None, None, age: 1.0, phase: eval
age sample (2910, 1)
failures
{'eval'}
Resampling for: models_across_time, None, None, age: 1.0, phase: eval
age sample (3520, 1)
successes
{'val'}
Resampling for: models_across_time, None, None, age: 2.0, phase: val
age sample (5000, 1)
failures
{'val'}
Resampling for: models_across_time, None, None, age: 2.0, phase: val
age sample (3669, 1)
successes
{'

# Child work

In [108]:
# Map the all_tokens_phono type information to the utterance_ids
# So that they are accessible by success_utts/yyy_utts.

# This is also the pool, actually 
# Note that filtering may be easier at this point.
# Or some punct. won't be available

id_type_pairs = list( { (t_id, t_type) for t_id, t_type in zip(all_tokens_phono['id'], all_tokens_phono['type']) } )
visited_ids = set()

id2type = dict(id_type_pairs)

In [109]:

# Check for above

visited_ids = set()
for t_id, t_type in id_type_pairs:
    
    # Every id corresponds to exactly one type
    assert t_id not in visited_ids
    visited_ids.add(t_id)
    
assert visited_ids == set(all_tokens_phono['id']), "Not all ids were visited."

print('Asserts passed')

Asserts passed


In [110]:

# Below are checks needed to ensure that disjoint splitting scheme is truly disjoint across train/val,
# even with separate yyy/successes assignments.

unique_success_ids = set(success_utts['utterance_id'])
unique_yyy_ids = set(yyy_utts['utterance_id'])

assert len(unique_success_ids & unique_yyy_ids) == 0,\
"Overlap in utterance id exists between successes and yyy utterances."
assert len(unique_success_ids) == success_utts.shape[0], "Utterance ids are not unique in success_utts dataframe."
assert len(unique_yyy_ids) == yyy_utts.shape[0], "Utterance ids are not unique in yyy_utts dataframe."

print('Disjoint assert assumptions passed.')


Disjoint assert assumptions passed.


In [111]:

# Filter out any utterances that aren't in all_tokens_phono.id

filtered_id = all_tokens_phono[all_tokens_phono.success_token | all_tokens_phono.yyy_token].id

success_child_pool = load_splits.get_utts_from_ids(success_utts, filtered_id)
yyy_child_pool = load_splits.get_utts_from_ids(yyy_utts, filtered_id)

# You need the speaker code to use the prep_utt_glosses function
assert all(chi_phono.speaker_code == 'CHI') # The source of all of the utts_with_ages, success_utts

assign_type = lambda this_id : id2type[this_id]

# Recover the punctuation.
for df in [success_child_pool, yyy_child_pool]:
    df['type'] = list(map(assign_type, df['utterance_id']))
    df['speaker_code'] = ['CHI' for _ in range(df.shape[0])]

child_names = set(success_child_pool['target_child_name'])

split_attr = 'transcript_id'


# Prep finetuning data for text file writing.
success_child_pool = data_cleaning.prep_utt_glosses(success_child_pool)

Cell 232 output (83880, 10)
Cell 233 output (83880, 11)
declarative                   67290
question                      10044
imperative_emphatic            3375
trail off                      1985
interruption                    564
self interruption               403
trail off question              125
interruption question            53
self interruption question       41
Name: type, dtype: int64
Cell 238 utterance_id            16759315
num_xxx                        0
num_yyy                        0
set                      success
target_child_age           514.0
target_child_name           Alex
transcript_id              42204
gloss                      Mommy
type                 declarative
speaker_code                 CHI
contains_error             False
punct                          .
Name: 0, dtype: object
Cell 269 0    [CHI] mommy.
1      [CHI] wee.
2      [CHI] wee.
3      [CHI] woo.
4    [CHI] ernie.
Name: gloss_with_punct, dtype: object


In [112]:

split_attr = 'transcript_id'

for name in child_names:
    
    print(f'Processing: {name}')
    this_child_phono = all_tokens_phono[all_tokens_phono.target_child_name == name]
    
    this_success_phono = this_child_phono[this_child_phono.success_token]
    this_yyy_phono = this_child_phono[this_child_phono.yyy_token]
    
    this_partition_folder = split_gen.get_split_folder('child', name, config.data_dir)
 
    # Split successes
    train_success_idxs_1, val_success_idxs = child_split_gen.split_train_eval_idxs(this_success_phono, split_attr, 'val')
    train_success_idxs_2, eval_success_idxs =  child_split_gen.split_train_eval_idxs(this_success_phono, split_attr, 'eval')
    
    # Split yyy
    _, val_yyy_idxs = child_split_gen.split_train_eval_idxs(this_yyy_phono, split_attr, 'val')
    _, eval_yyy_idxs =  child_split_gen.split_train_eval_idxs(this_yyy_phono, split_attr, 'eval')
    
    # Combine the proper indices into their phases
    
    train_idxs = np.concatenate([train_success_idxs_1, train_success_idxs_2])
    eval_idxs = np.concatenate([eval_success_idxs, eval_yyy_idxs])
    val_idxs = np.concatenate([val_yyy_idxs, eval_yyy_idxs])
    
    # Write the new split information to all_tokens_phono
    list_idxs = [train_idxs, val_idxs, eval_idxs]
    list_child_phases = [f'{name}_{phase}' for phase in ['train', 'val', 'eval']]
    
    for idx_set, phase_name in zip(list_idxs, list_child_phases):
        
        # Make a new attribute for all_tokens_phono parallel to phase (which is the val/eval split defined above)
        _, all_tokens_phono = split_gen.assign_and_find_phase_data(phase_name, split_attr, idx_set, all_tokens_phono, phase_label = 'phase_child')
    
    # Sample beta from the respective splits val/eval
    # and not the same as the evaluation transcripts selected above.
    
    train_pool = all_tokens_phono[all_tokens_phono.phase_child == f'{name}_train']
    
    val_sample = child_split_gen.get_beta_idxs(train_pool, 'transcript_id', 'val')
    eval_sample = child_split_gen.get_beta_idxs(train_pool, 'transcript_id', 'eval')
    
    
    for phase, sample in zip(['val', 'eval'], [val_sample, eval_sample]):
        this_path = load_splits.get_sample_path('success', 'beta', 'child', name, eval_phase = phase)
        sample.to_csv(this_path)
        
        print(f'\tWriting beta samples for phase {phase}, to {this_path}, sample size: {sample.shape}')
    
    # Write the train and val to the text files for training.
    
    val_pool = all_tokens_phono[all_tokens_phono.phase_child == f'{name}_val']
    
    for phase, phase_data in zip(['train', 'val'], [train_pool, val_pool]):
        split_gen.write_partition(phase, phase_data, this_partition_folder)
    

Processing: William
	Writing beta samples for phase val, to /home/nwong/chompsky/childes/child_listening_continuation/child-directed-listening/eval/new_splits/child/William/success_utts_beta_5000_val.csv, sample size: (5000, 1)
	Writing beta samples for phase eval, to /home/nwong/chompsky/childes/child_listening_continuation/child-directed-listening/eval/new_splits/child/William/success_utts_beta_5000_eval.csv, sample size: (5000, 1)
File written to /home/nwong/chompsky/childes/child_listening_continuation/child-directed-listening/data/new_splits/child/William/train.txt
File written to /home/nwong/chompsky/childes/child_listening_continuation/child-directed-listening/data/new_splits/child/William/train_no_tags.txt
File written to /home/nwong/chompsky/childes/child_listening_continuation/child-directed-listening/data/new_splits/child/William/val.txt
File written to /home/nwong/chompsky/childes/child_listening_continuation/child-directed-listening/data/new_splits/child/William/val_no_tag

In [114]:
# Write final all_tokens_phono with all split information to the proper place.
all_tokens_phono.to_pickle(join(config.eval_dir, 'pvd_all_tokens_phono_for_eval.pkl'))