## Prepares Providence data

Evaluation set for all and age splits.

Both the finetuning and evaluation set (split) for child finetuning.

In [1]:
%load_ext rpy2.ipython
import rpy2.robjects.lib.ggplot2 as ggplot2
import childespy
import numpy as np
import os
import imp
import pandas as pd
import transformers
import torch
import re
import unicodedata
import scipy.stats
import copy
from string import punctuation

import config
np.random.seed(config.SEED)

In [2]:
from os.path import join, exists

In [3]:
from utils import split_gen, data_cleaning, load_splits, load_csvs, load_models, data_cleaning, transformers_bert_completions

### Prepare and clean the source of individual utterance samples

In [4]:
# Communicative success: how many no-xxx, no-yyy child  utterances are in Providence? 
# Communicative failures: how many one-yyy, no-xxx child utterances are in Providence?
# Subset to instances that are monosyllabic later

In [4]:
pvd_idx = childespy.get_sql_query('select * from corpus where name = "Providence"').iloc[0]['id']

phono_glosses = childespy.get_sql_query('select gloss, target_child_name, target_child_age, \
    speaker_code, actual_phonology, model_phonology, transcript_id, utterance_id, \
    token_order, corpus_name, collection_name, language from token where \
    actual_phonology != "" and model_phonology != "" and collection_name = "Eng-NA" \
    and corpus_id = '+str(pvd_idx) ,
        db_version = "2020.1")

R[write to console]: Using current database version: '2020.1'.

R[write to console]: Using supported database version: '2020.1'.



In [5]:
if config.verbose: 
    print(phono_glosses.corpus_name.value_counts())
    print(phono_glosses.loc[phono_glosses.gloss == 'xxx'].actual_phonology.value_counts())
    print(phono_glosses.loc[phono_glosses.gloss == 'yyy'].actual_phonology.value_counts())

In [6]:
chi_phono = phono_glosses.loc[(phono_glosses.speaker_code == 'CHI') & 
    (phono_glosses.target_child_age < (365*5))]

def count_transmission_errors(utt_vector, error_codes):
    return(np.sum([x in error_codes for x in  utt_vector]))

In [7]:
xxxs_per_utt = chi_phono.groupby('utterance_id').gloss.agg(
    lambda x: count_transmission_errors(x, ['xxx'])).reset_index()
xxxs_per_utt.columns = ['utterance_id', 'num_xxx']
yyys_per_utt = chi_phono.groupby('utterance_id').gloss.agg(
    lambda x: count_transmission_errors(x, ['yyy'])).reset_index()
yyys_per_utt.columns = ['utterance_id', 'num_yyy']
failures_per_utt = xxxs_per_utt.merge(yyys_per_utt)

raw_yyy_utts = failures_per_utt.loc[(failures_per_utt.num_xxx == 0) &  (failures_per_utt.num_yyy == 1)]

if config.verbose: print(raw_yyy_utts.shape)

raw_success_utts = failures_per_utt.loc[(failures_per_utt.num_xxx == 0) &  
    (failures_per_utt.num_yyy == 0)]

if config.verbose: print(raw_success_utts.shape)
    

### Prepare and clean Providence data 

Corresponds to: 4 | Prep Utterances / Tokens for BERT,
    in the original notebook

In [8]:

# Get the index of the Providence corpus
pvd_idx = childespy.get_sql_query('select * from corpus where name = "Providence"').iloc[0]['id']

# Load utterances from the Providence corpus from childs-db

if config.regenerate:
    raw_utt_glosses = childespy.get_sql_query('select gloss, transcript_id, id, \
    utterance_order, speaker_code, target_child_name, target_child_age, type from utterance where corpus_id = '+str(pvd_idx) ,
        db_version = "2020.1")
    raw_utt_glosses.to_csv('csv/pvd_utt_glosses.csv', index=False)
else: 
    raw_utt_glosses = load_csvs.load_csv_with_lists('csv/pvd_utt_glosses.csv')

R[write to console]: Using current database version: '2020.1'.



In [9]:
for_chi_phono_utts = raw_utt_glosses.copy() # Avoid cleaning the glosses for the utt_glosses twice (see prep code for child splits)
utt_glosses = data_cleaning.clean_glosses(for_chi_phono_utts, '.')

if config.verbose: utt_glosses[utt_glosses.id == 17280964]

### Build the Vocabulary

In [10]:

cmu_2syl_inchildes = load_models.get_cmu_dict_info()

# tokenize with the most extensive tokenizer, which is the one used for model #2

initial_tokenizer = load_models.get_meylan_original_model(with_tags = True)['tokenizer']

initial_tokenizer.add_tokens(['yyy','xxx']) #must maintain xxx and yyy for alignment,
# otherwwise, BERT tokenizer will try to separate these into x #x and #x and y #y #y
inital_vocab_mask, initial_vocab = transformers_bert_completions.get_softmax_mask(initial_tokenizer,
    cmu_2syl_inchildes.word)

# confirm yyy treated as a separate character
assert initial_tokenizer.tokenize('this is a yyy.') == ['this', 'is', 'a', 'yyy', '.']

cmu_in_initial_vocab = cmu_2syl_inchildes.loc[cmu_2syl_inchildes.word.isin(initial_vocab)]

if config.verbose: print(cmu_in_initial_vocab.shape)
    

In [11]:

# build a dataframe of tokens 
# this is slow, because tokenization is slow
def inflate (row):
    tokens = initial_tokenizer.tokenize(row['gloss_with_punct'])
    return(pd.DataFrame({'token':tokens, 'id':row['id']}) )

if config.regenerate:
    all_tokens = pd.concat([inflate(x) for x in utt_glosses.to_dict('records')])
    all_tokens = all_tokens.merge(utt_glosses)
    all_tokens.to_csv('csv/pvd_utt_glosses_inflated.csv')

else:
    all_tokens = load_csvs.load_csv_with_lists('csv/pvd_utt_glosses_inflated.csv', na_filter=False)

if config.verbose: print(all_tokens.iloc[0:10])

# Assign a token_id (integer in the BERT vocabulary). 
# Because these are from the tokenized utterances, there is no correpsondence 
# with childes-db token ids
all_tokens['token_id'] = initial_tokenizer.convert_tokens_to_ids(all_tokens['token'])
# assigns utterances a 0-indexed index column
all_tokens['seq_utt_id'] = all_tokens['id'].astype('category').cat.codes


### Add back IPA, syllable structure, and child ages for child productions

In [12]:
# get the token-level data, esp phonology

if config.regenerate:

    # get token-level information for Providence
    pvd_chi_tokens = childespy.get_sql_query('select gloss, target_child_name, target_child_age, \
    speaker_code, actual_phonology, model_phonology, transcript_id, utterance_id, \
    token_order from token where speaker_code = "CHI" and corpus_id = '+str(pvd_idx),
        db_version = "2020.1")
    pvd_chi_tokens['gloss'] = [data_cleaning.fix_gloss(x) for x in pvd_chi_tokens.gloss]
    
    # prep the tokens generated from segmenting the utterances
    all_tokens_test = copy.deepcopy(all_tokens) 

    # initialize the fields that need to be populated
    all_tokens_test['actual_phonology'] = ''
    all_tokens_test['model_phonology'] = ''
    all_tokens_test['target_child_age'] = np.nan
    
    # get a set of unique utterances
    _, idx = np.unique(all_tokens_test.id, return_index=True)
    all_utt_indices = all_tokens_test.id[np.sort(idx)]
    
    # For fast retrieval of IPA, split pvd_chi_tokens into a dictionary
    pvd_chi_tokens_list = pvd_chi_tokens.groupby(['utterance_id'])
    pvd_chi_tokens_dict = dict(zip(
        [x[0] for x in pvd_chi_tokens_list], 
        [x[1] for x in pvd_chi_tokens_list], 
    ))
    
    # For fast retrival of BERT tokenization
    all_tokens_test_list = all_tokens_test.groupby(['id'])
    all_tokens_test_dict = dict(zip(
        [x[0] for x in all_tokens_test_list], 
        [x[1] for x in all_tokens_test_list], 
    ))
        
    # Augment the tokens from all_tokens with the IPA from pvd_chi_tokens 
    rvs = [] 
    utts_to_retrieve = raw_yyy_utts.utterance_id.to_list() + raw_success_utts.utterance_id.to_list()
    i=-1
    for utt_index in all_utt_indices: #utts_to_retrieve: #[16760331]:       
        i+=1
        if i % int(len(all_utt_indices) / 100) == 0:
            print(str(np.round((i / (len(all_utt_indices)) * 100),2))+'% complete...')    
            # should learn to use tqdm instead
        if utt_index in utts_to_retrieve:        
            utt_df = copy.deepcopy(all_tokens_test_dict[utt_index])
            utt_df['model_phonology'] = transfomers_bert_completions.augment_with_ipa(
              utt_df, pvd_chi_tokens_dict[utt_index],initial_tokenizer, 'model_phonology')
            utt_df['actual_phonology'] = transfomers_bert_completions.augment_with_ipa(
              utt_df, pvd_chi_tokens_dict[utt_index],initial_tokenizer, 'actual_phonology')
            utt_df['target_child_age'] = pvd_chi_tokens_dict[utt_index].iloc[0].target_child_age    
            rvs.append(utt_df)  
        else:
            rvs.append(all_tokens_test_dict[utt_index])  
            
    # get the resulting augmented forms back into a dataframe
    all_tokens_phono = pd.concat(rvs)
    
    # add a unique identifier to the BERT tokens
    all_tokens_phono['bert_token_id'] = range(all_tokens_phono.shape[0])
    
    #save the results
    all_tokens_phono.to_pickle('csv/pvd_utt_glosses_phono_inflated.pkl')
else:
    all_tokens_phono = pd.read_pickle('csv/pvd_utt_glosses_phono_inflated.pkl')

In [13]:
# Get the IPA map
phone_map_df = load_csvs.load_csv_with_lists('phon/phon_map_populated.csv')

In [14]:
if config.verbose:
    # Inspect the IPA
    print(all_tokens_phono.loc[all_tokens_phono.actual_phonology != ''][['token','actual_phonology','model_phonology']])
    print(phone_map_df.head())

In [15]:
def phone_remap(x):
    return(x.replace("ː","").replace('ʌ','ə')
.replace('ɪ','ə').replace('ɔ','ɑ').replace('a','ɑ').replace('o','oʊ').replace('˞','').replace('ʰ',
    ''). replace('r','ɹ')).replace('\\^','').replace('\\ ̃','').replace(' ̩','').replace('^',''
).replace('ʙ','b').replace('(','').replace(')','').replace('.','').replace('ch','ʧ'
).replace('c','k').replace('g','ɡ').replace('y','j').replace('ʁ','ɹ')

def strip_accents(string, accents=('COMBINING ACUTE ACCENT', 
    'COMBINING GRAVE ACCENT', 'COMBINING TILDE', 'COMBINING VERTICAL LINE BELOW',
    'COMBINING SHORT STROKE OVERLAY')):
    accents = set(map(unicodedata.lookup, accents))
    chars = [c for c in unicodedata.normalize('NFD', string) if c not in accents]
    return unicodedata.normalize('NFC', ''.join(chars))

cv_map = dict(zip(phone_map_df['ipa'], phone_map_df['c_or_v']))
cv_map['o'] = 'v' 
cv_map['ɜ'] = 'v'
cv_map['e'] = 'v'
cv_map['ʔ'] = 'c'
cv_map['ɾ'] = 'c'
cv_map['ɲ'] = 'c'
cv_map['x'] = 'c'
cv_map['ɱ'] = 'c'
cv_map['ɣ'] = 'c'

def cv_mapper(x, cv_map):
    try:
        return(cv_map[x])
    except:
        raise ValueError(x)

if config.regenerate:    

    # Do the same excludes as were used to identify appropriate utterances
    excludes = ['*','(.)','(..)', '(...)','(....)','(.....)']
    all_tokens_phono.loc[all_tokens_phono.actual_phonology.isin(excludes),'actual_phonology'] =''
    all_tokens_phono.loc[all_tokens_phono.actual_phonology.str.contains('V'),'actual_phonology'] =''
    
    # remap phonology from narrow phonetic transcription to broad phonological transcription
    all_tokens_phono['model_phonology_clean'] = [phone_remap(x) for x in all_tokens_phono['model_phonology']]
    all_tokens_phono['actual_phonology_clean'] = [phone_remap(x) for x in all_tokens_phono['actual_phonology']]

    # remove any non-combining diacritical marks
    all_tokens_phono['model_phonology_no_dia'] = [strip_accents(x) for x in \
    all_tokens_phono['model_phonology_clean']]
    all_tokens_phono['actual_phonology_no_dia'] = [strip_accents(x) for x in \
    all_tokens_phono['actual_phonology_clean']]
    
    # Compute the number of non-contiguous vowels.
    # slightly different than the cmu vowel computation ---
    # because here we are computing it directly from IPA
    all_tokens_phono['cv_raw'] = [''.join([cv_mapper(x, cv_map) for x in list(y)]) if y != '' else '' for y in all_tokens_phono['actual_phonology_no_dia']]    
    all_tokens_phono['cv_collapsed']  = [re.sub(r'(.)\1+', r'\1', str(x)) if x != '' else '' for x in all_tokens_phono['cv_raw']]
    all_tokens_phono['num_vowels'] = [np.sum(np.array(list(x)) == 'v') if x !='' else np.nan for x in all_tokens_phono['cv_collapsed']]
    all_tokens_phono.to_pickle('csv/pvd_utt_glosses_phono_cleaned_inflated.pkl')
else:
    all_tokens_phono = pd.read_pickle('csv/pvd_utt_glosses_phono_cleaned_inflated.pkl')


In [16]:
if config.verbose:
    # Why no actual phonology?
    print(all_tokens_phono.loc[all_tokens_phono.actual_phonology_no_dia != '']['actual_phonology_no_dia'])
    print(all_tokens_phono.shape)

### Identify the tokens that can be evaluated 

In [17]:
successful_utt_ids = set(raw_success_utts['utterance_id']) 
initial_vocab_set = set(initial_vocab)
yyy_utt_ids = set(raw_yyy_utts['utterance_id'])
all_tokens_phono['in_vocab'] = all_tokens_phono['token'].isin(initial_vocab_set)
all_tokens_phono['success_token'] = [x in successful_utt_ids for x in 
    all_tokens_phono['id']]
all_tokens_phono['yyy_token'] = [x in yyy_utt_ids for x in 
    all_tokens_phono['id']]

In [18]:
if config.verbose:
    print(initial_vocab)
    print(all_tokens_phono.shape)

### Identify the subset of success and failure utterances that have transcriptions

In [19]:
all_tokens_phono['partition'] = 'none'

In [20]:
success_tokens = all_tokens_phono.loc[(all_tokens_phono['success_token']) & 
    (all_tokens_phono['num_vowels'] <= 2) ]
all_tokens_phono.loc[(all_tokens_phono['success_token']) & 
    (all_tokens_phono['num_vowels'] <= 2), 'partition'] = 'success'     

if config.verbose:
    print(success_tokens.shape)
    print(all_tokens_phono.loc[(all_tokens_phono['success_token']) & 
    (all_tokens_phono['num_vowels'] <= 2)])

In [21]:
yyy_tokens = all_tokens_phono.loc[(all_tokens_phono['yyy_token']) & 
(all_tokens_phono['token'] == 'yyy') & (all_tokens_phono.num_vowels <= 2) ]
all_tokens_phono.loc[(all_tokens_phono['yyy_token']) & 
(all_tokens_phono['token'] == 'yyy') & (all_tokens_phono.num_vowels <= 2),'partition'] = 'yyy'


In [22]:
if config.verbose:
    print(yyy_tokens.shape)
    print(all_tokens_phono.partition.value_counts())
    print(initial_tokenizer.unk_token_id)

In [23]:
all_tokens_phono.loc[all_tokens_phono.token == 'xxx','token_id'] = initial_tokenizer.unk_token_id
all_tokens_phono.loc[all_tokens_phono.token == 'yyy','token_id'] = initial_tokenizer.unk_token_id

## Additional section from 6 | Prevalence of Successes and Failures Over Time

Need to augment successes/failures with information on age.

In [24]:
# get number of tokens per age
# Warnings were present in the original code

raw_success_utts['set'] = 'success'
raw_yyy_utts['set'] = 'failure'

utt_age = chi_phono.groupby('utterance_id').target_child_age.agg(np.unique).reset_index()

# Additional attributes needed for the text split.
utt_name = chi_phono.groupby('utterance_id').target_child_name.agg(np.unique).reset_index()
utt_transcript = chi_phono.groupby('utterance_id').transcript_id.agg(np.unique).reset_index()
# Manually asserted that speaker code is always CHI for all of chi_phono, so OK to set it directly to CHI later.

inter_success_utts = raw_success_utts.copy()
inter_yyy_utts = raw_yyy_utts.copy()

for add_attr in [utt_age, utt_name, utt_transcript]:
    inter_success_utts = inter_success_utts.merge(add_attr, on = 'utterance_id')
    inter_yyy_utts = inter_yyy_utts.merge(add_attr, on = 'utterance_id')

# Merge the glosses separately because they aren't the same for both the successes and the yyy.
#Generate the glosses per utterance id
utt_gloss_save_success = data_cleaning.gloss_df_augmentation(chi_phono, raw_success_utts.utterance_id)
utt_gloss_save_yyy = data_cleaning.gloss_df_augmentation(chi_phono, raw_yyy_utts.utterance_id)
    
success_utts = inter_success_utts.merge(utt_gloss_save_success, on = 'utterance_id')
yyy_utts = inter_yyy_utts.merge(utt_gloss_save_yyy, on = 'utterance_id')

utts_with_ages = pd.concat([success_utts, yyy_utts])

assert len(set(utts_with_ages['utterance_id'])) == utts_with_ages.shape[0],\
"Make sure that the utterance id is a unique identifier for the observations in the yyy and success dataframes"
assert len(set(utt_age['utterance_id'])) == utt_age.shape[0],\
"Make sure that the utterance id is a unique identifier for the observations in the utt_age dataframe"

# Changed from the original: the merged dfs don't have the same order of utterance id immediately,
# so am now merging on utterance id

utts_with_ages['year'] = .5*np.floor(utts_with_ages['target_child_age'] / (365. /2) ) 

if config.verbose:
    print(utts_with_ages.loc[utts_with_ages.set == 'failure'].year.value_counts())
    print(utts_with_ages.loc[utts_with_ages.set == 'success'].year.value_counts())


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


Computing gloss df augmentation, 0.0% complete.
Computing gloss df augmentation, 11.9218% complete.
Computing gloss df augmentation, 23.8436% complete.
Computing gloss df augmentation, 35.7654% complete.
Computing gloss df augmentation, 47.6872% complete.
Computing gloss df augmentation, 59.609% complete.
Computing gloss df augmentation, 71.5308% complete.
Computing gloss df augmentation, 83.4526% complete.
Computing gloss df augmentation, 95.3743% complete.
Computing gloss df augmentation, 0.0% complete.
Computing gloss df augmentation, 31.7894% complete.
Computing gloss df augmentation, 63.5789% complete.
Computing gloss df augmentation, 95.3683% complete.


In [25]:
# A cell that acts as a check for the utt_age, utt_name augumentations.

# Check that every utterance_id is matched to its right age in yyy/success dataframe
# This is a valid method because all of the utterance IDs are unique per dataframe.

for i in range(utts_with_ages.shape[0]):
    
    if i % 10000 == 0: print(f'{(i / utts_with_ages.shape[0]) * 100.0}% complete')
    this_entry = utts_with_ages.iloc[i]
    this_id = this_entry['utterance_id']
    
    keys_to_check = ['target_child_age', 'target_child_name', 'transcript_id', 'gloss']
    
    cross_entry = chi_phono[chi_phono['utterance_id'] == this_id]
    # Why is the cross value actually still a string?
    # Where is it converted to non-string -- is there a way to convert it to non string?

    for key in keys_to_check:
        
        this_value = this_entry[key]
    
        
        if key == 'gloss' and ' ' in this_entry['gloss']:
            # If this utterance is multiple tokens,
            # you will have to match across multiple entries in chi_phono and join them to make the gloss.
            # For example, idx = 5 using utts_with_ages indexing
            
            # what this checks for
            # 1) you got the right pieces of the gloss
            # 2) they are in the right token order
            
            formatted_cross = list(cross_entry[key])
            
            assert list(cross_entry['token_order']) == list(range(1, 1 + cross_entry.shape[0])),\
            "Cross entry was not sliced in ascending token order, so gloss order of words is wrong."
            
            
            assert this_value == ' '.join(formatted_cross), f'if, at index: {i}, key: {key}, real: {this_value}, cross: {formatted_cross}'
            
        else:
            
            # The item to be matched is a single value or string.
            # Applies to everything but the multiple token gloss.
            # If the gloss of the utt_with_ages is multiple tokens,
            # then it will still match to multiple locations in chi_phono tokens.
            # However, because it's not the gloss attribute itself, the child attribute
            # should be repeated across all of those entries.
            
            if cross_entry.shape[0] == 1:
                cross_single_val = cross_entry[key].item()
            else:
                cross_set = list(set(cross_entry[key]))
                assert len(cross_set) == 1
                cross_single_val = cross_set[0]
                
            assert this_value == cross_single_val, f'else, at index: {i}, key: {key}, real: {this_value}, cross: {cross_entry[key]}'

print('Asserts passed.')

0.0% complete
8.670244587599816% complete
17.340489175199632% complete
26.010733762799447% complete
34.680978350399265% complete
43.351222937999076% complete
52.021467525598894% complete
60.69171211319871% complete
69.36195670079853% complete
78.03220128839835% complete
86.70244587599815% complete
95.37269046359798% complete
Asserts passed.


## This marks the end of the section before the model evaluations/queries beginning.
## It also marks the end of chi_phono generation -- do NOT re-run above or you will double-merge and lose the target_child_age attribute

In [26]:

def save_eval_data(data, filename, split_name, dataset_name):
    
    assert split_name in ['all', 'age', 'child'], "Invalid split name. Must be one of {all, age, child}."
    
    # Saving based on a mask of a copy of a? Will this be a problem?
    
    save_path = split_gen.get_split_folder(split_name, dataset_name, config.eval_dir)
    save_location = join(save_path, filename)
    if filename.endswith('.pkl'):
        data.to_pickle(save_location)
    elif filename.endswith('.csv'):
        data.to_csv(save_location)
    else:
        assert False, "Tried to save something that was neither a pkl nor a csv."
    
    print(f'Saved all/all evaluation data to {save_location}')
    
    return save_location

In [27]:
phono_filename = 'pvd_utt_glosses_phono_cleaned_inflated.pkl'
success_utts_filename = 'success_utts.csv'
yyy_utts_filename = 'yyy_utts.csv'

data_filenames = [phono_filename, success_utts_filename, yyy_utts_filename]

# Use this line from the original code and load the above two CSVs for model inputs later:
# utts_with_ages = pd.concat([success_utts, yyy_utts]).merge(utt_age)

# for the input into the actual models.

### Save evaluation data for all split.

In [28]:

for this_data, filename in zip([all_tokens_phono, success_utts, yyy_utts], data_filenames):
    save_eval_data(this_data, filename, 'all', 'all')


Saved all/all evaluation data to /home/nwong/chompsky/childes/child_listening_continuation/child-directed-listening/eval/new_splits/all/all/pvd_utt_glosses_phono_cleaned_inflated.pkl
Saved all/all evaluation data to /home/nwong/chompsky/childes/child_listening_continuation/child-directed-listening/eval/new_splits/all/all/success_utts.csv
Saved all/all evaluation data to /home/nwong/chompsky/childes/child_listening_continuation/child-directed-listening/eval/new_splits/all/all/yyy_utts.csv


### Save evaluation data for age split.

In [29]:

young_tokens_phono, old_tokens_phono = split_gen.get_age_split_data(all_tokens_phono)
young_success_utts, old_success_utts = split_gen.get_age_split_data(success_utts)
young_yyy_utts, old_yyy_utts = split_gen.get_age_split_data(yyy_utts)

for this_data, filename in zip([old_tokens_phono, old_success_utts, old_yyy_utts], data_filenames):
    save_eval_data(this_data, filename, 'age', 'old')
    
for this_data, filename in zip([young_tokens_phono, young_success_utts, young_yyy_utts], data_filenames):
    save_eval_data(this_data, filename, 'age', 'young') 
    

Saved all/all evaluation data to /home/nwong/chompsky/childes/child_listening_continuation/child-directed-listening/eval/new_splits/age/old/pvd_utt_glosses_phono_cleaned_inflated.pkl
Saved all/all evaluation data to /home/nwong/chompsky/childes/child_listening_continuation/child-directed-listening/eval/new_splits/age/old/success_utts.csv
Saved all/all evaluation data to /home/nwong/chompsky/childes/child_listening_continuation/child-directed-listening/eval/new_splits/age/old/yyy_utts.csv
Saved all/all evaluation data to /home/nwong/chompsky/childes/child_listening_continuation/child-directed-listening/eval/new_splits/age/young/pvd_utt_glosses_phono_cleaned_inflated.pkl
Saved all/all evaluation data to /home/nwong/chompsky/childes/child_listening_continuation/child-directed-listening/eval/new_splits/age/young/success_utts.csv
Saved all/all evaluation data to /home/nwong/chompsky/childes/child_listening_continuation/child-directed-listening/eval/new_splits/age/young/yyy_utts.csv


## Get successes and yyy samples for use in beta fitting and models across time.

In [33]:
# What to do here?

# Temp comment out because I don't want to regen this and lose problematic cases

task_types = ['beta', 'models_across_time']

model_args = [('all_debug', 'all_debug'), ('all', 'all'), ('age', 'young'), ('age', 'old')]
for task in task_types:
    for split_name, dataset_name in model_args:
        for pool in ['yyy', 'success']:      
            eval_data = load_splits.load_eval_data_all(split_name, dataset_name)
            load_splits.sample_successes_yyy(pool, task, split_name, dataset_name, eval_data[f'{pool}_utts'])        

Resampling for: yyy, beta, all_debug, all_debug
Resampling for: success, beta, all_debug, all_debug
Resampling for: yyy, beta, all, all
Resampling for: success, beta, all, all
Resampling for: yyy, beta, age, young
Resampling for: success, beta, age, young
Resampling for: yyy, beta, age, old
Resampling for: success, beta, age, old
Resampling for: yyy, models_across_time, all_debug, all_debug
Resampling for: success, models_across_time, all_debug, all_debug
Resampling for: yyy, models_across_time, all, all
Resampling for: success, models_across_time, all, all
Resampling for: yyy, models_across_time, age, young
Resampling for: success, models_across_time, age, young
Resampling for: yyy, models_across_time, age, old
Resampling for: success, models_across_time, age, old


In [30]:
# check the results 

model_args = [('all_debug', 'all_debug'), ('all', 'all'), ('age', 'young'), ('age', 'old')]


# Was working before with small sample sizes,
# just need to integrate the condition for really big sample sizes check later


# def expected_shape(sample, pool, which_ref_num):
    
#     # Sampled everything available
#     pool_small = (pool.shape[0] < which_ref_num) and (sample.shape[0] == pool.shape[0])
#     # Typical sampling
#     pool_big = (pool.shape[0] >= which_ref_num) and (sample.shape[0] == which_ref_num)
    
#     return (pool_small or pool_big)
    
# expected_beta_shape = lambda df : (df.shape[0] >= config.n_beta and df.shape[0] == config.n_beta) or (df.shape[0] == )
# expected_time_shape = and success_time.shape[0]
# for task in task_types:
#     for split_name, dataset_name in model_args:
        
#         success_beta = load_splits.load_sample_successes('beta', split_name, dataset_name)
#         yyy_beta = load_splits.load_sample_yyy('beta', split_name, dataset_name)
        
#         success_time = load_splits.load_sample_successes('models_across_time', split_name, dataset_name)
#         yyy_time = load_splits.load_sample_yyy('models_across_time', split_name, dataset_name)
        
        
#         assert expected_shape(success_beta, success_beta, config.n_beta) and expected_shape(success_time, config.n_across_time)
#         assert yyy_beta.shape[0] == config.n_beta and yyy_time.shape[0] == config.n_across_time
    

### On hold: Process finetuning and evaluation data for child split.

In [None]:

# Below are checks needed to ensure that disjoint splitting scheme is truly disjoint across train/val,
# even with separate yyy/successes assignments.

unique_success_ids = set(success_utts['utterance_id'])
unique_yyy_ids = set(yyy_utts['utterance_id'])

assert len(unique_success_ids & unique_yyy_ids) == 0,\
"Overlap in utterance id exists between successes and yyy utterances."
assert len(unique_success_ids) == success_utts.shape[0], "Utterance ids are not unique in success_utts dataframe."
assert len(unique_yyy_ids) == yyy_utts.shape[0], "Utterance ids are not unique in yyy_utts dataframe."

print('Disjoint assert assumptions passed.')

In [None]:
# Note you'll have to restart runtime/re-gen all of this on random seed

# For now, define a success as any utterance without any yyy and xxx -- any such utterance can be used for training.
# This is how it's defined in the implementation elsewhere in the other splits
# -- although technically the token itself should also be monosyllabic?

# Because type is not available in the first query,
# just make all of the sentences end with a period for now.


# Save the successes and yyy and write them to files for finetuning.
success_child_pool = success_utts; yyy_child_pool = yyy_utts

# You need the speaker code to use the prep_utt_glosses function
assert all(chi_phono.speaker_code == 'CHI') # The source of all of the utts_with_ages, success_utts

# Augment with defaults. -> need to change this to preserve the punctuation somehow?
# Where to actually find the punctuation?

for df in [success_child_pool, yyy_child_pool]:
    df['type'] = [ 'declarative' for _ in range(df.shape[0])]
    df['speaker_code'] = ['CHI' for _ in range(df.shape[0])]

child_names = set(success_child_pool['target_child_name'])

split_attr = 'transcript_id'

for name in child_names:
    
    # First, isolate the relevant data.
    child_success_utts = success_child_pool[success_child_pool['target_child_name'] == name]
    child_yyy_utts = yyy_child_pool[yyy_child_pool['target_child_name'] == name]
    child_tokens_phono = all_tokens_phono[all_tokens_phono['target_child_name'] == name]
    
    # Prep finetuning data for text file writing and use None to match the processing of the other splits.
    child_success_utts = data_cleaning.prep_utt_glosses(child_success_utts, None)
    

    # Split the successes and prepare them for file writing -- doesn't write to the state of the parent csv
    this_partition_folder = split_gen.get_split_folder('child', name, config.data_dir)
    train_success_idxs, val_eval_success_idxs = split_gen.determine_split_idxs(child_success_utts, split_on = split_attr, val_num = config.child_val_num)
    
    
    # Find the eval data from the val data
    val_eval_child_success_utts = split_gen.find_in_phase_idxs(child_success_utts, val_eval_success_idxs, split_attr)
    val_idxs, eval_idxs = split_gen.determine_split_idxs(val_eval_child_success_utts, split_on = split_attr, val_num = config.child_eval_num)
    
    
    success_dict = {}
    # Mark phases in the data pool and write the finetuning data
    for phase, phase_idxs in zip(['train', 'val', 'eval'], [train_success_idxs, val_success_idxs, eval_success_idxs]):
        # eval.txt is not necessary, but is convenient for identifying and marking eval phase data.
        child_success_utts, this_phase_successes = split_gen.write_data_partitions_text(child_success_utts, this_partition_folder, phase, phase_idxs, split_attr)
        success_dict[phase] = this_phase_successes

    # Identify yyy utts for use in validation. Only identify the validation data -- failures aren't used in training.
    _, yyy_val_eval_idx = split_gen.determine_split_idxs(child_yyy_utts, split_on = split_attr, val_num = config.child_eval_num)
    
    val_eval_child_yyy_utts = split_gen.find_in_phase_idxs(child_yyy_utts, yyy_val_eval_idx, split_attr)
    yyy_val_idx, yyy_eval_idx = split_gen.determine_split_idxs(val_eval_child_yyy_utts, split_on = split_attr, val_num = config.child_eval_num)
    
    yyy_dict = {}
    for phase, phase_idxs in zip(['val', 'eval'], [yyy_val_idx, yyy_eval_idx]):
        val_yyy_utts, this_phase_yyy = split_gen.assign_and_find_phase_data(phase, split_attr, phase_idxs, child_yyy_utts)
        yyy_dict[phase] = this_phase_successes
        
    eval_utts = pd.concat([success_dict['eval'], yyy_dict['eval']])
    this_eval_data_path = split_gen.get_split_folder('child', name, config.eval_dir)
    eval_utts.to_csv(join(this_eval_data_path, 'eval_utts.csv'))
    
    # Save this for use in the notebook analyses. 
    
    
    # Disjoint check at data generation 
    # Check that utterance_id (per entry), used for splitting phases, is disjoint.
    
#     assert len(set(train_utts.utterance_id) & set(pd.concat([val_success_utts, val_yyy_utts]).utterance_id)) == 0, "Train and validation data written was not disjoint."
    # Need to add checks for val and eval.
    
#     pd.concat([child_success_utts, child_yyy_utts]).to_csv(join(this_partition_folder, 'utts_pooled_data_with_phases.csv'))
#     # Note that not all of this data is actually used.
#     # expected behavior is {train, val, none} where yyy not assigned to val split should read "none" in phase.
    
#     print(name, child_success_utts.shape[0], 'number of examples')
    
#     # Save the evaluation-related data.
#     # Note there are no phase marks on the child_tokens_phono right now -- I just save the entire thing.
#     # So technically, train and val are saved together -- but they should be retrieved via utterance_id
#     # prompted by the splits of the utts_with_ages df.
#     # This can be changed in the future if needed. 

    # Note not all of this is eval data, but it's being stored there for consistency
    # with the other splits.
    for this_data, filename in zip([child_tokens_phono, child_success_utts, child_yyy_utts], data_filenames):
        save_eval_data(this_data, filename, 'child', name)
        