## Prepares Providence data

Evaluation set for all and age splits.

Both the finetuning and evaluation set (split) for child finetuning.

In [1]:
%load_ext rpy2.ipython
import rpy2.robjects.lib.ggplot2 as ggplot2
import childespy
import numpy as np
import os
import imp
import pandas as pd
import transformers
import torch
import re
import unicodedata
import scipy.stats
import copy
from string import punctuation

np.random.seed(0)

In [2]:
from os.path import join, exists

In [3]:
from utils import transfomers_bert_completions, split_gen, data_cleaning

### Prepare and clean the source of individual utterance samples

In [4]:
# Communicative success: how many no-xxx, no-yyy child  utterances are in Providence? 
# Communicative failures: how many one-yyy, no-xxx child utterances are in Providence?
# Subset to instances that are monosyllabic later

verbose = True

In [5]:
pvd_idx = childespy.get_sql_query('select * from corpus where name = "Providence"').iloc[0]['id']

phono_glosses = childespy.get_sql_query('select gloss, target_child_name, target_child_age, \
    speaker_code, actual_phonology, model_phonology, transcript_id, utterance_id, \
    token_order, corpus_name, collection_name, language from token where \
    actual_phonology != "" and model_phonology != "" and collection_name = "Eng-NA" \
    and corpus_id = '+str(pvd_idx) ,
        db_version = "2020.1")

R[write to console]: Using current database version: '2020.1'.

R[write to console]: Using supported database version: '2020.1'.



In [6]:
if verbose: 
    print(phono_glosses.corpus_name.value_counts())
    print(phono_glosses.loc[phono_glosses.gloss == 'xxx'].actual_phonology.value_counts())
    print(phono_glosses.loc[phono_glosses.gloss == 'yyy'].actual_phonology.value_counts())

Providence    396621
Name: corpus_name, dtype: int64
*       26736
ə          10
(.)         7
aɪ          4
ən          2
        ...  
mɪ          1
hɛʔ         1
kjɛ         1
dɪ          1
hɛjə        1
Name: actual_phonology, Length: 76, dtype: int64
ɛ         3206
ʌ         2132
ɪ         1881
ə          512
o          507
          ... 
ʃævəsk       1
tuitə        1
mʌtɑd        1
pɑbæ         1
iæ̃          1
Name: actual_phonology, Length: 30293, dtype: int64


In [7]:
chi_phono = phono_glosses.loc[(phono_glosses.speaker_code == 'CHI') & 
    (phono_glosses.target_child_age < (365*5))]

def count_transmission_errors(utt_vector, error_codes):
    return(np.sum([x in error_codes for x in  utt_vector]))

In [8]:
chi_phono.columns

Index(['gloss', 'target_child_name', 'target_child_age', 'speaker_code',
       'actual_phonology', 'model_phonology', 'transcript_id', 'utterance_id',
       'token_order', 'corpus_name', 'collection_name', 'language'],
      dtype='object')

In [9]:
xxxs_per_utt = chi_phono.groupby('utterance_id').gloss.agg(
    lambda x: count_transmission_errors(x, ['xxx'])).reset_index()
xxxs_per_utt.columns = ['utterance_id', 'num_xxx']
yyys_per_utt = chi_phono.groupby('utterance_id').gloss.agg(
    lambda x: count_transmission_errors(x, ['yyy'])).reset_index()
yyys_per_utt.columns = ['utterance_id', 'num_yyy']
failures_per_utt = xxxs_per_utt.merge(yyys_per_utt)

raw_yyy_utts = failures_per_utt.loc[(failures_per_utt.num_xxx == 0) &  (failures_per_utt.num_yyy == 1)]

if verbose: print(raw_yyy_utts.shape)

raw_success_utts = failures_per_utt.loc[(failures_per_utt.num_xxx == 0) &  
    (failures_per_utt.num_yyy == 0)]

if verbose: print(raw_success_utts.shape)
    

(31457, 3)
(83880, 3)


### Refactor below to the other notebook -- load based on cached successes instead? Unsure if there is a dependency there.

In [10]:
# Probably move this to the other analysis notebook and use caching instead on the data.


tokens_from_errorless_utts = chi_phono.loc[chi_phono.utterance_id.isin(raw_success_utts.utterance_id)]
#exclude un-transcribed tokens and syllabically transcribed tokens
excludes = ['*','(.)','(..)', '(...)','(....)','(.....)']
tokens_from_errorless_utts = tokens_from_errorless_utts.loc[~(tokens_from_errorless_utts.actual_phonology.isin(excludes) |
    tokens_from_errorless_utts.model_phonology.isin(excludes))]

if verbose:
    print(tokens_from_errorless_utts.shape)
    print(tokens_from_errorless_utts.actual_phonology)

# 31,457 transmission errors (from 31,457 utterances)
# 214,239 transmission successes (from 83,880 utterances)
# this will be further decreased later by the need to test monosyllabic forms

(214239, 12)
1          ɑmɪ
3          wiː
4          wiː
5           uː
52           ɛ
          ... 
396606       o
396607     waɪ
396608     liʔ
396609       ɪ
396610    hɪpo
Name: actual_phonology, Length: 214239, dtype: object


### Prepare and clean Providence data 

Corresponds to: 4 | Prep Utterances / Tokens for BERT,
    in the original notebook

In [11]:
from utils import load_models, data_cleaning

In [12]:
regenerate = True
verbose = True

# Get the index of the Providence corpus
pvd_idx = childespy.get_sql_query('select * from corpus where name = "Providence"').iloc[0]['id']

# Load utterances from the Providence corpus from childs-db

if regenerate:
    utt_glosses = childespy.get_sql_query('select gloss, transcript_id, id, \
    utterance_order, speaker_code, target_child_name, target_child_age, type from utterance where corpus_id = '+str(pvd_idx) ,
        db_version = "2020.1")
    utt_glosses.to_csv('csv/pvd_utt_glosses.csv', index=False)
else: 
    utt_glosses = pd.read_csv('csv/pvd_utt_glosses.csv')

R[write to console]: Using current database version: '2020.1'.

R[write to console]: Using supported database version: '2020.1'.



In [13]:
utt_glosses = data_cleaning.clean_glosses(utt_glosses, '.')

if verbose: utt_glosses[utt_glosses.id == 17280964]

### Build the Vocabulary

In [14]:
import importlib
importlib.reload(load_models)

root_dir = '/home/nwong/chompsky/childes/child_listening_continuation/child-directed-listening/'

cmu_2syl_inchildes = load_models.get_cmu_dict_info(root_dir = root_dir)

# tokenize with the most extensive tokenizer, which is the one used for model #2

initial_tokenizer = load_models.get_meylan_original_model(with_tags = True, root_dir = root_dir)['tokenizer']

initial_tokenizer.add_tokens(['yyy','xxx']) #must maintain xxx and yyy for alignment,
# otherwwise, BERT tokenizer will try to separate these into x #x and #x and y #y #y
inital_vocab_mask, initial_vocab = transfomers_bert_completions.get_softmax_mask(initial_tokenizer,
    cmu_2syl_inchildes.word)

# confirm yyy treated as a separate character
assert initial_tokenizer.tokenize('this is a yyy.') == ['this', 'is', 'a', 'yyy', '.']

cmu_in_initial_vocab = cmu_2syl_inchildes.loc[cmu_2syl_inchildes.word.isin(initial_vocab)]

if verbose: print(cmu_in_initial_vocab.shape)
    

(7904, 8)


In [15]:

# build a dataframe of tokens 
# this is slow, because tokenization is slow
def inflate (row):
    tokens = initial_tokenizer.tokenize(row['gloss_with_punct'])
    return(pd.DataFrame({'token':tokens, 'id':row['id']}) )

regenerate = True
if regenerate:
    all_tokens = pd.concat([inflate(x) for x in utt_glosses.to_dict('records')])
    all_tokens = all_tokens.merge(utt_glosses)
    all_tokens.to_csv('csv/pvd_utt_glosses_inflated.csv')

else:
    all_tokens = pd.read_csv('csv/pvd_utt_glosses_inflated.csv', na_filter=False)

if verbose: print(all_tokens.iloc[0:10])

# Assign a token_id (integer in the BERT vocabulary). 
# Because these are from the tokenized utterances, there is no correpsondence 
# with childes-db token ids
all_tokens['token_id'] = initial_tokenizer.convert_tokens_to_ids(all_tokens['token'])
# assigns utterances a 0-indexed index column
all_tokens['seq_utt_id'] = all_tokens['id'].astype('category').cat.codes


   token        id                                         gloss  \
0  [cgv]  16759250                    where do you want me to go   
1  where  16759250                    where do you want me to go   
2     do  16759250                    where do you want me to go   
3    you  16759250                    where do you want me to go   
4   want  16759250                    where do you want me to go   
5     me  16759250                    where do you want me to go   
6     to  16759250                    where do you want me to go   
7     go  16759250                    where do you want me to go   
8      ?  16759250                    where do you want me to go   
9  [cgv]  16759261  anywhere you'll feel comfortable um anywhere   

   transcript_id  utterance_order speaker_code target_child_name  \
0          42204                1          OPE              Alex   
1          42204                1          OPE              Alex   
2          42204                1          OPE 

### Add back IPA, syllable structure, and child ages for child productions

In [None]:
# get the token-level data, esp phonology

if regenerate:

    # get token-level information for Providence
    pvd_chi_tokens = childespy.get_sql_query('select gloss, target_child_name, target_child_age, \
    speaker_code, actual_phonology, model_phonology, transcript_id, utterance_id, \
    token_order from token where speaker_code = "CHI" and corpus_id = '+str(pvd_idx),
        db_version = "2020.1")
    pvd_chi_tokens['gloss'] = [data_cleaning.fix_gloss(x) for x in pvd_chi_tokens.gloss]
    
    # prep the tokens generated from segmenting the utterances
    all_tokens_test = copy.deepcopy(all_tokens) 

    # initialize the fields that need to be populated
    all_tokens_test['actual_phonology'] = ''
    all_tokens_test['model_phonology'] = ''
    all_tokens_test['target_child_age'] = np.nan
    
    # get a set of unique utterances
    _, idx = np.unique(all_tokens_test.id, return_index=True)
    all_utt_indices = all_tokens_test.id[np.sort(idx)]
    
    # For fast retrieval of IPA, split pvd_chi_tokens into a dictionary
    pvd_chi_tokens_list = pvd_chi_tokens.groupby(['utterance_id'])
    pvd_chi_tokens_dict = dict(zip(
        [x[0] for x in pvd_chi_tokens_list], 
        [x[1] for x in pvd_chi_tokens_list], 
    ))
    
    # For fast retrival of BERT tokenization
    all_tokens_test_list = all_tokens_test.groupby(['id'])
    all_tokens_test_dict = dict(zip(
        [x[0] for x in all_tokens_test_list], 
        [x[1] for x in all_tokens_test_list], 
    ))
        
    # Augment the tokens from all_tokens with the IPA from pvd_chi_tokens 
    rvs = [] 
    utts_to_retrieve = raw_yyy_utts.utterance_id.to_list() + raw_success_utts.utterance_id.to_list()
    i=-1
    for utt_index in all_utt_indices: #utts_to_retrieve: #[16760331]:       
        i+=1
        if i % int(len(all_utt_indices) / 100) == 0:
            print(str(np.round((i / (len(all_utt_indices)) * 100),2))+'% complete...')    
            # should learn to use tqdm instead
        if utt_index in utts_to_retrieve:        
            utt_df = copy.deepcopy(all_tokens_test_dict[utt_index])
            utt_df['model_phonology'] = transfomers_bert_completions.augment_with_ipa(
              utt_df, pvd_chi_tokens_dict[utt_index],initial_tokenizer, 'model_phonology')
            utt_df['actual_phonology'] = transfomers_bert_completions.augment_with_ipa(
              utt_df, pvd_chi_tokens_dict[utt_index],initial_tokenizer, 'actual_phonology')
            utt_df['target_child_age'] = pvd_chi_tokens_dict[utt_index].iloc[0].target_child_age    
            rvs.append(utt_df)  
        else:
            rvs.append(all_tokens_test_dict[utt_index])  
            
    # get the resulting augmented forms back into a dataframe
    all_tokens_phono = pd.concat(rvs)
    
    # add a unique identifier to the BERT tokens
    all_tokens_phono['bert_token_id'] = range(all_tokens_phono.shape[0])
    
    #save the results
    all_tokens_phono.to_pickle('csv/pvd_utt_glosses_phono_inflated.pkl')
else:
    all_tokens_phono = pd.read_pickle('csv/pvd_utt_glosses_phono_inflated.pkl')

R[write to console]: Using supported database version: '2020.1'.



0.0% complete...




1.0% complete...
2.0% complete...
3.0% complete...
4.0% complete...
5.0% complete...
6.0% complete...
7.0% complete...
8.0% complete...
9.0% complete...
10.0% complete...
11.0% complete...
12.0% complete...
13.0% complete...
14.0% complete...
15.0% complete...
16.0% complete...
17.0% complete...
18.0% complete...
19.0% complete...
20.0% complete...
21.0% complete...
22.0% complete...
23.0% complete...
24.0% complete...
25.0% complete...
26.0% complete...
27.0% complete...
28.0% complete...
29.0% complete...
30.0% complete...
31.0% complete...
32.0% complete...
33.0% complete...
34.0% complete...
35.0% complete...
36.0% complete...
37.0% complete...
37.99% complete...
38.99% complete...
39.99% complete...
40.99% complete...
41.99% complete...
42.99% complete...
43.99% complete...
44.99% complete...
45.99% complete...
46.99% complete...
47.99% complete...
48.99% complete...
49.99% complete...
50.99% complete...
51.99% complete...
52.99% complete...
53.99% complete...
54.99% complete...
5

In [None]:
# Get the IPA map
phone_map_df = pd.read_csv('phon/phon_map_populated.csv')

In [None]:
if verbose:
    # Inspect the IPA
    print(all_tokens_phono.loc[all_tokens_phono.actual_phonology != ''][['token','actual_phonology','model_phonology']])
    print(phone_map_df.head())

In [None]:
def phone_remap(x):
    return(x.replace("ː","").replace('ʌ','ə')
.replace('ɪ','ə').replace('ɔ','ɑ').replace('a','ɑ').replace('o','oʊ').replace('˞','').replace('ʰ',
    ''). replace('r','ɹ')).replace('\\^','').replace('\\ ̃','').replace(' ̩','').replace('^',''
).replace('ʙ','b').replace('(','').replace(')','').replace('.','').replace('ch','ʧ'
).replace('c','k').replace('g','ɡ').replace('y','j').replace('ʁ','ɹ')

def strip_accents(string, accents=('COMBINING ACUTE ACCENT', 
    'COMBINING GRAVE ACCENT', 'COMBINING TILDE', 'COMBINING VERTICAL LINE BELOW',
    'COMBINING SHORT STROKE OVERLAY')):
    accents = set(map(unicodedata.lookup, accents))
    chars = [c for c in unicodedata.normalize('NFD', string) if c not in accents]
    return unicodedata.normalize('NFC', ''.join(chars))

cv_map = dict(zip(phone_map_df['ipa'], phone_map_df['c_or_v']))
cv_map['o'] = 'v' 
cv_map['ɜ'] = 'v'
cv_map['e'] = 'v'
cv_map['ʔ'] = 'c'
cv_map['ɾ'] = 'c'
cv_map['ɲ'] = 'c'
cv_map['x'] = 'c'
cv_map['ɱ'] = 'c'
cv_map['ɣ'] = 'c'

def cv_mapper(x, cv_map):
    try:
        return(cv_map[x])
    except:
        raise ValueError(x)

regenerate = True
if regenerate:    

    # Do the same excludes as were used to identify appropriate utterances
    excludes = ['*','(.)','(..)', '(...)','(....)','(.....)']
    all_tokens_phono.loc[all_tokens_phono.actual_phonology.isin(excludes),'actual_phonology'] =''
    all_tokens_phono.loc[all_tokens_phono.actual_phonology.str.contains('V'),'actual_phonology'] =''
    
    # remap phonology from narrow phonetic transcription to broad phonological transcription
    all_tokens_phono['model_phonology_clean'] = [phone_remap(x) for x in all_tokens_phono['model_phonology']]
    all_tokens_phono['actual_phonology_clean'] = [phone_remap(x) for x in all_tokens_phono['actual_phonology']]

    # remove any non-combining diacritical marks
    all_tokens_phono['model_phonology_no_dia'] = [strip_accents(x) for x in \
    all_tokens_phono['model_phonology_clean']]
    all_tokens_phono['actual_phonology_no_dia'] = [strip_accents(x) for x in \
    all_tokens_phono['actual_phonology_clean']]
    
    # Compute the number of non-contiguous vowels.
    # slightly different than the cmu vowel computation ---
    # because here we are computing it directly from IPA
    all_tokens_phono['cv_raw'] = [''.join([cv_mapper(x, cv_map) for x in list(y)]) if y != '' else '' for y in all_tokens_phono['actual_phonology_no_dia']]    
    all_tokens_phono['cv_collapsed']  = [re.sub(r'(.)\1+', r'\1', str(x)) if x != '' else '' for x in all_tokens_phono['cv_raw']]
    all_tokens_phono['num_vowels'] = [np.sum(np.array(list(x)) == 'v') if x !='' else np.nan for x in all_tokens_phono['cv_collapsed']]
    all_tokens_phono.to_pickle('csv/pvd_utt_glosses_phono_cleaned_inflated.pkl')
else:
    all_tokens_phono = pd.read_pickle('csv/pvd_utt_glosses_phono_cleaned_inflated.pkl')


In [None]:
if verbose:
    # Why no actual phonology?
    print(all_tokens_phono.loc[all_tokens_phono.actual_phonology_no_dia != '']['actual_phonology_no_dia'])
    print(all_tokens_phono.shape)

### Identify the tokens that can be evaluated 

In [None]:
successful_utt_ids = set(raw_success_utts['utterance_id'])
initial_vocab_set = set(initial_vocab)
yyy_utt_ids = set(raw_yyy_utts['utterance_id'])
all_tokens_phono['in_vocab'] = all_tokens_phono['token'].isin(initial_vocab_set)
all_tokens_phono['success_token'] = [x in successful_utt_ids for x in 
    all_tokens_phono['id']]
all_tokens_phono['yyy_token'] = [x in yyy_utt_ids for x in 
    all_tokens_phono['id']]

In [None]:
if verbose:
    print(initial_vocab)
    print(all_tokens_phono.shape)

### Identify the subset of success and failure utterances that have transcriptions

In [None]:
all_tokens_phono['partition'] = 'none'

In [None]:
success_tokens = all_tokens_phono.loc[(all_tokens_phono['success_token']) & 
    (all_tokens_phono['num_vowels'] <= 2) ]
all_tokens_phono.loc[(all_tokens_phono['success_token']) & 
    (all_tokens_phono['num_vowels'] <= 2), 'partition'] = 'success'     

if verbose:
    print(success_tokens.shape)
    print(all_tokens_phono.loc[(all_tokens_phono['success_token']) & 
    (all_tokens_phono['num_vowels'] <= 2)])

In [None]:
yyy_tokens = all_tokens_phono.loc[(all_tokens_phono['yyy_token']) & 
(all_tokens_phono['token'] == 'yyy') & (all_tokens_phono.num_vowels <= 2) ]
all_tokens_phono.loc[(all_tokens_phono['yyy_token']) & 
(all_tokens_phono['token'] == 'yyy') & (all_tokens_phono.num_vowels <= 2),'partition'] = 'yyy'


In [None]:
if verbose:
    print(yyy_tokens.shape)
    print(all_tokens_phono.partition.value_counts())
    print(initial_tokenizer.unk_token_id)

## Additional section from 6 | Prevalence of Successes and Failures Over Time

Need to augment successes/failures with information on age.

In [None]:
# get number of tokens per age
# Warnings were present in the original code

raw_success_utts['set'] = 'success'
raw_yyy_utts['set'] = 'failure'

utt_age = chi_phono.groupby('utterance_id').target_child_age.agg(np.unique).reset_index()
utt_name = chi_phono.groupby('utterance_id').target_child_name.agg(np.unique).reset_index()

success_utts = raw_success_utts.merge(utt_age, on = 'utterance_id').merge(utt_name, on = 'utterance_id')
yyy_utts = raw_yyy_utts.merge(utt_age, on = 'utterance_id').merge(utt_name, on = 'utterance_id')

utts_with_ages = pd.concat([success_utts, yyy_utts]) # Added names for child split.

assert len(set(utts_with_ages['utterance_id'])) == utts_with_ages.shape[0],\
"Make sure that the utterance id is a unique identifier for the observations in the yyy and success dataframes"
assert len(set(utt_age['utterance_id'])) == utt_age.shape[0],\
"Make sure that the utterance id is a unique identifier for the observations in the utt_age dataframe"

# Changed from the original: the merged dfs don't have the same order of utterance id immediately,
# so am now merging on utterance id

utts_with_ages['year'] = .5*np.floor(utts_with_ages['target_child_age'] / (365. /2) ) 

if verbose:
    print(utts_with_ages.loc[utts_with_ages.set == 'failure'].year.value_counts())
    print(utts_with_ages.loc[utts_with_ages.set == 'success'].year.value_counts())

In [None]:
utts_with_ages.columns # Why is there "target_child_age_x" and such? Need to think about this.
# There is an unexpected interaction... what is a better way to merge the information?

In [None]:
# A cell that acts as a check for the utt_age, utt_name augumentations.


# Need below assumption for the name-based check.
# Ensure that for every name in the original dataframe,
# every utterance associated with that name
# has the right association in the resultant dataframe,
# and there are no extra associations in the resultant dataframe.

assert len(set(utts_with_ages['utterance_id'])) == utts_with_ages.shape[0], "utts with ages doesn't have unique IDs"
names = set(chi_phono['target_child_name'])

for name in names:
    
    select_name_subset = lambda df : df[df['target_child_name'] == name]['utterance_id']
    ids_for_name = set(select_name_subset(chi_phono))
    utts_with_ages_ids = set(select_name_subset(utts_with_ages))
    
    assert ids_for_name == utts_with_ages_ids
    
    
# Check that every utterance_id is matched to its right age in yyy/success dataframe
# This is a valid method because all of the utterance IDs are unique per dataframe.

for i in range(utts_with_ages.shape[0]):
    
    if i % 10000 == 0: print(f'{(i / utts_with_ages.shape[0]) * 100.0} complete')
    this_entry = utts_with_ages.iloc[i]
    this_id = this_entry['utterance_id']
    this_age = this_entry['target_child_age']
    
    # Below is OK because of the assert.
    cross_entry = utts_with_ages[utts_with_ages['utterance_id'] == this_id]
    
    assert cross_entry.shape[0] == 1
    assert cross_entry['target_child_age'].item() == this_age, f'Error at index: {i}'



## This marks the end of the section before the model evaluations/queries beginning.

In [None]:

def save_eval_data(data, filename, split_name, dataset_name, base_dir = 'eval/new_splits'):
    
    assert split_name in ['all', 'age', 'child'], "Invalid split name. Must be one of {all, age, child}."
    
    # Saving based on a mask of a copy of a? Will this be a problem?
    
    save_path = split_gen.get_split_folder('all', 'all', base_dir)
    save_location = join(save_path, filename)
    data.to_pickle(save_location)
    
    print(f'Saved all/all evaluation data to {save_location}')
    
    return save_location

In [None]:
phono_filename = 'pvd_utt_glosses_phono_cleaned_inflated.pkl'
success_utts_filename = 'success_utts.csv'
yyy_utts_filename = 'yyy_utts.csv'

data_filenames = [phono_filename, success_utts_filename, yyy_utts_filename]

# Use this line from the original code and load the above two CSVs for model inputs later:
# utts_with_ages = pd.concat([success_utts, yyy_utts]).merge(utt_age)

# for the input into the actual models.

### Save evaluation data for all split.

In [None]:

for this_data, filename in zip([all_tokens_phono, success_utts, yyy_utts], data_filenames):
    save_eval_data(this_data, filename, 'all', 'all')


### Save evaluation data for age split.

In [None]:

young_tokens_phono, old_tokens_phono = split_gen.get_age_split_data(all_tokens_phono, months = 36)
young_success_utts, old_success_utts = split_gen.get_age_split_data(success_utts, months = 36)
young_yyy_utts, old_yyy_utts = split_gen.get_age_split_data(yyy_utts, months = 36)

for this_data, filename in zip([old_tokens_phono, old_success_utts, old_yyy_utts], data_filenames):
    save_eval_data(this_data, filename, 'age', 'old')
    
for this_data, filename in zip([young_tokens_phono, young_success_utts, young_yyy_utts], data_filenames):
    save_eval_data(this_data, filename, 'age', 'young') 
    

### Process finetuning and evaluation data for child split.

In [None]:

# Note you'll have to restart runtime/re-gen all of this on random seed

# For now, define a success as any utterance without any yyy and xxx.  
# This is how it's defined in the implementation elswhere -- although technically the token itself should also be monosyllabic.

child_names = set(success_utts['target_child_name'])
for name in child_names:
    child_success_utts = success_utts[success_utts['target_child_name'] == name]
    
    # Need to select 200 for the evaluation. How to do this per child?
    
    this_partition_folder = split_gen.get_split_folder('child', name, base_dir = 'data/new_splits')
    val_idxs = split_gen.glosses_random_split(child_success_utts, val_num = 200)
    
    pooled_data = split_gen.write_data_partition_text(child_success_utts, this_partition_folder, val_idxs)
    # Note: Here, partition = success or not
    # phase = train or validation (this was changed from the original)
    
    print(name, child_success_utts.shape[0], 'number of examples')
    