In [2]:

import os
from os.path import join, exists

import pandas as pd
import glob

from utils import load_splits, split_gen, data_cleaning
from utils_child import child_models, child_split_gen

import config

from collections import defaultdict

## Useful functions

In [3]:


def sampled_proper_attributes(phono, utt_id_set):
    
    """
    Ensures that a given df with tokens that are part of a sample follow the proper attributes
    (actual_phonology and model_phonology are populated, and CHI utterance)
    """
    
    df = phono[phono.utterance_id.isin(utt_id_set)]
    
    assert all(df.speaker_code_simple == '[CHI]')
    assert has_phonology(df)
    assert has_success_or_yyy(df) # Also checked for more stringent criteria later (has success and yyy)
    
def check_chi_cgv_present(this_df):
    this_codes = set(this_df.speaker_code)
    assert this_codes.issubset({'CHI', 'FAT', 'MOT'}) and this_codes != {'CHI'}
        
def load_marked_pooled_data(this_split_path):
    return pd.read_pickle(join(this_split_path, 'data_pool_with_phases.pkl'))

def check_disjoint_and_phase_written(split, name, base_dir, which_phase):
    """
    Checks that the phase data indicated per entry corresponds to the written text in the file.
    """
    
    this_split_loc = split_gen.get_split_folder(split, name, base_dir)
    
    if split in {'all', 'age'}:
        
        this_all_loc = split_gen.get_split_folder('all', 'all', base_dir)
        this_pool_data = data_cleaning.drop_errors(load_marked_pooled_data(this_all_loc))
        if name == 'young':
            this_pool_data, _ = split_gen.get_age_split_data(this_pool_data)
        if name == 'old':
            _, this_pool_data = split_gen.get_age_split_data(this_pool_data)
            
    if split == 'child':
        
        # Check that the train/val text files are as expected
        #     they match the right phase as marked in the df
        #     they match the right child
        #     they don't contain errors

        this_pool_data = data_cleaning.drop_errors(load_splits.load_phono())
        this_pool_data = this_pool_data[(this_pool_data.target_child_name == name)]
        this_pool_data = this_pool_data[['utterance_id', 'gloss_with_punct', 'phase_child_finetune']].drop_duplicates()
        
    for phase in ['train', 'val']:
        phase_locs = this_pool_data[this_pool_data[which_phase] == phase]
        with open(join(this_split_loc, f"{phase}.txt"), 'r') as f:
            from_text_text = sorted([l.strip() for l in f.readlines()]) # Get rid of trailing \n
        
        from_df_text = sorted(list(phase_locs['gloss_with_punct']))
        
        assert from_text_text == from_df_text, f'Failed to match phase data for: {split}, {name}, {phase}'
    
        # Extra checks after first iteration
        assert all('xxx' not in s for s in from_text_text)
        assert all('yyy' not in s for s in from_text_text)
        
        assert '[CHI] .\n' not in set(from_text_text)
        assert '[CGV] .\n' not in set(from_text_text)
    
    print(f'Assert passed for {split}, {name}')
    return True


def has_phonology(df):
    
    def non_empty(entry, attribute):
        return any(entry != '')
    
    actual = df.groupby('utterance_id').actual_phonology.agg(lambda x : non_empty(x, 'actual_phonology')).reset_index()
    
    return all(actual.actual_phonology)
    
def has_this(collect, token_type):
    return (token_type in set(collect))

def give_success(df):
    return df.groupby('utterance_id').partition.agg(lambda x : has_this(x, 'success')).reset_index()

def give_yyy(df):
    return df.groupby('utterance_id').partition.agg(lambda x : has_this(x, 'yyy')).reset_index()
    
def has_success(df):
    return all(give_success(df).partition)

def has_yyy(df):
    return all(give_yyy(df).partition)

def has_success_or_yyy(df):
    
    success_df = give_success(df)
    yyy_df = give_yyy(df)
    
    either = success_df.partition | yyy_df.partition
    
    return all(either)


## Loading

In [4]:
child_names = child_models.get_child_names()

all_phono = pd.read_pickle(join(config.prov_dir, 'pvd_all_tokens_phono_for_eval.pkl'))

phases = ['train', 'val', 'eval']

## Checks to ensure fixes from incorrect iteration

In [4]:
# Check that there are no errors in partition == 'success' utterance_id

final_utts_save_path = join(config.prov_csv_dir, 'utts_with_ages.csv')

utts_with_ages = pd.read_csv(final_utts_save_path)

# Find the subsets of partition-based successes and yyy

success_ids = set(all_phono[all_phono.partition == 'success'].utterance_id)
yyy_ids = set(all_phono[all_phono.partition == 'yyy'].utterance_id)

success_phono = all_phono[all_phono.utterance_id.isin(success_ids)]
yyy_phono = all_phono[all_phono.utterance_id.isin(yyy_ids)]

# Check that utterances that are sampled are appropriate -- they are all in the right set of utterance ids

success_ages_utts_id = set(utts_with_ages[utts_with_ages.set == 'success'].utterance_id)
yyy_ages_utts_id = set(utts_with_ages[utts_with_ages.set == 'failure'].utterance_id)

# Check that success partition markings have utterance ids that are all part of utts_with_ages success
# Check the same for yyy

assert success_ids.issubset(success_ages_utts_id)
assert yyy_ids.issubset(yyy_ages_utts_id)

# Check that there are no yyy tokens or xxx tokens in the things marked as success or yyy

success_tokens = set(success_phono['token'])
yyy_tokens = set(yyy_phono['token'])

assert not any(success_phono['token'] == 'yyy')
assert not any(success_phono['token'] == 'xxx')

assert not any(yyy_phono['token'] == 'xxx')

print('Passed')

Passed


## General checks

### Non-Providence data

In [5]:

# Are the training/text files disjoint for non-Providence?

check_disjoint_and_phase_written('all', 'all', config.finetune_dir, 'phase_finetune')
check_disjoint_and_phase_written('age', 'young', config.finetune_dir, 'phase_finetune')
check_disjoint_and_phase_written('age', 'old', config.finetune_dir, 'phase_finetune')


Assert passed for all, all
Assert passed for age, young
Assert passed for age, old


True

In [6]:
# Make sure that [CHI], [CGV] are present in the model inputs
    
folder = split_gen.get_split_folder('all', 'all', config.finetune_dir)

all_df = load_marked_pooled_data(folder)
young_df, old_df = split_gen.get_age_split_data(all_df)

for s, d in config.childes_model_args:
    
    if d == 'old':
        this_df = old_df
    if d == 'young':
        this_df = young_df
    if d == 'all':
        this_df = all_df
            
    check_chi_cgv_present(this_df)

print('Passed')

Passed


### Providence data

In [5]:
# Check proper functioning for the sampling

for n in config.subsamples:
    for phase in phases:
        for this_type in ['yyy', 'success']:
            for name in child_names:
                attr = child_split_gen.get_subsample_key(n, this_type, name)
                avail = len(set(all_phono.utterance_id))

                sel_phono = all_phono[all_phono[attr]]
                num_sel = len(set(sel_phono.utterance_id))
                print(f'For n: over all phases {n}, {num_sel} / {avail}') # Should be approx. 3x the n

                assert set(sel_phono.target_child_name) == {name}
                assert set(sel_phono.partition) == {this_type, 'none'}
                # Above: Is possible to have non-scoreable tokens in a single utterance marked for scoring.
                
                assert set(sel_phono.phase_child_sample) == {phase}
                assert not any(all_phono[attr].isna())

ValueError: Cannot mask with non-boolean array containing NA / NaN values

In [7]:
# Check the text files for disjointedness (necessary because it splits on a different phase label)
# Note this also checks the correctness of train and val partition.

for name in child_names:
    print(f"Verifying: {name}")
    check_disjoint_and_phase_written('child', name, config.finetune_dir, which_phase = 'phase_child_finetune')


Verifying: Alex
Assert passed for child, Alex
Verifying: Ethan
Assert passed for child, Ethan
Verifying: Lily
Assert passed for child, Lily
Verifying: Naima
Assert passed for child, Naima
Verifying: Violet
Assert passed for child, Violet
Verifying: William
Assert passed for child, William


In [8]:

from collections import defaultdict

# Check that train, val, eval are disjoint for the age/all splits.

phase_set = {}

for phase in phases:
    phase_set[phase] = set(all_phono[all_phono['phase_sample'] == phase].transcript_id)

for p1 in phases:
    for p2 in phases:
        if p1 == p2 : continue
        assert not (phase_set[p1] & phase_set[p2]), f'Overlap for {p1} and {p2}'

print('Passed')

# For phase sample, for each of success and yyy,
# there should be a 25/50/50 split on transcript ids for each of these,
# that is about equal for success and yyy

for phase in phases:
    
    print('Size of phase in transcripts', len(set(all_phono[all_phono['phase_sample'] == phase].transcript_id)))
    
    for part_type in ['success', 'yyy']:
        
        success_ids_num = len(set(all_phono[all_phono.transcript_id.isin(phase_set[phase]) & (all_phono.partition == part_type)].transcript_id))
        print(f'\tfor {part_type}, number of unique transcripts:', success_ids_num)


Passed
Size of phase in transcripts 90
	for success, number of unique transcripts: 90
	for yyy, number of unique transcripts: 90
Size of phase in transcripts 90
	for success, number of unique transcripts: 90
	for yyy, number of unique transcripts: 90
Size of phase in transcripts 179
	for success, number of unique transcripts: 179
	for yyy, number of unique transcripts: 179


In [9]:
# Make sure that all of the eval and val data are separate for across_time_samples

all_time_samples = glob.glob(join(config.prov_dir, 'across_time_samples/*'))

val_ids = set(pd.concat([pd.read_csv(path) for path in all_time_samples if '_val' in path]).utterance_id)
eval_ids = set(pd.concat([pd.read_csv(path) for path in all_time_samples if '_eval' in path]).utterance_id)

val_phases = set(all_phono[all_phono.utterance_id.isin(val_ids)].phase_sample)
eval_phases = set(all_phono[all_phono.utterance_id.isin(eval_ids)].phase_sample)

assert val_phases == {'val'}
assert eval_phases == {'eval'}

sampled_proper_attributes(all_phono, val_ids | eval_ids)

print('Passed asserts')

Passed asserts


In [10]:
# For the used ages, check the following:

# there are approx 5000 of them (by utterance id!)
#     some may have fewer because of data sparsity.

# per EACH of successes and yyy
# And, they correspond to the right splits (the one in their name)

for phase in ['val', 'eval']:
    
    print('*'*20, f"Phase: {phase}")
    
    for data_type, data_func in zip(['success', 'yyy'], [load_splits.get_age_success_sample_paths, load_splits.get_age_yyy_sample_paths]):

        print(f'For {data_type}')
        all_paths = data_func(phase = phase)
        all_samples = [ pd.read_csv(p) for p in all_paths ]

        sample_ids = set(pd.concat(all_samples).utterance_id)

        print('Sample shapes')
        for sample in all_samples:
            print(f'\t\tNumber of total utterances in all pool: {sample.shape}')

        this_phono = all_phono[all_phono.utterance_id.isin(sample_ids)]
        if data_type == 'success':
            assert has_success(this_phono)
        else:
            assert has_yyy(this_phono)


******************** Phase: val
For success
Sample shapes
		Number of total utterances in all pool: (1, 2)
		Number of total utterances in all pool: (1511, 2)
		Number of total utterances in all pool: (3186, 2)
		Number of total utterances in all pool: (4892, 2)
		Number of total utterances in all pool: (5000, 2)
		Number of total utterances in all pool: (3303, 2)
		Number of total utterances in all pool: (745, 2)
		Number of total utterances in all pool: (323, 2)
For yyy
Sample shapes
		Number of total utterances in all pool: (1, 2)
		Number of total utterances in all pool: (1521, 2)
		Number of total utterances in all pool: (1890, 2)
		Number of total utterances in all pool: (1412, 2)
		Number of total utterances in all pool: (1149, 2)
		Number of total utterances in all pool: (699, 2)
		Number of total utterances in all pool: (85, 2)
		Number of total utterances in all pool: (11, 2)
******************** Phase: eval
For success
Sample shapes
		Number of total utterances in all pool: 

In [11]:
    
# Make sure:
# The young data is actually young data
# The old data is actually old data

old_ids = set(pd.read_csv(join(split_gen.get_split_folder('age', 'old', config.prov_dir), 'success_utts_beta_5000_val.csv')).utterance_id)
assert all(all_phono[all_phono.utterance_id.isin(old_ids)].target_child_age > config.age_split * 30.5)

young_ids = set(pd.read_csv(join(split_gen.get_split_folder('age', 'young', config.prov_dir), 'success_utts_beta_5000_val.csv')).utterance_id)
assert all(all_phono[all_phono.utterance_id.isin(young_ids)].target_child_age <= config.age_split * 30.5)

print('Passed')


Passed


In [12]:
# For the beta samples,
# they are all successes
# there is approx 5000 of them (by utterance id!)
# they all have phase == 'val'

def check_beta_samples(arg_set, phase_label):
    
    print(f'Checking for {phase_label}')
    
    all_beta_samples = [
        pd.read_csv(join(split_gen.get_split_folder(s, d, config.prov_dir), 'success_utts_beta_5000_val.csv'))
        for s, d in arg_set
    ]

    beta_ids = set(pd.concat(all_beta_samples).utterance_id)
    sel_phono = all_phono[all_phono.utterance_id.isin(beta_ids)]
    beta_phases = set(sel_phono[phase_label])

    assert beta_phases == {'val'}
    assert has_success(sel_phono)
    
    # You should have at least one success per utterance, but not all tokens have to be marked as a success.

    print('Passed asserts')

    for (s, d), (beta_sample) in zip(arg_set, all_beta_samples):
        print('Dataset', s, d)
        print(f'\tNumber of utterances: {len(set(beta_sample.utterance_id))}')
        
    sampled_proper_attributes(all_phono, beta_ids)
    
check_beta_samples(config.childes_model_args, 'phase_sample')
check_beta_samples([('child', name) for name in child_names], 'phase_child_sample')


Checking for phase_sample
Passed asserts
Dataset all all
	Number of utterances: 5000
Dataset age young
	Number of utterances: 5000
Dataset age old
	Number of utterances: 4371
Checking for phase_child_sample
Passed asserts
Dataset child Alex
	Number of utterances: 2745
Dataset child Ethan
	Number of utterances: 1938
Dataset child Lily
	Number of utterances: 4141
Dataset child Naima
	Number of utterances: 3423
Dataset child Violet
	Number of utterances: 2115
Dataset child William
	Number of utterances: 2944


In [13]:
# Check if my samples are sufficiently across time (beta)

all_beta_args = config.childes_model_args + [('child', name) for name in child_names]

for (s, d) in all_beta_args:
    
    print(f'Analyzing across time for split: {s}, {d}')
    this_sample = set(pd.read_csv(join(split_gen.get_split_folder(s, d, config.prov_dir), 'success_utts_beta_5000_val.csv')).utterance_id)
    
    # Select utterances in the beta sample
    sel = all_phono[all_phono.utterance_id.isin(this_sample)][['utterance_id', 'year']].drop_duplicates()
    
    # Year actually means counts of certain years, not the years themselves.
    counts = sel.year.value_counts().to_frame().sort_values('year')
    
    print(counts)
    

Analyzing across time for split: all, all
     year
0.5     1
4.0    74
3.5   162
1.0   406
3.0   830
1.5   831
2.0  1249
2.5  1447
Analyzing across time for split: age, young
     year
1.0   498
1.5  1056
2.0  1646
2.5  1800
Analyzing across time for split: age, old
     year
4.0   323
3.5   745
3.0  3303
Analyzing across time for split: child, Alex
     year
1.0    19
1.5   381
2.0   446
2.5   871
3.0  1028
Analyzing across time for split: child, Ethan
     year
0.5    29
2.5   362
2.0   431
1.5   499
1.0   617
Analyzing across time for split: child, Lily
     year
1.0    50
1.5   621
3.5   690
2.0   740
2.5   975
3.0  1065
Analyzing across time for split: child, Naima
     year
3.5   272
1.0   425
3.0   593
2.0   656
2.5   705
1.5   772
Analyzing across time for split: child, Violet
     year
1.0    11
3.0   135
1.5   235
3.5   464
2.5   615
2.0   655
Analyzing across time for split: child, William
     year
1.0    63
1.5   305
2.5   570
2.0   771
3.0  1235


In [14]:
# Make sure the child train/val/eval data is separate within child
# Note this no longer requires/uses the constraint that child val/eval matches the overall val/eval.


for attr, phase_set in zip(['phase_child_sample', 'phase_child_finetune'], [phases[:], ['train', 'val']]):
                                                                            
    for name in child_names:

        child_pool = all_phono[all_phono.target_child_name == name]
        ids = {}
        
        for phase in phase_set:
            ids[phase] = set(child_pool[child_pool[attr] == phase].utterance_id)

        for p1 in phase_set:
            for p2 in phase_set:
                if p1 == p2: continue
                assert len(ids[p1] & ids[p2]) == 0, f'{attr}, {p1}, {p2}'
    
print('Asserts passed.')
        

Asserts passed.


In [15]:
# Other quick checks

# Size of the train relative to val, eval for Providence
# For all/all, age/old, age/young

phase_data = {}

for phase_type, phase_set in zip(['phase_child_sample', 'phase_child_finetune'], [phases[:], ['train', 'val']]):
    for phase in phase_set:
        print(f'Size of phase: {phase}', len(set(all_phono[all_phono[phase_type] == phase].transcript_id)))
    

# Make sure that there is a spread of age sampling for test
# And also val

for name in child_names:
    
    name_pool = all_phono[all_phono.target_child_name == name]
    
    print(f'For child: {name}')
    for phase in ['eval', 'val']:
        
        print(f'\tFor phase: {phase}')
        this_pool = name_pool[name_pool.phase_child_sample == phase]
        # But, you need this per transcript.
        
        all_ages = data_cleaning.get_years(this_pool)
        
        # For the val/eval samples 
        # There is about the right number of transcripts for eval and val (print out the numbers)

        for age in all_ages: 
            
            this_sel_df = this_pool[this_pool.year == age]
            
            get_num_transcripts = lambda df : len(set(df.transcript_id))
            all_num = get_num_transcripts(this_sel_df)
            success_num = get_num_transcripts(this_sel_df[this_sel_df.partition == "success"])
            yyy_num = get_num_transcripts(this_sel_df[this_sel_df.partition == "yyy"])
            
            # Request one transcript with both present.
            # Expected behavior below for revised sampling with success and yyy present constraint.
            
            print(f'\t\tFor age: {age}, Number of transcripts: {all_num}')
            
            assert success_num == yyy_num == all_num
            assert 1 <= all_num <= 3
         
print('Passed automatic checks, manual behavior verifications are above.')
            

Size of phase: train 181
Size of phase: val 86
Size of phase: eval 92
Size of phase: train 183
Size of phase: val 89
For child: Alex
	For phase: eval
		For age: 1.0, Number of transcripts: 2
		For age: 1.5, Number of transcripts: 3
		For age: 2.0, Number of transcripts: 3
		For age: 2.5, Number of transcripts: 3
		For age: 3.0, Number of transcripts: 3
	For phase: val
		For age: 1.0, Number of transcripts: 1
		For age: 1.5, Number of transcripts: 3
		For age: 2.0, Number of transcripts: 3
		For age: 2.5, Number of transcripts: 3
		For age: 3.0, Number of transcripts: 3
For child: Ethan
	For phase: eval
		For age: 0.5, Number of transcripts: 1
		For age: 1.0, Number of transcripts: 3
		For age: 1.5, Number of transcripts: 3
		For age: 2.0, Number of transcripts: 3
		For age: 2.5, Number of transcripts: 3
	For phase: val
		For age: 0.5, Number of transcripts: 1
		For age: 1.0, Number of transcripts: 3
		For age: 1.5, Number of transcripts: 3
		For age: 2.0, Number of transcripts: 3
		For

### Child work

In [16]:
# The phases are all disjoint

phase_dict = defaultdict(dict)

for phase_type, phase_set in zip(['phase_child_sample', 'phase_child_finetune'], [phases[:], ['train', 'val']]):
    
    for phase in phase_set:
        phase_dict[phase_type][phase] = set(all_phono[all_phono[phase_type] == phase].transcript_id)
        
    # train/val/eval in phase_child_sample, as well as phase_child_finetune
    this_type_dict = phase_dict[phase_type]
    for p1 in phase_set:
        for p2 in phase_set:
            if p1 == p2: continue
            assert not (this_type_dict[p1] & this_type_dict[p2])

# The finetune phases are all disjoint from val/eval phases in sample
# The train_sample phase is in the finetune_phase

finetune_disjoint = phase_dict['phase_child_finetune']['train'] | phase_dict['phase_child_finetune']['val']
sample_disjoint = phase_dict['phase_child_sample']['eval']


# 7/25/21: https://www.geeksforgeeks.org/issubset-in-python/
for phase in ['train', 'val']:
    valid_for_finetune_partial = (phase_dict['phase_child_sample'][phase]) & set(data_cleaning.drop_errors(all_phono).transcript_id)
    assert valid_for_finetune_partial.issubset(phase_dict['phase_child_finetune'][phase])
# end cite

assert len(finetune_disjoint & sample_disjoint) == 0
    
print('Passed')

Passed


In [17]:
for name in child_names:
    for phase in ['train', 'val']:
        # Make sure there is cgv, chi in both phases for child data finetuning
        rel_df = all_phono[(all_phono.target_child_name == name) & (all_phono.phase_child_finetune == phase)]
        check_chi_cgv_present(rel_df)
        
print('Passed')

Passed


In [18]:
# Write a check to ensure that if phase_child_sample (or phase_sample) then
# every transcript contains both at least one yyy and at least one success

def has_success_and_yyy(this_attr):

    for phase in ['train', 'val', 'test']:
        
        has_phase = all_phono[all_phono[this_attr] == phase]

        has_success_phase = set(has_phase[has_phase.partition == 'success'].transcript_id)
        has_yyy_phase = set(has_phase[has_phase.partition == 'yyy'].transcript_id)
        gen_phase = set(has_phase.transcript_id)

        # Note: no constraint on at least one yyy and at least one success for the finetune data.

        assert has_success_phase == has_yyy_phase == gen_phase

        print(f'Passed for {this_attr}, phase: {phase}')

has_success_and_yyy('phase_child_sample')
has_success_and_yyy('phase_sample')

print('Passed')

Passed for phase_child_sample, phase: train
Passed for phase_child_sample, phase: val
Passed for phase_child_sample, phase: test
Passed for phase_sample, phase: train
Passed for phase_sample, phase: val
Passed for phase_sample, phase: test
Passed


In [19]:
# Make sure that the ages are correctly sampled across time.

all_time_samples = glob.glob(join(config.prov_dir, 'across_time_samples/*'))

for year in data_cleaning.get_years(all_phono):
    
    age_ids = set(pd.concat([pd.read_csv(path) for path in all_time_samples if str(year) in path]).utterance_id)
    
    assert set(all_phono[all_phono.utterance_id.isin(age_ids)].year) == {year}


print('Passed')

Passed


In [20]:
from datetime import datetime
print(datetime.today())

2021-08-02 08:01:45.614089
