In [2]:
# 7/22/21: https://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython

%load_ext autoreload
%autoreload 2

# end cite

In [3]:

import os
from os.path import join, exists

import pandas as pd
import glob

from utils import load_splits, split_gen, data_cleaning
from utils_child import child_models

import config

from collections import defaultdict

## Useful functions

In [4]:


def sampled_proper_attributes(phono, utt_id_set):
    
    """
    Ensures that a given df with tokens that are part of a sample follow the proper attributes
    (actual_phonology and model_phonology are populated, and CHI utterance)
    """
    
    df = phono[phono.utterance_id.isin(utt_id_set)]
    
    assert all(df.speaker_code_simple == '[CHI]')
    assert has_phonology(df)
    assert has_success_or_yyy(df)
    
def check_chi_cgv_present(this_df):
    this_codes = set(this_df.speaker_code)
    assert this_codes.issubset({'CHI', 'FAT', 'MOT'}) and this_codes != {'CHI'}
        
def load_marked_pooled_data(this_split_path):
    return pd.read_pickle(join(this_split_path, 'data_pool_with_phases.pkl'))

def check_disjoint_and_phase_written(split, name, base_dir, which_phase = 'phase'):
    """
    Checks that the phase data indicated per entry corresponds to the written text in the file.
    """
    
    this_split_loc = split_gen.get_split_folder(split, name, base_dir)
    
    if split in {'all', 'age'}:
        
        this_all_loc = split_gen.get_split_folder('all', 'all', base_dir)
        this_pool_data = load_marked_pooled_data(this_all_loc)
        if name == 'young':
            this_pool_data, _ = split_gen.get_age_split_data(this_pool_data)
        if name == 'old':
            _, this_pool_data = split_gen.get_age_split_data(this_pool_data)
            
    #if split == 'child':
    #    this_pool_data = load_splits.load_phono()
    #    this_pool_data = this_pool_data[(this_pool_data.target_child_name == name) & (this_pool_data.speaker_code == 'CHI')]
        
    for phase in ['train', 'val']:
        phase_locs = this_pool_data[this_pool_data[which_phase] == phase]
        with open(join(this_split_loc, f"{phase}.txt"), 'r') as f:
            from_text_text = sorted([l.strip() for l in f.readlines()]) # Get rid of trailing \n
        
        from_df_text = sorted(list(phase_locs['gloss_with_punct']))
        
        assert from_text_text == from_df_text, f'Failed to match phase data for: {split}, {name}, {phase}'
    
    print(f'Assert passed for {split}, {name}')
    return True


def has_phonology(df):
    
    def non_empty(entry, attribute):
        return any(entry != '')
    actual = df.groupby('utterance_id').actual_phonology.agg(lambda x : non_empty(x, 'actual_phonology')).reset_index()
    #model = df.groupby('utterance_id').model_phonology.agg(lambda x : non_empty(x, 'model_phonology')).reset_index()
    
    return all(actual.actual_phonology)# & model.model_phonology) 
    
def has_this(collect, token_type):
    return (token_type in set(collect))

def give_success(df):
    return df.groupby('utterance_id').partition.agg(lambda x : has_this(x, 'success')).reset_index()

def give_yyy(df):
    return df.groupby('utterance_id').partition.agg(lambda x : has_this(x, 'yyy')).reset_index()
    
def has_success(df):
    return all(give_success(df).partition)

def has_yyy(df):
    return all(give_yyy(df).partition)

def has_success_or_yyy(df):
    
    success_df = give_success(df)
    yyy_df = give_yyy(df)
    
    either = success_df.partition | yyy_df.partition
    
    return all(either)


## Checks

In [14]:
child_names = child_models.get_child_names()

all_phono = pd.read_pickle(join(config.prov_dir, 'pvd_all_tokens_phono_for_eval.pkl'))

phases = ['train', 'val', 'eval']

### Non-Providence data

In [188]:

# Are the training/text files disjoint for non-Providence?

check_disjoint_and_phase_written('all', 'all', config.finetune_dir, which_phase = 'phase_finetune')
check_disjoint_and_phase_written('age', 'young', config.finetune_dir, which_phase = 'phase_finetune')
check_disjoint_and_phase_written('age', 'old', config.finetune_dir, which_phase = 'phase_finetune')


Assert passed for all, all
Assert passed for age, young
Assert passed for age, old


True

In [194]:
# Make sure that [CHI], [CGV] are present in the model inputs
    
folder = split_gen.get_split_folder('all', 'all', config.finetune_dir)

all_df = load_marked_pooled_data(folder)
young_df, old_df = split_gen.get_age_split_data(all_df)

for s, d in config.childes_model_args:
    
    if d == 'old':
        this_df = old_df
    if d == 'young':
        this_df = young_df
    if d == 'all':
        this_df = all_df
            
    check_chi_cgv_present(this_df)

print('Passed')

{'FAT', 'CHI', 'MOT'}
Child tags: 1640520 CGV tags: 2319432
{'FAT', 'CHI', 'MOT'}
Child tags: 985752 CGV tags: 1560943
{'FAT', 'CHI', 'MOT'}
Child tags: 618718 CGV tags: 639542
Passed


### Providence data

In [13]:
# Check the text files for disjointedness (necessary because it splits on a different phase label)
# Note this also checks the correctness of train and val partition.

# The train/val text files are as expected
#     they match the right phase as marked in the df (determine if this is hard to check with the current function)
#     they match the right child
#     they don't contain errors (you can check this via the all_tokens_phono if possible)
#     they are about the right size (80/20-ish it won't be exact), on transcript ids.

for name in child_names:
    
    print(f"Verifying: {name}")
    check_disjoint_and_phase_written('child', name, config.finetune_dir, which_phase = 'phase_child_finetune')


Verifying: William


UnboundLocalError: local variable 'this_pool_data' referenced before assignment

In [6]:

from collections import defaultdict

# Check that train, val, eval are disjoint for the age/all splits.

phase_set = {}

for phase in phases:
    phase_set[phase] = set(all_phono[all_phono['phase_sample'] == phase].transcript_id)

for p1 in phases:
    for p2 in phases:
        if p1 == p2 : continue
        assert not (phase_set[p1] & phase_set[p2]), f'Overlap for {p1} and {p2}'

print('Passed')

# For phase sample, for each of success and yyy,
# there should be a 25/50/50 split on transcript ids for each of these,
# that is about equal for success and yyy

for phase in phases:
    
    print('Size of phase in transcripts', len(set(all_phono[all_phono['phase_sample'] == phase].transcript_id)))
    
    for part_type in ['success', 'yyy']:
        
        success_ids_num = len(set(all_phono[all_phono.transcript_id.isin(phase_set[phase]) & (all_phono.partition == part_type)].transcript_id))
        print(f'\tfor {part_type}, number of unique transcripts:', success_ids_num)

assert False, "Shouldn't the size of transcripts id be the same for yyy and success?"

Passed
Size of phase in transcripts 92
	for success, number of unique transcripts: 91
	for yyy, number of unique transcripts: 88
Size of phase in transcripts 90
	for success, number of unique transcripts: 90
	for yyy, number of unique transcripts: 87
Size of phase in transcripts 182
	for success, number of unique transcripts: 179
	for yyy, number of unique transcripts: 176


AssertionError: Shouldn't the size of transcripts id be the same for yyy and success?

In [7]:
# Make sure that all of the eval and val data are separate for across_time_samples

all_time_samples = glob.glob(join(config.prov_dir, 'across_time_samples/*'))

val_ids = set(pd.concat([pd.read_csv(path) for path in all_time_samples if '_val' in path]).utterance_id)
eval_ids = set(pd.concat([pd.read_csv(path) for path in all_time_samples if '_eval' in path]).utterance_id)

val_phases = set(all_phono[all_phono.utterance_id.isin(val_ids)].phase_sample)
eval_phases = set(all_phono[all_phono.utterance_id.isin(eval_ids)].phase_sample)

assert val_phases == {'val'}
assert eval_phases == {'eval'}

sampled_proper_attributes(all_phono, val_ids | eval_ids)

print('Passed asserts')

Passed asserts


In [8]:
# For the used ages, check the following:

# there are approx 5000 of them (by utterance id!)
#     some may have fewer because of data sparsity.

# per EACH of successes and yyy
# And, they correspond to the right splits (the one in their name)

for phase in ['val', 'eval']:
    
    print('*'*20, f"Phase: {phase}")
    
    for data_type, data_func in zip(['success', 'yyy'], [load_splits.get_age_success_sample_paths, load_splits.get_age_yyy_sample_paths]):

        print(f'For {data_type}')
        all_paths = data_func(phase = phase)
        all_samples = [ pd.read_csv(p) for p in all_paths ]

        sample_ids = set(pd.concat(all_samples).utterance_id)

        print('Sample shapes')
        for sample in all_samples:
            print(f'\t{sample.shape}')
            print(f'\t\tNumber of total transcripts in all pool: {sample.shape}')

        this_phono = all_phono[all_phono.utterance_id.isin(sample_ids)]
        if data_type == 'success':
            assert has_success(this_phono)
        else:
            assert has_yyy(this_phono)

# Age 3.5 can be empty
# but on the run where it was observed,
# it seems that there are only 13 transcripts -- in a development run 10 were assigned to eval
# and 3 were assigned to train.

******************** Phase: val
For success
Sample shapes
	(1, 2)
		Number of total transcripts in all pool: (1, 2)
	(2316, 2)
		Number of total transcripts in all pool: (2316, 2)
	(5000, 2)
		Number of total transcripts in all pool: (5000, 2)
	(4113, 2)
		Number of total transcripts in all pool: (4113, 2)
	(5000, 2)
		Number of total transcripts in all pool: (5000, 2)
	(5000, 2)
		Number of total transcripts in all pool: (5000, 2)
	(699, 2)
		Number of total transcripts in all pool: (699, 2)
	(379, 2)
		Number of total transcripts in all pool: (379, 2)
For yyy
Sample shapes
	(1, 2)
		Number of total transcripts in all pool: (1, 2)
	(1343, 2)
		Number of total transcripts in all pool: (1343, 2)
	(1060, 2)
		Number of total transcripts in all pool: (1060, 2)
	(493, 2)
		Number of total transcripts in all pool: (493, 2)
	(669, 2)
		Number of total transcripts in all pool: (669, 2)
	(262, 2)
		Number of total transcripts in all pool: (262, 2)
	(19, 2)
		Number of total transcripts in all 

In [216]:
    
# Make sure:
# The young data is actually young data
# The old data is actually old data

old_ids = set(pd.read_csv(join(split_gen.get_split_folder('age', 'old', config.prov_dir), 'success_utts_beta_5000_val.csv')).utterance_id)
assert all(all_phono[all_phono.utterance_id.isin(old_ids)].target_child_age > config.age_split * 30.5)

young_ids = set(pd.read_csv(join(split_gen.get_split_folder('age', 'young', config.prov_dir), 'success_utts_beta_5000_val.csv')).utterance_id)
assert all(all_phono[all_phono.utterance_id.isin(young_ids)].target_child_age <= config.age_split * 30.5)

print('Passed')


Passed


In [217]:
# For the beta samples,
# they are all successes
# there is approx 5000 of them (by utterance id!)
# they all have phase == 'val'


# The problem exists both in the general and child versions of the function.


def check_beta_samples(arg_set, phase_label):
    
    print(f'Checking for {phase_label}')
    
    all_beta_samples = [
        pd.read_csv(join(split_gen.get_split_folder(s, d, config.prov_dir), 'success_utts_beta_5000_val.csv'))
        for s, d in arg_set
    ]

    beta_ids = set(pd.concat(all_beta_samples).utterance_id)
    sel_phono = all_phono[all_phono.utterance_id.isin(beta_ids)]
    beta_phases = set(sel_phono[phase_label])

    assert beta_phases == {'val'}
    assert has_success(sel_phono)
    # You should have at least one success per utterance, but not all tokens have to be marked as a success.

    print('Passed asserts')

    for beta_sample in all_beta_samples:
        print(len(set(beta_sample.utterance_id)))
        
    sampled_proper_attributes(all_phono, beta_ids)
    
check_beta_samples(config.childes_model_args, 'phase_sample')
check_beta_samples([('child', name) for name in child_names], 'phase_child_sample')


Checking for phase_sample
Passed asserts
5000
5000
5000
Checking for phase_child_sample
Passed asserts
1615
931
1807
1038
2517
989


In [218]:
# Make sure the child train/val/eval data is separate within child
# Note this no longer requires/uses the constraint that child val/eval matches the overall val/eval.


for attr, phase_set in zip(['phase_child_sample', 'phase_child_finetune'], [phases[:], ['train', 'val']]):
                                                                            
    for name in child_names:

        child_pool = all_phono[all_phono.target_child_name == name]
        ids = {}
        
        for phase in phase_set:
            ids[phase] = set(child_pool[child_pool[attr] == phase].utterance_id)

        for p1 in phase_set:
            for p2 in phase_set:
                if p1 == p2: continue
                assert len(ids[p1] & ids[p2]) == 0, f'{attr}, {p1}, {p2}'
    
print('Asserts passed.')
        

Asserts passed.


In [219]:
# Other quick checks

# Size of the train relative to val, eval for Providence
# For all/all, age/old, age/young

phase_data = {}

for phase_type, phase_set in zip(['phase_child_sample', 'phase_child_finetune'], [phases[:], ['train', 'val']]):
    for phase in phase_set:
        print(f'Size of phase: {phase}', len(set(all_phono[all_phono[phase_type] == phase].transcript_id)))
    

# Make sure that there is a spread of age sampling for test
# And also val

print()

for name in child_names:
    
    name_pool = all_phono[all_phono.target_child_name == name]
    print(f'For child: {name}, years available: {data_cleaning.get_years(name_pool)}')
    
    for phase in ['eval', 'val']:
        this_pool = name_pool[name_pool.phase_child_sample == phase]
        # But, you need this per transcript.
        
        all_ages = data_cleaning.get_years(this_pool)
        print(f'For phase: {phase}, ages available: {all_ages}')
        
        # For the val/eval samples 
        # There is about the right number of transcripts for eval and val (print out the numbers)

        for age in all_ages: 
            this_sel_df = this_pool[this_pool.year == age]
            
            get_num_transcripts = lambda df : len(set(df.transcript_id))
            all_num = get_num_transcripts(this_sel_df)
            success_num = get_num_transcripts(this_sel_df[this_sel_df.partition == "success"])
            yyy_num = get_num_transcripts(this_sel_df[this_sel_df.partition == "yyy"])
            
            assert 1 <= success_num <= 2
            assert 1 <= yyy_num <= 2
            
            # Although one transcript was requested per success and yyy,
            # it's possible that more than one will be printed below
            # if that transcript also happened to have the other type in it.
            
            print(f'\tNumber of transcripts for this age, all: {age}, {all_num}')
            print(f'\t\tNumber of transcripts for this age, successes: {age}, {success_num}')
            print(f'\t\tNumber of transcripts for this age, yyy: {age}, {yyy_num}')

# Make sure that 0.5 and 4.0 were dropped from the models across time (do this manually)
# Done

# This should be sufficient for Providence checks.

assert False, "Figure out the transcript 1 transcript issue"

Size of phase: train 279
Size of phase: val 33
Size of phase: eval 35
Size of phase: train 302
Size of phase: val 62

For child: Lily, years available: [1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0]
For phase: eval, ages available: [1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0]
	Number of transcripts for this age, all: 1.0, 1
		Number of transcripts for this age, successes: 1.0, 1
		Number of transcripts for this age, yyy: 1.0, 1
	Number of transcripts for this age, all: 1.5, 1
		Number of transcripts for this age, successes: 1.5, 1
		Number of transcripts for this age, yyy: 1.5, 1
	Number of transcripts for this age, all: 2.0, 1
		Number of transcripts for this age, successes: 2.0, 1
		Number of transcripts for this age, yyy: 2.0, 1
	Number of transcripts for this age, all: 2.5, 1
		Number of transcripts for this age, successes: 2.5, 1
		Number of transcripts for this age, yyy: 2.5, 1
	Number of transcripts for this age, all: 3.0, 1
		Number of transcripts for this age, successes: 3.0, 1
		Number of transc

For child: William, years available: [1.0, 1.5, 2.0, 2.5, 3.0]
For phase: eval, ages available: [1.0, 1.5, 2.0, 2.5, 3.0]
	Number of transcripts for this age, all: 1.0, 1
		Number of transcripts for this age, successes: 1.0, 1
		Number of transcripts for this age, yyy: 1.0, 1
	Number of transcripts for this age, all: 1.5, 1
		Number of transcripts for this age, successes: 1.5, 1
		Number of transcripts for this age, yyy: 1.5, 1
	Number of transcripts for this age, all: 2.0, 1
		Number of transcripts for this age, successes: 2.0, 1
		Number of transcripts for this age, yyy: 2.0, 1
	Number of transcripts for this age, all: 2.5, 1
		Number of transcripts for this age, successes: 2.5, 1
		Number of transcripts for this age, yyy: 2.5, 1
	Number of transcripts for this age, all: 3.0, 1
		Number of transcripts for this age, successes: 3.0, 1
		Number of transcripts for this age, yyy: 3.0, 1
For phase: val, ages available: [1.0, 1.5, 2.0, 2.5, 3.0]
	Number of transcripts for this age, all: 1.0

AssertionError: Figure out the transcript 1 transcript issue

In [33]:
# Check that transcripts are available for Naima 

# Why is this?


child_pool = all_phono[(all_phono.target_child_name == 'Naima')
                      & (all_phono.partition == 'yyy')
                      & (all_phono.year == 2.0)]


{42426, 42427, 42428, 42429, 42430, 42431, 42432, 42433, 42434, 42435, 42436, 42437, 42438, 42439, 42440, 42441, 42442, 42443, 42444, 42445, 42446, 42447, 42448, 42449}
24


In [347]:
# Notes to self;
# Naima's transcript didn't sample w/o replacement between successes and yyy

# why is 2.0 low for Naima? Is it likely that sampling w/o replacement led to collisions between successes and yyy for the following:

# Naima 2.0: success gives 24 utterance ids
# Naima 2.0: failure gives 24 utterance ids
# Violet 2.5: success: 11, yyy is 11 -> this is somewhat unlikely.

# Lily 4.0 (4.0 is correct, only one transcript)
# Lily 3.5 is ~5 transcripts so it's somewhat likely to sample w/o replacement.
# Can you check if it's really w/o replacement? 


# Note these numbers have changed because of re-split
print()


### Child work

In [11]:
# The phases are all disjoint

phase_dict = defaultdict(dict)

for phase_type, phase_set in zip(['phase_child_sample', 'phase_child_finetune'], [phases[:], ['train', 'val']]):
    
    for phase in phase_set:
        phase_dict[phase_type][phase] = set(all_phono[all_phono[phase_type] == phase].transcript_id)
        
    # train/val/eval in phase_child_sample, as well as phase_child_finetune
    this_type_dict = phase_dict[phase_type]
    for p1 in phase_set:
        for p2 in phase_set:
            if p1 == p2: continue
            assert not (this_type_dict[p1] & this_type_dict[p2])

# The finetune phases are all disjoint from val/eval phases in sample
# The train_sample phase is in the finetune_phase

finetune_disjoint = phase_dict['phase_child_finetune']['train'] | phase_dict['phase_child_finetune']['val']
sample_disjoint = phase_dict['phase_child_sample']['val'] | phase_dict['phase_child_sample']['eval']


# 7/25/21: https://www.geeksforgeeks.org/issubset-in-python/
valid_for_finetune_partial = (phase_dict['phase_child_sample']['train']) & set(all_phono[~all_phono.contains_error])
assert valid_for_finetune_partial.issubset(phase_dict['phase_child_finetune']['train'])
# end cite

assert len(finetune_disjoint & sample_disjoint) == 0
    
print('Passed')

Passed


In [12]:
for name in child_names:
    for phase in ['train', 'val']:
        # Make sure there is cgv, chi in both phases for child data finetuning
        rel_df = all_phono[(all_phono.target_child_name == name) & (all_phono.phase_child_finetune == phase)]
        check_chi_cgv_present(rel_df)
        
print('Passed')

Passed


In [19]:
# Write a check to ensure that if phase_child_sample (or phase_sample) then
# every transcript contains both at least one yyy and at least one success

# How to check if something is not nan, etc? drop nan can be complex?
# Just give all of the possibilties

has_phase = all_phono[all_phono.phase_child_sample.isin(phases)]

has_success_phase = set(has_phase[has_phase.partition == 'success'].transcript_id)
has_yyy_phase = set(has_phase[has_phase.partition == 'yyy'].transcript_id)
gen_phase = set(has_phase.transcript_id)

# Note: no constraint on at least one yyy and at least one success for the finetune data.

assert (has_success_phase & has_yyy_phase & gen_phase) == gen_phase

print('Passed')


Passed


In [None]:
# Think about scoreability checks for child scoring, other than beta.

In [None]:
# Check if my samples are sufficiently across time (beta)

