In [None]:

import os
from os.path import join, exists


In [6]:
from collections import Counter
from utils import load_splits, split_gen

In [None]:
BASE_SAVE_PATH = 'eval/new_splits'

In [None]:
def load_eval_data_all(split_name, dataset_name, base_dir):
    
    phono_filename = 'pvd_utt_glosses_phono_cleaned_inflated.pkl'
    success_utts_filename = 'success_utts.csv'
    yyy_utts_filename = 'yyy_utts.csv'

    data_filenames = [phono_filename, success_utts_filename, yyy_utts_filename]
    this_folder_path = split_gen.get_split_folder(split_name, dataset_name, base_dir)
    return {f : pd.read_csv(join(this_folder_path, f)) for f in filenames }

    

In [3]:
# For making sure that the generated training files are disjoint.

def disjoint_check_age_all(train_text_path, val_text_path):
    
    """
    Used to check that the finetuning data is correctly split between train and val.
    This applies to the 'all' and 'age' splits.
    
    The evaluation text is definitely different for all, age because they are from different datasets
        (manually checked)
    """
    
    # 6/25/21 https://docs.python.org/3/library/collections.html
    
    with open(train_text_path, 'r') as f:
        train_text = f.readlines()
        assert set(train_text) == len(train_text), 'Not all utterances are unique. This is strictly not needed, but the check assumes this to not raise false positive.'
    with open(val_text_path, 'r') as f:
        val_text = f.readlines()
        assert set(val_text) == len(val_text), 'Not all utterances are unique. This is strictly not needed, but the check assumes this to not raise false positive.'
        
    counts = Counter(train_text + val_text)
    elem_types = set(counts.elements()) 
    
    # Note: This can be true if the same sentence, but different utterance,
    # appears more than once within the train and val sets themselves -- this case is OK.
    
    assert all(counts[e] == 1 for e in elem_types), "Items may appear more than once in the union of train and val."
    assert len(elem_types) == len(train_text) + len(val_text)
    # Above: not strictly needed condition, but combined with above show that dataset merges to the whole.
    
    return True
    

In [None]:
entire_dataset_dict = load_eval_data_all('all', 'all', BASE_SAVE_PATH)['all']

In [12]:
# 1) Disjoint sets for "all" split

all_split_paths = load_splits.load_splits_folder_text('all', 'data/new_splits')
assert len(all_split_paths) == 1

disjoint_check_age_all(age_split_paths['all']['train'], age_split_paths['all']['val'])

[]

In [None]:

age_split_paths = load_splits.load_splits_folder_text('age', 'data/new_splits')

# Check for disjointedness within each of the age splits.

for k in age_split_paths.keys():
    disjoint_check_age_all(age_split_paths[k]['train'], age_split_paths[k]['val'])
    
# 2) Need to check that success and yyy dataframes all correspond to the proper age

young_df_dict = load_eval_data_all('age', 'young', BASE_SAVE_PATH)
old_df_dict = load_eval_data_all('age', 'old', BASE_SAVE_PATH)


# This is in months
assert all(young_df_dict[k]['target_age'] <= 36 for k in young_df_dict.keys()) 
assert all(old_df_dict[k]['target_child_age'] > 36 for k in old_df_dict.keys())


# 3) Disjoint sets for young, old, and merge to form whole dataset
for k in young_df_dict.keys():
    this_entire_data = entire_dataset_dict[k].sort_values('utterance_id')
    concat_df = pd.concat(young_df_dict[k], old_df_dict[k]).sort_values('utterance_id')
    assert this_entire_data.equals(entire_dataset)

In [None]:

# Do I want to check this?
#name_split_paths = load_splits.load_splits_folder_text('child', 'data/new_splits')

# how to find the names?
names = glob.glob(join(BASE_SAVE_PATH, 'child/*'))

print(names)

name_split_data = 
names = name_split_paths.keys()

# 4) Data always addresses the relevant child
all_child_data = {}
for name in names:
    this_info_dict = load_eval_data_all('child', name, BASE_SAVE_PATH)
    assert all(all(this_info_dict[k]['target_child_name'] == name) for k in this_info_dict.keys())
    all_child_data[name] = this_info_dict

# 5) Check that child data merges to the entire dataset?
for k in entire_dataset_dict.keys():
    
    this_entire_data = entire_dataset_dict[k].sort_values('utterance_id')
    concat_df = pd.concat([all_child_data[n][k] for n in names])

    assert concat_df.equals(this_entire_data)