In [2]:

import os
from os.path import join, exists

import pandas as pd
import glob

from utils import load_splits, split_gen
import config

In [4]:
BASE_SAVE_PATH = 'eval/new_splits'

In [13]:
def load_marked_pooled_data(this_split_path):
    return pd.read_csv(join(this_split_path, 'data_pool_with_phases.csv'))

def check_disjoint_and_phase_written(split, name, base_dir):
    """
    Checks that the phase data indicated per entry corresponds to the written text in the file.
    By nature of the phase marks the data pool will be split disjointly.
    """
    
    this_split_loc = split_gen.get_split_folder(split, name, base_dir)
    this_pool_data = load_marked_pooled_data(this_split_loc)
    
    if split == 'child':
        data_cleaning.drop_errors(this_pool_data) # Don't consider the yyy, which are not written to the text files.
        assert this_pool_data[this_pool_data.gloss == 'yyy'].phase == 'val'
    
    for phase in ['train', 'val']:
        phase_locs = this_pool_data[this_pool_data['phase'] == phase]
        with open(join(this_split_loc, f"{phase}.txt"), 'r') as f:
            from_text_text = sorted([l.strip() for l in f.readlines()]) # Get rid of trailing \n
        
        from_df_text = sorted(list(phase_locs['gloss_with_punct']))
        
        assert from_text_text == from_df_text, f'Failed to match phase data for: {split}, {name}, {phase}'
    
    print(f'Assert passed for {split}, {name}')
    return True

In [6]:
import importlib
importlib.reload(load_splits)

entire_dataset_dict = load_splits.load_eval_data_all('all', 'all', BASE_SAVE_PATH)

In [None]:
# Need to check that the samples are all <= 36 months for young, old

In [7]:
# 1) Disjoint sets for "all" split

#all_split_paths = load_splits.load_splits_folder_text('all', 'data/new_splits')
#assert len(all_split_paths) == 1

check_disjoint_and_phase_written('all', 'all', 'data/new_splits')


  exec(code_obj, self.user_global_ns, self.user_ns)


Assert passed for all, all


True

In [22]:

# Check for correct age splitting within the evaluation data
young_df_dict = load_splits.load_eval_data_all('age', 'young', BASE_SAVE_PATH)
old_df_dict = load_splits.load_eval_data_all('age', 'old', BASE_SAVE_PATH)

# Convert to months, the comparison in the code is in months but it's converted from days for each comparison
assert all(all((young_df_dict[k]['target_child_age'] / 30.5) <= 36) for k in young_df_dict.keys()) 
assert all(all((old_df_dict[k]['target_child_age'] / 30.5) > 36) for k in old_df_dict.keys())

print("asserts passed")

asserts passed


In [159]:
# Disjoint sets for young, old, and merge to form whole dataset in the evaluation data.
# This runs really slow so terminating for now,
 
for k in young_df_dict.keys():
    
    # This is not quite right -- you have to drop all of the NaNs that are 
    sort_by_cols = sort_on[k] # Just trying to get them to be the same sorted order
    this_entire_data = entire_dataset_dict[k]
    
    # 7/1/21: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.dropna.html
    # This makes a copy of the data.
    this_entire_data_filtered = this_entire_data.dropna(subset=['target_child_age'])
    
    concat_df = pd.concat([young_df_dict[k], old_df_dict[k]])
    
    # Make sure there are no internal duplicates
    # 7/1/21: https://thispointer.com/pandas-find-duplicate-rows-in-a-dataframe-based-on-all-or-selected-columns-using-dataframe-duplicated-in-python/
    assert (not any(this_entire_data_filtered.duplicated())) and (not any(concat_df.duplicated()))
    
    # Then, check if the two dataframes are the same
    # 7/1/21: https://stackoverflow.com/questions/48647534/python-pandas-find-difference-between-two-data-frames
    assert pd.concat([concat_df[this_entire_data_filtered.columns],this_entire_data_filtered]).drop_duplicates(keep=False).shape[0] == 0
    
print("passed asserts")

passed asserts


In [158]:
# Correct ages for the training/val data.

print('started')

marked_pooled_old = load_marked_pooled_data(split_gen.get_split_folder('age', 'old', 'data/new_splits'))
marked_pooled_young = load_marked_pooled_data(split_gen.get_split_folder('age', 'young', 'data/new_splits'))

# Convert to months
assert all(age / 30.5 > 36 for age in marked_pooled_old.target_child_age)
assert all(age / 30.5 <= 36 for age in marked_pooled_young.target_child_age) 


# Check for disjointedness within each of the age splits for train/val data.

check_disjoint_and_phase_written('age', 'old', 'data/new_splits')
check_disjoint_and_phase_written('age', 'young', 'data/new_splits')

started


  if (await self.run_code(code, result,  async_=asy)):
  exec(code_obj, self.user_global_ns, self.user_ns)


Assert passed for age, old
Assert passed for age, young


True

## Come back to child work after finishing age-based work

In [None]:

names = ['William', 'Alex', 'Violet', 'Naima', 'Ethan', 'Lily']

# 1) Data always addresses the relevant child
all_child_data = {}
for name in names:
    this_info_dict = load_splits.load_eval_data_all('child', name, BASE_SAVE_PATH)
    assert all(all(this_info_dict[k]['target_child_name'] == name) for k in this_info_dict.keys())
    all_child_data[name] = this_info_dict

# 2) Check that child data merges to the entire dataset
for k in entire_dataset_dict.keys():
    
    this_entire_data = entire_dataset_dict[k].sort_values('utterance_id')
    concat_df = pd.concat([all_child_data[n][k] for n in names])

    assert concat_df.equals(this_entire_data)
    
# 3) The child splits are disjoint across the train and val data

for name in names:
    check_disjoint_and_phase_written('child', name, BASE_SAVE_PATH)
