In [1]:
# These will need to be run on child_listening_split after pushed to remote

import pandas as pd
from utils import load_models, load_splits, split_gen
from utils_child import child_models

import os
from os.path import join, exists

import config
import glob


In [2]:

finetune_compare = 'finetune_for_rep'
prov_compare  = f'prov_for_rep'

finetune = {
    'new' : finetune_compare,
    'old' : 'finetune'
}

prov = {
    'new' : prov_compare,
    'old' : 'prov'
}


In [3]:
names = child_models.get_child_names()

all_model_args = config.childes_model_args + [('child', name) for name in names]

# Training text files

In [4]:
# Check that the training files are identical for all of

for arg_set in all_model_args:
    
    split, dataset = arg_set
    
    for phase in ['train', 'val']:
        
        text_compare = {}
        for label, base_path in finetune.items():
            this_split_folder = split_gen.get_split_folder(split, dataset, base_path)
            with open(join(this_split_folder, f'{phase}.txt')) as f:
                text_compare[label] = f.readlines()
                
        # This is NOT passing it's making sure it runs.
        assert (text_compare['new'] == text_compare['old']), f"Failed on split: {arg_set}, phase: {phase}"
        
print('Passed')

Passed


# All_tokens_phono files

In [5]:

def get_prov_paths(base_name):
    return join(prov['new'], base_name), join(prov['old'], base_name)
    

def compare_df_rep(df1, df2):
    
    assert all(df1.columns == df2.columns)
    
    for col in df1.columns:
        assert all(df1[col].isna() == df2[col].isna())
        assert all(df1[col].dropna() == df2[col].dropna())
        
    return True
        
print('Passed')

Passed


In [6]:
all_phono = load_splits.load_phono()

In [7]:
# Verify that all_tokens_phono is the same

prov_pkl_name = 'pvd_all_tokens_phono_for_eval.pkl'
path1, path2 = get_prov_paths(prov_pkl_name)
assert compare_df_rep(pd.read_pickle(path1), pd.read_pickle(path2))

print('Passed')

Passed


In [8]:

def paths_to_df(paths):
    return pd.concat([pd.read_csv(path) for path in paths])
    
def load_ages_samples(base_dir, which_type):
    data_folder = join(base_dir, 'across_time_samples')
    template = join(data_folder, f'{which_type}_utts_models_across_time_{config.n_across_time}_*_{phase}.csv')
    paths = glob.glob(template)
    return paths
    
# Verify that the across time samples (all, subset) are the same (concat them all)

for arg_set in all_model_args:
    
    for this_type in ['success', 'yyy']:
        
        paths1 = load_ages_samples(prov['new'], this_type) 
        paths2 = load_ages_samples(prov['old'], this_type) 
            
        for p1, p2 in zip(paths1, paths2):
            assert compare_df_rep(pd.read_csv(p1), pd.read_csv(p2))
        
print('Passed')

Passed


In [9]:

# Verify that the beta samples (all) are all the same for every split.

def load_beta_samples(base_dir, split, dataset, is_tags, context_num, model_type):
    
    folder = split_gen.get_split_folder(split, dataset, base_dir)
    this_data_path = join(folder, f'success_utts_beta_5000_{config.eval_phase}.csv')
    
    return this_data_path


for arg_set in load_models.gen_all_model_args():
    sample1 = pd.read_csv(load_beta_samples(prov['new'], *arg_set))
    sample2 = pd.read_csv(load_beta_samples(prov['old'], *arg_set))
    
    assert sample1.equals(sample2)

print('Passed')

# Note the subsampling is a truncation of a random sample, so this should be sufficient for reproducibility


Passed


In [10]:
from datetime import datetime
print(datetime.now())

2021-08-20 14:38:21.153798
