In [None]:

import numpy as np
import os
import pandas as pd

from os.path import join, exists

import config
np.random.seed(config.SEED)

In [None]:
final_save_path = join(config.prov_csv_dir, 'csv/pvd_utt_glosses_phono_cleaned_inflated_to_next_notebook.pkl')

if config.regenerate:
    print('For re-generating final all_tokens_phono before splits, please see other notebook. Using cached values.')

all_tokens_phono = all_tokens_phono.read_pickle(final_save_path)

## Get the samples and splits for age/all splits

In [None]:

# Do this for each of success and yyy, then merge them together.

for partition in ['success', 'yyy']:
    
    all_tokens_phono_valid = all_tokens_phono[all_tokens_phono.partition == partition]

    # Split train/val/test: 25/50/50.

    split_attr = 'transcript_id'

    phono_train_val_idxs, phono_eval_idxs = split_gen.determine_split_idxs(all_tokens_phono_valid, split_attr, 0.5)

    phono_train_val = all_tokens_phono_valid[all_tokens_phono_valid.transcript_id.isin(phono_train_val_idxs)]
    phono_train_idxs, phono_val_idxs = split_gen.determine_split_idxs(phono_train_val, split_attr, 0.5)

    for phase, idx_set in zip(['train', 'val', 'eval'], [phono_train_idxs, phono_val_idxs, phono_eval_idxs]):

        # It's on transcript_id, not actual idx, so this is OK.
        # all_tokens_phono will receive the val/eval phase marking where it applies.

        this_phase_data, all_tokens_phono = split_gen.assign_and_find_phase_data(phase, split_attr, idx_set, all_tokens_phono, 'phase_sample')

    all_tokens_phono = data_cleaning.augment_target_child_year(all_tokens_phono)

# Below: For debugging only
all_tokens_phono.to_pickle(join(config.prov_dir, 'pvd_all_tokens_phono_for_eval_before_child.pkl')) 


In [None]:

young_phono, old_phono = split_gen.get_age_split_data(all_tokens_phono)

phono_pool = [
    all_tokens_phono,
    young_phono,
    old_phono,
]

model_args = [('all', 'all'), ('age', 'young'), ('age', 'old')]

for (split_name, dataset_name), this_phono_raw in zip(model_args, phono_pool):
    
    print('Processing', split_name, dataset_name)
    phono_phase = this_phono_raw[(this_phono_raw.phase_sample == 'val') & (this_phono_raw.partition == 'success')]

    # age = None means don't filter on a given age
    result_beta_sample = sampling.sample_successes('beta', split_name, dataset_name, None, phono_phase, 'val')        

    print('\tbeta sample', result_beta_sample.shape)


# Dropping ages 0.5 and 4.0 because of data sparsity.
# -- for 4.0 there is only one transcript so it's not possible to do a val/eval split.
# -- for 1.0 it's possible to have a sample size of 1 or 0, which is too unstable.

used_ages = data_cleaning.get_years(all_tokens_phono)

assert (used_ages[0] == 0.5 and used_ages[-1] == 4.0)

for age in used_ages[1:-1]:
    
    for phase in ['val', 'eval']:
        
        for sample_func, sample_name in zip([sampling.sample_successes, sampling.sample_yyy], ['success', 'yyy']):

            print(f'for {sample_name}')

            phono_phase = all_tokens_phono[(all_tokens_phono.phase_sample == phase) & (all_tokens_phono.partition == sample_name)]
            this_age_sample = sample_func('models_across_time', None, None, age, phono_phase, phase)        

            print('\tage sample', this_age_sample.shape)

# Child work

In [None]:

split_attr = 'transcript_id'

# 7/25/21: https://www.kite.com/python/answers/how-to-create-an-empty-column-in-a-pandas-dataframe-in-python
all_tokens_phono['phase_child_sample'] = np.nan
all_tokens_phono['phase_child_finetune'] = np.nan
# end cite

for name in child_models.get_child_names():
    
    ## -------- Restricted sampling section
    
    print(f'Processing: {name}')
    
    this_child_phono = all_tokens_phono[(all_tokens_phono.target_child_name == name)]
    
    this_success_phono = this_child_phono[this_child_phono.partition == 'success']
    this_yyy_phono = this_child_phono[this_child_phono.partition == 'yyy']
    
    this_partition_folder = split_gen.get_split_folder('child', name, config.finetune_dir)
 
    # Sample across ages
    
    success_idxs = child_split_gen.find_splits_across_ages(this_success_phono)
    
    yyy_idxs = child_split_gen.find_splits_across_ages(this_yyy_phono)

    # Combine the proper indices into their phases and identify them
    
    complete_phase_idxs = {}
    
    for phase in ['train', 'val', 'eval']:
        complete_phase_idxs[phase] = np.concatenate([success_idxs[phase], yyy_idxs[phase]])
        
    for phase_name, idx_set in complete_phase_idxs.items():
        
        # Make a new attribute for all_tokens_phono parallel to phase (which is the val/eval split defined above)
        _, all_tokens_phono = split_gen.assign_and_find_phase_data(phase_name, split_attr, idx_set, all_tokens_phono, phase_label = 'phase_child_sample')
    
    # Beta samples

    val_success_pool = all_tokens_phono[
        (all_tokens_phono.partition == 'success')
        & (all_tokens_phono.target_child_name == name)
        & (all_tokens_phono.transcript_id.isin(success_idxs['val']))
    ]
    
    # Note: get_beta_idxs does NOT internally filter things.
    # It's necessary to pass all_tokens_phono-based filtering because all_tokens_phono has the phase information
    # associated with it.
    val_sample = child_split_gen.get_beta_idxs(val_success_pool, 'transcript_id', 'val')
    
    this_path = sampling.get_sample_path('success', 'beta', 'child', name, eval_phase = 'val')
    val_sample.to_csv(this_path)

    print(f'\tWriting beta samples for phase {phase}, to {this_path}, sample size: {val_sample.shape}')
    
    ## -------- Unrestricted sampling section
    
    # Identify everything that isn't in the sample.
    
    complete_sample_idxs = np.concatenate([complete_phase_idxs[phase] for phase in ['train', 'val', 'eval']])
    
    # Checked that all_tokens_phono is unfiltered pool of information.
    
    train_val_finetune_phono_new = all_tokens_phono[~all_tokens_phono.transcript_id.isin(complete_sample_idxs)]

    train_val_finetune_phono_new = data_cleaning.drop_errors(train_val_finetune_phono_new)
    # prepped glosses already done above in Pvd logic
    
    train_new_idxs, val_finetune_idxs = split_gen.determine_split_idxs(train_val_finetune_phono_new, 'transcript_id', val_ratio = config.val_ratio)
    
    # Note there's a slight bias in val_finetune, because they cannot be chosen from the scoreable successes/yyy
    # But probably better than throwing away data or mixing train_sample and val_finetune

    # Complete_phase_idxs['train'] still has some yyy in it.
    # Isolate the parts of train_sample that can be merged with the finetune train phase.
    
    no_errors_phono = data_cleaning.drop_errors(all_tokens_phono)
    train_sample_idxs_no_errors_pool = (no_errors_phono[no_errors_phono.transcript_id.isin(complete_phase_idxs['train'])])
    
    train_sample_idxs_no_errors = np.unique(train_sample_idxs_no_errors_pool.transcript_id)
    
    train_finetune_idxs = np.concatenate([train_new_idxs, train_sample_idxs_no_errors])
    
    ## Identify and write the finetune phases relative to partition information.
    
    for finetune_phase, finetune_idxs in zip(['train', 'val'], [train_finetune_idxs, val_finetune_idxs]):
        
        # This won't work because you need to eliminate duplicates.
        all_tokens_phono, _ = split_gen.write_data_partitions_text(all_tokens_phono, this_partition_folder, finetune_phase, finetune_idxs, 'transcript_id', 'phase_child_finetune')
        

In [None]:
# Write final all_tokens_phono with all split information to the proper place.

if not exists(config.prov_dir):
    os.makedirs(config.prov_dir)
    
all_tokens_phono.to_pickle(join(config.prov_dir, 'pvd_all_tokens_phono_for_eval.pkl'))