In [47]:
import os
import sys
import numpy as np
import pandas as pd
import copy
from os.path import join, exists

In [3]:
sys.path.append('../../')
from src.utils import split_gen, sampling, data_cleaning, load_models, data_cleaning, transformers_bert_completions, configuration, child_split_gen, child_models
config = configuration.Config()
np.random.seed(config.SEED)

In [4]:
final_save_path = join(config.prov_csv_dir, 'pvd_utt_glosses_phono_cleaned_inflated_to_next_notebook.pkl')
all_tokens_phono = pd.read_pickle(final_save_path)

## Get the samples and splits for age/all splits

In [12]:
# Do this for each of success and yyy, then merge them together.

all_tokens_phono_valid = data_cleaning.find_transcripts_with_successes_and_yyy(all_tokens_phono)

# Split train/val/test: 25/50/50.

split_attr = 'transcript_id'

phono_train_val_idxs, phono_eval_idxs = split_gen.determine_split_idxs(all_tokens_phono_valid, split_attr, 0.5)

phono_train_val = all_tokens_phono_valid[all_tokens_phono_valid.transcript_id.isin(phono_train_val_idxs)]
phono_train_idxs, phono_val_idxs = split_gen.determine_split_idxs(phono_train_val, split_attr, 0.5)

for phase, idx_set in zip(['train', 'val', 'eval'], [phono_train_idxs, phono_val_idxs, phono_eval_idxs]):

    # It's on transcript_id, not actual idx, so this is OK.
    # all_tokens_phono will receive the val/eval phase marking where it applies.

    this_phase_data, all_tokens_phono = split_gen.assign_and_find_phase_data(phase, split_attr, idx_set, all_tokens_phono)

all_tokens_phono = data_cleaning.augment_target_child_year(all_tokens_phono)

# Below: For debugging only

all_tokens_phono.to_pickle(join(config.prov_dir, 'pvd_all_tokens_phono_for_eval_before_child.pkl')) 


In [10]:
import imp
imp.reload(sampling)
from src.utils import paths
imp.reload(paths)
imp.reload(configuration)
imp.reload(split_gen)
config = configuration.Config()

### Take a random sample for fitting the likelihood hyperparameter


In [114]:
young_phono, old_phono = split_gen.get_age_split_data(all_tokens_phono)

phono_pool = [
    all_tokens_phono,
    young_phono,
    old_phono,
    all_tokens_phono # validate switchboard on all tokens phono
]

model_args = [('Providence', 'all'), ('Providence-Age', 'young'), ('Providence-Age', 'old'), ('Switchboard','all')]

for (test_split, test_dataset), this_phono_raw in zip(model_args, phono_pool):
    
    print('Processing', test_split, test_dataset)
    phono_data_for_phase = this_phono_raw[(this_phono_raw.phase_sample == 'val') & (this_phono_raw.partition == 'success')]

    # age = None means don't filter on a given age
    result_beta_sample = sampling.sample_successes( 
            task_phase_to_sample_for = 'fit',
            test_split = test_split,
            test_dataset = test_dataset,
            raw_phono = phono_data_for_phase,
            age = None)        

    print('\t sample for fitting:', result_beta_sample.shape)


Processing Providence all
	 sample for fitting: (5000, 1)
Processing Providence-Age young
	 sample for fitting: (5000, 1)
Processing Providence-Age old
	 sample for fitting: (5000, 1)
Processing Switchboard all
	 sample for fitting: (5000, 1)


### Sample across ages

In [117]:
used_ages = data_cleaning.get_years(all_tokens_phono)

for age in used_ages:
    for eval_phase in ['val', 'test']:
        for sample_func, sample_name in zip([sampling.sample_successes, sampling.sample_yyy], ['success', 'yyy']):

            print(f'for {sample_name}')

            phono_data_for_phase = all_tokens_phono[(all_tokens_phono.phase_sample == phase) & (all_tokens_phono.partition == sample_name)]
            
            this_age_sample = sample_func(
                task_phase_to_sample_for= 'eval',
                test_split = 'Providence',
            test_dataset = 'all',
            raw_phono = phono_data_for_phase,
            age = age)       
            
            print('\tage sample', this_age_sample.shape)

for success
	age sample (87, 1)
for yyy
	age sample (105, 1)
for success
	age sample (87, 1)
for yyy
	age sample (105, 1)
for success
	age sample (2364, 1)
for yyy
	age sample (2369, 1)
for success
	age sample (2364, 1)
for yyy
	age sample (2369, 1)
for success
	age sample (5000, 1)
for yyy
	age sample (4524, 1)
for success
	age sample (5000, 1)
for yyy
	age sample (4524, 1)
for success
	age sample (5000, 1)
for yyy
	age sample (2971, 1)
for success
	age sample (5000, 1)
for yyy
	age sample (2971, 1)
for success
	age sample (5000, 1)
for yyy
	age sample (1994, 1)
for success
	age sample (5000, 1)
for yyy
	age sample (1994, 1)
for success
	age sample (5000, 1)
for yyy
	age sample (810, 1)
for success
	age sample (5000, 1)
for yyy
	age sample (810, 1)
for success
	age sample (1580, 1)
for yyy
	age sample (118, 1)
for success
	age sample (1580, 1)
for yyy
	age sample (118, 1)
for success
	age sample (0, 1)
for yyy
	age sample (0, 1)
for success
	age sample (0, 1)
for yyy
	age sample (0, 1

# Child work

In [56]:
# save these to 
import imp
imp.reload(split_gen)

<module 'src.utils.split_gen' from '/home/stephan/notebooks/child-directed-listening/src/tier_1/../../src/utils/split_gen.py'>

In [57]:
all_tokens_phono.columns

Index(['token', 'utterance_id', 'gloss', 'transcript_id', 'utterance_order',
       'target_child_name', 'speaker_code', 'type', 'punct',
       'speaker_code_simple', 'gloss_with_punct', 'token_id', 'seq_utt_id',
       'actual_phonology', 'model_phonology', 'target_child_age',
       'bert_token_id', 'model_phonology_clean', 'actual_phonology_clean',
       'model_phonology_no_dia', 'actual_phonology_no_dia', 'cv_raw_actual',
       'cv_collapsed_actual', 'num_vowels_actual', 'cv_raw_model',
       'cv_collapsed_model', 'num_vowels_model', 'num_vowels', 'in_vocab',
       'success_token', 'yyy_token', 'partition', 'phase', 'year',
       'phase_child_sample', 'phase_child_finetune',
       'phase_child_sample_n=2_type=success_name=Alex',
       'phase_child_sample_n=2_type=success_name=Ethan',
       'phase_child_sample_n=2_type=success_name=Lily',
       'phase_child_sample_n=2_type=success_name=Naima',
       'phase_child_sample_n=2_type=success_name=Violet',
       'phase_child_sa

In [58]:
all_tokens_phono.phase.value_counts()

train    1960626
val      1002741
eval       28498
Name: phase, dtype: int64

In [59]:
split_attr = 'transcript_id'

# 7/25/21: https://www.kite.com/python/answers/how-to-create-an-empty-column-in-a-pandas-dataframe-in-python
all_tokens_phono['phase_child_sample'] = np.nan
all_tokens_phono['phase_child_finetune'] = np.nan
# end cite

for name in sorted(list(set(all_tokens_phono.target_child_name))):
    
    sample_spec_dict = {
        'task_name': 'child',
        'task_phase' : 'sample',
        'training_split' : 'Providence-Child',
        'training_dataset' : name,
        'test_split' : None,
        'test_dataset' : None,
        'model_type': None,
        'use_tags': None,
        'context_width': None,
        'n_samples' : config.n_beta
        
    }
        
    sample_folder = paths.get_directory(sample_spec_dict)
    if not os.path.exists(sample_folder):
        os.makedirs(sample_folder)
    
    ## -------- Restricted sampling section
    
    print(f'Processing: {name}')
    
    this_child_phono = all_tokens_phono[(all_tokens_phono.target_child_name == name)]
    
    this_valid_phono = data_cleaning.find_transcripts_with_successes_and_yyy(this_child_phono)
 
    # Sample across ages
    
    complete_phase_idxs = child_split_gen.find_splits_across_ages(this_valid_phono)
        
    for phase_name, idx_set in complete_phase_idxs.items():
        
        # Make a new attribute for all_tokens_phono parallel to phase (which is the val/eval split defined above)
        _, all_tokens_phono = split_gen.assign_and_find_phase_data(phase_name, split_attr, idx_set, all_tokens_phono, phase_label = 'phase_child_sample')
    
    # Beta samples
    
    val_success_pool = all_tokens_phono[
        (all_tokens_phono.partition == 'success')
        & (all_tokens_phono.target_child_name == name)
        & (all_tokens_phono.phase_child_sample == 'val')
    ]
    
    # Note: get_beta_idxs does NOT internally filter things.
    # It's necessary to pass all_tokens_phono-based filtering because all_tokens_phono has the phase information
    # associated with it.
    
    val_sample = child_split_gen.get_beta_idxs(val_success_pool, 'transcript_id')
    
    #this_path = sampling.get_sample_path('success', 'beta', 'child', name, eval_phase = 'val')
    this_path = os.path.join(sample_folder, 
        'fit_success_utts_'+str(config.n_beta)+'.csv')
    val_sample.to_csv(this_path)

    print(f'\tWriting beta samples for phase {phase}, to {this_path}, sample size: {val_sample.shape}, pool size: {len(set(val_success_pool.utterance_id))}')
    
    ## -------- Unrestricted sampling section
    
    # Identify everything that isn't in the sample.
    
    complete_sample_idxs = np.concatenate([complete_phase_idxs[phase] for phase in ['train', 'val', 'eval']])
    
    # Checked that all_tokens_phono is unfiltered pool of information.
    
    train_val_finetune_phono_new = this_child_phono[~this_child_phono.transcript_id.isin(complete_sample_idxs)]

    train_val_finetune_phono_new = data_cleaning.drop_errors(train_val_finetune_phono_new)
    # prepped glosses already done above in Pvd logic
    
    train_merge_out_pool = {}
    
    avail_train_val_finetune_ids = set(train_val_finetune_phono_new.transcript_id)
    num_train_val_finetune_new_ids = len(avail_train_val_finetune_ids)
    
    if num_train_val_finetune_new_ids >= 2:
        train_merge_out_pool['train'], train_merge_out_pool['val'] = split_gen.determine_split_idxs(train_val_finetune_phono_new, 'transcript_id', val_ratio = config.val_ratio)
    elif num_train_val_finetune_new_ids == 1:
        # Prioritize validation because train will receive a larger merge from the previous data.
        train_merge_out_pool['train'], train_merge_out_pool['val'] = np.array([]), np.array(list(avail_train_val_finetune_ids))
    else:
        train_merge_out_pool['train'], train_merge_out_pool['val'] = np.array([]), np.array([])
    
    # Complete_phase_idxs still has some yyy in it.
    # Isolate the parts of train_sample that can be merged with the finetune train phase.
    
    no_errors_phono = data_cleaning.drop_errors(this_child_phono)
    
    train_merge_in_pool = {}
    for phase in ['train', 'val']:
        train_merge_in_pool[phase] = np.unique(no_errors_phono[no_errors_phono.transcript_id.isin(complete_phase_idxs[phase])].transcript_id)
    
    finetune_idxs = {}
    for phase in ['train', 'val']:
        finetune_idxs[phase] = np.concatenate([train_merge_out_pool[phase], train_merge_in_pool[phase]])
    
    ## Identify and write the finetune phases relative to partition information.
    
    data_spec_dict = copy.copy(sample_spec_dict)
    data_spec_dict['task_phase'] = 'extract_data'
    data_spec_dict['use_tags'] = True
    data_folder = paths.get_directory(data_spec_dict)
    
    
    for finetune_phase, finetune_idx_set in finetune_idxs.items():
        
        split_gen.write_data_partitions_text(this_child_phono, data_folder, finetune_phase, finetune_idx_set, 'transcript_id')
        
        # Re-assign phase information all_tokens_phono
        # because need to limit to this_child_phono above for the writing.
        
        _, all_tokens_phono = split_gen.assign_and_find_phase_data(finetune_phase, split_attr, finetune_idx_set, all_tokens_phono)
        
        

Processing: Alex
	Writing beta samples for phase val, to /home/stephan/notebooks/child-directed-listening/output/experiments/full_scale/sample/n=5000/Providence-Child_Alex/fit_success_utts_5000.csv, sample size: (2745, 1), pool size: 2745


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  utt_data['contains_error'] = ['xxx' in str(x) or 'yyy' in str(x) for x in all_lowercase]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)


File written to /home/stephan/notebooks/child-directed-listening/output/experiments/full_scale/extract_data/n=5000/Providence-Child_Alex_with_tags/train.txt
File written to /home/stephan/notebooks/child-directed-listening/output/experiments/full_scale/extract_data/n=5000/Providence-Child_Alex_with_tags/val.txt
Processing: Ethan
	Writing beta samples for phase val, to /home/stephan/notebooks/child-directed-listening/output/experiments/full_scale/sample/n=5000/Providence-Child_Ethan/fit_success_utts_5000.csv, sample size: (1938, 1), pool size: 1938


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  utt_data['contains_error'] = ['xxx' in str(x) or 'yyy' in str(x) for x in all_lowercase]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)


File written to /home/stephan/notebooks/child-directed-listening/output/experiments/full_scale/extract_data/n=5000/Providence-Child_Ethan_with_tags/train.txt
File written to /home/stephan/notebooks/child-directed-listening/output/experiments/full_scale/extract_data/n=5000/Providence-Child_Ethan_with_tags/val.txt
Processing: Lily
	Writing beta samples for phase val, to /home/stephan/notebooks/child-directed-listening/output/experiments/full_scale/sample/n=5000/Providence-Child_Lily/fit_success_utts_5000.csv, sample size: (4142, 1), pool size: 4142


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  utt_data['contains_error'] = ['xxx' in str(x) or 'yyy' in str(x) for x in all_lowercase]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)


File written to /home/stephan/notebooks/child-directed-listening/output/experiments/full_scale/extract_data/n=5000/Providence-Child_Lily_with_tags/train.txt
File written to /home/stephan/notebooks/child-directed-listening/output/experiments/full_scale/extract_data/n=5000/Providence-Child_Lily_with_tags/val.txt
Processing: Naima
	Writing beta samples for phase val, to /home/stephan/notebooks/child-directed-listening/output/experiments/full_scale/sample/n=5000/Providence-Child_Naima/fit_success_utts_5000.csv, sample size: (3423, 1), pool size: 3423


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  utt_data['contains_error'] = ['xxx' in str(x) or 'yyy' in str(x) for x in all_lowercase]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)


File written to /home/stephan/notebooks/child-directed-listening/output/experiments/full_scale/extract_data/n=5000/Providence-Child_Naima_with_tags/train.txt
File written to /home/stephan/notebooks/child-directed-listening/output/experiments/full_scale/extract_data/n=5000/Providence-Child_Naima_with_tags/val.txt
Processing: Violet
	Writing beta samples for phase val, to /home/stephan/notebooks/child-directed-listening/output/experiments/full_scale/sample/n=5000/Providence-Child_Violet/fit_success_utts_5000.csv, sample size: (2115, 1), pool size: 2115


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  utt_data['contains_error'] = ['xxx' in str(x) or 'yyy' in str(x) for x in all_lowercase]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)


File written to /home/stephan/notebooks/child-directed-listening/output/experiments/full_scale/extract_data/n=5000/Providence-Child_Violet_with_tags/train.txt
File written to /home/stephan/notebooks/child-directed-listening/output/experiments/full_scale/extract_data/n=5000/Providence-Child_Violet_with_tags/val.txt
Processing: William
	Writing beta samples for phase val, to /home/stephan/notebooks/child-directed-listening/output/experiments/full_scale/sample/n=5000/Providence-Child_William/fit_success_utts_5000.csv, sample size: (2944, 1), pool size: 2944


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  utt_data['contains_error'] = ['xxx' in str(x) or 'yyy' in str(x) for x in all_lowercase]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)


File written to /home/stephan/notebooks/child-directed-listening/output/experiments/full_scale/extract_data/n=5000/Providence-Child_William_with_tags/train.txt
File written to /home/stephan/notebooks/child-directed-listening/output/experiments/full_scale/extract_data/n=5000/Providence-Child_William_with_tags/val.txt


In [50]:
# Mark the subsampling for the child cross scoring
all_tokens_phono = child_split_gen.split_child_subsampling(all_tokens_phono)


in augment with subsamples train 2 success Alex
in augment with subsamples train 2 success Ethan
in augment with subsamples train 2 success Lily
in augment with subsamples train 2 success Naima
in augment with subsamples train 2 success Violet
in augment with subsamples train 2 success William
in augment with subsamples train 2 yyy Alex
in augment with subsamples train 2 yyy Ethan
in augment with subsamples train 2 yyy Lily
in augment with subsamples train 2 yyy Naima
in augment with subsamples train 2 yyy Violet
in augment with subsamples train 2 yyy William
in augment with subsamples train 1000 success Alex
in augment with subsamples train 1000 success Ethan
in augment with subsamples train 1000 success Lily
in augment with subsamples train 1000 success Naima
in augment with subsamples train 1000 success Violet
in augment with subsamples train 1000 success William
in augment with subsamples train 1000 yyy Alex
in augment with subsamples train 1000 yyy Ethan
in augment with subsamples

In [51]:
# Write final all_tokens_phono with all split information to the proper place.
if not exists(config.prov_dir):
    os.makedirs(config.prov_dir)
    
all_tokens_phono.to_pickle(join(config.prov_dir, 'pvd_all_tokens_phono_for_eval.pkl'))

In [53]:
from datetime import datetime
print(datetime.today())

2022-04-26 06:31:34.580581
