In [1]:

import numpy as np
import os
import pandas as pd

from os.path import join, exists

import config
np.random.seed(config.SEED)

In [2]:
final_save_path = join(config.prov_csv_dir, 'pvd_utt_glosses_phono_cleaned_inflated_to_next_notebook.pkl')

if config.regenerate:
    print('For re-generating final all_tokens_phono and utts_with_ages before splits, please see other notebook. Using cached values.')

all_tokens_phono = pd.read_pickle(final_save_path)


For re-generating final all_tokens_phono and utts_with_ages before splits, please see other notebook. Using cached values.


In [3]:
from utils import split_gen, sampling, data_cleaning, load_models, data_cleaning, transformers_bert_completions
from utils_child import child_split_gen, child_models

## Get the samples and splits for age/all splits

In [4]:

# Do this for each of success and yyy, then merge them together.

all_tokens_phono_valid = data_cleaning.find_transcripts_with_successes_and_yyy(all_tokens_phono)

# Split train/val/test: 25/50/50.

split_attr = 'transcript_id'

phono_train_val_idxs, phono_eval_idxs = split_gen.determine_split_idxs(all_tokens_phono_valid, split_attr, 0.5)

phono_train_val = all_tokens_phono_valid[all_tokens_phono_valid.transcript_id.isin(phono_train_val_idxs)]
phono_train_idxs, phono_val_idxs = split_gen.determine_split_idxs(phono_train_val, split_attr, 0.5)

for phase, idx_set in zip(['train', 'val', 'eval'], [phono_train_idxs, phono_val_idxs, phono_eval_idxs]):

    # It's on transcript_id, not actual idx, so this is OK.
    # all_tokens_phono will receive the val/eval phase marking where it applies.

    this_phase_data, all_tokens_phono = split_gen.assign_and_find_phase_data(phase, split_attr, idx_set, all_tokens_phono, 'phase_sample')

all_tokens_phono = data_cleaning.augment_target_child_year(all_tokens_phono)

# Below: For debugging only
all_tokens_phono.to_pickle(join(config.prov_dir, 'pvd_all_tokens_phono_for_eval_before_child.pkl')) 


In [5]:

young_phono, old_phono = split_gen.get_age_split_data(all_tokens_phono)

phono_pool = [
    all_tokens_phono,
    young_phono,
    old_phono,
]

model_args = [('all', 'all'), ('age', 'young'), ('age', 'old')]

for (split_name, dataset_name), this_phono_raw in zip(model_args, phono_pool):
    
    print('Processing', split_name, dataset_name)
    phono_phase = this_phono_raw[(this_phono_raw.phase_sample == 'val') & (this_phono_raw.partition == 'success')]

    # age = None means don't filter on a given age
    result_beta_sample = sampling.sample_successes('beta', split_name, dataset_name, None, phono_phase, 'val')        

    print('\tbeta sample', result_beta_sample.shape)


# Handle sparse ages later in the pipeline, if any

used_ages = data_cleaning.get_years(all_tokens_phono)

for age in used_ages:
    
    for phase in ['val', 'eval']:
        
        for sample_func, sample_name in zip([sampling.sample_successes, sampling.sample_yyy], ['success', 'yyy']):

            print(f'for {sample_name}')

            phono_phase = all_tokens_phono[(all_tokens_phono.phase_sample == phase) & (all_tokens_phono.partition == sample_name)]
            this_age_sample = sample_func('models_across_time', None, None, age, phono_phase, phase)        

            print('\tage sample', this_age_sample.shape)

Processing all all
Resampling for: task: beta, split: all, dataset: all, age: None, phase: val
	beta sample (5000, 1)
Processing age young
Resampling for: task: beta, split: age, dataset: young, age: None, phase: val
	beta sample (5000, 1)
Processing age old
Resampling for: task: beta, split: age, dataset: old, age: None, phase: val
	beta sample (5000, 1)
for success
Resampling for: task: models_across_time, split: None, dataset: None, age: 0.5, phase: val
	age sample (1, 1)
for yyy
Resampling for: task: models_across_time, split: None, dataset: None, age: 0.5, phase: val
	age sample (1, 1)
for success
Resampling for: task: models_across_time, split: None, dataset: None, age: 0.5, phase: eval
	age sample (87, 1)
for yyy
Resampling for: task: models_across_time, split: None, dataset: None, age: 0.5, phase: eval
	age sample (105, 1)
for success
Resampling for: task: models_across_time, split: None, dataset: None, age: 1.0, phase: val
	age sample (1511, 1)
for yyy
Resampling for: task: mo

# Child work

In [6]:

import importlib
importlib.reload(child_split_gen)


<module 'utils_child.child_split_gen' from '/home/nwong/chompsky/childes/child_listening_split/child-directed-listening/utils_child/child_split_gen.py'>

In [7]:
all_tokens_phono.columns

Index(['token', 'utterance_id', 'gloss', 'transcript_id', 'utterance_order',
       'target_child_name', 'speaker_code', 'type', 'punct',
       'speaker_code_simple', 'gloss_with_punct', 'token_id', 'seq_utt_id',
       'actual_phonology', 'model_phonology', 'target_child_age',
       'bert_token_id', 'model_phonology_clean', 'actual_phonology_clean',
       'model_phonology_no_dia', 'actual_phonology_no_dia', 'cv_raw_actual',
       'cv_collapsed_actual', 'num_vowels_actual', 'cv_raw_model',
       'cv_collapsed_model', 'num_vowels_model', 'num_vowels', 'in_vocab',
       'success_token', 'yyy_token', 'partition', 'phase_sample', 'year'],
      dtype='object')

In [8]:

split_attr = 'transcript_id'

# 7/25/21: https://www.kite.com/python/answers/how-to-create-an-empty-column-in-a-pandas-dataframe-in-python
all_tokens_phono['phase_child_sample'] = np.nan
all_tokens_phono['phase_child_finetune'] = np.nan
# end cite

for name in sorted(list(set(all_tokens_phono.target_child_name))):
    
    this_partition_folder = split_gen.get_split_folder('child', name, config.finetune_dir)
    
    ## -------- Restricted sampling section
    
    print(f'Processing: {name}')
    
    this_child_phono = all_tokens_phono[(all_tokens_phono.target_child_name == name)]
    
    this_valid_phono = data_cleaning.find_transcripts_with_successes_and_yyy(this_child_phono)
 
    # Sample across ages
    
    complete_phase_idxs = child_split_gen.find_splits_across_ages(this_valid_phono)
        
    for phase_name, idx_set in complete_phase_idxs.items():
        
        # Make a new attribute for all_tokens_phono parallel to phase (which is the val/eval split defined above)
        _, all_tokens_phono = split_gen.assign_and_find_phase_data(phase_name, split_attr, idx_set, all_tokens_phono, phase_label = 'phase_child_sample')
    
    # Beta samples
    
    val_success_pool = all_tokens_phono[
        (all_tokens_phono.partition == 'success')
        & (all_tokens_phono.target_child_name == name)
        & (all_tokens_phono.phase_child_sample == 'val')
    ]
    
    # Note: get_beta_idxs does NOT internally filter things.
    # It's necessary to pass all_tokens_phono-based filtering because all_tokens_phono has the phase information
    # associated with it.
    
    val_sample = child_split_gen.get_beta_idxs(val_success_pool, 'transcript_id')
    
    this_path = sampling.get_sample_path('success', 'beta', 'child', name, eval_phase = 'val')
    val_sample.to_csv(this_path)

    print(f'\tWriting beta samples for phase {phase}, to {this_path}, sample size: {val_sample.shape}, pool size: {len(set(val_success_pool.utterance_id))}')
    
    ## -------- Unrestricted sampling section
    
    # Identify everything that isn't in the sample.
    
    complete_sample_idxs = np.concatenate([complete_phase_idxs[phase] for phase in ['train', 'val', 'eval']])
    
    # Checked that all_tokens_phono is unfiltered pool of information.
    
    train_val_finetune_phono_new = this_child_phono[~this_child_phono.transcript_id.isin(complete_sample_idxs)]

    train_val_finetune_phono_new = data_cleaning.drop_errors(train_val_finetune_phono_new)
    # prepped glosses already done above in Pvd logic
    
    train_merge_out_pool = {}
    
    avail_train_val_finetune_ids = set(train_val_finetune_phono_new.transcript_id)
    num_train_val_finetune_new_ids = len(avail_train_val_finetune_ids)
    
    if num_train_val_finetune_new_ids >= 2:
        train_merge_out_pool['train'], train_merge_out_pool['val'] = split_gen.determine_split_idxs(train_val_finetune_phono_new, 'transcript_id', val_ratio = config.val_ratio)
    elif num_train_val_finetune_new_ids == 1:
        # Prioritize validation because train will receive a larger merge from the previous data.
        train_merge_out_pool['train'], train_merge_out_pool['val'] = np.array([]), np.array(list(avail_train_val_finetune_ids))
    else:
        train_merge_out_pool['train'], train_merge_out_pool['val'] = np.array([]), np.array([])
    
    # Complete_phase_idxs still has some yyy in it.
    # Isolate the parts of train_sample that can be merged with the finetune train phase.
    
    no_errors_phono = data_cleaning.drop_errors(this_child_phono)
    
    train_merge_in_pool = {}
    for phase in ['train', 'val']:
        train_merge_in_pool[phase] = np.unique(no_errors_phono[no_errors_phono.transcript_id.isin(complete_phase_idxs[phase])].transcript_id)
    
    finetune_idxs = {}
    for phase in ['train', 'val']:
        finetune_idxs[phase] = np.concatenate([train_merge_out_pool[phase], train_merge_in_pool[phase]])
    
    ## Identify and write the finetune phases relative to partition information.
    
    for finetune_phase, finetune_idx_set in finetune_idxs.items():
        
        split_gen.write_data_partitions_text(this_child_phono, this_partition_folder, finetune_phase, finetune_idx_set, 'transcript_id', 'phase_child_finetune')
        
        # Re-assign phase information all_tokens_phono
        # because need to limit to this_child_phono above for the writing.
        
        _, all_tokens_phono = split_gen.assign_and_find_phase_data(finetune_phase, split_attr, finetune_idx_set, all_tokens_phono, phase_label = 'phase_child_finetune')
        
        

Processing: Alex
	Writing beta samples for phase eval, to /home/nwong/chompsky/childes/child_listening_split/child-directed-listening/prov/child/Alex/success_utts_beta_5000_val.csv, sample size: (2745, 1), pool size: 2745


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  utt_data['contains_error'] = ['xxx' in str(x) or 'yyy' in str(x) for x in all_lowercase]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


File written to /home/nwong/chompsky/childes/child_listening_split/child-directed-listening/finetune/child/Alex/train.txt
File written to /home/nwong/chompsky/childes/child_listening_split/child-directed-listening/finetune/child/Alex/train_no_tags.txt
File written to /home/nwong/chompsky/childes/child_listening_split/child-directed-listening/finetune/child/Alex/val.txt
File written to /home/nwong/chompsky/childes/child_listening_split/child-directed-listening/finetune/child/Alex/val_no_tags.txt
Processing: Ethan
	Writing beta samples for phase val, to /home/nwong/chompsky/childes/child_listening_split/child-directed-listening/prov/child/Ethan/success_utts_beta_5000_val.csv, sample size: (1938, 1), pool size: 1938


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  utt_data['contains_error'] = ['xxx' in str(x) or 'yyy' in str(x) for x in all_lowercase]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


File written to /home/nwong/chompsky/childes/child_listening_split/child-directed-listening/finetune/child/Ethan/train.txt
File written to /home/nwong/chompsky/childes/child_listening_split/child-directed-listening/finetune/child/Ethan/train_no_tags.txt
File written to /home/nwong/chompsky/childes/child_listening_split/child-directed-listening/finetune/child/Ethan/val.txt
File written to /home/nwong/chompsky/childes/child_listening_split/child-directed-listening/finetune/child/Ethan/val_no_tags.txt
Processing: Lily
	Writing beta samples for phase val, to /home/nwong/chompsky/childes/child_listening_split/child-directed-listening/prov/child/Lily/success_utts_beta_5000_val.csv, sample size: (4141, 1), pool size: 4141


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  utt_data['contains_error'] = ['xxx' in str(x) or 'yyy' in str(x) for x in all_lowercase]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


File written to /home/nwong/chompsky/childes/child_listening_split/child-directed-listening/finetune/child/Lily/train.txt
File written to /home/nwong/chompsky/childes/child_listening_split/child-directed-listening/finetune/child/Lily/train_no_tags.txt
File written to /home/nwong/chompsky/childes/child_listening_split/child-directed-listening/finetune/child/Lily/val.txt
File written to /home/nwong/chompsky/childes/child_listening_split/child-directed-listening/finetune/child/Lily/val_no_tags.txt
Processing: Naima
	Writing beta samples for phase val, to /home/nwong/chompsky/childes/child_listening_split/child-directed-listening/prov/child/Naima/success_utts_beta_5000_val.csv, sample size: (3423, 1), pool size: 3423


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  utt_data['contains_error'] = ['xxx' in str(x) or 'yyy' in str(x) for x in all_lowercase]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


File written to /home/nwong/chompsky/childes/child_listening_split/child-directed-listening/finetune/child/Naima/train.txt
File written to /home/nwong/chompsky/childes/child_listening_split/child-directed-listening/finetune/child/Naima/train_no_tags.txt
File written to /home/nwong/chompsky/childes/child_listening_split/child-directed-listening/finetune/child/Naima/val.txt
File written to /home/nwong/chompsky/childes/child_listening_split/child-directed-listening/finetune/child/Naima/val_no_tags.txt
Processing: Violet
	Writing beta samples for phase val, to /home/nwong/chompsky/childes/child_listening_split/child-directed-listening/prov/child/Violet/success_utts_beta_5000_val.csv, sample size: (2115, 1), pool size: 2115


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  utt_data['contains_error'] = ['xxx' in str(x) or 'yyy' in str(x) for x in all_lowercase]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


File written to /home/nwong/chompsky/childes/child_listening_split/child-directed-listening/finetune/child/Violet/train.txt
File written to /home/nwong/chompsky/childes/child_listening_split/child-directed-listening/finetune/child/Violet/train_no_tags.txt
File written to /home/nwong/chompsky/childes/child_listening_split/child-directed-listening/finetune/child/Violet/val.txt
File written to /home/nwong/chompsky/childes/child_listening_split/child-directed-listening/finetune/child/Violet/val_no_tags.txt
Processing: William
	Writing beta samples for phase val, to /home/nwong/chompsky/childes/child_listening_split/child-directed-listening/prov/child/William/success_utts_beta_5000_val.csv, sample size: (2944, 1), pool size: 2944


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  utt_data['contains_error'] = ['xxx' in str(x) or 'yyy' in str(x) for x in all_lowercase]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


File written to /home/nwong/chompsky/childes/child_listening_split/child-directed-listening/finetune/child/William/train.txt
File written to /home/nwong/chompsky/childes/child_listening_split/child-directed-listening/finetune/child/William/train_no_tags.txt
File written to /home/nwong/chompsky/childes/child_listening_split/child-directed-listening/finetune/child/William/val.txt
File written to /home/nwong/chompsky/childes/child_listening_split/child-directed-listening/finetune/child/William/val_no_tags.txt


In [10]:
all_tokens_phono.to_pickle('./scratch/prov_before_augment_subsampling.pkl')

In [11]:
# Mark the subsampling for the child cross scoring

all_tokens_phono = child_split_gen.split_child_subsampling(all_tokens_phono)


in augment with subsamples train 2 success Alex
in augment with subsamples train 2 success Ethan
in augment with subsamples train 2 success Lily
in augment with subsamples train 2 success Naima
in augment with subsamples train 2 success Violet
in augment with subsamples train 2 success William
in augment with subsamples train 2 yyy Alex
in augment with subsamples train 2 yyy Ethan
in augment with subsamples train 2 yyy Lily
in augment with subsamples train 2 yyy Naima
in augment with subsamples train 2 yyy Violet
in augment with subsamples train 2 yyy William
in augment with subsamples train 500 success Alex
in augment with subsamples train 500 success Ethan
in augment with subsamples train 500 success Lily
in augment with subsamples train 500 success Naima
in augment with subsamples train 500 success Violet
in augment with subsamples train 500 success William
in augment with subsamples train 500 yyy Alex
in augment with subsamples train 500 yyy Ethan
in augment with subsamples train 5

In [12]:
# Write final all_tokens_phono with all split information to the proper place.

if not exists(config.prov_dir):
    os.makedirs(config.prov_dir)
    
all_tokens_phono.to_pickle(join(config.prov_dir, 'pvd_all_tokens_phono_for_eval.pkl'))

In [13]:
from datetime import datetime
print(datetime.today())

2021-08-23 12:54:20.146901
