In [1]:
import pandas as pd
import childespy
import numpy as np

import os
from os.path import join, exists
import config

np.random.seed(config.SEED)

# Important: Run this cell only once per "restart runtime"
# for reproducibility of random seed.

In [2]:
from utils import split_gen, data_cleaning, load_splits

# Generate Train, Val for All and Age splits

In [3]:
#get all of the North American and British English adult and child utterances without xxx or yyy
#concatenate them at the the transcript level
#hold out 20% for validation 

In [4]:
corpora = childespy.get_sql_query('select * from corpus where \
collection_name in ("Eng-NA", "Eng-UK") and data_source = "CHILDES"')
corpora

R[write to console]: Using current database version: '2020.1'.



Unnamed: 0,id,name,collection_name,data_source,collection_id
1,32,Garvey,Eng-NA,CHILDES,2
2,33,Valian,Eng-NA,CHILDES,2
3,34,Bernstein,Eng-NA,CHILDES,2
4,35,Clark,Eng-NA,CHILDES,2
5,36,PetersonMcCabe,Eng-NA,CHILDES,2
...,...,...,...,...,...
57,221,Wells,Eng-UK,CHILDES,12
58,222,Gathburn,Eng-UK,CHILDES,12
59,223,Nuffield,Eng-UK,CHILDES,12
60,224,Lara,Eng-UK,CHILDES,12


In [5]:
childes_datasets = ",".join([str(x) for x in corpora.collection_id])

In [None]:
save_path = join(config.finetune_dir, 'utt_glosses.pkl')

if config.regenerate:
    utt_glosses = childespy.get_sql_query('select gloss, transcript_id, id, \
    utterance_order, speaker_code, target_child_name, corpus_name, target_child_age, type from utterance where collection_name in ("Eng-NA", "Eng-UK") \
    and collection_id in ('+childes_datasets+') and speaker_code in ("MOT", "FAT","CHI")' , db_version = "2020.1")
    utt_glosses.to_pickle(save_path)
else: 
    utt_glosses = pd.read_pickle(save_path)

R[write to console]: Using supported database version: '2020.1'.



## "all" split

In [None]:

# expectations for verbose output.
# 232 should be (4205071, 7) (because you added two extra fields)
# 233 likewise is (3959952, 7)

# Save the partition markings as well, for verification purposes.


In [None]:


# 7/26/21: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.rename.html
# for general function
# 7/27/21: https://stackoverflow.com/questions/46096307/alias-for-column-in-pandas
# for using columns keyword
all_utt_glosses = utt_glosses.rename(columns = {'id' : 'utterance_id'}).copy()
# end both cites

this_split_folder = split_gen.get_split_folder('all', 'all', config.finetune_dir)


In [None]:
# Changed to drop xxx and yyy for all of the splits.

if config.verbose: print('Cell 232 output', all_utt_glosses.shape)

# Cell 233 in the notebook relative to Dr Meylan's commit
data = data_cleaning.drop_errors(all_utt_glosses)

if config.verbose: print('Cell 233 output', data.shape)

data = data_cleaning.clean_glosses(data)

assert not any(['xxx' in s for s in set(data.gloss_with_punct)])
assert not any(['yyy' in s for s in set(data.gloss_with_punct)])

if config.verbose: print('Cell 269', data.head(5).gloss_with_punct)

# Cell 271: This was moved outside of token cleaning because it's needed for the CHI analysis.
data['tokens'] = [str(x).lower().split(' ') for x in data.gloss]



In [None]:

split_glosses_df, train_df = split_gen.exec_split_gen(data, this_split_folder, 'val', phase_label = 'phase_finetune')

chi_tok_freq = split_gen.save_chi_vocab(train_df, 'all', 'all')


In [None]:
assert set(split_glosses_df[split_glosses_df.transcript_id.isin(set(train_df.transcript_id))].phase_finetune) == {'train'}
print('Passed')

## "age" split

In [None]:

# Write age-based train/val data.

young_df, old_df = split_gen.get_age_split_data(split_glosses_df.copy(), months = config.age_split)

for args, which_df in zip([('age', 'old'), ('age', 'young')], [old_df, young_df]):
    
    split, dataset = args
    this_split_folder = split_gen.get_split_folder(split, dataset, config.finetune_dir) 
    
    for phase in ['train', 'val']:
        
        rel_data = which_df[which_df.phase_finetune == phase]
        split_gen.write_partition(phase, rel_data, this_split_folder)


In [None]:
from datetime import datetime
print(datetime.today())