In [1]:
import pandas as pd
import childespy
import numpy as np

import os
from os.path import join, exists
import config

np.random.seed(config.SEED)

# Important: Run this cell only once per "restart runtime"
# for reproducibility of random seed.

In [2]:
from utils import split_gen, data_cleaning, load_splits

# Generate Train, Val for All and Age splits

In [3]:
#get all of the North American and British English adult and child utterances without xxx or yyy
#concatenate them at the the transcript level
#hold out 20% for validation 

In [4]:
corpora = childespy.get_sql_query('select * from corpus where \
collection_name in ("Eng-NA", "Eng-UK") and data_source = "CHILDES"')
corpora

R[write to console]: Using current database version: '2020.1'.



Unnamed: 0,id,name,collection_name,data_source,collection_id
1,32,Garvey,Eng-NA,CHILDES,2
2,33,Valian,Eng-NA,CHILDES,2
3,34,Bernstein,Eng-NA,CHILDES,2
4,35,Clark,Eng-NA,CHILDES,2
5,36,PetersonMcCabe,Eng-NA,CHILDES,2
...,...,...,...,...,...
57,221,Wells,Eng-UK,CHILDES,12
58,222,Gathburn,Eng-UK,CHILDES,12
59,223,Nuffield,Eng-UK,CHILDES,12
60,224,Lara,Eng-UK,CHILDES,12


In [5]:
childes_datasets = ",".join([str(x) for x in corpora.collection_id])

In [6]:
if config.regenerate:
    utt_glosses = childespy.get_sql_query('select gloss, transcript_id, id, \
    utterance_order, speaker_code, target_child_name, corpus_name, target_child_age, type from utterance where collection_name in ("Eng-NA", "Eng-UK") \
    and collection_id in ('+childes_datasets+') and speaker_code in ("MOT", "FAT","CHI")' , db_version = "2020.1")
    utt_glosses.to_csv('csv/utt_glosses.csv', index=False)
else: 
    utt_glosses = pd.read_csv('csv/utt_glosses.csv')

R[write to console]: Using supported database version: '2020.1'.



## "all" split

In [7]:

# expectations for verbose output.
# 232 should be (4205071, 7) (because you added two extra fields)
# 233 likewise is (3959952, 7)

# Save the partition markings as well, for verification purposes.

In [8]:


# 7/26/21: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.rename.html
# for general function
# 7/27/21: https://stackoverflow.com/questions/46096307/alias-for-column-in-pandas
# for using columns keyword
all_utt_glosses = utt_glosses.rename(columns = {'id' : 'utterance_id'}).copy()
# end both cites


this_split_folder = split_gen.get_split_folder('all', 'all', config.finetune_dir)

prepped_utt_glosses = data_cleaning.prep_utt_glosses(all_utt_glosses)

split_glosses_df, train_df = split_gen.exec_split_gen(prepped_utt_glosses, this_split_folder, 'val', phase_label = 'phase_finetune')

chi_tok_freq = split_gen.save_chi_vocab(train_df, 'all', 'all')


Cell 232 output (4205071, 9)
Cell 233 output (3959952, 10)
declarative                   2828187
question                       903625
imperative_emphatic            114118
trail off                       63230
interruption                    16870
missing CA terminator           14430
self interruption               11307
quotation next line              5903
interruption question             877
quotation precedes                756
trail off question                504
self interruption question        125
broken for coding                  17
no break TCU continuation           3
Name: type, dtype: int64
Cell 238 gloss                now we need this
transcript_id                    3261
utterance_id                   279663
utterance_order                     1
speaker_code                      CHI
target_child_name                None
corpus_name                    Garvey
target_child_age                  NaN
type                        trail off
contains_error                  F

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  utt_data['contains_error'] = ['xxx' in str(x) or 'yyy' in str(x) for x in utt_data.gloss]


File written to /home/nwong/chompsky/childes/child_listening_continuation/child-directed-listening/finetune/all/all/train.txt
File written to /home/nwong/chompsky/childes/child_listening_continuation/child-directed-listening/finetune/all/all/train_no_tags.txt
File written to /home/nwong/chompsky/childes/child_listening_continuation/child-directed-listening/finetune/all/all/val.txt
File written to /home/nwong/chompsky/childes/child_listening_continuation/child-directed-listening/finetune/all/all/val_no_tags.txt
Writing split glosses to: /home/nwong/chompsky/childes/child_listening_continuation/child-directed-listening/finetune/all/all/data_pool_with_phases.pkl


In [9]:
assert set(split_glosses_df[split_glosses_df.transcript_id.isin(set(train_df.transcript_id))].phase_finetune) == {'train'}
print('Passed')

Passed


## "age" split

In [10]:

# Write age-based train/val data.

young_df, old_df = split_gen.get_age_split_data(split_glosses_df.copy(), months = config.age_split)

for args, which_df in zip([('age', 'old'), ('age', 'young')], [old_df, young_df]):
    
    split, dataset = args
    this_split_folder = split_gen.get_split_folder(split, dataset, config.finetune_dir) 
    
    for phase in ['train', 'val']:
        
        rel_data = which_df[which_df.phase_finetune == phase]
        split_gen.write_partition(phase, rel_data, this_split_folder)


File written to /home/nwong/chompsky/childes/child_listening_continuation/child-directed-listening/finetune/age/old/train.txt
File written to /home/nwong/chompsky/childes/child_listening_continuation/child-directed-listening/finetune/age/old/train_no_tags.txt
File written to /home/nwong/chompsky/childes/child_listening_continuation/child-directed-listening/finetune/age/old/val.txt
File written to /home/nwong/chompsky/childes/child_listening_continuation/child-directed-listening/finetune/age/old/val_no_tags.txt
File written to /home/nwong/chompsky/childes/child_listening_continuation/child-directed-listening/finetune/age/young/train.txt
File written to /home/nwong/chompsky/childes/child_listening_continuation/child-directed-listening/finetune/age/young/train_no_tags.txt
File written to /home/nwong/chompsky/childes/child_listening_continuation/child-directed-listening/finetune/age/young/val.txt
File written to /home/nwong/chompsky/childes/child_listening_continuation/child-directed-liste