In [7]:
import pandas as pd
import childespy
import numpy as np

import os
from os.path import join, exists
import config

np.random.seed(config.SEED)

# Important: Run this cell only once per "restart runtime"
# for reproducibility of random seed.

In [10]:
from utils import split_gen, data_cleaning, load_splits

# Generate Train, Val for All and Age splits

In [3]:
#get all of the North American and British English adult and child utterances without xxx or yyy
#concatenate them at the the transcript level
#hold out 20% for validation 

In [4]:
corpora = childespy.get_sql_query('select * from corpus where \
collection_name in ("Eng-NA", "Eng-UK") and data_source = "CHILDES"')
corpora

R[write to console]: Using current database version: '2020.1'.



Unnamed: 0,id,name,collection_name,data_source,collection_id
1,32,Garvey,Eng-NA,CHILDES,2
2,33,Valian,Eng-NA,CHILDES,2
3,34,Bernstein,Eng-NA,CHILDES,2
4,35,Clark,Eng-NA,CHILDES,2
5,36,PetersonMcCabe,Eng-NA,CHILDES,2
...,...,...,...,...,...
57,221,Wells,Eng-UK,CHILDES,12
58,222,Gathburn,Eng-UK,CHILDES,12
59,223,Nuffield,Eng-UK,CHILDES,12
60,224,Lara,Eng-UK,CHILDES,12


In [5]:
childes_datasets = ",".join([str(x) for x in corpora.collection_id])

In [5]:
regenerate = False
if regenerate:
    utt_glosses = childespy.get_sql_query('select gloss, transcript_id, id, \
    utterance_order, speaker_code, target_child_name, corpus_name, target_child_age, type from utterance where collection_name in ("Eng-NA", "Eng-UK") \
    and collection_id in ('+childes_datasets+') and speaker_code in ("MOT", "FAT","CHI")' , db_version = "2020.1")
    utt_glosses.to_csv('csv/utt_glosses.csv', index=False)
else: 
    utt_glosses = pd.read_csv('csv/utt_glosses.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [6]:
utt_glosses.columns

Index(['gloss', 'transcript_id', 'id', 'utterance_order', 'speaker_code',
       'target_child_name', 'corpus_name', 'target_child_age', 'type'],
      dtype='object')

In [10]:
utt_glosses = data_cleaning.drop_errors(utt_glosses) # Drop errors for age and all splits.

## "all" split

In [22]:

all_utt_glosses = utt_glosses.copy() # Important because split_gen functions often mutate, not copy.
all_split_df, _ = split_gen.exec_split_gen(all_utt_glosses, 'all', 'all')

# expectations for verbose output.
# 232 should be (4205071, 7) (because you added two extra fields)
# 233 likewise is (3959952, 7)

# Save the partition markings as well, for verification purposes.

Beginning split gen call: all all
Cell 232 output (4205071, 9)
Cell 233 output (3959952, 10)
declarative                   2828187
question                       903625
imperative_emphatic            114118
trail off                       63230
interruption                    16870
missing CA terminator           14430
self interruption               11307
quotation next line              5903
interruption question             877
quotation precedes                756
trail off question                504
self interruption question        125
broken for coding                  17
no break TCU continuation           3
Name: type, dtype: int64
Cell 238 gloss                now we need this
transcript_id                    3261
id                             279663
utterance_order                     1
speaker_code                      CHI
target_child_name                 NaN
corpus_name                    Garvey
target_child_age                  NaN
type                        trail off

In [23]:
this_split_path = split_gen.get_split_folder('all', 'all', config.data_dir)
all_split_df.to_pickle(join(this_split_path, 'data_pool_with_phases.pkl'))

## "age" split

In [18]:
import importlib
importlib.reload(split_gen)

<module 'utils.split_gen' from '/home/nwong/chompsky/childes/child_listening_continuation/child-directed-listening/utils/split_gen.py'>

In [20]:

young_df, old_df = split_gen.get_age_split_data(utt_glosses.copy(), months = config.age_split)

cleaned_young_split_df, _ = split_gen.exec_split_gen(young_df, 'age', 'young')
cleaned_old_split_df, _ = split_gen.exec_split_gen(old_df, 'age', 'old')


Beginning split gen call: age young
Cell 232 output (2703798, 9)
Cell 233 output (2546695, 10)
declarative                   1836671
question                       584173
imperative_emphatic             67718
trail off                       28060
missing CA terminator           11639
interruption                     8173
self interruption                6986
quotation next line              1816
interruption question             643
quotation precedes                379
trail off question                374
self interruption question         52
broken for coding                   8
no break TCU continuation           3
Name: type, dtype: int64
Cell 238 gloss                            Child's
transcript_id                       3306
id                                290276
utterance_order                        2
speaker_code                         MOT
target_child_name                    NaN
corpus_name                       Valian
target_child_age                763.9375
type       

In [24]:

for name, this_data in zip(['young', 'old'], [cleaned_young_split_df, cleaned_old_split_df]):
    this_split_path = split_gen.get_split_folder('age', name, config.data_dir)
    this_data.to_pickle(join(this_split_path, 'data_pool_with_phases.pkl'))
    

## Write chi_token_freq dataframes

In [25]:

model_args = [('all', 'all'), ('age', 'young'), ('age', 'old')]

for names in model_args:
    
    split, dataset = names
    this_split_path = split_gen.get_split_folder(split, dataset, config.data_dir)
    
    this_pool = pd.read_pickle(join(this_split_path, 'data_pool_with_phases.pkl'))
    this_pool_train = split_gen.find_phase_data('train', this_pool)
    
    res_freq = split_gen.save_chi_vocab(this_pool_train, split, dataset)
    