In [9]:
import pandas as pd
import childespy
import numpy as np

import os
from os.path import join, exists
np.random.seed(0)

# Important: Run this cell only once per "restart runtime"
# for reproducibility of random seed.

In [2]:
from utils import data_gen

 ## General description of splits (this notebook addresses the "all" and "age" splits)
 
 **All split**
    
- The normal train and validation from the original query on CHILDES in "Generate data to fine-tune" notebook.
- Train, Val chosen 80/20 randomly using unique transcript IDs only.
- Doesn't have xxx or yyy.

**Age split**

- Split on <=36 months for young, and rest for old.
- Do all split process.
- Doesn't have xxx or yyy.

**Child split**
- Split on 6 children as found in Generate Phonological Analysis. See notebook "Generate child data for finetuning".
- Has 200 yyy and 200 successes for validation, randomly select from UNIQUE TRANSCRIPT IDS ONLY (you need to ensure this in the code). The rest is train.
- Doesn't have xxx.

# Generate Train, Val for All and Age splits

In [2]:
#get all of the North American and British English adult and child utterances without xxx or yyy
#concatenate them at the the transcript level
#hold out 20% for validation 

In [3]:
corpora = childespy.get_sql_query('select * from corpus where \
collection_name in ("Eng-NA", "Eng-UK") and data_source = "CHILDES"')
corpora

R[write to console]: Using current database version: '2020.1'.



Unnamed: 0,id,name,collection_name,data_source,collection_id
1,32,Garvey,Eng-NA,CHILDES,2
2,33,Valian,Eng-NA,CHILDES,2
3,34,Bernstein,Eng-NA,CHILDES,2
4,35,Clark,Eng-NA,CHILDES,2
5,36,PetersonMcCabe,Eng-NA,CHILDES,2
...,...,...,...,...,...
57,221,Wells,Eng-UK,CHILDES,12
58,222,Gathburn,Eng-UK,CHILDES,12
59,223,Nuffield,Eng-UK,CHILDES,12
60,224,Lara,Eng-UK,CHILDES,12


In [4]:
childes_datasets = ",".join([str(x) for x in corpora.collection_id])

In [5]:
regenerate = False #True
if regenerate:
    utt_glosses = childespy.get_sql_query('select gloss, transcript_id, id, \
    utterance_order, speaker_code, target_child_name, target_child_age, type from utterance where collection_name in ("Eng-NA", "Eng-UK") \
    and collection_id in ('+childes_datasets+') and speaker_code in ("MOT", "FAT","CHI")' , db_version = "2020.1")
    utt_glosses.to_csv('csv/utt_glosses.csv', index=False)
else: 
    utt_glosses = pd.read_csv('csv/utt_glosses.csv')

In [6]:
import importlib
from utils import data_gen
importlib.reload(data_gen)

<module 'utils.data_gen' from '/home/nwong/chompsky/childes/child_listening_continuation/child-directed-listening/utils/data_gen.py'>

In [17]:
def exec_split_gen(raw_data, split_name, dataset_name, base_dir, verbose = False):
    
    if not exists(base_dir):
        os.makedirs(base_dir)
    
    print('Beginning split gen call:', split_name, dataset_name)
    
    fill_punct_val = None # For behavior of finetune notebook.
    cleaned_utt_glosses = data_gen.prep_utt_glosses_for_split(utt_glosses, fill_punct_val = fill_punct_val,
                                                              verbose = True)
    
    # Be careful of saving cleaned_utt_glosses -- it will convert the 'tokens' attribute to a string,
    # so it can't be used directly with save_vocab or token frequencies afterwards.
    
    tok_freq, chi_tok_freq = data_gen.save_vocab(cleaned_utt_glosses, 'all', 'all', base_dir)

    split_glosses_df = data_gen.split_glosses_shuffle(cleaned_utt_glosses, 'all', 'all', base_dir, val_ratio = 0.8)
    
    return split_glosses_df, tok_freq, chi_tok_freq
    
    

## "all" split

In [21]:
# For now maintain in separate cells because you want to check things, can merge into a single dictionary later.
# This is kind of slow -- how to monitor progress? Probably because of all of the copies

import importlib
importlib.reload(data_gen)

base_path = 'refactored'

all_utt_glosses = utt_glosses.copy() # Important because data_gen functions often mutate, not copy.
all_split_df, _, _ = exec_split_gen(all_utt_glosses, 'all', 'all', base_path)

# expectations for verbose output.
# 232 should be (4205071, 7) (because you added two extra fields)
# 233 likewise is (3959952, 7)

Beginning split gen call: all all
Cell 232 output (4205071, 7)
Cell 233 output (3959952, 7)


KeyboardInterrupt: 

## "age" split

In [None]:
# Split the original utterances by age first. 
# Split point is 36 months

mask = utt_glosses['target_child_age'] <= 36 * 30.5

# Implied that target_child_age is in days,
# and 30.5 days/month is used in the original Generalized Phonological analysis.

young_df = utt_glosses[mask].copy()
old_df = utt_glosses[~mask].copy()


In [None]:
#young_split_df, _, _ = exec_split_gen(young_df, 'age', 'young')
#old_split_df, _, _ = exec_split_gen(old_df, 'age', 'old')

## Quick checks for consistency with the full notebook (on "all" split)

In [56]:

# The fix gloss should now match?

import os
from os.path import join, exists

# Need to check where are the differences in pandas dataframes?

retrieve_path = data_gen.get_split_folder('all', 'all', base_path)
tok = pd.read_csv(join(retrieve_path, 'vocab.csv'))
chi_tok = pd.read_csv(join(retrieve_path, 'chi_vocab.csv'))

# This was definitely re-run from scratch. You need to check your vocab.csv code? What is the generation of the tokens like?

orig_tok = pd.read_csv('data/vocab.csv') # Saved by the original notebook in your run.
orig_chi_tok = pd.read_csv('data/chi_vocab.csv')


In [57]:

nan_filler = 'is_an_nan_need_to_fill'
orig_tok_sorted = orig_tok.sort_values('word').reset_index(drop = True).fillna(nan_filler)
tok_sorted = tok.sort_values('word').reset_index(drop = True).fillna(nan_filler)

orig_tok_sorted.equals(tok_sorted)

orig_tok_sorted.head()

assert all(orig_tok_sorted['word'] == tok_sorted['word'])
assert all(orig_tok_sorted['count'] == tok_sorted['count'])

# Note that if you compare the dataframes directly, like:
# print(orig_tok_sorted[orig_tok_sorted != tok_sorted])
# print(tok_sorted[orig_tok_sorted != tok_sorted])
# This is not going to work because of the Unnamed: 0 column. But the actual contents are the same.

In [58]:
# Checking if the dataframes are the same after the first cleaning

base_path = 'refactored'

my_out = pd.read_csv(join(base_path, 'cleaned_utt_glosses_my.csv'))
his_out = pd.read_csv('data/cleaned_utt_glosses_meylan.csv')

my_out.sort_values('id')
his_out.sort_values('id')

filler = 'fill_this_nan______'

my_out = my_out.fillna(filler)
his_out = his_out.fillna(filler)

for field in my_out.columns:
    if field not in his_out.columns: continue
    if not all(my_out[field] == his_out[field]):
        print(field)
        print(my_out[my_out[field] != his_out[field]])
        break # Why?
        
    # Seems like they are actually the same
    # So long as "NaN" glosses are filled with some value to be equivalent.