In [3]:
import pandas as pd
import childespy
import numpy as np

import os
from os.path import join, exists
import config

np.random.seed(config.SEED)

# Important: Run this cell only once per "restart runtime"
# for reproducibility of random seed.

In [4]:
from utils import split_gen, data_cleaning, load_splits

# Generate Train, Val for All and Age splits

In [3]:
#get all of the North American and British English adult and child utterances without xxx or yyy
#concatenate them at the the transcript level
#hold out 20% for validation 

In [4]:
corpora = childespy.get_sql_query('select * from corpus where \
collection_name in ("Eng-NA", "Eng-UK") and data_source = "CHILDES"')
corpora

R[write to console]: Using current database version: '2020.1'.



Unnamed: 0,id,name,collection_name,data_source,collection_id
1,32,Garvey,Eng-NA,CHILDES,2
2,33,Valian,Eng-NA,CHILDES,2
3,34,Bernstein,Eng-NA,CHILDES,2
4,35,Clark,Eng-NA,CHILDES,2
5,36,PetersonMcCabe,Eng-NA,CHILDES,2
...,...,...,...,...,...
57,221,Wells,Eng-UK,CHILDES,12
58,222,Gathburn,Eng-UK,CHILDES,12
59,223,Nuffield,Eng-UK,CHILDES,12
60,224,Lara,Eng-UK,CHILDES,12


In [5]:
childes_datasets = ",".join([str(x) for x in corpora.collection_id])

In [5]:
regenerate = False
if regenerate:
    utt_glosses = childespy.get_sql_query('select gloss, transcript_id, id, \
    utterance_order, speaker_code, target_child_name, corpus_name, target_child_age, type from utterance where collection_name in ("Eng-NA", "Eng-UK") \
    and collection_id in ('+childes_datasets+') and speaker_code in ("MOT", "FAT","CHI")' , db_version = "2020.1")
    utt_glosses.to_csv('csv/utt_glosses.csv', index=False)
else: 
    utt_glosses = pd.read_csv('csv/utt_glosses.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [6]:
utt_glosses.columns

Index(['gloss', 'transcript_id', 'id', 'utterance_order', 'speaker_code',
       'target_child_name', 'corpus_name', 'target_child_age', 'type'],
      dtype='object')

In [10]:
utt_glosses = data_cleaning.drop_errors(utt_glosses) # Drop errors for age and all splits.

## "all" split

In [78]:

all_utt_glosses = utt_glosses.copy() # Important because split_gen functions often mutate, not copy.
all_split_df, _, _ = split_gen.exec_split_gen(all_utt_glosses, 'all', 'all')

# expectations for verbose output.
# 232 should be (4205071, 7) (because you added two extra fields)
# 233 likewise is (3959952, 7)

# Save the partition markings as well, for verification purposes.

Beginning split gen call: all all
File written to data/new_splits/all/all/train.txt
File written to data/new_splits/all/all/val.txt


In [80]:
this_split_path = split_gen.get_split_folder('all', 'all', config.data_dir)
all_split_df.to_pickle(join(this_split_path, 'data_pool_with_phases.pkl'))

## "age" split

In [79]:

young_df, old_df = split_gen.get_age_split_data(utt_glosses.copy(), months = config.age_split)

cleaned_young_split_df, young_tok, young_chi_tok = split_gen.exec_split_gen(young_df, 'age', 'young')
cleaned_old_split_df, old_tok, old_chi_tok = split_gen.exec_split_gen(old_df, 'age', 'old')


Beginning split gen call: age young


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['gloss_with_punct'] = [x['speaker_code_simple'] + ' '+ x['gloss'].lower() + x['punct'] for x in data.to_dict('records')]


File written to data/new_splits/age/young/train.txt
File written to data/new_splits/age/young/val.txt
Beginning split gen call: age old
File written to data/new_splits/age/old/train.txt
File written to data/new_splits/age/old/val.txt


In [81]:

for name, this_data in zip(['young', 'old'], [cleaned_young_split_df, cleaned_old_split_df]):
    this_split_path = split_gen.get_split_folder('age', name, config.data_dir)
    this_data.to_pickle(join(this_split_path, 'data_pool_with_phases.pkl'))
    

## Write chi_token_freq dataframes

In [106]:

model_args = [('all', 'all'), ('age', 'young'), ('age', 'old')]

for names in model_args:
    
    split, dataset = names
    this_split_path = split_gen.get_split_folder(split, dataset, config.data_dir)
    
    this_pool = pd.read_pickle(join(this_split_path, 'data_pool_with_phases.pkl'))
    this_pool_train = split_gen.find_phase_data('train', this_pool)
    
    res_freq = split_gen.save_chi_vocab(this_pool_train, split, dataset)
    

## Additional checks to consider writing -- no xxx, yyy in the resulting pools/text files

## (need to revisit these, maybe -- or verify otherwise) Verifications: Quick checks for consistency with the full notebook (on "all" split)

In [None]:
# Note you refer to a partition as a phase to avoid confusion

In [None]:

# The fix gloss should now match?

import os
from os.path import join, exists

retrieve_path = split_gen.get_split_folder('all', 'all', base_path)
tok = load_csvs.load_csv_with_lists(join(retrieve_path, 'vocab.csv'))
chi_tok = load_csvs.load_csv_with_lists(join(retrieve_path, 'chi_vocab.csv'))


orig_tok = load_csvs.load_csv_with_lists('data/vocab.csv') # Saved by the original notebook in your run.
orig_chi_tok = load_csvs.load_csv_with_lists('data/chi_vocab.csv')


In [57]:

nan_filler = 'is_an_nan_need_to_fill'
orig_tok_sorted = orig_tok.sort_values('word').reset_index(drop = True).fillna(nan_filler)
tok_sorted = tok.sort_values('word').reset_index(drop = True).fillna(nan_filler)

orig_tok_sorted.equals(tok_sorted)

orig_tok_sorted.head()

assert all(orig_tok_sorted['word'] == tok_sorted['word'])
assert all(orig_tok_sorted['count'] == tok_sorted['count'])

# Note that if you compare the dataframes directly, like:
# print(orig_tok_sorted[orig_tok_sorted != tok_sorted])
# print(tok_sorted[orig_tok_sorted != tok_sorted])
# This is not going to work because of the Unnamed: 0 column. But the actual contents are the same.

In [58]:
# Checking if the dataframes are the same after the first cleaning

base_path = 'refactored'

my_out = load_csvs.load_csv_with_lists(join(base_path, 'cleaned_utt_glosses_my.csv'))
his_out = load_csvs.load_csv_with_lists('data/cleaned_utt_glosses_meylan.csv')

my_out.sort_values('id')
his_out.sort_values('id')

filler = 'fill_this_nan______'

my_out = my_out.fillna(filler)
his_out = his_out.fillna(filler)

for field in my_out.columns:
    if field not in his_out.columns: continue
    if not all(my_out[field] == his_out[field]):
        print(field)
        print(my_out[my_out[field] != his_out[field]])
        break
        
    # Seems like they are actually the same
    # So long as "NaN" glosses are filled with some value to be equivalent.