In [1]:
# 6/21/21 Adapted from the original Generalized Phonological Comparison file from Dr. Meylan

In [2]:
%load_ext rpy2.ipython
import rpy2.robjects.lib.ggplot2 as ggplot2
import childespy
import numpy as np
import os
import imp
import pandas as pd
import Levenshtein
import itertools

In [3]:
regenerate = False
phono_data_path = 'csv/phono_data.csv'

# This takes a long time to run

# This query is NOT the same as the original Generalized Phonology one
# It adds the restriction that speaker code must be in MOT, FAT, or CHI
# It removes the restriction on non-empty phonologies
# I also add and remove fields so that the fields match those in "Generate data to fine-tune a BERT model"


# utterance_order and type are not available in the phono data? Why?

if (not os.path.exists(phono_data_path)) or regenerate:
    
    # 'id' seems to be automatically present 
    # Removed the "type" and "utterance order" arguments.
    
    phono = childespy.get_sql_query('select gloss, target_child_name, target_child_age, \
    speaker_code, transcript_id, \
    token_order, corpus_name, collection_name, language from token where speaker_code in ("MOT", "FAT","CHI")', db_version = "2020.1")
    # cache this phonological information locally
    phono.to_csv(phono_data_path, index=False)
else:
    phono = pd.read_csv(phono_data_path, keep_default_na=False, na_values=[''])

In [7]:
# remove cases where one is not set
excludes = ['*','(.)','(..)', '(...)','(....)','(.....)']
phono = phono.loc[~(phono.model_phonology.isin(excludes) |  phono.actual_phonology.isin(excludes))]

print(phono.shape)
chi_phono = phono.loc[(phono.speaker_code == 'CHI') & (phono.target_child_age < (365*5))]

(914135, 12)


In [8]:


chi_phono.loc[(chi_phono.collection_name == "Eng-NA") & (chi_phono.target_child_age > 40*30.5)]\
.to_csv('csv/EngNAOver40months.csv', index=False)
# not a model mismatch

In [9]:

en_chi_phono = chi_phono.loc[chi_phono.collection_name == 'Eng-NA']
en_chi_phono.head(5)


Unnamed: 0,gloss,target_child_name,target_child_age,speaker_code,actual_phonology,model_phonology,transcript_id,utterance_id,token_order,corpus_name,collection_name,language
162470,yyy,Naima,609.75,CHI,hɛhɪ,*vv,42409,17015713,1,Providence,Eng-NA,eng
162478,I,Julia,693.625,CHI,ə,a,41544,16650674,1,Goad,Eng-NA,eng
162479,a,Sonya,826.8125,CHI,ðæ,a,41587,16659604,1,Goad,Eng-NA,eng
162480,I,Julia,1192.0625,CHI,aj,a,41569,16658519,1,Goad,Eng-NA,eng
162481,I,Julia,1192.0625,CHI,aj,a,41569,16659995,1,Goad,Eng-NA,eng


## Clean the data using "Generate finetuned" notebook code

In [26]:
# Select which children to analyze -- I select the children with most data available

child_valid_samples = {}

for child in child_names:
    tcd = en_chi_phono[en_chi_phono['target_child_name'] == child].copy()
    tcd['contains_error'] = ['xxx' in str(x) or 'yyy' in str(x) for x in tcd.gloss]
    
    tcd = tcd.loc[~tcd.contains_error]
    child_valid_samples[child] = tcd.shape[0]
    
child_freq_sorted = sorted(child_valid_samples, key = lambda k : child_valid_samples[k])[::-1]

for idx, c in enumerate(child_freq_sorted):
    print(c, 'samples', child_valid_samples[c])
    if idx == 5: print('-----'*20)

num_children = 6
select_children = child_freq_sorted[:num_children]

print('Select:', select_children)
    

Naima samples 98585
Lily samples 74467
Alex samples 37041
William samples 30014
Ethan samples 27280
Violet samples 26349
----------------------------------------------------------------------------------------------------
Julia samples 23112
Trevor samples 14029
Sean samples 8741
Sonya samples 7781
E samples 3332
T42 samples 147
Select: ['Naima', 'Lily', 'Alex', 'William', 'Ethan', 'Violet']


In [None]:
import importlib
from utils import data_gen
importlib.reload(data_gen)

In [None]:

this_base_dir = 'data/child'
child_names = list(set(en_chi_phono['target_child_name']))

for child in child_names:
    
    child_data = en_chi_phono[en_chi_phono['target_child_name'] == child].copy()
    
    # Need to select 400 successes as val for now -- this is done internally in exec_split_gen
    # Will save their training/validation txt as expected
    this_split_glosses_df, this_tok_freq, this_chi_tok_freq = data_gen.exec_split_gen(child_data, 'child', child,
                                                                                      this_base_dir, verbose = False)
    