In [1]:
# 6/21/21 Adapted from the original Generalized Phonological Comparison file from Dr. Meylan

In [2]:
%load_ext rpy2.ipython
import rpy2.robjects.lib.ggplot2 as ggplot2
import childespy
import numpy as np
import os
import imp
import pandas as pd
import Levenshtein
import itertools

SEED = 0
np.random.seed(SEED)

from utils import split_gen

In [10]:
# Note: I used regenerate = False for the refactor, because it took too long for the query to complete.

regenerate = True
phono_data_path = 'csv/phono_data.csv'

# This takes a long time to run

# This query is NOT the same as the original Generalized Phonology one
# It adds the restriction that speaker code must be in MOT, FAT, or CHI
# It removes the restriction on non-empty phonologies
# I also add and remove fields so that the fields match those in "Generate data to fine-tune a BERT model"

# utterance_order and type are not available in the phono data? Why?

if (not os.path.exists(phono_data_path)) or regenerate:
    
    # 'id' seems to be automatically present 
    # Removed the "type" and "utterance order" arguments.
    
    phono = childespy.get_sql_query('select gloss, target_child_name, type, utterance_order, target_child_age, \
    speaker_code, transcript_id, \
    token_order, corpus_name, collection_name, language from token where speaker_code in ("MOT", "FAT","CHI")', db_version = "2020.1")
    # cache this phonological information locally
    phono.to_csv(phono_data_path, index=False)
else:
    phono = pd.read_csv(phono_data_path, keep_default_na=False, na_values=[''])

R[write to console]: Using supported database version: '2020.1'.

R[write to console]: Error in .local(conn, statement, ...) : 
  could not run statement: Unknown column 'type' in 'field list'



RRuntimeError: Error in .local(conn, statement, ...) : 
  could not run statement: Unknown column 'type' in 'field list'


In [15]:
for name in ['Naima', 'Lily', 'Alex', 'William', 'Ethan', 'Violet']:
    print(phono[phono['target_child_name'] == name]['corpus_name'])

162470     Providence
174090     Providence
174091     Providence
174092     Providence
184210     Providence
              ...    
1075982    Providence
1075983    Providence
1076052    Providence
1076059    Providence
1076077    Providence
Name: corpus_name, Length: 98591, dtype: object
184149     Providence
184207     Providence
184208     Providence
184209     Providence
184295     Providence
              ...    
1076075    Providence
1076076    Providence
1076109    Providence
1076110    Providence
1076113    Providence
Name: corpus_name, Length: 74468, dtype: object
174060     Providence
174061     Providence
174062     Providence
174063     Providence
184140     Providence
              ...    
1076104    Providence
1076105    Providence
1076106    Providence
1076107    Providence
1076108    Providence
Name: corpus_name, Length: 37043, dtype: object
184150     Providence
184151     Providence
184152     Providence
184153     Providence
184154     Providence
              ...   

In [4]:
# remove cases where one is not set
excludes = ['*','(.)','(..)', '(...)','(....)','(.....)']
phono = phono.loc[~(phono.model_phonology.isin(excludes) |  phono.actual_phonology.isin(excludes))]

print(phono.shape)
chi_phono = phono.loc[(phono.speaker_code == 'CHI') & (phono.target_child_age < (365*5))]

(914135, 12)


In [5]:

chi_phono.loc[(chi_phono.collection_name == "Eng-NA") & (chi_phono.target_child_age > 40*30.5)]\
.to_csv('csv/EngNAOver40months.csv', index=False)
# not a model mismatch

In [6]:

en_chi_phono = chi_phono.loc[chi_phono.collection_name == 'Eng-NA']
en_chi_phono.head(5)


Unnamed: 0,gloss,target_child_name,target_child_age,speaker_code,actual_phonology,model_phonology,transcript_id,utterance_id,token_order,corpus_name,collection_name,language
162470,yyy,Naima,609.75,CHI,hɛhɪ,*vv,42409,17015713,1,Providence,Eng-NA,eng
162478,I,Julia,693.625,CHI,ə,a,41544,16650674,1,Goad,Eng-NA,eng
162479,a,Sonya,826.8125,CHI,ðæ,a,41587,16659604,1,Goad,Eng-NA,eng
162480,I,Julia,1192.0625,CHI,aj,a,41569,16658519,1,Goad,Eng-NA,eng
162481,I,Julia,1192.0625,CHI,aj,a,41569,16659995,1,Goad,Eng-NA,eng


## Clean the data using "Generate finetuned" notebook code

In [7]:
# Select which children to analyze -- I select the children with most data available

child_valid_samples = {}

for child in set(en_chi_phono['target_child_name']):
    tcd = en_chi_phono[en_chi_phono['target_child_name'] == child].copy()
    tcd['contains_error'] = ['xxx' in str(x) or 'yyy' in str(x) for x in tcd.gloss]
    
    tcd = tcd.loc[~tcd.contains_error]
    child_valid_samples[child] = tcd.shape[0]
    
child_freq_sorted = sorted(child_valid_samples, key = lambda k : child_valid_samples[k])[::-1]

for idx, c in enumerate(child_freq_sorted):
    print(c, 'samples', child_valid_samples[c])
    if idx == 5: print('-----'*20)

num_children = 6
select_children = child_freq_sorted[:num_children]

print('Select:', select_children)
    

Naima samples 98585
Lily samples 74467
Alex samples 37041
William samples 30014
Ethan samples 27280
Violet samples 26349
----------------------------------------------------------------------------------------------------
Julia samples 23112
Trevor samples 14029
Sean samples 8741
Sonya samples 7781
E samples 3332
T42 samples 147
Select: ['Naima', 'Lily', 'Alex', 'William', 'Ethan', 'Violet']


In [9]:


this_base_dir = 'data/new_splits'

if not os.path.exists(this_base_dir):
    os.makedirs(this_base_dir)
    
child_names = list(set(en_chi_phono['target_child_name']))

for child in select_children:
    
    child_data = en_chi_phono[en_chi_phono['target_child_name'] == child].copy()
    
    # Need to select 400 successes as val for now -- this is done internally in exec_split_gen
    # Will save their training/validation txt as expected
    
    this_split_glosses_df, this_tok_freq, this_chi_tok_freq = split_gen.exec_split_gen(child_data, 'child', child,
                                                                                      this_base_dir, verbose = False)
    

Beginning split gen call: child Naima


AttributeError: 'DataFrame' object has no attribute 'type'

## Confirmation

In [None]:
# Load all of the generated data properly

from utils import load_splits
child_data = load_splits.load_splits_folder_text('child', os.getcwd())


In [None]:
# 1) no communicative failures in the final text file, anywhere
# -- a communicative failure is the occurence of xxx or yyy anywhere in the dataset

# 2) all of the data composes to form the original dataset 
# 3) all of the data appears once in any dataset
# 4) all of the data within a child's dataset belongs to that child.