In [None]:
# 6/21/21 Adapted from the original Generalized Phonological Comparison file from Dr. Meylan

In [2]:
%load_ext rpy2.ipython
import rpy2.robjects.lib.ggplot2 as ggplot2
import childespy
import numpy as np
import os
import imp
import pandas as pd
import Levenshtein
import itertools

In [3]:
regenerate_cached_phono = False
phono_data_path = 'csv/phono_data.csv'


# This query is NOT the same as the original Generalized Phonology one
# It adds the restriction that speaker code must be in MOT, FAT, or CHI
# It removes the restriction on non-empty phonologies
# I also add and remove fields so that the fields match those in "Generate data to fine-tune a BERT model"

if (not os.path.exists(phono_data_path)) or regenerate_cached_phono:
    
    # 'id' seems to be automatically present 
    
    phono = childespy.get_sql_query('select gloss, target_child_name, target_child_age, type, \
    speaker_code, transcript_id, utterance_order, \
    token_order, corpus_name, collection_name, language from token where speaker_code in ("MOT", "FAT","CHI"),
        db_version = "2020.1")
    # cache this phonological information locally
    phono.to_csv(phono_data_path, index=False)
else:
    phono = pd.read_csv(phono_data_path, keep_default_na=False, na_values=[''])

R[write to console]: Using supported database version: '2020.1'.



In [4]:
# remove cases where one is not set
excludes = ['*','(.)','(..)', '(...)','(....)','(.....)']
phono = phono.loc[~(phono.model_phonology.isin(excludes) |  phono.actual_phonology.isin(excludes))]

In [5]:
phono.shape

(914135, 12)

In [6]:
chi_phono = phono.loc[(phono.speaker_code == 'CHI') & (phono.target_child_age < (365*5))]

In [7]:
chi_phono.shape

(802885, 12)

In [19]:
chi_phono.loc[(chi_phono.collection_name == "Eng-NA") & (chi_phono.target_child_age > 40*30.5)]\
.to_csv('csv/EngNAOver40months.csv', index=False)
# not a model mismatch

In [21]:

en_chi_phono = chi_phono.loc[chi_phono.collection_name == 'Eng-NA']
en_chi_phono.head(5)


Unnamed: 0,gloss,target_child_name,target_child_age,speaker_code,actual_phonology,model_phonology,transcript_id,utterance_id,token_order,corpus_name,collection_name,language,levdist
162471,yyy,Naima,609.75,CHI,hɛhɪ,*vv,42409,17015713,1,Providence,Eng-NA,eng,1.333333
162479,I,Julia,693.625,CHI,ə,a,41544,16650674,1,Goad,Eng-NA,eng,1.0
162480,a,Sonya,826.8125,CHI,ðæ,a,41587,16659604,1,Goad,Eng-NA,eng,2.0
162481,I,Julia,1192.0625,CHI,aj,a,41569,16658519,1,Goad,Eng-NA,eng,1.0
162482,I,Julia,1192.0625,CHI,aj,a,41569,16659995,1,Goad,Eng-NA,eng,1.0


## Clean the data using "Generate finetuned" notebook code