## Prepares Providence data

In [1]:
# 7/22/21: https://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython

%load_ext autoreload
%autoreload 2

# end cite

In [2]:
%load_ext rpy2.ipython
import rpy2.robjects.lib.ggplot2 as ggplot2
import childespy
import numpy as np
import os
import imp
import pandas as pd
import transformers
import torch
import re
import unicodedata
import scipy.stats
import copy
from string import punctuation

from os.path import join, exists

import config
np.random.seed(config.SEED)

In [22]:
from utils import split_gen, sampling, data_cleaning, load_models, data_cleaning, transformers_bert_completions
from utils_child import child_split_gen, child_models

### Prepare and clean Providence data 

Corresponds to: 4 | Prep Utterances / Tokens for BERT,
    in the original notebook

In [4]:

# Get the index of the Providence corpus
pvd_idx = childespy.get_sql_query('select * from corpus where name = "Providence"').iloc[0]['id']

# Load utterances from the Providence corpus from childs-db

raw_utt_glosses_save_path = join(config.prov_csv_dir, 'pvd_utt_glosses.csv')
if config.regenerate:
    raw_utt_glosses = childespy.get_sql_query('select gloss, transcript_id, id, \
    utterance_order, speaker_code, actual_phonology, model_phonology, target_child_name, target_child_age, type from utterance where speaker_code in ("MOT", "FAT","CHI") and corpus_id = '+str(pvd_idx) ,
        db_version = "2020.1")
    raw_utt_glosses.to_csv(raw_utt_glosses_save_path, index=False)
else: 
    raw_utt_glosses = pd.read_csv(raw_utt_glosses_save_path)

R[write to console]: Using current database version: '2020.1'.

R[write to console]: Using supported database version: '2020.1'.



In [5]:
raw_utt_glosses[['gloss', 'actual_phonology', 'model_phonology']] # Utterance id level, then it's tokenized.

Unnamed: 0,gloss,actual_phonology,model_phonology
1,anywhere you'll feel comfortable um anywhere,,
2,please don't do that,,
3,this is,,
4,Mommy,ɑmɪ,mɑmiː
5,okay that's fine,,
...,...,...,...
445363,oh why lick hippo,o waɪ liʔ ɪ,oʊ waɪ lɪk hɪ
445364,hippo,hɪpo,hɪpoʊ
445365,xxx,,
445366,xxx la la xxx,* lɑ lɑ *,* lɑː lɑː *


In [6]:
# 7/26/21: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.rename.html
# for general function
# 7/27/21: https://stackoverflow.com/questions/46096307/alias-for-column-in-pandas
# for using columns keyword
raw_utt_glosses = raw_utt_glosses.rename(columns = {'id' : 'utterance_id'})
# end both cites

In [7]:

utt_glosses = data_cleaning.clean_glosses(raw_utt_glosses.copy())


declarative                   325325
question                       81758
imperative_emphatic            15404
trail off                      11924
self interruption               6443
interruption                    2784
self interruption question       807
trail off question               631
interruption question            285
quotation precedes                 3
question exclamation               2
broken for coding                  1
Name: type, dtype: int64
Cell 238 gloss                anywhere you'll feel comfortable um anywhere
transcript_id                                               42204
utterance_id                                             16759261
utterance_order                                                 2
speaker_code                                                  MOT
actual_phonology                                                 
model_phonology                                                  
target_child_name                                           

In [8]:
if config.verbose: print(utt_glosses[utt_glosses.utterance_id == 17280964])

                    gloss  transcript_id  utterance_id  utterance_order  \
445363  oh why lick hippo          42569      17280964              756   

       speaker_code actual_phonology model_phonology target_child_name  \
445363          CHI      o waɪ liʔ ɪ   oʊ waɪ lɪk hɪ           William   

        target_child_age               type punct speaker_code_simple  \
445363         1212.0625  self interruption     .               [CHI]   

                gloss_with_punct  
445363  [CHI] oh why lick hippo.  


### Build the Vocabulary

In [9]:

cmu_2syl_inchildes = load_models.get_cmu_dict_info()
initial_tokenizer = load_models.get_primary_tokenizer()

initial_vocab, cmu_in_initial_vocab = load_models.get_initial_vocab_info()

# confirm yyy treated as a separate character
assert initial_tokenizer.tokenize('this is a yyy.') == ['this', 'is', 'a', 'yyy', '.']

if config.verbose: print(cmu_in_initial_vocab.shape)
    

(7904, 8)


### Count successes/errors

In [10]:

def count_transmission_errors(utt_vector, error_codes):
    return(np.sum([x in error_codes for x in  utt_vector]))


In [11]:

# Only consider scoreable utterances from children,
# but here consider CGV utterances as well due to need to filter on finetune data.
# Therefore raw_success_utts and raw_yyy_utts refer to things with the appropriate number of errors

xxxs_per_utt = utt_glosses.groupby('utterance_id').gloss.agg(
    lambda x: count_transmission_errors(x, ['xxx'])).reset_index()

xxxs_per_utt.columns = ['utterance_id', 'num_xxx']
yyys_per_utt = utt_glosses.groupby('utterance_id').gloss.agg(
    lambda x: count_transmission_errors(x, ['yyy'])).reset_index()

yyys_per_utt.columns = ['utterance_id', 'num_yyy']

failures_per_utt = xxxs_per_utt.merge(yyys_per_utt)

raw_yyy_utts = failures_per_utt.loc[(failures_per_utt.num_xxx == 0) &  (failures_per_utt.num_yyy == 1)]

if config.verbose: print(raw_yyy_utts.shape)

raw_success_utts = failures_per_utt.loc[(failures_per_utt.num_xxx == 0) &  
    (failures_per_utt.num_yyy == 0)]

# Note this will not be the same as the previous printout because it's directly on all_tokens_phono.
if config.verbose: print(raw_success_utts.shape)
    

(19686, 3)
(399286, 3)


In [49]:
all_tokens_phono[all_tokens_phono.utterance_id.isin(yyy_utt_ids)] # what to do here?

Unnamed: 0,token,utterance_id,gloss,transcript_id,utterance_order,speaker_code,actual_phonology,model_phonology,target_child_name,target_child_age,...,model_phonology_clean,actual_phonology_clean,model_phonology_no_dia,actual_phonology_no_dia,cv_raw,cv_collapsed,num_vowels,in_vocab,success_token,yyy_token
54,[chi],16759363,yyy,42204,11,CHI,,,Alex,514.0000,...,,,,,,,,False,False,False
55,yyy,16759363,yyy,42204,11,CHI,ʌ,*,Alex,514.0000,...,*,ə,*,ə,v,v,1.0,False,False,True
56,.,16759363,yyy,42204,11,CHI,,,Alex,514.0000,...,,,,,,,,False,False,False
127,[chi],16759468,yyy,42206,1,CHI,,,Alex,543.4375,...,,,,,,,,False,False,False
128,yyy,16759468,yyy,42206,1,CHI,aʊ,*,Alex,543.4375,...,*,ɑʊ,*,ɑʊ,vv,v,1.0,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2991672,yyy,17280335,yyy,42569,723,CHI,iː,*,William,1212.0625,...,*,i,*,i,v,v,1.0,False,False,True
2991673,.,17280335,yyy,42569,723,CHI,,,William,1212.0625,...,,,,,,,,False,False,False
2991674,[chi],17280349,yyy,42569,724,CHI,,,William,1212.0625,...,,,,,,,,False,False,False
2991675,yyy,17280349,yyy,42569,724,CHI,iːɪːoː,*,William,1212.0625,...,*,iəoʊ,*,iəoʊ,vvvv,v,1.0,False,False,True


In [12]:

# build a dataframe of tokens 
# this is slow, because tokenization is slow

def inflate(row):
    tokens = initial_tokenizer.tokenize(row['gloss_with_punct'])
    return(pd.DataFrame({'token':tokens, 'utterance_id':row['utterance_id']}) )

all_tokens_save_path = join(config.prov_csv_dir, 'pvd_utt_glosses_inflated.csv')
if config.regenerate:
    inflate_all = []
    
    num_process = utt_glosses.to_dict('records')
    
    print(f"Number to process: {num_process}")
    for idx, x in enumerate(utt_glosses.to_dict('records')):
        percent_done = round(idx / len(num_process), 3)
        if idx % 5000 == 0 : print(f'{percent_done} done')
        inflate_all.append(inflate(x))
        
    all_tokens = pd.concat(inflate_all)
    all_tokens = all_tokens.merge(utt_glosses)
    
    all_tokens.to_csv(all_tokens_save_path)

else:
    all_tokens = pd.read_csv(all_tokens_save_path, na_filter=False)

if config.verbose: print(all_tokens.iloc[0:10])

# Assign a token_id (integer in the BERT vocabulary). 
# Because these are from the tokenized utterances, there is no correpsondence 
# with childes-db token ids
all_tokens['token_id'] = initial_tokenizer.convert_tokens_to_ids(all_tokens['token'])
# assigns utterances a 0-indexed index column


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



0.0 done
0.011 done
0.022 done
0.034 done
0.045 done
0.056 done
0.067 done
0.079 done
0.09 done
0.101 done
0.112 done
0.123 done
0.135 done
0.146 done
0.157 done
0.168 done
0.18 done
0.191 done
0.202 done
0.213 done
0.225 done
0.236 done
0.247 done
0.258 done
0.269 done
0.281 done
0.292 done
0.303 done
0.314 done
0.326 done
0.337 done
0.348 done
0.359 done
0.37 done
0.382 done
0.393 done
0.404 done
0.415 done
0.427 done
0.438 done
0.449 done
0.46 done
0.472 done
0.483 done
0.494 done
0.505 done
0.516 done
0.528 done
0.539 done
0.55 done
0.561 done
0.573 done
0.584 done
0.595 done
0.606 done
0.617 done
0.629 done
0.64 done
0.651 done
0.662 done
0.674 done
0.685 done
0.696 done
0.707 done
0.719 done
0.73 done
0.741 done
0.752 done
0.763 done
0.775 done
0.786 done
0.797 done
0.808 done
0.82 done
0.831 done
0.842 done
0.853 done
0.864 done
0.876 done
0.887 done
0.898 done
0.909 done
0.921 done
0.932 done
0.943 done
0.954 done
0.965 done
0.977 done
0.988 done
0.999 done
         token  utte

In [13]:
all_tokens['seq_utt_id'] = all_tokens['utterance_id'].astype('category').cat.codes

### Add back IPA, syllable structure, and child ages for child productions

In [24]:
# get the token-level data, esp phonology


save_phono_inflated_path = join(config.prov_csv_dir, 'pvd_utt_glosses_phono_inflated.pkl')
if config.regenerate:

    # get token-level information for Providence
    pvd_chi_tokens = childespy.get_sql_query('select gloss, target_child_name, target_child_age, \
    speaker_code, actual_phonology, model_phonology, transcript_id, utterance_id, \
    token_order from token where speaker_code = "CHI" and corpus_id = '+str(pvd_idx),
        db_version = "2020.1")
    
    counts_utt_valid_set = set(pd.concat([raw_success_utts, raw_yyy_utts]).utterance_id)
    
    # Isolate the utterances that are valid successes and yyy
    # And also have model phonology/actual phonology
    # The intent is to replace the first query in the original notebook.
    
    
    pvd_chi_phonology_utt_set_df = pvd_chi_tokens[pvd_chi_tokens.isin(counts_utt_valid_set)]
    
    pvd_chi_phonology_utt_set_df = pvd_chi_phonology_utt_set_df[pvd_chi_phonology_utt_set_df.actual_phonology != ""]
    pvd_chi_phonology_utt_set_df = pvd_chi_phonology_utt_set_df[pvd_chi_phonology_utt_set_df.model_phonology != ""]

    pvd_chi_phonology_utt_set = set(pvd_chi_phonology_utt_set_df.utterance_id)

    # Pvd chi will enforce that everything populated with phonology is a CHI token
    utts_to_retrieve = counts_utt_valid_set & pvd_chi_phonology_utt_set
     
    pvd_chi_tokens['gloss'] = [data_cleaning.fix_gloss(x) for x in pvd_chi_tokens.gloss]
    
    # prep the tokens generated from segmenting the utterances
    all_tokens_test = copy.deepcopy(all_tokens) 

    # initialize the fields that need to be populated
    all_tokens_test['actual_phonology'] = ''
    all_tokens_test['model_phonology'] = ''
    all_tokens_test['target_child_age'] = np.nan
    
    # get a set of unique utterances
    _, idx = np.unique(all_tokens_test.utterance_id, return_index=True)
    all_utt_indices = all_tokens_test.utterance_id[np.sort(idx)]
    
    # For fast retrieval of IPA, split pvd_chi_tokens into a dictionary
    pvd_chi_tokens_list = pvd_chi_tokens.groupby(['utterance_id'])
    pvd_chi_tokens_dict = dict(zip(
        [x[0] for x in pvd_chi_tokens_list], 
        [x[1] for x in pvd_chi_tokens_list], 
    ))
    
    # For fast retrival of BERT tokenization
    all_tokens_test_list = all_tokens_test.groupby(['utterance_id'])
    all_tokens_test_dict = dict(zip(
        [x[0] for x in all_tokens_test_list], 
        [x[1] for x in all_tokens_test_list], 
    ))
        
    # Augment the tokens from all_tokens with the IPA from pvd_chi_tokens 
    rvs = [] 
    
    i=-1
    for utt_index in all_utt_indices:
        i+=1
        if i % int(len(all_utt_indices) / 100) == 0:
            print(str(np.round((i / (len(all_utt_indices)) * 100),2))+'% complete...')    
            # should learn to use tqdm instead
        if utt_index in utts_to_retrieve:        
            utt_df = copy.deepcopy(all_tokens_test_dict[utt_index])
            utt_df['model_phonology'] = transformers_bert_completions.augment_with_ipa(
              utt_df, pvd_chi_tokens_dict[utt_index],initial_tokenizer, 'model_phonology')
            utt_df['actual_phonology'] = transformers_bert_completions.augment_with_ipa(
              utt_df, pvd_chi_tokens_dict[utt_index],initial_tokenizer, 'actual_phonology')
            utt_df['target_child_age'] = pvd_chi_tokens_dict[utt_index].iloc[0].target_child_age    
            rvs.append(utt_df)  
        else:
            rvs.append(all_tokens_test_dict[utt_index])  
            
    # get the resulting augmented forms back into a dataframe
    all_tokens_phono = pd.concat(rvs)
    
    # add a unique identifier to the BERT tokens
    all_tokens_phono['bert_token_id'] = range(all_tokens_phono.shape[0])
    
    #save the results
    all_tokens_phono.to_pickle(save_phono_inflated_path)
else:
    all_tokens_phono = pd.read_pickle(save_phono_inflated_path)

R[write to console]: Using supported database version: '2020.1'.



0.0% complete...




1.0% complete...
2.0% complete...
3.0% complete...
4.0% complete...
5.0% complete...
6.0% complete...
7.0% complete...
8.0% complete...
9.0% complete...
10.0% complete...
11.0% complete...
12.0% complete...
13.0% complete...
14.0% complete...
15.0% complete...
16.0% complete...
17.0% complete...
18.0% complete...
19.0% complete...
20.0% complete...
21.0% complete...
22.0% complete...
23.0% complete...
24.0% complete...
25.0% complete...
26.0% complete...
27.0% complete...
28.0% complete...
29.0% complete...
30.0% complete...
31.0% complete...
32.0% complete...
33.0% complete...
33.99% complete...
34.99% complete...
35.99% complete...
36.99% complete...
37.99% complete...
38.99% complete...
39.99% complete...
40.99% complete...
41.99% complete...
42.99% complete...
43.99% complete...
44.99% complete...
45.99% complete...
46.99% complete...
47.99% complete...
48.99% complete...
49.99% complete...
50.99% complete...
51.99% complete...
52.99% complete...
53.99% complete...
54.99% complete.

FileNotFoundError: [Errno 2] No such file or directory: '/home/nwong/chompsky/childes/child_listening_continuation/child-directed-listening/prov_csv/csv/pvd_utt_glosses_phono_inflated.pkl'

In [26]:
all_tokens_phono.to_pickle(save_phono_inflated_path)

In [27]:
# Get the IPA map
phone_map_df = pd.read_csv('phon/phon_map_populated.csv')

In [28]:
if config.verbose:
    # Inspect the IPA
    print(all_tokens_phono.loc[all_tokens_phono.actual_phonology != ''][['token','actual_phonology','model_phonology']])
    print(phone_map_df.head())

         token actual_phonology model_phonology
23       mommy              ɑmɪ           mɑmiː
55         yyy                ʌ               *
125        wee              wiː             wiː
128        yyy               aʊ               *
131        yyy                u               *
...        ...              ...             ...
2991851    xxx                *               *
2991854    xxx                *               *
2991857    xxx                *               *
2991860    xxx                *               *
2991863    xxx                *               *

[360185 rows x 3 columns]
  arpa ipa c_or_v
0   AA   ɑ      v
1   AE   æ      v
2   AH   ə      v
3   AO   ɔ      v
4   AW  aʊ      v


In [29]:
def phone_remap(x):
    return(x.replace("ː","").replace('ʌ','ə')
.replace('ɪ','ə').replace('ɔ','ɑ').replace('a','ɑ').replace('o','oʊ').replace('˞','').replace('ʰ',
    ''). replace('r','ɹ')).replace('\\^','').replace('\\ ̃','').replace(' ̩','').replace('^',''
).replace('ʙ','b').replace('(','').replace(')','').replace('.','').replace('ch','ʧ'
).replace('c','k').replace('g','ɡ').replace('y','j').replace('ʁ','ɹ')

def strip_accents(string, accents=('COMBINING ACUTE ACCENT', 
    'COMBINING GRAVE ACCENT', 'COMBINING TILDE', 'COMBINING VERTICAL LINE BELOW',
    'COMBINING SHORT STROKE OVERLAY')):
    accents = set(map(unicodedata.lookup, accents))
    chars = [c for c in unicodedata.normalize('NFD', string) if c not in accents]
    return unicodedata.normalize('NFC', ''.join(chars))

cv_map = dict(zip(phone_map_df['ipa'], phone_map_df['c_or_v']))
cv_map['o'] = 'v' 
cv_map['ɜ'] = 'v'
cv_map['e'] = 'v'
cv_map['ʔ'] = 'c'
cv_map['ɾ'] = 'c'
cv_map['ɲ'] = 'c'
cv_map['x'] = 'c'
cv_map['ɱ'] = 'c'
cv_map['ɣ'] = 'c'

def cv_mapper(x, cv_map):
    try:
        return(cv_map[x])
    except:
        raise ValueError(x)

cleaned_inflated_save = join(config.prov_csv_dir, 'pvd_utt_glosses_phono_cleaned_inflated.pkl')

if config.regenerate:    

    # Do the same excludes as were used to identify appropriate utterances
    excludes = ['*','(.)','(..)', '(...)','(....)','(.....)']
    all_tokens_phono.loc[all_tokens_phono.actual_phonology.isin(excludes),'actual_phonology'] =''
    all_tokens_phono.loc[all_tokens_phono.actual_phonology.str.contains('V'),'actual_phonology'] =''
    
    # remap phonology from narrow phonetic transcription to broad phonological transcription
    all_tokens_phono['model_phonology_clean'] = [phone_remap(x) for x in all_tokens_phono['model_phonology']]
    all_tokens_phono['actual_phonology_clean'] = [phone_remap(x) for x in all_tokens_phono['actual_phonology']]

    # remove any non-combining diacritical marks
    all_tokens_phono['model_phonology_no_dia'] = [strip_accents(x) for x in \
    all_tokens_phono['model_phonology_clean']]
    all_tokens_phono['actual_phonology_no_dia'] = [strip_accents(x) for x in \
    all_tokens_phono['actual_phonology_clean']]
    
    # Compute the number of non-contiguous vowels.
    # slightly different than the cmu vowel computation ---
    # because here we are computing it directly from IPA
    all_tokens_phono['cv_raw'] = [''.join([cv_mapper(x, cv_map) for x in list(y)]) if y != '' else '' for y in all_tokens_phono['actual_phonology_no_dia']]    
    all_tokens_phono['cv_collapsed']  = [re.sub(r'(.)\1+', r'\1', str(x)) if x != '' else '' for x in all_tokens_phono['cv_raw']]
    all_tokens_phono['num_vowels'] = [np.sum(np.array(list(x)) == 'v') if x !='' else np.nan for x in all_tokens_phono['cv_collapsed']]
    all_tokens_phono.to_pickle(cleaned_inflated_save)
else:
    all_tokens_phono = pd.read_pickle(cleaned_inflated_save)


In [30]:
all_tokens_phono[all_tokens_phono.utterance_id == 16759250][['gloss', 'seq_utt_id', 'token']]

Unnamed: 0,gloss,seq_utt_id,token


In [31]:
if config.verbose:
    # Why no actual phonology?
    print(all_tokens_phono.loc[all_tokens_phono.actual_phonology_no_dia != '']['actual_phonology_no_dia'])
    print(all_tokens_phono.shape)

23         ɑmə
55           ə
125         wi
128         ɑʊ
131          u
          ... 
2991834     oʊ
2991835    wɑə
2991836    liʔ
2991849     lɑ
2991850     lɑ
Name: actual_phonology_no_dia, Length: 344267, dtype: object
(2991865, 24)


### Identify the tokens that can be evaluated & Identify the subset of success and failure utterances that have transcriptions

In [33]:
# Note that these raw

successful_utt_ids = set(raw_success_utts['utterance_id']) 

initial_vocab_set = set(initial_vocab)
yyy_utt_ids = set(raw_yyy_utts['utterance_id'])

all_tokens_phono['in_vocab'] = all_tokens_phono['token'].isin(initial_vocab_set)

# Added logic 7/23/21 here for in_vocab
# Changed success_token to be sufficient on the entire requirements

all_tokens_phono['success_token'] = [
    ((x in successful_utt_ids) and (y) and (z <= 2) )
    for x, y, z in zip(
        all_tokens_phono['utterance_id'], # Check if has the right number of errors 
        all_tokens_phono['in_vocab'], # Enforce BERT and CMU requirement
        all_tokens_phono['num_vowels'] # Enforce syllable and child requirement (this won't be populated if not CHI)
    )]

all_tokens_phono['yyy_token'] = [
    (x in yyy_utt_ids) and (y == 'yyy') and (z <= 2)
    for x, y, z in zip(
        all_tokens_phono['utterance_id'], # Check if it has the right number of errors
        all_tokens_phono['token'], # Check if it has the right gloss
        all_tokens_phono['num_vowels'], # Enforce syllable and child requirement (this won't be populated if not CHI)
        
    )]

# end added logic

In [79]:
all_tokens_phono['partition'] = 'none'

In [80]:
all_tokens_phono.loc[(all_tokens_phono['success_token']), 'partition'] = 'success'     
all_tokens_phono.loc[(all_tokens_phono['yyy_token']), 'partition'] = 'yyy'

In [81]:
if config.verbose:
    print(all_tokens_phono.partition.value_counts())
    print('success shape', all_tokens_phono[all_tokens_phono['success_token']].shape)
    print('yyy shape', all_tokens_phono[all_tokens_phono['yyy_token']].shape)
    print(initial_tokenizer.unk_token_id)

none       2714806
success     259819
yyy          17240
Name: partition, dtype: int64
success shape (259819, 28)
yyy shape (17240, 28)
100


In [82]:
all_tokens_phono.loc[all_tokens_phono.token == 'xxx','token_id'] = initial_tokenizer.unk_token_id
all_tokens_phono.loc[all_tokens_phono.token == 'yyy','token_id'] = initial_tokenizer.unk_token_id

In [84]:
final_save_path = join(config.prov_csv_dir, 'pvd_utt_glosses_phono_cleaned_inflated_to_next_notebook.pkl')
all_tokens_phono.to_pickle(final_save_path)