In [1]:
# Note for the repository: Not re-running reproducibility checks on
# corrected version with this notebook
# because no changes to the sampling/splitting logic itself
# and this notebook/data preprocessing is deterministic.

In [2]:
%load_ext rpy2.ipython
import rpy2.robjects.lib.ggplot2 as ggplot2
import childespy
import numpy as np
import os
import imp
import pandas as pd
import transformers
import torch
import re
import unicodedata
import scipy.stats
import copy
from string import punctuation

from os.path import join, exists

In [3]:
from utils import load_models, transformers_bert_completions, data_cleaning
import config

# Find Communicative Successes and Failures 

In [4]:
# Communicative success: how many no-xxx, no-yyy child  utterances are in Providence? 
# Communicative failures: how many one-yyy, no-xxx child utterances are in Providence?
# Subset to instances that are monosyllabic later

In [5]:
pvd_idx = childespy.get_sql_query('select * from corpus where name = "Providence"').iloc[0]['id']

R[write to console]: Using current database version: '2020.1'.



In [6]:
phono_glosses = childespy.get_sql_query('select gloss, target_child_name, target_child_age, \
    speaker_code, actual_phonology, model_phonology, transcript_id, utterance_id, \
    token_order, corpus_name, collection_name, language from token where \
    actual_phonology != "" and model_phonology != "" and speaker_code in ("MOT", "FAT","CHI") and collection_name = "Eng-NA" \
    and corpus_id = '+str(pvd_idx) ,
        db_version = "2020.1")

# 8/1/21: Added constraint for speaker_code

R[write to console]: Using supported database version: '2020.1'.



In [7]:
phono_glosses.corpus_name.value_counts()

Providence    396621
Name: corpus_name, dtype: int64

In [8]:
phono_glosses.loc[phono_glosses.gloss == 'xxx'].actual_phonology.value_counts()

*        26736
ə           10
(.)          7
aɪ           4
ən           2
         ...  
tæ           1
ɛs           1
roːts        1
eɪ           1
u            1
Name: actual_phonology, Length: 76, dtype: int64

Actual phonology is almost always * for xxx items

In [9]:
phono_glosses.loc[phono_glosses.gloss == 'yyy'].actual_phonology.value_counts()

ɛ                   3206
ʌ                   2132
ɪ                   1881
ə                    512
o                    507
                    ... 
ɑdzəɑdɪlɑzəzduɪt       1
ɡɪɪlɛ                  1
ʌm̩di                  1
ɑwɪs                   1
əlɛdɪdɪ                1
Name: actual_phonology, Length: 30293, dtype: int64

Actual phonology is populated for yyy items

In [10]:
chi_phono = phono_glosses.loc[(phono_glosses.speaker_code == 'CHI') & 
    (phono_glosses.target_child_age < (365*5))]

In [11]:
def count_transmission_errors(utt_vector, error_codes):
    return(np.sum([x in error_codes for x in  utt_vector]))

In [12]:
xxxs_per_utt = chi_phono.groupby('utterance_id').gloss.agg(
    lambda x: count_transmission_errors(x, ['xxx'])).reset_index()
xxxs_per_utt.columns = ['utterance_id', 'num_xxx']
yyys_per_utt = chi_phono.groupby('utterance_id').gloss.agg(
    lambda x: count_transmission_errors(x, ['yyy'])).reset_index()
yyys_per_utt.columns = ['utterance_id', 'num_yyy']
failures_per_utt = xxxs_per_utt.merge(yyys_per_utt)

In [13]:
yyy_utts = failures_per_utt.loc[(failures_per_utt.num_xxx == 0) &  (failures_per_utt.num_yyy == 1)]

In [14]:
yyy_utts.shape

(31457, 3)

In [15]:
success_utts = failures_per_utt.loc[(failures_per_utt.num_xxx == 0) &  
    (failures_per_utt.num_yyy == 0)]

In [16]:
success_utts.shape

(83880, 3)

In [17]:
tokens_from_errorless_utts = chi_phono.loc[chi_phono.utterance_id.isin(success_utts.utterance_id)]

In [18]:
#exclude un-transcribed tokens and syllabically transcribed tokens
excludes = ['*','(.)','(..)', '(...)','(....)','(.....)']
tokens_from_errorless_utts = tokens_from_errorless_utts.loc[~(tokens_from_errorless_utts.actual_phonology.isin(excludes) |
    tokens_from_errorless_utts.model_phonology.isin(excludes))]

In [19]:
tokens_from_errorless_utts.shape

(214239, 12)

In [20]:
# example phonology
tokens_from_errorless_utts.actual_phonology

1          ɑmɪ
3          wiː
4          wiː
5           uː
52           ɛ
          ... 
396606       o
396607     waɪ
396608     liʔ
396609       ɪ
396610    hɪpo
Name: actual_phonology, Length: 214239, dtype: object

In [21]:
# 31,457 transmission errors (from 31,457 utterances)
# 214,239 transmission successes (from 83,880 utterances)
# this will be further decreased later by the need to test monosyllabic forms

# Load BERT Models + CMU Dict

### Get the CMU Pronunciation Dictionary

In [22]:
cmu_in_childes = pd.read_pickle(config.cmu_path)
cmu_2syl_inchildes = cmu_in_childes.loc[cmu_in_childes.num_vowels <=2]

# Prep Utterances / Tokens for BERT

In [23]:
# Get the index of the Providence corpus
pvd_idx = childespy.get_sql_query('select * from corpus where name = "Providence"').iloc[0]['id']

R[write to console]: Using current database version: '2020.1'.



In [24]:
# Load utterances from the Providence corpus from childs-db

this_path = join(config.prov_csv_dir, 'pvd_utt_glosses.csv')
if config.regenerate:
    utt_glosses = childespy.get_sql_query('select gloss, transcript_id, id, \
    utterance_order, target_child_name, speaker_code, type from utterance where corpus_id = '+str(pvd_idx) ,
        db_version = "2020.1")
    utt_glosses.to_csv(this_path, index=False)
else: 
    utt_glosses = pd.read_csv(this_path)

R[write to console]: Using supported database version: '2020.1'.



In [25]:
utt_glosses = utt_glosses.rename(columns = {'id' : 'utterance_id'})

In [26]:
# Prep the utterances for tokenization


import importlib
importlib.reload(data_cleaning)

utt_glosses = data_cleaning.clean_glosses(utt_glosses)
print(utt_glosses.shape)

declarative                   335678
question                       84707
imperative_emphatic            15954
trail off                      12351
self interruption               6658
interruption                    2928
self interruption question       825
trail off question               650
interruption question            304
quotation precedes                 3
question exclamation               2
broken for coding                  1
Name: type, dtype: int64
Cell 238 gloss                where do you want me to go
transcript_id                             42204
utterance_id                           16759250
utterance_order                               1
target_child_name                          Alex
speaker_code                                OPE
type                                   question
punct                                         ?
Name: 1, dtype: object
(460061, 10)


### Build the Vocabulary

In [27]:
initial_tokenizer = load_models.get_primary_tokenizer()
initial_vocab, cmu_in_initial_vocab = load_models.get_initial_vocab_info(initial_tokenizer)

In [28]:
# confirm yyy treated as a separate character
initial_tokenizer.tokenize('this is a yyy.')

['this', 'is', 'a', 'yyy', '.']

In [29]:
cmu_in_initial_vocab.shape

(7904, 8)

In [30]:
# build a dataframe of tokens 
# this is slow, because tokenization is slow
def inflate(row):
    tokens = initial_tokenizer.tokenize(row['gloss_with_punct'])
    return(pd.DataFrame({'token':tokens, 'utterance_id':row['utterance_id']}) )

inflate_path = join(config.prov_csv_dir, 'pvd_utt_glosses_inflated.csv')
if config.regenerate:
    all_tokens = pd.concat([inflate(x) for x in utt_glosses.to_dict('records')])
    all_tokens = all_tokens.merge(utt_glosses)
    all_tokens.to_csv(inflate_path)

else:
    all_tokens = pd.read_csv(inflate_path, na_filter=False)

In [31]:
all_tokens.iloc[0:10]

Unnamed: 0,token,utterance_id,gloss,transcript_id,utterance_order,target_child_name,speaker_code,type,punct,speaker_code_simple,gloss_with_punct
0,[cgv],16759250,where do you want me to go,42204,1,Alex,OPE,question,?,[CGV],[CGV] where do you want me to go?
1,where,16759250,where do you want me to go,42204,1,Alex,OPE,question,?,[CGV],[CGV] where do you want me to go?
2,do,16759250,where do you want me to go,42204,1,Alex,OPE,question,?,[CGV],[CGV] where do you want me to go?
3,you,16759250,where do you want me to go,42204,1,Alex,OPE,question,?,[CGV],[CGV] where do you want me to go?
4,want,16759250,where do you want me to go,42204,1,Alex,OPE,question,?,[CGV],[CGV] where do you want me to go?
5,me,16759250,where do you want me to go,42204,1,Alex,OPE,question,?,[CGV],[CGV] where do you want me to go?
6,to,16759250,where do you want me to go,42204,1,Alex,OPE,question,?,[CGV],[CGV] where do you want me to go?
7,go,16759250,where do you want me to go,42204,1,Alex,OPE,question,?,[CGV],[CGV] where do you want me to go?
8,?,16759250,where do you want me to go,42204,1,Alex,OPE,question,?,[CGV],[CGV] where do you want me to go?
9,[cgv],16759261,anywhere you'll feel comfortable um anywhere,42204,2,Alex,MOT,declarative,.,[CGV],[CGV] anywhere you'll feel comfortable um anyw...


In [32]:
# Assign a token_id (integer in the BERT vocabulary). 
# Because these are from the tokenized utterances, there is no correpsondence 
# with childes-db token ids
all_tokens['token_id'] = initial_tokenizer.convert_tokens_to_ids(all_tokens['token'])
# assigns utterances a 0-indexed index column
all_tokens['seq_utt_id'] = all_tokens['utterance_id'].astype('category').cat.codes

### Add back IPA, syllable structure, and child ages for child productions

In [33]:
# get the token-level data, esp phonology

save_phono_inflated_path = join(config.prov_csv_dir, 'pvd_utt_glosses_phono_inflated.pkl')

if config.regenerate:

    # get token-level information for Providence
    pvd_chi_tokens = childespy.get_sql_query('select gloss, target_child_name, target_child_age, \
    speaker_code, actual_phonology, model_phonology, transcript_id, utterance_id, \
    token_order from token where speaker_code = "CHI" and corpus_id = '+str(pvd_idx),
        db_version = "2020.1")
    pvd_chi_tokens['gloss'] = [data_cleaning.fix_gloss(x) for x in pvd_chi_tokens.gloss]
    
    # prep the tokens generated from segmenting the utterances
    all_tokens_test = copy.deepcopy(all_tokens) 

    # initialize the fields that need to be populated
    all_tokens_test['actual_phonology'] = ''
    all_tokens_test['model_phonology'] = ''
    all_tokens_test['target_child_age'] = np.nan
    
    # get a set of unique utterances
    _, idx = np.unique(all_tokens_test.utterance_id, return_index=True)
    all_utt_indices = all_tokens_test.utterance_id[np.sort(idx)]
    
    # For fast retrieval of IPA, split pvd_chi_tokens into a dictionary
    pvd_chi_tokens_list = pvd_chi_tokens.groupby(['utterance_id'])
    pvd_chi_tokens_dict = dict(zip(
        [x[0] for x in pvd_chi_tokens_list], 
        [x[1] for x in pvd_chi_tokens_list], 
    ))
    
    # For fast retrival of BERT tokenization
    all_tokens_test_list = all_tokens_test.groupby(['utterance_id'])
    all_tokens_test_dict = dict(zip(
        [x[0] for x in all_tokens_test_list], 
        [x[1] for x in all_tokens_test_list], 
    ))
        
    # Augment the tokens from all_tokens with the IPA from pvd_chi_tokens 
    rvs = [] 
    utts_to_retrieve = yyy_utts.utterance_id.to_list() + success_utts.utterance_id.to_list()
    i=-1
    for utt_index in all_utt_indices: #utts_to_retrieve: #[16760331]:       
        i+=1
        if i % int(len(all_utt_indices) / 100) == 0:
            print(str(np.round((i / (len(all_utt_indices)) * 100),2))+'% complete...')    
            # should learn to use tqdm instead
        if utt_index in utts_to_retrieve:        
            utt_df = copy.deepcopy(all_tokens_test_dict[utt_index])
            utt_df['model_phonology'] = transformers_bert_completions.augment_with_ipa(
              utt_df, pvd_chi_tokens_dict[utt_index],initial_tokenizer, 'model_phonology')
            utt_df['actual_phonology'] = transformers_bert_completions.augment_with_ipa(
              utt_df, pvd_chi_tokens_dict[utt_index],initial_tokenizer, 'actual_phonology')
            utt_df['target_child_age'] = pvd_chi_tokens_dict[utt_index].iloc[0].target_child_age    
            rvs.append(utt_df)  
        else:
            rvs.append(all_tokens_test_dict[utt_index])  
            
    # get the resulting augmented forms back into a dataframe
    all_tokens_phono = pd.concat(rvs)
    
    # add a unique identifier to the BERT tokens
    all_tokens_phono['bert_token_id'] = range(all_tokens_phono.shape[0])
    
    #save the results
    all_tokens_phono.to_pickle(save_phono_inflated_path)
else:
    all_tokens_phono = pd.read_pickle(save_phono_inflated_path)

R[write to console]: Using supported database version: '2020.1'.



0.0% complete...




1.0% complete...
2.0% complete...
3.0% complete...
4.0% complete...
5.0% complete...
6.0% complete...
7.0% complete...
8.0% complete...
9.0% complete...
10.0% complete...
11.0% complete...
12.0% complete...
13.0% complete...
14.0% complete...
15.0% complete...
16.0% complete...
17.0% complete...
18.0% complete...
19.0% complete...
20.0% complete...
21.0% complete...
22.0% complete...
23.0% complete...
24.0% complete...
25.0% complete...
26.0% complete...
27.0% complete...
28.0% complete...
29.0% complete...
30.0% complete...
31.0% complete...
32.0% complete...
33.0% complete...
34.0% complete...
35.0% complete...
36.0% complete...
37.0% complete...
37.99% complete...
38.99% complete...
39.99% complete...
40.99% complete...
41.99% complete...
42.99% complete...
43.99% complete...
44.99% complete...
45.99% complete...
46.99% complete...
47.99% complete...
48.99% complete...
49.99% complete...
50.99% complete...
51.99% complete...
52.99% complete...
53.99% complete...
54.99% complete...
5

In [34]:
# Inspect the IPA
all_tokens_phono.loc[all_tokens_phono.actual_phonology != ''][['token','actual_phonology','model_phonology']]

Unnamed: 0,token,actual_phonology,model_phonology
42,mommy,ɑmɪ,mɑmiː
81,yyy,ʌ,*
170,wee,wiː,wiː
173,yyy,aʊ,*
201,wee,wiː,wiː
...,...,...,...
3083588,nobody,nobɑɾi,noʊbɑdiː
3083589,hates,heɪs,heɪts
3083594,oh,o,oʊ
3083595,why,waɪ,waɪ


In [35]:
# Get the IPA map
phone_map_df = pd.read_csv('phon/phon_map_populated.csv')
phone_map_df.head()

Unnamed: 0,arpa,ipa,c_or_v
0,AA,ɑ,v
1,AE,æ,v
2,AH,ə,v
3,AO,ɔ,v
4,AW,aʊ,v


In [36]:
def phone_remap(x):
    return(x.replace("ː","").replace('ʌ','ə')
.replace('ɪ','ə').replace('ɔ','ɑ').replace('a','ɑ').replace('o','oʊ').replace('˞','').replace('ʰ',
    ''). replace('r','ɹ')).replace('\\^','').replace('\\ ̃','').replace(' ̩','').replace('^',''
).replace('ʙ','b').replace('(','').replace(')','').replace('.','').replace('ch','ʧ'
).replace('c','k').replace('g','ɡ').replace('y','j').replace('ʁ','ɹ')

def strip_accents(string, accents=('COMBINING ACUTE ACCENT', 
    'COMBINING GRAVE ACCENT', 'COMBINING TILDE', 'COMBINING VERTICAL LINE BELOW',
    'COMBINING SHORT STROKE OVERLAY')):
    accents = set(map(unicodedata.lookup, accents))
    chars = [c for c in unicodedata.normalize('NFD', string) if c not in accents]
    return unicodedata.normalize('NFC', ''.join(chars))

cv_map = dict(zip(phone_map_df['ipa'], phone_map_df['c_or_v']))
cv_map['o'] = 'v' 
cv_map['ɜ'] = 'v'
cv_map['e'] = 'v'
cv_map['ʔ'] = 'c'
cv_map['ɾ'] = 'c'
cv_map['ɲ'] = 'c'
cv_map['x'] = 'c'
cv_map['ɱ'] = 'c'
cv_map['ɣ'] = 'c'

def cv_mapper(x, cv_map):
    try:
        return(cv_map[x])
    except:
        raise ValueError(x)

cleaned_inflated_save = join(config.prov_csv_dir, 'pvd_utt_glosses_phono_cleaned_inflated.pkl')

if config.regenerate:    

    # Do the same excludes as were used to identify appropriate utterances
    excludes = ['*','(.)','(..)', '(...)','(....)','(.....)']
    all_tokens_phono.loc[all_tokens_phono.actual_phonology.isin(excludes),'actual_phonology'] =''
    all_tokens_phono.loc[all_tokens_phono.actual_phonology.str.contains('V'),'actual_phonology'] =''
    
    # remap phonology from narrow phonetic transcription to broad phonological transcription
    all_tokens_phono['model_phonology_clean'] = [phone_remap(x) for x in all_tokens_phono['model_phonology']]
    all_tokens_phono['actual_phonology_clean'] = [phone_remap(x) for x in all_tokens_phono['actual_phonology']]

    # remove any non-combining diacritical marks
    all_tokens_phono['model_phonology_no_dia'] = [strip_accents(x) for x in \
    all_tokens_phono['model_phonology_clean']]
    all_tokens_phono['actual_phonology_no_dia'] = [strip_accents(x) for x in \
    all_tokens_phono['actual_phonology_clean']]
    
    # Compute the number of non-contiguous vowels.
    # slightly different than the cmu vowel computation ---
    # because here we are computing it directly from IPA
    all_tokens_phono['cv_raw'] = [''.join([cv_mapper(x, cv_map) for x in list(y)]) if y != '' else '' for y in all_tokens_phono['actual_phonology_no_dia']]    
    all_tokens_phono['cv_collapsed']  = [re.sub(r'(.)\1+', r'\1', str(x)) if x != '' else '' for x in all_tokens_phono['cv_raw']]
    all_tokens_phono['num_vowels'] = [np.sum(np.array(list(x)) == 'v') if x !='' else np.nan for x in all_tokens_phono['cv_collapsed']]
    all_tokens_phono.to_pickle(cleaned_inflated_save)
else:
    all_tokens_phono = pd.read_pickle(cleaned_inflated_save)


In [37]:
all_tokens_phono.loc[all_tokens_phono.actual_phonology_no_dia != '']['actual_phonology_no_dia']

42             ɑmə
81               ə
170             wi
173             ɑʊ
201             wi
            ...   
3083588    noʊbɑɾi
3083589       heəs
3083594         oʊ
3083595        wɑə
3083596        liʔ
Name: actual_phonology_no_dia, Length: 254440, dtype: object

In [38]:
all_tokens_phono.shape

(3083625, 24)

### Identify the tokens that can be evaluated 

In [39]:
# find the tokens in the resulting dataframe that belong to the utterances identified above

In [40]:
initial_vocab

array(['a', 'b', 'c', ..., 'hideout', 'pudding', 'stalks'], dtype='<U18')

In [41]:

successful_utt_ids = set(success_utts['utterance_id'])

initial_vocab_set = set(initial_vocab)

yyy_utt_ids = set(yyy_utts['utterance_id'])

all_tokens_phono['in_vocab'] = all_tokens_phono['token'].isin(initial_vocab_set)

# 8/1/21: Changed this line to include the vocab constraint.
all_tokens_phono['success_token'] = [(x in successful_utt_ids) and (y) for x, y in 
    zip(all_tokens_phono['utterance_id'], all_tokens_phono['in_vocab'])]
# end changes

all_tokens_phono['yyy_token'] = [x in yyy_utt_ids for x in 
    all_tokens_phono['utterance_id']]


In [42]:
all_tokens_phono.shape

(3083625, 27)

In [43]:
assert '' not in set(all_tokens_phono[all_tokens_phono['num_vowels'] <= 2].actual_phonology)

### Potential changes to counts begin here due to filtering on in_vocab.

### Identify the subset of success and failure utterances that have transcriptions

In [44]:
all_tokens_phono['partition'] = 'none'

In [45]:
success_tokens = all_tokens_phono.loc[(all_tokens_phono['success_token']) & 
    (all_tokens_phono['num_vowels'] <= 2) ]
all_tokens_phono.loc[(all_tokens_phono['success_token']) & 
    (all_tokens_phono['num_vowels'] <= 2), 'partition'] = 'success'     
success_tokens.shape

(185723, 28)

In [46]:
# Successes conditions

success_tokens_check = all_tokens_phono[all_tokens_phono.partition == 'success']
assert all(success_tokens_check['in_vocab'])
assert all(success_tokens_check.utterance_id.isin(successful_utt_ids))
assert all(success_tokens_check['num_vowels'] <= 2)


In [47]:
all_tokens_phono.loc[(all_tokens_phono['success_token']) & 
    (all_tokens_phono['num_vowels'] <= 2)]

Unnamed: 0,token,utterance_id,gloss,transcript_id,utterance_order,target_child_name,speaker_code,type,punct,speaker_code_simple,...,actual_phonology_clean,model_phonology_no_dia,actual_phonology_no_dia,cv_raw,cv_collapsed,num_vowels,in_vocab,success_token,yyy_token,partition
42,mommy,16759315,Mommy,42204,6,Alex,CHI,declarative,.,[CHI],...,ɑmə,mɑmi,ɑmə,vcv,vcv,2.0,True,True,False,success
170,wee,16759467,wee,42204,24,Alex,CHI,declarative,.,[CHI],...,wi,wi,wi,cv,cv,1.0,True,True,False,success
201,wee,16759501,wee,42204,28,Alex,CHI,declarative,.,[CHI],...,wi,wi,wi,cv,cv,1.0,True,True,False,success
239,woo,16759549,woo,42204,33,Alex,CHI,declarative,.,[CHI],...,u,wu,u,v,v,1.0,True,True,False,success
743,ernie,16759752,Ernie,42204,58,Alex,CHI,declarative,.,[CHI],...,ɛ,əɹni,ɛ,v,v,1.0,True,True,False,success
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3083575,help,17280891,help,42569,752,William,CHI,declarative,.,[CHI],...,ɛlp,hɛlp,ɛlp,vcc,vc,1.0,True,True,False,success
3083589,hates,17280946,nobody hates Simba,42569,755,William,CHI,declarative,.,[CHI],...,heəs,heəts,heəs,cvvc,cvc,1.0,True,True,False,success
3083594,oh,17280964,oh why lick hippo,42569,756,William,CHI,self interruption,.,[CHI],...,oʊ,oʊʊ,oʊ,vv,v,1.0,True,True,False,success
3083595,why,17280964,oh why lick hippo,42569,756,William,CHI,self interruption,.,[CHI],...,wɑə,wɑə,wɑə,cvv,cv,1.0,True,True,False,success


In [48]:
yyy_tokens = all_tokens_phono.loc[(all_tokens_phono['yyy_token']) & 
(all_tokens_phono['token'] == 'yyy') & (all_tokens_phono.num_vowels <= 2) ]
all_tokens_phono.loc[(all_tokens_phono['yyy_token']) & 
(all_tokens_phono['token'] == 'yyy') & (all_tokens_phono.num_vowels <= 2),'partition'] = 'yyy'
yyy_tokens.shape

(27693, 28)

In [49]:
all_tokens_phono.partition.value_counts()

none       2870209
success     185723
yyy          27693
Name: partition, dtype: int64

In [50]:
initial_tokenizer.unk_token_id

100

In [51]:
all_tokens_phono.loc[all_tokens_phono.token == 'xxx','token_id'] = initial_tokenizer.unk_token_id
all_tokens_phono.loc[all_tokens_phono.token == 'yyy','token_id'] = initial_tokenizer.unk_token_id

In [52]:
# this adds the partition information

final_save_path = join(config.prov_csv_dir, 'pvd_utt_glosses_phono_cleaned_inflated_to_next_notebook.pkl')
all_tokens_phono.to_pickle(final_save_path)

# Prevalence of Successes and Failures Across Time

In [53]:
# get number of tokens per age
success_utts['set'] = 'success'
yyy_utts['set'] = 'failure'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [54]:
# get child age in days associated with each utterance id and plot it

In [55]:
utt_age = chi_phono.groupby('utterance_id').target_child_age.agg(np.unique).reset_index()

In [56]:
utts_with_ages = pd.concat([success_utts, yyy_utts]).merge(utt_age)

In [57]:
utts_with_ages['year'] = .5*np.floor(utts_with_ages['target_child_age'] / (365. /2) ) 
print(utts_with_ages.loc[utts_with_ages.set == 'failure'].year.value_counts())
print(utts_with_ages.loc[utts_with_ages.set == 'success'].year.value_counts())

1.5    9919
2.0    7261
1.0    6693
2.5    4895
3.0    2097
3.5     414
0.5     167
4.0      11
Name: year, dtype: int64
2.0    22432
2.5    21194
1.5    16798
3.0    12564
1.0     6697
3.5     3683
4.0      379
0.5      133
Name: year, dtype: int64


In [58]:
final_utts_save_path = join(config.prov_csv_dir, 'utts_with_ages.csv')
utts_with_ages.to_csv(final_utts_save_path)

In [59]:
assert 'yyy' not in set(success_tokens_check['token'])
# This was the problem that was observed in my iteration of the code.