In [1]:
import pandas as pd
import childespy
import numpy as np
import spellchecker

# Generate Training and Valdiation for Fine Tuning

In [2]:
#get all of the North American and British English adult and child utterances without xxx or yyy
#concatenate them at the the transcript level
#hold out 20% for validation 

In [3]:
corpora = childespy.get_sql_query('select * from corpus where \
collection_name in ("Eng-NA", "Eng-UK") and data_source = "CHILDES"')
corpora

R[write to console]: Using current database version: '2020.1'.



Unnamed: 0,id,name,collection_name,data_source,collection_id
1,32,Garvey,Eng-NA,CHILDES,2
2,33,Valian,Eng-NA,CHILDES,2
3,34,Bernstein,Eng-NA,CHILDES,2
4,35,Clark,Eng-NA,CHILDES,2
5,36,PetersonMcCabe,Eng-NA,CHILDES,2
...,...,...,...,...,...
57,221,Wells,Eng-UK,CHILDES,12
58,222,Gathburn,Eng-UK,CHILDES,12
59,223,Nuffield,Eng-UK,CHILDES,12
60,224,Lara,Eng-UK,CHILDES,12


In [13]:
corpora.columns

Index(['id', 'name', 'collection_name', 'data_source', 'collection_id'], dtype='object')

In [4]:
childes_datasets = ",".join([str(x) for x in corpora.collection_id])

In [14]:
regenerate = True
if regenerate:
    utt_glosses = childespy.get_sql_query('select gloss, transcript_id, id, \
    utterance_order, speaker_code, target_child_name, target_child_age, type from utterance where collection_name in ("Eng-NA", "Eng-UK") \
    and collection_id in ('+childes_datasets+') and speaker_code in ("MOT", "FAT","CHI")' , db_version = "2020.1")
    utt_glosses.to_csv('csv/utt_glosses.csv', index=False)
else: 
    utt_glosses = pd.read_csv('csv/utt_glosses.csv')

R[write to console]: Using supported database version: '2020.1'.



In [15]:
set(utt_glosses.target_child_name) # Note it doesn't have the child's name anymore -- do you need to split early?

{'1234AB',
 '4269LP',
 '4269LP10',
 '4269LP11',
 '4269LP18',
 '4269LP7',
 '4271WC',
 '4273WC',
 '4273WC10',
 '4273WC24',
 '427WC10',
 '427WC11',
 '4310AM11',
 '4310AM24',
 '4310AM7',
 '4452CM',
 '4452CM11',
 '4452CM24',
 '4592HVG10mos',
 '4592HVG11mos',
 '4592HVG7mos',
 '4619WC',
 '4619WZ10',
 '4619WZ11',
 '4619WZ7',
 '4629AB',
 '4629AB10',
 '4629AB11',
 '4629AB24',
 '4641CC11',
 '4641CC24',
 '4641CC7',
 '4650KS',
 '4650KS10',
 '4650KS11',
 '4650KS24',
 '4664AM',
 '4664AM7',
 '4687NH',
 '4687NH24mos',
 '4697JK10',
 '4697JK11',
 '4697JK24',
 '4697JK7',
 '47081B7',
 '4708IB10mos',
 '4708IB11mos',
 '4708IB24',
 '4724LM10',
 '4724LM24mos',
 '4724LM7',
 '4724LM7mos',
 '4731SA',
 '4734ES',
 '4734ES7',
 '4737NA7',
 '474311mos',
 '4743NA10',
 '4743NA11',
 '4743NA24',
 '4767JC',
 '4767JC11',
 '4801RB',
 '4801RB10',
 '4801RB11',
 '4802JP',
 '4802JP10',
 '4802JP11',
 '4802JP7',
 '4814BS',
 '4814BS24',
 '4825GG',
 '4854MP11mos',
 '4854MP24mos',
 '4854MP7mos',
 '4858CM10',
 '4858CM11',
 '4858CM7',


In [232]:
utt_glosses.shape

(4205071, 6)

In [233]:
# drop any with xxx or yyy
utt_glosses['contains_error'] = ['xxx' in str(x) or 'yyy' in str(x) for x in utt_glosses.gloss]
utt_glosses = utt_glosses.loc[~utt_glosses.contains_error]
utt_glosses.shape

(3959952, 7)

In [234]:
def fix_gloss(gloss):
    return(str(gloss).replace('+',' ').replace('_',' '))

utt_glosses.gloss = [fix_gloss(x) for x in utt_glosses.gloss]

In [235]:
utt_glosses['type'].value_counts()

declarative                   2828187
question                       903625
imperative_emphatic            114118
trail off                       63230
interruption                    16870
missing CA terminator           14430
self interruption               11307
quotation next line              5903
interruption question             877
quotation precedes                756
trail off question                504
self interruption question        125
broken for coding                  17
no break TCU continuation           3
Name: type, dtype: int64

In [236]:
punct_for_type = {
    'question':'?',
    'declarative':'.',
    'interruption':'!',
    'trail off':'...',
    'trail off question':'?',
    'imperative_emphatic':'!' 
}

In [237]:
utt_glosses['punct'] = [punct_for_type[x] if x in punct_for_type else None
                        for x in utt_glosses.type ]

In [238]:
utt_glosses.iloc[0]

gloss              now we need this
transcript_id                  3261
id                           279663
utterance_order                   1
speaker_code                    CHI
type                      trail off
contains_error                False
punct                           ...
Name: 1, dtype: object

In [267]:
utt_glosses['speaker_code_simple'] = ['[CHI]' if x == 'CHI' else '[CGV]'
    for x in utt_glosses.speaker_code]

In [268]:
utt_glosses = utt_glosses.loc[[x is not None for x in utt_glosses.punct]]
utt_glosses['gloss_with_punct'] = [x['speaker_code_simple'] + ' '+ x['gloss'].lower() + x['punct'] for x in utt_glosses.to_dict('records')]

In [269]:
utt_glosses.head(5).gloss_with_punct

1                          [CHI] now we need this...
2                        [CHI] we need we you don't.
3    [CHI] need a pocketbook because i'm the mother.
4                               [CHI] i'm i'm the...
6         [CHI] now you can watch now wait a minute.
Name: gloss_with_punct, dtype: object

### Token cleaning 

In [270]:
#get the unigram counts for tokens and remap any problematic ones

In [271]:
utt_glosses['tokens'] = [str(x).lower().split(' ') for x in utt_glosses.gloss]

In [272]:
all_tokens = [y for x in utt_glosses['tokens'] for y in x]

In [273]:
all_token_frequencies = pd.Series(all_tokens).value_counts().reset_index()
all_token_frequencies.columns = ['word','count']
all_token_frequencies.to_csv('data/vocab.csv')

In [274]:
#Check what we are missing: filter to ones outside of a dictionary

In [276]:
spell = spellchecker.SpellChecker()

In [277]:
all_token_frequencies['in_dict'] = [len(spell.unknown([x])) == 0  for  x in all_token_frequencies.word]

In [278]:
all_token_frequencies.loc[all_token_frequencies['in_dict'] == False]

Unnamed: 0,word,count,in_dict
16,,148011,False
92,mhm,28524,False
113,hm,23665,False
127,www,18965,False
157,mm,14499,False
...,...,...,...
56860,hayahe,1,False
56862,brmmmtssss,1,False
56863,adudududududududu,1,False
56864,ghooghoo,1,False


In [279]:
' '.join(all_token_frequencies.loc[all_token_frequencies['in_dict'] == False].head(1000).word)

" mhm hm www mm uhhuh needta uhoh jwww purdie n b s sposta t mummie zzz c uhuh d firstname useta hadta m jwww's ahhah p willn't r mr w shh tv h mrs g l nappie dimitra mm-hm aah f k uhhum purdie's awww afternoon lwww swww aran pottie choos ssh no-no cwww j lastname v fraser's neenaw hunhunh cromer pingu th x dwww botbot mwww wh q ummhm mkay z awoh sukie dada's ewww marky's sh duplo tweenies enne dadaw mummy'll nuhhuh rwww twww chih gwww grr beepbeep didsbury beeba ch kalie zorg afterwards jeannine's pennys tellie miffy baaee didldow uh-huh baura bwww stockport firstname's grrr kwww ntuu puttaputta mummie's unhunh dollie ribena ninight weener hunm pippo brr mm-mm peeppeep awww's dimitra's strawberrys teletubby kokowk bowwow gagaa puppys ss umhum batterys psh brumbrum dr pingu's jj hwww paddington mooshas goggins nwww sss lorrys morgie st nuuw pickyaup koolaid babyschool carle dipsy's aislinn swww's dodie jaylen pwww bzz toybox eensie babaa aladar tisha aaron aguh weensie ewww's sainsbury

In [None]:
### Re-order by transcript and token order

In [280]:
utt_glosses_sorted = utt_glosses.sort_values(by=['transcript_id', 'utterance_order'])

In [281]:
utt_glosses_sorted.head()

Unnamed: 0,gloss,transcript_id,id,utterance_order,speaker_code,type,contains_error,punct,speaker_code_simple,gloss_with_punct,tokens
12,look,3260,279700,2,CHI,declarative,False,.,[CHI],[CHI] look.,[look]
15,can I have some,3260,279707,4,CHI,question,False,?,[CHI],[CHI] can i have some?,"[can, i, have, some]"
18,is this,3260,279716,6,CHI,declarative,False,.,[CHI],[CHI] is this.,"[is, this]"
22,don't call me Karen because I'm not Karen,3260,279730,10,CHI,declarative,False,.,[CHI],[CHI] don't call me karen because i'm not karen.,"[don't, call, me, karen, because, i'm, not, ka..."
24,I'm Amy,3260,279735,11,CHI,declarative,False,.,[CHI],[CHI] i'm amy.,"[i'm, amy]"


In [282]:
utt_glosses_sorted.shape

(3926534, 11)

In [283]:
# select 20 % of the transcripts for training
transcript_inventory = np.unique(utt_glosses_sorted.transcript_id)
validation_indices = np.random.choice(transcript_inventory, 
    int(np.round(len(transcript_inventory) / 5)))
print(len(validation_indices))
utt_glosses_sorted['partition'] = 'train'
utt_glosses_sorted.loc[utt_glosses_sorted.transcript_id.isin(validation_indices), 
    'partition'] = 'validation'

1754


In [284]:
utt_glosses_sorted.partition.value_counts()

train         3204912
validation     721622
Name: partition, dtype: int64

In [264]:
import os
os.getcwd()

'/home/stephan/notebooks/child-directed-listening'

In [285]:
utt_glosses_sorted.loc[utt_glosses_sorted.partition =='validation'] \
          [['gloss_with_punct']].to_csv('data/validation.txt', index=False, header=False) 

In [286]:
utt_glosses_sorted.loc[utt_glosses_sorted.partition =='train'] \
          [['gloss_with_punct']].to_csv('data/train.txt', index=False, header=False) 

In [None]:
#run run_mlm_finetune_on_childes.sh

In [None]:
# [ ] Put on OpenMind for GPU usage -- nope, vagrant totall broken
# [X] Confirm that I can do a hello word for MLM training
    https://github.com/huggingface/transformers/tree/master/examples/language-modeling
# [ ] replace the training and testing datasets with things from CHILDES
# [ ] can the yielded model be used with the existing query code? transformers vs. torch

In [287]:
utt_glosses

Unnamed: 0,gloss,transcript_id,id,utterance_order,speaker_code,type,contains_error,punct,speaker_code_simple,gloss_with_punct,tokens
1,now we need this,3261,279663,1,CHI,trail off,False,...,[CHI],[CHI] now we need this...,"[now, we, need, this]"
2,we need we you don't,3261,279666,2,CHI,declarative,False,.,[CHI],[CHI] we need we you don't.,"[we, need, we, you, don't]"
3,need a pocketbook because I'm the mother,3261,279668,3,CHI,declarative,False,.,[CHI],[CHI] need a pocketbook because i'm the mother.,"[need, a, pocketbook, because, i'm, the, mother]"
4,I'm I'm the,3261,279670,5,CHI,trail off,False,...,[CHI],[CHI] i'm i'm the...,"[i'm, i'm, the]"
6,now you can watch now wait a minute,3261,279672,7,CHI,declarative,False,.,[CHI],[CHI] now you can watch now wait a minute.,"[now, you, can, watch, now, wait, a, minute]"
...,...,...,...,...,...,...,...,...,...,...,...
4205066,none you did so,25473,10212128,1763,MOT,declarative,False,.,[CGV],[CGV] none you did so.,"[none, you, did, so]"
4205068,is it not nice,25473,10212130,1765,MOT,question,False,?,[CGV],[CGV] is it not nice?,"[is, it, not, nice]"
4205069,no,25473,10212131,1766,CHI,declarative,False,.,[CHI],[CHI] no.,[no]
4205070,yeah you have me tired out,25473,10212132,1767,MOT,declarative,False,.,[CGV],[CGV] yeah you have me tired out.,"[yeah, you, have, me, tired, out]"


In [289]:
chi_utt_glosses = utt_glosses.loc[utt_glosses.speaker_code == 'CHI']
chi_tokens = [y for x in chi_utt_glosses['tokens'] for y in x]
chi_token_frequencies = pd.Series(chi_tokens).value_counts().reset_index()
chi_token_frequencies.columns = ['word','count']
chi_token_frequencies.to_csv('data/chi_vocab.csv')

In [291]:
chi_token_frequencies.head()

Unnamed: 0,word,count
0,i,168156
1,a,131763
2,the,111961
3,yeah,103693
4,it,100575


In [20]:
number_id = len(set(utt_glosses.id))
this_len = len(utt_glosses)

print(number_id, this_len) # Yes, they are true ids of the entries.

4205071 4205071
