In [1]:
import pandas as pd
from utils import transformers_bert_completions, load_splits, load_models
from utils_model_sampling import sample_across_models, beta_utils

from yyy_analysis import examples_figure
import numpy as np

import os
from os.path import join, exists


import childespy

In [2]:
import configuration
config = configuration.Config()

### Get general data

In [3]:
# Make this config regenerate controlled later

pvd_idx = childespy.get_sql_query('select * from corpus where name = "Providence"').iloc[0]['id']

regenerate = False
this_path = join(config.prov_csv_dir, 'pvd_utt_glosses.csv')

if regenerate:
    utt_glosses = childespy.get_sql_query('select gloss, transcript_id, id, \
    utterance_order, speaker_code, type from utterance where corpus_id = '+str(pvd_idx) ,
        db_version = "2020.1")
    utt_glosses.to_csv(this_path, index=False)
else: 
    utt_glosses = pd.read_csv(this_path)

R[write to console]: Using current database version: '2020.1'.



### Load example information

In [4]:
all_tokens_phono = load_splits.load_phono()

In [5]:
success_idx = 16928243

all_tokens_phono.loc[all_tokens_phono.utterance_id == success_idx][['gloss','actual_phonology_no_dia',
 'model_phonology_no_dia', 'utterance_id','bert_token_id','utterance_order','transcript_id']]


Unnamed: 0,gloss,actual_phonology_no_dia,model_phonology_no_dia,utterance_id,bert_token_id,utterance_order,transcript_id
997717,I want to read,,,16928243,997717,310,42336
997718,I want to read,ɑə,ɑə,16928243,997718,310,42336
997719,I want to read,wɑn,wɑnt,16928243,997719,310,42336
997720,I want to read,də,tu,16928243,997720,310,42336
997721,I want to read,wid,ɹid,16928243,997721,310,42336
997722,I want to read,,,16928243,997722,310,42336


In [6]:
target_transcript_id = 42336 # Corresponds to the success_idx 

### Find new test examples - successes

In [7]:
# Changed this notebook to use the +/- 20 context newly generated versions.

# Written to match load_models.query_model_title
default_args = {
    'split' : 'all',
    'dataset' : 'all', 
    'is_tags' : False,
    'context_num' : 20,
}

childes_all_title = load_models.query_model_title(model_type = 'childes', **default_args)
adult_all_title = load_models.query_model_title(model_type = 'adult', **default_args)
unigram_title = 'CHILDES Unigram'

In [8]:

# CDL + Context +/- 20 is needed
# BERT + Context +/- 20 is needed
# Childes on train data.

# How to load properly with sample across models?
which_models = [
    ('all', 'all', False, 0, 'data_unigram'),
    ('all', 'all', False, 20, 'childes'),
    ('all', 'all', False, 20, 'adult'),
]

raw_scores_across_models = examples_figure.get_scores_across_models(success_idx, which_models, True)
scores_across_models = pd.concat(raw_scores_across_models)

Running model CHILDES Unigram...
Processing beta value 1 of 10
If possible compare the bert_token_id in sample_across_models to the bert_token_id in one of the other scores sets from bert.
Running model CHILDES BERT without tags, , +-20 utts context...
Computing failure scores
Computing success scores
Processing beta value 1 of 10


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Running model Adult BERT without tags, , +-20 utts context...
Computing failure scores
Computing success scores
Processing beta value 1 of 10


In [9]:
success_example = scores_across_models.loc[(scores_across_models.model == childes_all_title) &
    (scores_across_models.token == 'read')][['model','highest_posterior_words','highest_posterior_probabilities',
    'highest_prior_words','highest_prior_probabilities', 'prior_probability','token']]
success_example

  and should_run_async(code)


Unnamed: 0,model,highest_posterior_words,highest_posterior_probabilities,highest_prior_words,highest_prior_probabilities,prior_probability,token
997721,"CHILDES BERT without tags, , +-20 utts context",read see watch hear look know eat be weed do,0.755326884804619 0.2255594917587144 0.0071594...,read see play look know watch try do go write,0.6819752 0.20365484 0.02005186 0.0146201 0.01...,0.681975,read


In [10]:
words = success_example.iloc[0].highest_prior_words.split(' ')
probs = [float(x) for x in success_example.iloc[0].highest_prior_probabilities.split(' ')]
' '.join([words[i]+' ('+str(np.round(probs[i],2))+')' for i in range(len(words))]) 

  and should_run_async(code)


'read (0.68) see (0.2) play (0.02) look (0.01) know (0.01) watch (0.01) try (0.0) do (0.0) go (0.0) write (0.0)'

In [11]:
success_example = scores_across_models.loc[(scores_across_models.model == childes_all_title) &
    (scores_across_models.token == 'read')][['model','highest_posterior_words','highest_posterior_probabilities',
    'highest_prior_words','highest_prior_probabilities', 'prior_probability','token']]
success_example

words = success_example.iloc[0].highest_posterior_words.split(' ')
probs = [float(x) for x in success_example.iloc[0].highest_posterior_probabilities.split(' ')]
' '.join([words[i]+' ('+str(np.round(probs[i],2))+')' for i in range(len(words))]) 

'read (0.76) see (0.23) watch (0.01) hear (0.0) look (0.0) know (0.0) eat (0.0) be (0.0) weed (0.0) do (0.0)'

In [12]:
success_example = scores_across_models.loc[(scores_across_models.model == adult_all_title) &
    (scores_across_models.token == 'read')][['model','highest_posterior_words','highest_posterior_probabilities',
    'highest_prior_words','highest_prior_probabilities', 'prior_probability','token']]
success_example

words = success_example.iloc[0].highest_prior_words.split(' ')
probs = [float(x) for x in success_example.iloc[0].highest_prior_probabilities.split(' ')]
' '.join([words[i]+' ('+str(np.round(probs[i],2))+')' for i in range(len(words))]) 

'read (0.49) see (0.28) play (0.04) know (0.04) look (0.02) go (0.01) watch (0.01) help (0.01) try (0.01) talk (0.01)'

In [13]:
words = success_example.iloc[0].highest_posterior_words.split(' ')
probs = [float(x) for x in success_example.iloc[0].highest_posterior_probabilities.split(' ')]
' '.join([words[i]+' ('+str(np.round(probs[i],2))+')' for i in range(len(words))]) 

'read (0.61) see (0.35) watch (0.01) hear (0.01) know (0.0) be (0.0) look (0.0) eat (0.0) weed (0.0) go (0.0)'

In [14]:
success_example

Unnamed: 0,model,highest_posterior_words,highest_posterior_probabilities,highest_prior_words,highest_prior_probabilities,prior_probability,token
997721,"Adult BERT without tags, , +-20 utts context",read see watch hear know be look eat weed go,0.6111245106575145 0.3457097600504063 0.012493...,read see play know look go watch help try talk,0.48782247 0.27595848 0.036773972 0.036336947 ...,0.487822,read


In [15]:
success_example = scores_across_models.loc[(scores_across_models.model == unigram_title) &
    (scores_across_models.token == 'read')][['model','highest_posterior_words','highest_posterior_probabilities',
    'highest_prior_words','highest_prior_probabilities', 'prior_probability','token']]
success_example

  and should_run_async(code)


Unnamed: 0,model,highest_posterior_words,highest_posterior_probabilities,highest_prior_words,highest_prior_probabilities,prior_probability,token
3,CHILDES Unigram,we and need one what would he me here see,0.22214807950129137 0.07648990509019489 0.0701...,i a the yeah no it you and that this,0.04253891500819406 0.03258178351349858 0.0279...,0.001048,read


In [16]:
words = success_example.iloc[0].highest_prior_words.split(' ')
probs = [float(x) for x in success_example.iloc[0].highest_prior_probabilities.split(' ')]
' '.join([words[i]+' ('+str(np.round(probs[i],2))+')' for i in range(len(words))]) 

  and should_run_async(code)


'i (0.04) a (0.03) the (0.03) yeah (0.03) no (0.03) it (0.03) you (0.02) and (0.02) that (0.02) this (0.01)'

In [17]:
words = success_example.iloc[0].highest_posterior_words.split(' ')
probs = [float(x) for x in success_example.iloc[0].highest_posterior_probabilities.split(' ')]
' '.join([words[i]+' ('+str(np.round(probs[i],2))+')' for i in range(len(words))]) 

'we (0.22) and (0.08) need (0.07) one (0.04) what (0.03) would (0.03) he (0.03) me (0.03) here (0.02) see (0.02)'

In [18]:
# Need to visualize utt_glosses -- how to do this? What is the dependency?
# Is this a providence one or?
utt_glosses.loc[(utt_glosses.transcript_id == target_transcript_id) &
                (utt_glosses.utterance_order.isin(range(310-10, 310+2)))] # Note to self: changed the range here

Unnamed: 0,gloss,transcript_id,id,utterance_order,target_child_name,speaker_code,type
162628,Jasmine,42336,16928069,300,Lily,MOT,declarative
162639,a ballet,42336,16928081,301,Lily,CHI,declarative
162656,is Jasmine a ballerina,42336,16928098,302,Lily,MOT,question
162670,yeah,42336,16928113,303,Lily,CHI,declarative
162686,oh I didn't know that,42336,16928129,304,Lily,MOT,declarative
162710,I ready now yyy,42336,16928154,305,Lily,CHI,declarative
162730,whoa,42336,16928174,306,Lily,MOT,declarative
162744,yyy,42336,16928189,307,Lily,CHI,declarative
162760,yyy yyy,42336,16928205,308,Lily,CHI,declarative
162779,you want mamma let's see,42336,16928225,309,Lily,MOT,declarative


## New extraction section

In [19]:
yyy_idx = 16813515

raw_scores_across_models = examples_figure.get_scores_across_models(yyy_idx, which_models, False)
scores_across_models = pd.concat(raw_scores_across_models)

  and should_run_async(code)


Running model CHILDES Unigram...
Processing beta value 1 of 10
If possible compare the bert_token_id in sample_across_models to the bert_token_id in one of the other scores sets from bert.
Running model CHILDES BERT without tags, , +-20 utts context...
Computing failure scores
Computing success scores
Processing beta value 1 of 10


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Running model Adult BERT without tags, , +-20 utts context...
Computing failure scores
Computing success scores
Processing beta value 1 of 10


In [20]:
all_tokens_phono.loc[all_tokens_phono.utterance_id == yyy_idx][['gloss','actual_phonology_no_dia',
 'model_phonology_no_dia', 'utterance_id','bert_token_id','utterance_order','transcript_id']]

  and should_run_async(code)


Unnamed: 0,gloss,actual_phonology_no_dia,model_phonology_no_dia,utterance_id,bert_token_id,utterance_order,transcript_id
289678,you make your yyy,,,16813515,289678,112,42253
289679,you make your yyy,ju,ju,16813515,289679,112,42253
289680,you make your yyy,meək,meək,16813515,289680,112,42253
289681,you make your yyy,jɜ,jɑɹ,16813515,289681,112,42253
289682,you make your yyy,fɜt,*,16813515,289682,112,42253
289683,you make your yyy,,,16813515,289683,112,42253


In [21]:
yyy_example = scores_across_models.loc[(scores_across_models.model == childes_all_title)][['model','highest_posterior_words','highest_posterior_probabilities',
    'highest_prior_words','highest_prior_probabilities', 'prior_probability','token']]
yyy_example

words = yyy_example.iloc[0].highest_posterior_words.split(' ')
probs = [float(x) for x in yyy_example.iloc[0].highest_posterior_probabilities.split(' ')]
' '.join([words[i]+' ('+str(np.round(probs[i],2))+')' for i in range(len(words))]) 

  and should_run_async(code)


'own (0.48) bet (0.09) cut (0.05) bed (0.04) shot (0.03) wish (0.03) guess (0.03) call (0.02) choice (0.02) move (0.02)'

In [22]:
yyy_example = scores_across_models.loc[(scores_across_models.model == adult_all_title)][['model','highest_posterior_words','highest_posterior_probabilities',
    'highest_prior_words','highest_prior_probabilities', 'prior_probability','token']]
yyy_example

words = yyy_example.iloc[0].highest_prior_words.split(' ')
probs = [float(x) for x in yyy_example.iloc[0].highest_prior_probabilities.split(' ')]
' '.join([words[i]+' ('+str(np.round(probs[i],2))+')' for i in range(len(words))]) 

'own (0.25) choice (0.24) point (0.04) bed (0.03) call (0.03) guess (0.03) wish (0.02) choices (0.02) move (0.02) mistake (0.02)'

In [23]:
words = yyy_example.iloc[0].highest_posterior_words.split(' ')
probs = [float(x) for x in yyy_example.iloc[0].highest_posterior_probabilities.split(' ')]
' '.join([words[i]+' ('+str(np.round(probs[i],2))+')' for i in range(len(words))]) 

'own (0.32) bet (0.2) bed (0.04) call (0.04) cut (0.04) guess (0.04) wish (0.03) shot (0.03) choice (0.03) move (0.03)'

In [24]:
yyy_example = scores_across_models.loc[(scores_across_models.model == unigram_title)][['model','highest_posterior_words','highest_posterior_probabilities',
    'highest_prior_words','highest_prior_probabilities', 'prior_probability','token']]
yyy_example

words = yyy_example.iloc[0].highest_prior_words.split(' ')
probs = [float(x) for x in yyy_example.iloc[0].highest_prior_probabilities.split(' ')]
' '.join([words[i]+' ('+str(np.round(probs[i],2))+')' for i in range(len(words))]) 

'i (0.04) a (0.03) the (0.03) yeah (0.03) no (0.03) it (0.03) you (0.02) and (0.02) that (0.02) this (0.01)'

In [25]:
words = yyy_example.iloc[0].highest_posterior_words.split(' ')
probs = [float(x) for x in yyy_example.iloc[0].highest_posterior_probabilities.split(' ')]
' '.join([words[i]+' ('+str(np.round(probs[i],2))+')' for i in range(len(words))]) 

'it (0.14) that (0.1) what (0.06) not (0.04) get (0.03) got (0.03) put (0.03) fit (0.03) feet (0.02) for (0.02)'

In [26]:
utt_glosses.loc[(utt_glosses.transcript_id == 42253) &
                (utt_glosses.utterance_order.isin(range(112-3, 112+3)))]

Unnamed: 0,gloss,transcript_id,id,utterance_order,target_child_name,speaker_code,type
50658,then we won't be able to put them back into th...,42253,16813459,109,Alex,MOT,declarative
50679,do you want ta put some beans in your eggs and...,42253,16813482,110,Alex,MOT,question
50697,no,42253,16813501,111,Alex,CHI,declarative
50710,you make your yyy,42253,16813515,112,Alex,CHI,declarative
50723,can I make one,42253,16813529,113,Alex,MOT,question
50745,no,42253,16813554,114,Alex,MOT,declarative
