In [None]:
%load_ext rpy2.ipython
import rpy2.robjects.lib.ggplot2 as ggplot2
import childespy
import numpy as np
import os
import imp
import pandas as pd
import transformers
import torch
import scipy.stats
import copy
from string import punctuation
import transfomers_bert_completions
imp.reload(transfomers_bert_completions)

# Masked Language Prediction Softmax with BERT 

In [None]:
from pytorch_pretrained_bert import BertForMaskedLM
from transformers import BertTokenizer

In [None]:
adult_bertMaskedLM = BertForMaskedLM.from_pretrained('bert-base-uncased')
adult_bertMaskedLM.eval()
adult_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
adult_softmax_mask, adult_vocab = transfomers_bert_completions.get_softmax_mask(adult_tokenizer, cmu_2syl_inchildes.word)

#
ft1_bertMaskedLM = BertForMaskedLM.from_pretrained('model_output')
ft1_bertMaskedLM.eval()
ft1_tokenizer = BertTokenizer.from_pretrained('model_output')
ft1_softmax_mask, ft1_vocab = transfomers_bert_completions.get_softmax_mask(ft1_tokenizer, cmu_2syl_inchildes.word)


ft2_bertMaskedLM = BertForMaskedLM.from_pretrained('model_output2')
ft2_bertMaskedLM.eval()
ft2_tokenizer = BertTokenizer.from_pretrained('model_output2')
ft2_softmax_mask, ft2_vocab = transfomers_bert_completions.get_softmax_mask(ft2_tokenizer, cmu_2syl_inchildes.word)

### Adult Model

In [None]:
transfomers_bert_completions.compare_completions("mommy [MASK] .", adult_bertMaskedLM, adult_tokenizer).head(10)

In [None]:
transfomers_bert_completions.compare_completions("where [MASK] ?", adult_bertMaskedLM, adult_tokenizer).head(10)

In [None]:
transfomers_bert_completions.compare_completions("hi [MASK] .", adult_bertMaskedLM, adult_tokenizer).head(10)

In [None]:
transfomers_bert_completions.compare_completions("what [MASK] .", adult_bertMaskedLM, adult_tokenizer).head(10)

In [None]:
completions = transfomers_bert_completions.compare_completions("go [MASK] .", adult_bertMaskedLM, adult_tokenizer)

# BERT without Context

In [None]:
all_tokens_phono = pd.read_pickle('csv/pvd_utt_glosses_phono_cleaned_inflated.pkl')

In [None]:
# otsb: off the shelf BERT
_,predictions =  transfomers_bert_completions.bert_completions(
    "[MASK] .", adult_bertMaskedLM, adult_tokenizer, adult_softmax_mask)
predictions.head(10)

In [None]:
# no context, otsb: off the shelf BERT
priors, completions, stats = transfomers_bert_completions.get_stats_for_failure(
    all_tokens_phono, 16764425, adult_bertMaskedLM, adult_tokenizer, adult_softmax_mask,
    None, use_speaker_labels=False)

In [None]:
print(priors) 
print(completions)
print(stats)

In [None]:
# no context, otsb: off the shelf BERT
transfomers_bert_completions.get_stats_for_success(all_tokens_phono, 16759315, adult_bertMaskedLM, 
        adult_tokenizer, adult_softmax_mask, 'score', None, use_speaker_labels=False)

In [None]:
# Fine-tuned BERT: don't use speaker labels
transfomers_bert_completions.bert_completions(
    "[MASK] .", ft1_bertMaskedLM, ft1_tokenizer, adult_softmax_mask)

In [None]:
# no context ft1: Fine-tuned BERT: don't use speaker labels
transfomers_bert_completions.get_stats_for_failure(
    all_tokens_phono, 16764425, ft1_bertMaskedLM, ft1_tokenizer, ft1_softmax_mask,
    None, use_speaker_labels=False)

In [None]:
# no context, ft1: Fine-tuned BERT: don't use speaker labels
transfomers_bert_completions.get_stats_for_success(all_tokens_phono, 16759315, ft1_bertMaskedLM, 
        ft1_tokenizer, ft1_softmax_mask, 'score', None, use_speaker_labels=False)

In [None]:
# ft2: Fine-tuned BERT: with speaker labels
transfomers_bert_completions.bert_completions(
    "[chi] [MASK] .", ft2_bertMaskedLM, ft2_tokenizer, ft2_softmax_mask)

In [None]:
# no context ft2: Fine-tuned BERT: with speaker labels
transfomers_bert_completions.get_stats_for_failure(
    all_tokens_phono, 16764425, ft2_bertMaskedLM, ft2_tokenizer,
    ft2_softmax_mask, None, use_speaker_labels=True,
    preserve_errors=True)

In [None]:
transfomers_bert_completions.get_stats_for_success(all_tokens_phono, 16759315, 
    ft2_bertMaskedLM, ft2_tokenizer, ft2_softmax_mask, 'score', None, use_speaker_labels=True)

# BERT with Context

In [None]:
transfomers_bert_completions.bert_completions(
    "look ! [SEP] you see the ball . [SEP] what is it? [SEP] [MASK] !",
    adult_bertMaskedLM,
    adult_tokenizer,
    adult_softmax_mask
)

In [None]:
transfomers_bert_completions.bert_completions(
    "[MASK] !",
    adult_bertMaskedLM,
    adult_tokenizer,
    adult_softmax_mask
)

In [None]:
transfomers_bert_completions.bert_completions(
    "look ! [SEP] you see the ball . [SEP] what is it? [SEP] [MASK] !",
    ft1_bertMaskedLM,
    ft1_tokenizer, 
    ft1_softmax_mask
)

In [None]:
transfomers_bert_completions.bert_completions(
    "[MASK] !",
    ft1_bertMaskedLM,
    ft1_tokenizer,
    ft1_softmax_mask
)

In [None]:
transfomers_bert_completions.bert_completions(
    "[cgv] look ! [cgv] [SEP] you see the ball . [SEP] [cgv] what is it? [SEP] [cgv] [MASK] !",
    ft2_bertMaskedLM,
    ft2_tokenizer,
    ft2_softmax_mask
)

In [None]:
full_context = "you can play it . [SEP] build the blocks . [SEP] look . [SEP] what is this [MASK] . [SEP] wee . [SEP] what ? [SEP] alright just a little . . ."
minimal_context = "what is this [MASK] ."

In [None]:
transfomers_bert_completions.bert_completions(full_context, adult_bertMaskedLM,
    adult_tokenizer, adult_softmax_mask)

In [None]:
transfomers_bert_completions.bert_completions(minimal_context, adult_bertMaskedLM,
    adult_tokenizer, adult_softmax_mask)

In [None]:
transfomers_bert_completions.bert_completions(full_context, ft1_bertMaskedLM,
    ft1_tokenizer, ft1_softmax_mask)

In [None]:
transfomers_bert_completions.bert_completions(minimal_context, ft1_bertMaskedLM,
    ft1_tokenizer, ft1_softmax_mask)

In [None]:
# define minimal and full context with [cgv] and [chi] items
full_context = "[cgv] you can play it . [SEP] [cgv] build the blocks . [SEP] [cgv] look . [SEP] [cgv] what is this [MASK] . [SEP] [chi] wee . [SEP] [cgv] what ? [SEP] [cgv]  alright just a little . . ."
minimal_context = "[cgv] what is this [MASK] ."

In [None]:
transfomers_bert_completions.bert_completions(minimal_context, ft2_bertMaskedLM,
    ft2_tokenizer, ft2_softmax_mask)

In [None]:
transfomers_bert_completions.bert_completions(full_context, ft2_bertMaskedLM,
    ft2_tokenizer, ft2_softmax_mask)

# Successes and Failures

In [None]:
import transfomers_bert_completions
imp.reload(transfomers_bert_completions)

In [None]:
transfomers_bert_completions.get_stats_for_failure(
    all_tokens_phono, 17280349, adult_bertMaskedLM, adult_tokenizer, adult_softmax_mask,
    5, use_speaker_labels=False, preserve_errors=True)

In [None]:
transfomers_bert_completions.get_stats_for_success(all_tokens_phono, 16759315, adult_bertMaskedLM, 
        adult_tokenizer, adult_softmax_mask, 'score', None, use_speaker_labels=False, preserve_errors=True)

In [None]:
# no context, ft1: Fine-tuned BERT: don't use speaker labels
transfomers_bert_completions.get_stats_for_failure(
    all_tokens_phono, 17280276, ft1_bertMaskedLM, ft1_tokenizer, ft1_softmax_mask,
    5, use_speaker_labels=False, preserve_errors=True)

In [None]:
# no context, ft1: Fine-tuned BERT: don't use speaker labels
transfomers_bert_completions.get_stats_for_success(all_tokens_phono, 16759315, ft1_bertMaskedLM, 
        ft1_tokenizer, ft1_softmax_mask, 'score', 5, use_speaker_labels=False, preserve_errors=True)

In [None]:
# ft2: Fine-tuned BERT: with speaker labels
transfomers_bert_completions.get_stats_for_failure(
    all_tokens_phono, 17280276, ft2_bertMaskedLM, ft2_tokenizer,
    ft2_softmax_mask, 5, use_speaker_labels=True,
    preserve_errors=True)

In [None]:
# ft2: Fine-tuned BERT: with speaker labels
transfomers_bert_completions.get_stats_for_success(all_tokens_phono, 16759315, 
ft2_bertMaskedLM, ft2_tokenizer, ft2_softmax_mask, 'score', 5, use_speaker_labels=True, preserve_errors=True)

In [3]:
### Retrieve from selected utterances

In [None]:
# select 1000 success utterances and 1000 failure utterances
selected_success_utts = np.random.choice(success_utts.utterance_id, 500, replace=False)
selected_yyy_utts = np.random.choice(yyy_utts.utterance_id, 1000, replace=False)

# Unigram Model

In [None]:
unigram_scores = transfomers_bert_completions.compare_successes_failures_unigram_model(
    all_tokens_phono,
    selected_success_utts,
    selected_yyy_utts,
    adult_tokenizer,
    adult_softmax_mask,
    'data/chi_vocab.csv',
    initial_vocab)

In [None]:
2unigram_scores_flat = transfomers_bert_completions.compare_successes_failures_unigram_model(
    all_tokens_phono,
    selected_success_utts,
    selected_yyy_utts,
    adult_tokenizer,
    adult_softmax_mask,
    None,
    initial_vocab)