In [395]:
import transformers

from transformers import GPT2LMHeadModel, GPT2Tokenizer, BertForMaskedLM, BertTokenizer, BartForConditionalGeneration, BartTokenizer
from new_models import model_prefixes

In [396]:


models = {
    'gpt2': GPT2LMHeadModel.from_pretrained('gpt2'),
    'bert': BertForMaskedLM.from_pretrained('bert-large-uncased-whole-word-masking'),
    'bart': BartForConditionalGeneration.from_pretrained('facebook/bart-base'),
    
}

tokenizers = {
    'gpt2': GPT2Tokenizer.from_pretrained('gpt2'),
    'bert': BertTokenizer.from_pretrained("bert-large-uncased-whole-word-masking"),
    'bart': BartTokenizer.from_pretrained("facebook/bart-base"),
}



Some weights of the model checkpoint at bert-large-uncased-whole-word-masking were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# Other scratchwork

In [401]:
# Looking at test saves for the model scores

import os
from os.path import join, exists

from new_models import prep_probs

RESULTS_FOLDER = './intermediate_results/new_models_probs'

temp_results = {}
for model_name in ['gpt2_normal', 'gpt2_medium', 'bert', 'bart']:
    print(model_name)
    temp_results[model_name] = prep_probs.load_word_scores(model_name, RESULTS_FOLDER)
    print(f'For model: {model_name}, length: {len(temp_results[model_name])}')

gpt2_normal
For model: gpt2_normal, length: 3193
gpt2_medium
For model: gpt2_medium, length: 3193
bert
For model: bert, length: 3193
bart


KeyboardInterrupt: 

In [455]:
temp_results['bart'] = prep_probs.load_word_scores('bart', RESULTS_FOLDER)

       word      prob
0      Each  0.051310
1      Ġnon  0.708738
2   fiction  0.336515
3     Ġbook  0.593027
4      Ġhas  0.152824
5        Ġa  0.731974
6     Ġcall  0.000202
7   Ġnumber  0.000671
8       Ġon  0.287921
9      Ġits  0.252410
10   Ġspine  0.007092


# Parallel development

In [449]:
import align_prep_words
importlib.reload(align_prep_words)

importlib.reload(prep_probs)

word_list = [df['word'].values.tolist() for df in lm['bnc_unigram']]

transformer_names = ['gpt2_normal', 'gpt2_medium', 'bert']#, 'bart']
tokenizer_names = ['gpt2', 'gpt2', 'bert']#, 'bart']

sel_idx = 20
for model_name, tokenizer_name in zip(transformer_names, tokenizer_names):

    print(tokenizer_name)
    print(f'Processing model name: {model_name}')
    
    raw_scores = prep_probs.load_word_scores(model_name, RESULTS_FOLDER)
    lm[f'{model_name}_scores'] = align_prep_words.align_model_word_dfs(raw_scores[:sel_idx],
                                                                       tokenizers[tokenizer_name],
                                                                       word_list[:sel_idx])

gpt2
Processing model name: gpt2_normal
       word      prob
0      Each  0.000269
1      Ġnon  0.000262
2   fiction  0.011790
3     Ġbook  0.438744
4      Ġhas  0.057975
5        Ġa  0.289296
6     Ġcall  0.000068
7   Ġnumber  0.000363
8       Ġon  0.043549
9      Ġits  0.095197
10   Ġspine  0.051601
Index: 0
gpt2
Processing model name: gpt2_medium
       word      prob
0      Each  0.000595
1      Ġnon  0.000134
2   fiction  0.011834
3     Ġbook  0.683527
4      Ġhas  0.055999
5        Ġa  0.325504
6     Ġcall  0.000187
7   Ġnumber  0.002454
8       Ġon  0.087685
9      Ġits  0.168300
10   Ġspine  0.049400
Index: 0
bert
Processing model name: bert
         word      prob
0        each  0.472037
1  nonfiction  0.000103
2        book  0.006232
3         has  0.809371
4           a  0.942987
5        call  0.002517
6      number  0.024608
7          on  0.937234
8         its  0.387401
9       spine  0.009837
Index: 0


In [445]:
# Ensure correctness of this sub-sample?

for idx, name in enumerate(transformer_names):
    print(lm[f'{name}_scores'][idx])
    print(temp_results[name][idx])
    print()

       prob        word
0 -3.570354        each
1       NaN  nonfiction
2 -0.357789        book
3 -1.236761         has
4 -0.538658           a
5 -4.164424        call
6 -3.439832      number
7 -1.361023          on
8 -1.021375         its
9 -1.287340       spine
       word      prob
0      Each -3.570354
1      Ġnon -3.581758
2   fiction -1.928502
3     Ġbook -0.357789
4      Ġhas -1.236761
5        Ġa -0.538658
6     Ġcall -4.164424
7   Ġnumber -3.439832
8       Ġon -1.361023
9      Ġits -1.021375
10   Ġspine -1.287340
11       Ġ. -3.513690

        prob     word
0  -3.225354     each
1  -3.874274      non
2  -5.236221  fiction
3  -0.472252     book
4  -1.158080      has
5  -0.485143        a
6  -3.748181     call
7  -2.511031   number
8  -1.469794       in
9  -1.263469      its
10 -2.077428    spine
        word      prob
0       Each -3.225354
1       Ġnon -3.874274
2   Ġfiction -5.236221
3      Ġbook -0.472252
4       Ġhas -1.158080
5         Ġa -0.485143
6      Ġcall -3.748181
7

In [416]:
# Why does "closed" not match expectations?

importlib.reload(align_prep_words)
align_prep_words.process_nan_single_df(temp_results['gpt2_normal'][1260], tokenizers['gpt2'], word_list[1260])

Unnamed: 0,prob,word
0,-4.403163,austin
1,-4.549038,closed
2,-0.950157,the
3,-1.287342,door
4,-2.303025,behind
5,-0.651266,him
6,-1.327117,on
7,-0.748366,the
8,-3.311556,boat


# More scratchwork

In [340]:
import load_runs

agg_all_runs = load_runs.load_runs()
prep_all_runs = pd.read_csv('output/all_runs.csv')


prep_list = list(prep_all_runs['user_candidate_transcription'])
agg_list = list(agg_all_runs['user_candidate_transcription'])

# They seem to be the same sentences, but in different orders.

print(set(prep_list) ^ set(agg_list))
print(prep_list[1:] == agg_list[1:])

print(sorted(prep_list) == sorted(agg_list)) # They are just in different orders. So it should be fine?

set()
False
True


In [346]:
# Need to filter out all of the words that don't align? But how?

import pickle
import glob

DATA_PREP_FOLDER = './intermediate_results/data_prep_logistic'

model_names = [filename.split('logistic/')[1].split('_predictions.txt')[0]
               for filename in glob.glob(DATA_PREP_FOLDER+'/*')]

lm = {}

for lm_name in model_names:
    raw_scores_path = join(DATA_PREP_FOLDER, f"{lm_name}_predictions.txt")
    # 3/27: https://stackoverflow.com/questions/27745500/how-to-save-a-list-to-a-file-and-read-it-as-a-list-type
    with open(raw_scores_path, 'rb') as f:
        raw_scores = pickle.load(f)
        lm[lm_name] = raw_scores
        
BERT_case = 'A dietitian goes'
GPT_case = 'Each nonfiction book'

tokenizers['bert'].tokenize(BERT_case)
tokenizers['gpt2'].tokenize(GPT_case)

In [389]:
# This is verified

word_list = [df['word'].values.tolist() for df in lm['bnc_unigram']]
result = align_prep_words.align_model_word_dfs(lm['gpt2_normal_scores'][:2], tokenizers['gpt2'], word_list[:21])

In [314]:


def test_bert_dietitian():
    test_idx = 18

    orig_score = lm['bert_scores'][test_idx]

    print(orig_score)

    reference_sentence = 'a dietitian goes to college for at least four years'
    result = process_nan_single_df(lm['bert_scores'][test_idx], tokenizers['bert'], reference_sentence)
    
    print(result)

def test_gpt2_nonfiction():
    
    test_idx = 0

    orig_score = lm['gpt2_normal_scores'][test_idx]

    print(orig_score)

    reference_sentence = 'each nonfiction book has a call number on its spine'
    result = process_nan_single_df(lm['gpt2_normal_scores'][test_idx], tokenizers['gpt2'], reference_sentence)
    
    print(result)
    
    
test_gpt2_nonfiction()

       word      prob
0      Each -3.570354
1      Ġnon -3.581758
2   fiction -1.928502
3     Ġbook -0.357789
4      Ġhas -1.236761
5        Ġa -0.538658
6     Ġcall -4.164424
7   Ġnumber -3.439832
8       Ġon -1.361023
9      Ġits -1.021375
10   Ġspine -1.287340
11       Ġ. -3.513690
entire idx 1 to collapse 1
entire word nonfiction
to collapse idx non
	 Next collapse: 3

       prob        word
0 -3.570354        each
1       NaN  nonfiction
2 -0.357789        book
3 -1.236761         has
4 -0.538658           a
5 -4.164424        call
6 -3.439832      number
7 -1.361023          on
8 -1.021375         its
9 -1.287340       spine


In [450]:
temp_results['bert'][18]

Unnamed: 0,word,prob
0,a,-0.124584
1,diet,-0.714438
2,##itia,-0.188612
3,##n,-0.0462
4,goes,-0.189202
5,to,-0.074309
6,college,-1.313672
7,for,-0.032421
8,at,-0.000133
9,least,-0.000926


In [454]:
temp_results['gpt2_medium'][1]

Unnamed: 0,word,prob
0,Each,-3.225354
1,Ġnon,-3.874274
2,Ġfiction,-5.236221
3,Ġbook,-0.472252
4,Ġhas,-1.15808
5,Ġa,-0.485143
6,Ġcall,-3.748181
7,Ġnumber,-2.511031
8,Ġin,-1.469794
9,Ġits,-1.263469


In [278]:

def test_find_next_word_bert():
    
    reference_sentence = 'a dietitian goes to college for at least four years'
    
    tok = BertTokenizer.from_pretrained("bert-large-uncased-whole-word-masking")
    ttokens = [filter_symbols(t) for t in tok.tokenize(reference_sentence)]
    tidx = find_next_word_loc(1, ttokens, 'dietitian')
    
    assert tidx == 4 
    
def test_find_next_word_gpt2():
    
    reference_sentence = 'each nonfiction book has a call number on its spine'
    tok = GPT2Tokenizer.from_pretrained('gpt2')
    ttokens = [filter_symbols(t) for t in tok.tokenize(reference_sentence)]
    tidx = find_next_word_loc(1, ttokens, 'nonfiction')
    
    assert tidx == 3

test_find_next_word_bert()
test_find_next_word_gpt2()

In [228]:
# all_runs and BERT scores are not aligned?
for i, sent_ref in enumerate(lm['bnc_unigram']):
    
    if i == 0 : continue 
        
    sent_act = lm['bert_scores'][i] 
    
    print(f'reference: Index {i}')
    print(f'\t{list(sent_ref["word"])}')
    print('actual')
    print(f'\t{list(sent_act["word"])}')
    print('csv')
    print(f'\t{all_entire_word_reference[i]}')
    a = input()
    if a == 'quit':
        break

reference: Index 1
	['each', 'non', 'fiction', 'book', 'has', 'a', 'call', 'number', 'in', 'its', 'spine', '</s>']
actual
	['each', 'non', 'fiction', 'book', 'has', 'a', 'call', 'number', 'in', 'its', 'spine', '.']
csv
	each non fiction book has a call number in its spine
a
reference: Index 2
	['each', 'nonfiction', 'book', 'had', 'a', 'call', 'number', 'on', 'its', 'spine', '</s>']
actual
	['each', 'nonfiction', 'book', 'had', 'a', 'call', 'number', 'on', 'its', 'spine', '.']
csv
	each nonfiction book had a call number on its spine
a
reference: Index 3
	['each', 'nonfiction', 'book', 'had', 'a', 'call', 'number', 'on', 'its', 'spine', '</s>']
actual
	['each', 'nonfiction', 'book', 'had', 'a', 'call', 'number', 'on', 'its', 'spine', '.']
csv
	each nonfiction book had a call number on its spine
a
reference: Index 4
	['each', 'non', 'fiction', 'book', 'had', 'a', 'call', 'number', 'on', 'its', 'spine', '</s>']
actual
	['each', 'non', 'fiction', 'book', 'had', 'a', 'call', 'number', 'on',

# Sanity checks and tests

In [38]:
import new_models
from new_models import model_score_funcs


import importlib
importlib.reload(model_score_funcs)


inputs = [
    "it's time to go to the store",
    "walk slowly to the golden stair"
]

model_score_funcs.get_gpt2_scores(inputs)
#model_score_funcs.get_bert_scores(inputs)
#model_score_funcs.get_bart_scores(inputs)

Processing index: 0
It 0.00911561120301485
's 0.4054446220397949
Ġtime 0.034498848021030426
Ġto 0.548720121383667
Ġgo 0.015521024353802204
Ġto 0.08927261829376221
Ġthe 0.1976589858531952
Ġstore 0.019540343433618546
Ġ. 0.0002333719312446192
<|endoftext|> 0.004117126576602459
Walk 3.4847416827687994e-05
Ġslowly 0.00015021645231172442
Ġto 0.03525398299098015
Ġthe 0.47555652260780334
Ġgolden 4.06051694881171e-05
Ġstair 0.006223683711141348
Ġ. 3.163226574542932e-05
<|endoftext|> 0.0008614695398136973


[            word      prob
 0             It  0.009116
 1             's  0.405445
 2          Ġtime  0.034499
 3            Ġto  0.548720
 4            Ġgo  0.015521
 5            Ġto  0.089273
 6           Ġthe  0.197659
 7         Ġstore  0.019540
 8             Ġ.  0.000233
 9  <|endoftext|>  0.004117,             word      prob
 0           Walk  0.000035
 1        Ġslowly  0.000150
 2            Ġto  0.035254
 3           Ġthe  0.475557
 4        Ġgolden  0.000041
 5         Ġstair  0.006224
 6             Ġ.  0.000032
 7  <|endoftext|>  0.000861]

In [113]:
## Testing the next model probabilities

import torch
import pandas as pd

from new_models import model_score_utils

importlib.reload(model_score_utils)


def report_mask_words(scores, sentence, tokenizer):
    """
    raw_scores = a (vocabulary,) tensor of selected softmax values for a pre-selected position.
    mask_idx, the position to select for analysis.
    
    sentence = the prefix to do the prediction on
    tokenizer = BERT/BART tokenizer
    """
    
    # It should intake the raw scores itself.
    score_vals, word_idxs = torch.sort(scores, descending = True)
    words = tokenizer.convert_ids_to_tokens(word_idxs)

    print(f"Reporting most likely tokens to complete '{sentence}' in descending order")

    num_report = 20

    score_df = pd.DataFrame.from_dict({
      'Word': words,
      'Score value': list(map(lambda x : round(x, 5), score_vals.numpy().tolist()))
      })

    return score_df[:num_report]


def test_gpt2_model_probs():
    
    
    print('Note that this outputs softmax of the last word before the punctuation.')
    
    model = models['gpt2']; tok = tokenizers['gpt2']
    test_sentence = "it's time to go to the"
    # Below line is from the prefix code
    test_sentence = f'{tok.bos_token}{test_sentence}{tok.eos_token}'
    
    this_tokens = tok.encode(test_sentence)
    
    # The 0 is a filler index.
    this_pred_pos = len(this_tokens) - 2
    _, probs = model_score_utils.get_model_probabilities(this_tokens, model, 0, this_pred_pos, verifying = True)
    result_df = report_mask_words(probs, test_sentence, tok)
    
    test_word = 'Ġbathroom'
    ground_truth_token = tok.convert_tokens_to_ids(test_word)
    
    print(f'Ground truth probability, manual extract, for the word {test_word}: {probs[ground_truth_token]}')
    
    return result_df
    
    
def test_bertlike_model_probs(test_sentence, model_name):
        
    model = models[model_name]; tok = tokenizers[model_name]
    
    test_sentence = f"it's time to go to the {tok.mask_token}"
    this_tokens = tok.encode(test_sentence)
    this_pred_pos = len(this_tokens) - 2
    
    # The 0 is a filler index.
    prob_at_ground_truth, probs = model_score_utils.get_model_probabilities(this_tokens, model, 0, this_pred_pos, verifying = True)

    # For the ground truth idx? How to test this? Do it by cross-checking it with the probs in the "most likely".
    result_df = report_mask_words(probs, test_sentence, tok)

    test_word = 'moon' if model_name == 'bert' else 'Ġstore'
    ground_truth_token = tok.convert_tokens_to_ids(test_word)
    
    print(f'Ground truth probability, manual extract, for the word {test_word}: {probs[ground_truth_token]}')
    
    return result_df

bert_df = test_bertlike_model_probs('bert')
print(bert_df)

#bart_df = test_bertlike_model_probs('bart')
#print(bart_df)

#gpt2_df = test_gpt2_model_probs()
#print(gpt2_df)


TypeError: test_bertlike_model_probs() missing 1 required positional argument: 'model_name'

In [119]:

tok = tokenizers['bert']

# {tok.mask_token} nonfiction book had a call number on its spine
this_test = f'walk slowly to the {tok.mask_token} stair'
this_tokens = tokenizers['bert'].encode(this_test)

pred_idx = 5
prob_at_ground_truth, probs = model_score_utils.get_model_probabilities(this_tokens, models['bert'], 0, pred_idx, verifying = True)

print(tok.decode(this_tokens[pred_idx]))

# For the ground truth idx? How to test this? Do it by cross-checking it with the probs in the "most likely".
result_df = report_mask_words(probs, test_sentence, tokenizers['bert'])
print(result_df)

this is updated
[ M A S K ]
Reporting most likely tokens to complete 'each nonfiction book had a call number on its spine' in descending order
          Word  Score value
0        first      0.29029
1         last      0.15535
2        final      0.07602
3       second      0.05278
4          top      0.03681
5        third      0.02572
6         next      0.01916
7        fifth      0.01754
8       bottom      0.01634
9      seventh      0.01319
10      fourth      0.01292
11        main      0.01015
12       front      0.00830
13      eighth      0.00811
14      spiral      0.00801
15      lowest      0.00776
16        back      0.00671
17       tenth      0.00656
18  thirteenth      0.00527
19       ninth      0.00514


In [90]:
## Testing the model prefixes functions


from new_models import model_prefixes

import importlib
importlib.reload(model_prefixes)


test_sentence = "It's time to go to the store."
    
def test_gpt2_prefixes(s):
    tok = tokenizers['gpt2']
    prefixes, _ = model_prefixes.get_gpt2_prefixes(s, tok)
    for p in prefixes:
        print(tok.convert_ids_to_tokens(p))
        
def test_bertlike_prefixes(s, model_name):
    
    tok = tokenizers[model_name]
    mask_func = model_prefixes.get_bertlike_mask_func(tok)
    
    prefixes, _ = mask_func(s, tok)
    for p in prefixes:
        print(tok.convert_ids_to_tokens(p))

test_sentence = "each nonfiction book had a call number on its spine"
print('GPT2 checks')
test_gpt2_prefixes(test_sentence)
print('\nBERT checks')
test_bertlike_prefixes(test_sentence, 'bert')
print('\nBART checks')
test_bertlike_prefixes(test_sentence, 'bart')

GPT2 checks
['<|endoftext|>']
['<|endoftext|>', 'each']
['<|endoftext|>', 'each', 'Ġnon']
['<|endoftext|>', 'each', 'Ġnon', 'fiction']
['<|endoftext|>', 'each', 'Ġnon', 'fiction', 'Ġbook']
['<|endoftext|>', 'each', 'Ġnon', 'fiction', 'Ġbook', 'Ġhad']
['<|endoftext|>', 'each', 'Ġnon', 'fiction', 'Ġbook', 'Ġhad', 'Ġa']
['<|endoftext|>', 'each', 'Ġnon', 'fiction', 'Ġbook', 'Ġhad', 'Ġa', 'Ġcall']
['<|endoftext|>', 'each', 'Ġnon', 'fiction', 'Ġbook', 'Ġhad', 'Ġa', 'Ġcall', 'Ġnumber']
['<|endoftext|>', 'each', 'Ġnon', 'fiction', 'Ġbook', 'Ġhad', 'Ġa', 'Ġcall', 'Ġnumber', 'Ġon']
['<|endoftext|>', 'each', 'Ġnon', 'fiction', 'Ġbook', 'Ġhad', 'Ġa', 'Ġcall', 'Ġnumber', 'Ġon', 'Ġits']
['<|endoftext|>', 'each', 'Ġnon', 'fiction', 'Ġbook', 'Ġhad', 'Ġa', 'Ġcall', 'Ġnumber', 'Ġon', 'Ġits', 'Ġspine']

BERT checks
['[CLS]', '[MASK]', 'nonfiction', 'book', 'had', 'a', 'call', 'number', 'on', 'its', 'spine', '[SEP]']
['[CLS]', 'each', '[MASK]', 'book', 'had', 'a', 'call', 'number', 'on', 'its', 'spine', '