Code for verifications/dataset analyses from project start approximately to Data Prep Logistic postprocessing (NaN alignment).

In [1]:
import transformers
from transformers import GPT2LMHeadModel, GPT2Tokenizer, BertForMaskedLM, BertTokenizer, BartForConditionalGeneration, BartTokenizer

import os
from os.path import join, exists

curr_dir = os.getcwd()
os.chdir('..')

from new_models import model_prefixes, align_prep_words

from new_models import in_progress
from new_models.in_progress import sub_analysis

import importlib
importlib.reload(model_prefixes)

os.chdir(curr_dir)


In [2]:


models = {
    'gpt2': GPT2LMHeadModel.from_pretrained('gpt2'),
    'bert': BertForMaskedLM.from_pretrained('bert-large-uncased-whole-word-masking'),
    'bart': BartForConditionalGeneration.from_pretrained('facebook/bart-base'),
    
}

tokenizers = {
    'gpt2': GPT2Tokenizer.from_pretrained('gpt2'),
    'bert': BertTokenizer.from_pretrained("bert-large-uncased-whole-word-masking"),
    'bart': BartTokenizer.from_pretrained("facebook/bart-base"),
}



Some weights of the model checkpoint at bert-large-uncased-whole-word-masking were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
from new_models import prep_probs

def load_pred_results(RESULTS_FOLDER):
    temp_results = {}
    for model_name in ['bert', 'gpt2_normal', 'bart', 'gpt2_medium']:
        print(model_name)
        temp_results[model_name] = prep_probs.load_word_scores(model_name, RESULTS_FOLDER)
        print(f'For model: {model_name}, length: {len(temp_results[model_name])}')
    return temp_results


PROB_DF_PATH = '/home/nwong/chompsky/serial_chain/telephone-analysis-public/intermediate_results/new_models_probs'
prob_results = load_pred_results(PROB_DF_PATH)

bert
For model: bert, length: 3193
gpt2_normal
For model: gpt2_normal, length: 3193
bart
For model: bart, length: 3193
gpt2_medium
For model: gpt2_medium, length: 3193


# Sanity checks and tests

In [32]:

import pickle
import glob

# Checking whether the postprocessed logistic probabilities match the original probabilities?

DATA_PREP_FOLDER = '/home/nwong/chompsky/serial_chain/telephone-analysis-public/intermediate_results/data_prep_logistic' # What is meant by this path?

model_names = [filename.split('logistic/')[1].split('_predictions.txt')[0]
               for filename in glob.glob(DATA_PREP_FOLDER+'/*')]

lm = {}

for lm_name in model_names:
    raw_scores_path = join(DATA_PREP_FOLDER, f"{lm_name}_predictions.txt")
    # 3/27: https://stackoverflow.com/questions/27745500/how-to-save-a-list-to-a-file-and-read-it-as-a-list-type
    with open(raw_scores_path, 'rb') as f:
        raw_scores = pickle.load(f)
        lm[lm_name] = raw_scores
        
BERT_case = 'A dietitian goes'
GPT_case = 'Each nonfiction book'

tokenizers['bert'].tokenize(BERT_case)
tokenizers['gpt2'].tokenize(GPT_case)

['Each', 'Ġnon', 'fiction', 'Ġbook']

In [39]:

word_list = [df['word'].values.tolist() for df in lm['bnc_unigram']]
result = align_prep_words.align_model_word_dfs(lm['gpt2_normal_scores'][:2], tokenizers['gpt2'], word_list[:21])

# Verified that this is as expected, 5/31/21

align_prep_words.process_nan_single_df(prob_results['gpt2_normal'][1260], tokenizers['gpt2'], word_list[1260])
prob_results['gpt2_normal'][1260]

Index: 0


Unnamed: 0,word,prob
0,Austin,-4.403163
1,Ġclosed,-4.549038
2,Ġthe,-0.950157
3,Ġdoor,-1.287342
4,Ġbehind,-2.303025
5,Ġhim,-0.651266
6,Ġon,-1.327117
7,Ġthe,-0.748366
8,Ġboat,-3.311556


In [314]:


def test_bert_dietitian():
    test_idx = 18

    orig_score = lm['bert_scores'][test_idx]

    print(orig_score)

    reference_sentence = 'a dietitian goes to college for at least four years'
    result = process_nan_single_df(lm['bert_scores'][test_idx], tokenizers['bert'], reference_sentence)
    
    print(result)

def test_gpt2_nonfiction():
    
    test_idx = 0

    orig_score = lm['gpt2_normal_scores'][test_idx]

    print(orig_score)

    reference_sentence = 'each nonfiction book has a call number on its spine'
    result = process_nan_single_df(lm['gpt2_normal_scores'][test_idx], tokenizers['gpt2'], reference_sentence)
    
    print(result)
    
    
test_gpt2_nonfiction()

       word      prob
0      Each -3.570354
1      Ġnon -3.581758
2   fiction -1.928502
3     Ġbook -0.357789
4      Ġhas -1.236761
5        Ġa -0.538658
6     Ġcall -4.164424
7   Ġnumber -3.439832
8       Ġon -1.361023
9      Ġits -1.021375
10   Ġspine -1.287340
11       Ġ. -3.513690
entire idx 1 to collapse 1
entire word nonfiction
to collapse idx non
	 Next collapse: 3

       prob        word
0 -3.570354        each
1       NaN  nonfiction
2 -0.357789        book
3 -1.236761         has
4 -0.538658           a
5 -4.164424        call
6 -3.439832      number
7 -1.361023          on
8 -1.021375         its
9 -1.287340       spine


In [450]:
temp_results['bert'][18]

Unnamed: 0,word,prob
0,a,-0.124584
1,diet,-0.714438
2,##itia,-0.188612
3,##n,-0.0462
4,goes,-0.189202
5,to,-0.074309
6,college,-1.313672
7,for,-0.032421
8,at,-0.000133
9,least,-0.000926


In [454]:
temp_results['gpt2_medium'][1]

Unnamed: 0,word,prob
0,Each,-3.225354
1,Ġnon,-3.874274
2,Ġfiction,-5.236221
3,Ġbook,-0.472252
4,Ġhas,-1.15808
5,Ġa,-0.485143
6,Ġcall,-3.748181
7,Ġnumber,-2.511031
8,Ġin,-1.469794
9,Ġits,-1.263469


In [278]:

def test_find_next_word_bert():
    
    reference_sentence = 'a dietitian goes to college for at least four years'
    
    tok = BertTokenizer.from_pretrained("bert-large-uncased-whole-word-masking")
    ttokens = [filter_symbols(t) for t in tok.tokenize(reference_sentence)]
    tidx = find_next_word_loc(1, ttokens, 'dietitian')
    
    assert tidx == 4 
    
def test_find_next_word_gpt2():
    
    reference_sentence = 'each nonfiction book has a call number on its spine'
    tok = GPT2Tokenizer.from_pretrained('gpt2')
    ttokens = [filter_symbols(t) for t in tok.tokenize(reference_sentence)]
    tidx = find_next_word_loc(1, ttokens, 'nonfiction')
    
    assert tidx == 3

test_find_next_word_bert()
test_find_next_word_gpt2()

In [38]:
import new_models
from new_models import model_score_funcs


import importlib
importlib.reload(model_score_funcs)


inputs = [
    "it's time to go to the store",
    "walk slowly to the golden stair"
]

model_score_funcs.get_gpt2_scores(inputs)
#model_score_funcs.get_bert_scores(inputs)
#model_score_funcs.get_bart_scores(inputs)

Processing index: 0
It 0.00911561120301485
's 0.4054446220397949
Ġtime 0.034498848021030426
Ġto 0.548720121383667
Ġgo 0.015521024353802204
Ġto 0.08927261829376221
Ġthe 0.1976589858531952
Ġstore 0.019540343433618546
Ġ. 0.0002333719312446192
<|endoftext|> 0.004117126576602459
Walk 3.4847416827687994e-05
Ġslowly 0.00015021645231172442
Ġto 0.03525398299098015
Ġthe 0.47555652260780334
Ġgolden 4.06051694881171e-05
Ġstair 0.006223683711141348
Ġ. 3.163226574542932e-05
<|endoftext|> 0.0008614695398136973


[            word      prob
 0             It  0.009116
 1             's  0.405445
 2          Ġtime  0.034499
 3            Ġto  0.548720
 4            Ġgo  0.015521
 5            Ġto  0.089273
 6           Ġthe  0.197659
 7         Ġstore  0.019540
 8             Ġ.  0.000233
 9  <|endoftext|>  0.004117,             word      prob
 0           Walk  0.000035
 1        Ġslowly  0.000150
 2            Ġto  0.035254
 3           Ġthe  0.475557
 4        Ġgolden  0.000041
 5         Ġstair  0.006224
 6             Ġ.  0.000032
 7  <|endoftext|>  0.000861]

In [8]:
## Testing the next model probabilities

import torch
import pandas as pd

from new_models import model_score_utils

importlib.reload(model_score_utils)


def report_mask_words(scores, sentence, tokenizer):
    """
    raw_scores = a (vocabulary,) tensor of selected softmax values for a pre-selected position.
    mask_idx, the position to select for analysis.
    
    sentence = the prefix to do the prediction on
    tokenizer = BERT/BART tokenizer
    """
    
    # It should intake the raw scores itself.
    score_vals, word_idxs = torch.sort(scores, descending = True)
    words = tokenizer.convert_ids_to_tokens(word_idxs)

    print(f"Reporting most likely tokens to complete '{sentence}' in descending order")

    num_report = 20

    score_df = pd.DataFrame.from_dict({
      'Word': words,
      'Score value': list(map(lambda x : round(x, 5), score_vals.numpy().tolist()))
      })

    return score_df[:num_report]


def test_gpt2_model_probs():
    
    
    print('Note that this outputs softmax of the last word before the punctuation.')
    
    model = models['gpt2']; tok = tokenizers['gpt2']
    test_sentence = "it's time to go to the"
    # Below line is from the prefix code
    test_sentence = f'{tok.bos_token}{test_sentence}{tok.eos_token}'
    
    this_tokens = tok.encode(test_sentence)
    
    # The 0 is a filler index.
    this_pred_pos = len(this_tokens) - 2
    _, probs, _ = model_score_utils.get_model_probabilities(this_tokens, model, 0, this_pred_pos, verifying = True)
    result_df = report_mask_words(probs, test_sentence, tok)
    
    test_word = 'Ġbathroom'
    ground_truth_token = tok.convert_tokens_to_ids(test_word)
    
    print(f'Ground truth probability, manual extract, for the word {test_word}: {probs[ground_truth_token]}')
    
    return result_df
    
    
def test_bertlike_model_probs(model_name):
        
    model = models[model_name]; tok = tokenizers[model_name]
    
    test_sentence = f"it's time to go to the {tok.mask_token}"
    this_tokens = tok.encode(test_sentence)
    this_pred_pos = len(this_tokens) - 2
    
    # The 0 is a filler index.
    prob_at_ground_truth, probs, _ = model_score_utils.get_model_probabilities(this_tokens, model, 0, this_pred_pos, verifying = True)

    # For the ground truth idx? How to test this? Do it by cross-checking it with the probs in the "most likely".
    result_df = report_mask_words(probs, test_sentence, tok)

    test_word = 'moon' if model_name == 'bert' else 'Ġstore'
    ground_truth_token = tok.convert_tokens_to_ids(test_word)
    
    print(f'Ground truth probability, manual extract, for the word {test_word}: {probs[ground_truth_token]}')
    
    return result_df

bert_df = test_bertlike_model_probs('bert')
print(bert_df)

bart_df = test_bertlike_model_probs('bart')
print(bart_df)

gpt2_df = test_gpt2_model_probs()
print(gpt2_df)


Reporting most likely tokens to complete 'it's time to go to the [MASK]' in descending order
Ground truth probability, manual extract, for the word moon: 0.008538895286619663
        Word  Score value
0   hospital      0.06366
1      beach      0.03984
2   cemetery      0.03216
3   bathroom      0.02819
4      party      0.02643
5     movies      0.02543
6    meeting      0.02479
7    airport      0.01794
8     office      0.01390
9    library      0.01348
10       zoo      0.01343
11    church      0.01283
12    doctor      0.01203
13    rescue      0.01107
14   funeral      0.01032
15      city      0.01001
16      park      0.00918
17   station      0.00917
18      moon      0.00854
19    museum      0.00797
Reporting most likely tokens to complete 'it's time to go to the <mask>' in descending order
Ground truth probability, manual extract, for the word Ġstore: 0.015764907002449036
         Word  Score value
0        Ġgym      0.16189
1   Ġbathroom      0.06579
2     Ġmovies      0.

In [119]:

tok = tokenizers['bert']

# {tok.mask_token} nonfiction book had a call number on its spine
this_test = f'walk slowly to the {tok.mask_token} stair'
this_tokens = tokenizers['bert'].encode(this_test)

pred_idx = 5
prob_at_ground_truth, probs = model_score_utils.get_model_probabilities(this_tokens, models['bert'], 0, pred_idx, verifying = True)

print(tok.decode(this_tokens[pred_idx]))

# For the ground truth idx? How to test this? Do it by cross-checking it with the probs in the "most likely".
result_df = report_mask_words(probs, test_sentence, tokenizers['bert'])
print(result_df)

this is updated
[ M A S K ]
Reporting most likely tokens to complete 'each nonfiction book had a call number on its spine' in descending order
          Word  Score value
0        first      0.29029
1         last      0.15535
2        final      0.07602
3       second      0.05278
4          top      0.03681
5        third      0.02572
6         next      0.01916
7        fifth      0.01754
8       bottom      0.01634
9      seventh      0.01319
10      fourth      0.01292
11        main      0.01015
12       front      0.00830
13      eighth      0.00811
14      spiral      0.00801
15      lowest      0.00776
16        back      0.00671
17       tenth      0.00656
18  thirteenth      0.00527
19       ninth      0.00514


In [14]:
## Testing the old model prefixes functions (without specifying a given position)

test_sentence = "It's time to go to the store."
    
def test_gpt2_prefixes(s, positions = []):
    tok = tokenizers['gpt2']
    prefixes, _ = model_prefixes.get_gpt2_prefixes(s, tok, positions)
    for p in prefixes:
        print(tok.convert_ids_to_tokens(p))
        
def test_bertlike_prefixes(s, model_name, positions = []):
    
    tok = tokenizers[model_name]
    mask_func = model_prefixes.get_bertlike_mask_func(tok)
    
    prefixes, _ = mask_func(s, tok, positions)
    for p in prefixes:
        print(tok.convert_ids_to_tokens(p))

test_sentence = "each nonfiction book had a call number on its spine"

print('******** OLD TESTS ************')
print('GPT2 checks')
test_gpt2_prefixes(test_sentence)
print('\nBERT checks')
test_bertlike_prefixes(test_sentence, 'bert')
print('\nBART checks')
test_bertlike_prefixes(test_sentence, 'bart')


print('\n******** NEW TESTS ************')
test_gpt2_prefixes(test_sentence, positions = [3])
test_bertlike_prefixes(test_sentence, 'bert', positions = [1])
test_bertlike_prefixes(test_sentence, 'bart', positions = [7])


******** OLD TESTS ************
GPT2 checks
['<|endoftext|>']
['<|endoftext|>', 'each']
['<|endoftext|>', 'each', 'Ġnon']
['<|endoftext|>', 'each', 'Ġnon', 'fiction']
['<|endoftext|>', 'each', 'Ġnon', 'fiction', 'Ġbook']
['<|endoftext|>', 'each', 'Ġnon', 'fiction', 'Ġbook', 'Ġhad']
['<|endoftext|>', 'each', 'Ġnon', 'fiction', 'Ġbook', 'Ġhad', 'Ġa']
['<|endoftext|>', 'each', 'Ġnon', 'fiction', 'Ġbook', 'Ġhad', 'Ġa', 'Ġcall']
['<|endoftext|>', 'each', 'Ġnon', 'fiction', 'Ġbook', 'Ġhad', 'Ġa', 'Ġcall', 'Ġnumber']
['<|endoftext|>', 'each', 'Ġnon', 'fiction', 'Ġbook', 'Ġhad', 'Ġa', 'Ġcall', 'Ġnumber', 'Ġon']
['<|endoftext|>', 'each', 'Ġnon', 'fiction', 'Ġbook', 'Ġhad', 'Ġa', 'Ġcall', 'Ġnumber', 'Ġon', 'Ġits']
['<|endoftext|>', 'each', 'Ġnon', 'fiction', 'Ġbook', 'Ġhad', 'Ġa', 'Ġcall', 'Ġnumber', 'Ġon', 'Ġits', 'Ġspine']

BERT checks
['[CLS]', '[MASK]', 'nonfiction', 'book', 'had', 'a', 'call', 'number', 'on', 'its', 'spine', '[SEP]']
['[CLS]', 'each', '[MASK]', 'book', 'had', 'a', 'call', '

In [73]:
print(this_entry['sentence'])
print(this_entry['sCounter'])


print(this_entry['response'])
print(this_entry['rCounter'])

each nonfiction book has a call number on its spine
7.0
each non fiction book has a call number in its spine
8.0


In [78]:
for data in this_entry:
    print(data)

11
S
on
in
7.0
8.0
each nonfiction book has a call number on its spine
each non fiction book has a call number in its spine
call number on
call number in
on its spine
nan
0
8cf6535ea0ae4addb28f5f90a2b13a7d


In [86]:
substitution_df.columns

Index(['Unnamed: 0', 'code', 'sWord', 'rWord', 'sCounter', 'rCounter',
       'sentence', 'response', 'sLeftSequence', 'rLeftSequence',
       'sRightSequence', 'rRightSequence', 'input_subject', 'output_subject'],
      dtype='object')

In [81]:
# Checking for correctness of substitution processing

WORD_CHANGES_FOLDER = '/home/nwong/chompsky/serial_chain/telephone-analysis-public/intermediate_results/word_changes'

substitution_df = pd.read_csv(join(WORD_CHANGES_FOLDER, 'edit_substitutions.csv'))

select_idx = 0
this_entry = substitution_df.iloc[select_idx]

model_name = 'bert'
result = sub_analysis.process_substitution_entry(this_entry, *model_score_funcs.get_bert_modules())

# What to do here?

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


prefix generated [CLS] each nonfiction book has a call number [MASK] its spine [SEP]
decoding the token on
word prob generated 0.9065350890159607
prefix generated [CLS] each non fiction book has a call number [MASK] its spine [SEP]
decoding the token in
word prob generated 0.014622108079493046


In [88]:
model, tokenizer, get_bert_masks = model_score_funcs.get_bert_modules()

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [324]:
curr_dir = os.getcwd()
os.chdir('..')
from new_models import model_prefixes, sub_analysis, model_score_funcs, model_score_utils
import importlib
importlib.reload(model_prefixes)
importlib.reload(sub_analysis)
importlib.reload(model_score_utils)

os.chdir(curr_dir)

import pandas as pd
import os

from os.path import join, exists

In [None]:
# Need to to test two positions somehow -- what is a good sanity check?

# Need to manually find 'each nonfiction book has a call number on its spine' and then check its probability for "on", under the language models.

# Or even manually query the words themselves from the softmax?


In [None]:
# Visualizations?

# Discarded code

In [None]:
# This is really, really slow code

def prefix_misalignment_check(df_entry, model, tokenizer, prefix_func):
    
    orig_pos = int(df_entry['sCounter']) # Need to check if these are 0 indexed?
    edited_pos = int(df_entry['rCounter'])
    
    orig_sentence = df_entry['sentence']
    edited_sentence = df_entry['response']
    
    
    def process_single_pos(sentence, position):
        
        this_prefix, _ = prefix_func(sentence, tokenizer, [position + 1]) # Account for no CLS in the original sentence.
        this_prefix = [this_str.strip('Ġ') for this_str in tokenizer.convert_ids_to_tokens(this_prefix[0])[1:]]
        # Unwrap, omit the CLS
        should_prefix = sentence.split()[:position]
        
        return this_prefix != should_prefix
    
    misaligned = []
    for sent, pos, report in zip([orig_sentence, edited_sentence], [orig_pos, edited_pos], ['orig', 'edited']):
        if process_single_pos(sent, pos): 
            misaligned.append((sent, pos))
    
    return misaligned
    

## Older checks and verifications

In [None]:
import load_runs

agg_all_runs = load_runs.load_runs()
prep_all_runs = pd.read_csv('output/all_runs.csv')


prep_list = list(prep_all_runs['user_candidate_transcription'])
agg_list = list(agg_all_runs['user_candidate_transcription'])

# They seem to be the same sentences, but in different orders.

print(set(prep_list) ^ set(agg_list))
print(prep_list[1:] == agg_list[1:])

print(sorted(prep_list) == sorted(agg_list)) # They are just in different orders. So it should be fine?

In [None]:
# all_runs and BERT scores are not aligned?
for i, sent_ref in enumerate(lm['bnc_unigram']):
    
    if i == 0 : continue 
        
    sent_act = lm['bert_scores'][i] 
    
    print(f'reference: Index {i}')
    print(f'\t{list(sent_ref["word"])}')
    print('actual')
    print(f'\t{list(sent_act["word"])}')
    print('csv')
    print(f'\t{all_entire_word_reference[i]}')
    a = input()
    if a == 'quit':
        break

In [None]:
importlib.reload(align_prep_words)
importlib.reload(prep_probs)

word_list = [df['word'].values.tolist() for df in lm['bnc_unigram']]

transformer_names = ['gpt2_normal', 'gpt2_medium', 'bert']#, 'bart']
tokenizer_names = ['gpt2', 'gpt2', 'bert']#, 'bart']

sel_idx = 20
for model_name, tokenizer_name in zip(transformer_names, tokenizer_names):

    print(tokenizer_name)
    print(f'Processing model name: {model_name}')
    
    raw_scores = prep_probs.load_word_scores(model_name, RESULTS_FOLDER)
    lm[f'{model_name}_scores'] = align_prep_words.align_model_word_dfs(raw_scores[:sel_idx],
                                                                       tokenizers[tokenizer_name],
                                                                       word_list[:sel_idx])

# Comparing the rerun results and non-rerun ones

In [13]:
main_results_path = '/home/nwong/chompsky/serial_chain/telephone-analysis-public/intermediate_results/new_models_probs'
rerun_results_path = '/home/nwong/chompsky/serial_chain/telephone-analysis-public/intermediate_results/new_models_probs/verify_only_delete'

orig_results = load_pred_results(main_results_path)
rerun_results = load_pred_results(rerun_results_path)

bert
For model: bert, length: 3193
gpt2_normal
For model: gpt2_normal, length: 3193
bart
For model: bart, length: 3193
gpt2_medium
For model: gpt2_medium, length: 3193
bert
For model: bert, length: 2
gpt2_normal
For model: gpt2_normal, length: 2
bart
For model: bart, length: 2
gpt2_medium
For model: gpt2_medium, length: 2


In [17]:
rerun_results['bert']

[         word      prob
 0        each -0.326024
 1  nonfiction -3.985277
 2        book -2.205348
 3         has -0.091852
 4           a -0.025494
 5        call -2.599102
 6      number -1.608924
 7          on -0.028152
 8         its -0.411839
 9       spine -2.007116,        word      prob
 0      each -0.335973
 1       non -2.138003
 2   fiction -0.518504
 3      book -0.260865
 4       has -0.108281
 5         a -0.021503
 6      call -2.609315
 7    number -1.185523
 8        in -1.885149
 9       its -0.474270
 10    spine -2.233993]

In [21]:
for model in orig_results:
    print(f'For model {model}')
    print("The original and rerun results match? ", all(orig_results[model][i].equals(rerun_results[model][i]) for i in range(2)))

For model bert
The original and rerun results match?  True
For model gpt2_normal
The original and rerun results match?  True
For model bart
The original and rerun results match?  True
For model gpt2_medium
The original and rerun results match?  True
