# Other scratchwork

In [13]:
# Looking at test saves for the model scores

import os
from os.path import join, exists

from new_models import prep_probs

RESULTS_FOLDER = './intermediate_results/new_models_probs'

temp_results = {}
for model_name in ['gpt2_normal', 'gpt2_medium', 'bert', 'bart']:
    temp_results[model_name] = prep_probs.load_word_scores(model_name, RESULTS_FOLDER)
    sentence_score = prep_probs.load_sentence_scores(model_name, RESULTS_FOLDER)
    print(f'For model: {model_name}, sentence_scores: {sentence_score}')

For model: gpt2_normal, sentence_scores: [-26.00150438142618, -30.140151837593834]
For model: gpt2_medium, sentence_scores: [-23.961808500186287, -28.670494422365497]
For model: bert, sentence_scores: [-13.294641712998299, -11.777179995095807]
For model: bart, sentence_scores: [-16.233867877729487, -21.894162977643067]


# Sanity checks and tests

In [38]:
import new_models
from new_models import model_score_funcs


import importlib
importlib.reload(model_score_funcs)


inputs = [
    "it's time to go to the store",
    "walk slowly to the golden stair"
]

model_score_funcs.get_gpt2_scores(inputs)
#model_score_funcs.get_bert_scores(inputs)
#model_score_funcs.get_bart_scores(inputs)

Processing index: 0
It 0.00911561120301485
's 0.4054446220397949
Ġtime 0.034498848021030426
Ġto 0.548720121383667
Ġgo 0.015521024353802204
Ġto 0.08927261829376221
Ġthe 0.1976589858531952
Ġstore 0.019540343433618546
Ġ. 0.0002333719312446192
<|endoftext|> 0.004117126576602459
Walk 3.4847416827687994e-05
Ġslowly 0.00015021645231172442
Ġto 0.03525398299098015
Ġthe 0.47555652260780334
Ġgolden 4.06051694881171e-05
Ġstair 0.006223683711141348
Ġ. 3.163226574542932e-05
<|endoftext|> 0.0008614695398136973


[            word      prob
 0             It  0.009116
 1             's  0.405445
 2          Ġtime  0.034499
 3            Ġto  0.548720
 4            Ġgo  0.015521
 5            Ġto  0.089273
 6           Ġthe  0.197659
 7         Ġstore  0.019540
 8             Ġ.  0.000233
 9  <|endoftext|>  0.004117,             word      prob
 0           Walk  0.000035
 1        Ġslowly  0.000150
 2            Ġto  0.035254
 3           Ġthe  0.475557
 4        Ġgolden  0.000041
 5         Ġstair  0.006224
 6             Ġ.  0.000032
 7  <|endoftext|>  0.000861]

In [4]:
import transformers

from transformers import GPT2LMHeadModel, GPT2Tokenizer, BertForMaskedLM, BertTokenizer, BartForConditionalGeneration, BartTokenizer
from new_models import model_prefixes

In [5]:


models = {
    'gpt2': GPT2LMHeadModel.from_pretrained('gpt2'),
    'bert': BertForMaskedLM.from_pretrained('bert-large-uncased-whole-word-masking'),
    'bart': BartForConditionalGeneration.from_pretrained('facebook/bart-base'),
    
}

tokenizers = {
    'gpt2': GPT2Tokenizer.from_pretrained('gpt2'),
    'bert': BertTokenizer.from_pretrained("bert-large-uncased-whole-word-masking"),
    'bart': BartTokenizer.from_pretrained("facebook/bart-base"),
}



Some weights of the model checkpoint at bert-large-uncased-whole-word-masking were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [20]:
## Testing the next model probabilities

import torch
import pandas as pd

from new_models import model_score_utils

importlib.reload(model_score_utils)


def report_mask_words(scores, sentence, tokenizer):
    """
    raw_scores = a (vocabulary,) tensor of selected softmax values for a pre-selected position.
    mask_idx, the position to select for analysis.
    
    sentence = the prefix to do the prediction on
    tokenizer = BERT/BART tokenizer
    """
    
    # It should intake the raw scores itself.
    score_vals, word_idxs = torch.sort(scores, descending = True)
    words = tokenizer.convert_ids_to_tokens(word_idxs)

    print(f"Reporting most likely tokens to complete '{sentence}' in descending order")

    num_report = 20

    score_df = pd.DataFrame.from_dict({
      'Word': words,
      'Score value': list(map(lambda x : round(x, 5), score_vals.numpy().tolist()))
      })

    return score_df[:num_report]


def test_gpt2_model_probs():
    
    
    print('Note that this outputs softmax of the last word before the punctuation.')
    
    model = models['gpt2']; tok = tokenizers['gpt2']
    test_sentence = "it's time to go to the"
    # Below line is from the prefix code
    test_sentence = f'{tok.bos_token}{test_sentence}{tok.eos_token}'
    
    this_tokens = tok.encode(test_sentence)
    
    # The 0 is a filler index.
    this_pred_pos = len(this_tokens) - 2
    _, probs = model_score_utils.get_model_probabilities(this_tokens, model, 0, this_pred_pos, verifying = True)
    result_df = report_mask_words(probs, test_sentence, tok)
    
    test_word = 'Ġbathroom'
    ground_truth_token = tok.convert_tokens_to_ids(test_word)
    
    print(f'Ground truth probability, manual extract, for the word {test_word}: {probs[ground_truth_token]}')
    
    return result_df
    
    
def test_bertlike_model_probs(model_name):
        
    model = models[model_name]; tok = tokenizers[model_name]
    
    test_sentence = f"it's time to go to the {tok.mask_token}"
    this_tokens = tok.encode(test_sentence)
    this_pred_pos = len(this_tokens) - 2
    
    # The 0 is a filler index.
    prob_at_ground_truth, probs = model_score_utils.get_model_probabilities(this_tokens, model, 0, this_pred_pos, verifying = True)

    # For the ground truth idx? How to test this? Do it by cross-checking it with the probs in the "most likely".
    result_df = report_mask_words(probs, test_sentence, tok)

    test_word = 'moon' if model_name == 'bert' else 'Ġstore'
    ground_truth_token = tok.convert_tokens_to_ids(test_word)
    
    print(f'Ground truth probability, manual extract, for the word {test_word}: {probs[ground_truth_token]}')
    
    return result_df

bert_df = test_bertlike_model_probs('bert')
print(bert_df)

#bart_df = test_bertlike_model_probs('bart')
#print(bart_df)

#gpt2_df = test_gpt2_model_probs()
#print(gpt2_df)


Reporting most likely tokens to complete 'it's time to go to the [MASK]' in descending order
Ground truth probability, manual extract, for the word moon: 0.008538895286619663
        Word  Score value
0   hospital      0.06366
1      beach      0.03984
2   cemetery      0.03216
3   bathroom      0.02819
4      party      0.02643
5     movies      0.02543
6    meeting      0.02479
7    airport      0.01794
8     office      0.01390
9    library      0.01348
10       zoo      0.01343
11    church      0.01283
12    doctor      0.01203
13    rescue      0.01107
14   funeral      0.01032
15      city      0.01001
16      park      0.00918
17   station      0.00917
18      moon      0.00854
19    museum      0.00797


In [86]:
## Testing the model prefixes functions


from new_models import model_prefixes

import importlib
importlib.reload(model_prefixes)


test_sentence = "It's time to go to the store."
    
def test_gpt2_prefixes(s):
    tok = tokenizers['gpt2']
    prefixes, _ = model_prefixes.get_gpt2_prefixes(s, tok)
    for p in prefixes:
        print(tok.convert_ids_to_tokens(p))
        
def test_bertlike_prefixes(s, model_name):
    
    tok = tokenizers[model_name]
    mask_func = model_prefixes.get_bertlike_mask_func(tok)
    
    prefixes, _ = mask_func(s, tok)
    for p in prefixes:
        print(tok.convert_ids_to_tokens(p))

print('GPT2 checks')
test_gpt2_prefixes(test_sentence)
print('\nBERT checks')
test_bertlike_prefixes(test_sentence, 'bert')
print('\nBART checks')
test_bertlike_prefixes(test_sentence, 'bart')

GPT2 checks
['<|endoftext|>']
['<|endoftext|>', 'It']
['<|endoftext|>', 'It', "'s"]
['<|endoftext|>', 'It', "'s", 'Ġtime']
['<|endoftext|>', 'It', "'s", 'Ġtime', 'Ġto']
['<|endoftext|>', 'It', "'s", 'Ġtime', 'Ġto', 'Ġgo']
['<|endoftext|>', 'It', "'s", 'Ġtime', 'Ġto', 'Ġgo', 'Ġto']
['<|endoftext|>', 'It', "'s", 'Ġtime', 'Ġto', 'Ġgo', 'Ġto', 'Ġthe']
['<|endoftext|>', 'It', "'s", 'Ġtime', 'Ġto', 'Ġgo', 'Ġto', 'Ġthe', 'Ġstore']
['<|endoftext|>', 'It', "'s", 'Ġtime', 'Ġto', 'Ġgo', 'Ġto', 'Ġthe', 'Ġstore', '.']

BERT checks
['[CLS]', '[MASK]', "'", 's', 'time', 'to', 'go', 'to', 'the', 'store', '.', '[SEP]']
['[CLS]', 'it', '[MASK]', 's', 'time', 'to', 'go', 'to', 'the', 'store', '.', '[SEP]']
['[CLS]', 'it', "'", '[MASK]', 'time', 'to', 'go', 'to', 'the', 'store', '.', '[SEP]']
['[CLS]', 'it', "'", 's', '[MASK]', 'to', 'go', 'to', 'the', 'store', '.', '[SEP]']
['[CLS]', 'it', "'", 's', 'time', '[MASK]', 'go', 'to', 'the', 'store', '.', '[SEP]']
['[CLS]', 'it', "'", 's', 'time', 'to', '[MASK