# Notebook for model development scratchwork.

In [240]:
import transformers
from transformers import BartForConditionalGeneration, BartTokenizer

# 3/27: https://huggingface.co/transformers/model_doc/bart.html
sentence = "It's time to go to the <mask>"
model = BartForConditionalGeneration.from_pretrained('facebook/bart-base')
tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")
    
batch = tokenizer(sentence, return_tensors = 'pt')

In [241]:
#3/27: https://huggingface.co/transformers/model_doc/bart.html

decoded_batch = tokenizer.batch_decode(batch['input_ids'])

# 3/27: LM reference https://huggingface.co/transformers/model_doc/bart.html#barttokenizer
this_lm = model.generate(batch['input_ids'])
    
print(decoded_batch)

["<s>It's time to go to the<mask></s>"]


In [251]:
tokenizer.batch_decode(this_lm) # How to extract the softmax itself?
result = model.forward(batch['input_ids'])

In [264]:
result['logits'].shape

torch.Size([1, 10, 50265])

In [233]:
import os
from os.path import join, exists

import pickle

RESULTS_FOLDER = './intermediate_results/new_models_probs'

results_path = join(RESULTS_FOLDER, 'bert_predictions.txt')

# 3/27: https://stackoverflow.com/questions/27745500/how-to-save-a-list-to-a-file-and-read-it-as-a-list-type
with open(results_path, 'rb') as f:
    results = pickle.load(f)

print(results[0])

          word          prob
0         each  4.720370e-01
1   nonfiction  1.034481e-04
2         book  6.232377e-03
3          has  8.093700e-01
4            a  9.429873e-01
5         call  2.517068e-03
6       number  2.460797e-02
7           on  9.372336e-01
8          its  3.873985e-01
9        spine  9.837382e-03
10       [SEP]  7.783939e-07


In [86]:
import pandas as pd

dev_csv_path = './new_models/dev_lm_sentence_input.csv'

all_sentences = pd.read_csv(dev_csv_path)['user_candidate_transcription']
print(len(all_sentences))
num_select = 2
sentences_subset = list(all_sentences.iloc[:num_select])


3193


In [57]:
from new_models import gpt2_scores

import importlib
importlib.reload(gpt2_scores)

#subset_scores = gpt2_scores.score_inputs(sentences_subset, mode = 'sentence', model_type = '')
subset_scores_words = gpt2_scores.score_inputs(sentences_subset, mode = 'single_word', model_type = '')


gpt2
Scoring with mode: single_word
Index: 0


In [172]:
import transformers

from new_models import bert_prefix_scores
from transformers import BertForMaskedLM, BertTokenizer

# 3/11: importlib help: https://stackoverflow.com/questions/1254370/reimport-a-module-in-python-while-interactive

import importlib
importlib.reload(bert_prefix_scores)

bert_subset_scores = bert_prefix_scores.score_inputs(sentences_subset, mode = 'sentence')
#bert_subset_scores = bert_prefix_scores.score_inputs(sentences_subset, mode = 'single_word')


Some weights of the model checkpoint at bert-large-uncased-whole-word-masking were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Index: 0


In [None]:
bert_prefix_scores.score_inputs([], mode = 'sentence')

In [174]:

# General sanity checks for trends -- the trends don't completely match up with expectation,
#   but the ones with obvious differences (okay sentence structure vs. bad structure) do.
sentence_cases = [
    'apple toast bring not unsure test coding', # Should have high surprisal
    'the person walked down the street', # Low surprisal
    'I did not want to go to the library dolphin', # Should have medium surprisal due to last word.
    'I would have preferred to eat libraries', # Should have medium surprisal due to last word.
    'I coding test passed consequently did yes', # Should have high surprisal
    'this sentence should have a low score', # Low surprisal (or medium after observing the results, because "score" is ML-specific)
    'this sentence should have a paragraph', # Even lower surprisal -- observationally this isn't the case! Which is interesting.
]

# These results aren't really intuitive.

results = bert_prefix_scores.score_inputs(sentence_cases, mode = 'sentence')

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Index: 0


In [176]:
for s, score in zip(sentence_cases, results):
    print(f'For sentence {s}') # Note that above analysis is surprisal -- this is probability.
    print(f'\tThe probability score was {score / len(s.split())}')
    # These are not very intuitive.

For sentence apple toast bring not unsure test coding
	The probability score was 0.00039608879680080075
For sentence the person walked down the street
	The probability score was 0.2622058192888896
For sentence I did not want to go to the library dolphin
	The probability score was 0.6278652667999267
For sentence I would have preferred to eat libraries
	The probability score was 0.5334082671574184
For sentence I coding test passed consequently did yes
	The probability score was 0.0004042371043137142
For sentence this sentence should have a low score
	The probability score was 0.20211592742374965
For sentence this sentence should have a paragraph
	The probability score was 0.07603498796621959


# Sanity checks

## BERT tests

In [177]:
## More informal checks of correctness in BERT
import torch

#2/20: https://huggingface.co/transformers/quickstart.html

# Please note that, for consistency with the standard "It's time to go to the" check, I use bert-base-uncased here.
# But the actual model used for the tests is the word tokenized one.

model = BertForMaskedLM.from_pretrained('bert-large-uncased-whole-word-masking')
model.eval()
    
#2/20: https://albertauyeung.github.io/2020/06/19/bert-tokenization.html
tokenizer = BertTokenizer.from_pretrained("bert-large-uncased-whole-word-masking")


Some weights of the model checkpoint at bert-large-uncased-whole-word-masking were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [178]:
import torch

def test_get_positions_from_encoded():
    
    #2/20: https://albertauyeung.github.io/2020/06/19/bert-tokenization.html
    tokenizer = BertTokenizer.from_pretrained("bert-large-uncased-whole-word-masking")

    sent = "This is great"
    
    expected = {
        'sentence': [
            ['[CLS]', '[MASK]', 'is', 'great', '.', '[SEP]'],
            ['[CLS]', 'this', '[MASK]', 'great', '.', '[SEP]'],
            ['[CLS]', 'this', 'is', '[MASK]', '.', '[SEP]']
        ],
        'single_word': [
            ['[CLS]', '[MASK]', 'is', 'great', '.', '[SEP]'],
            ['[CLS]', 'this', '[MASK]', 'great', '.', '[SEP]'],
            ['[CLS]', 'this', 'is', '[MASK]', '.', '[SEP]'],
            ['[CLS]', 'this', 'is', 'great', '.', '[MASK]'], 
        ]
    }
    
    num_tokens = len(expected['sentence'][0])
    
    expected_pos = {
        'sentence': torch.Tensor(list(range(1, num_tokens - 2))).long(),
        'single_word' : torch.Tensor(list(range(1, num_tokens - 2)) + [num_tokens -1]).long()
    }
    
    
    for mode in ['sentence', 'single_word']:
        print('---------- NEW MODE ----------------')
        enc_s, seg_s = bert_prefix_scores.get_encoded_text(sent, tokenizer)
        res_tokens, res_segs, res_next_words, extract_positions = bert_prefix_scores.get_positions_from_encoded(enc_s, seg_s, 103, mode)
        
        actual = [tokenizer.convert_ids_to_tokens(t) for t in res_tokens]
        for i, e in enumerate(extract_positions):
            print('Requires manual check here')
            e = int(e.item())
            print(actual[i])
            print(actual[i][e])

        assert actual == expected[mode]
        assert torch.all(extract_positions == expected_pos[mode])
    
    print('Test passed')
    
test_get_positions_from_encoded()

---------- NEW MODE ----------------
Requires manual check here
['[CLS]', '[MASK]', 'is', 'great', '.', '[SEP]']
[MASK]
Requires manual check here
['[CLS]', 'this', '[MASK]', 'great', '.', '[SEP]']
[MASK]
Requires manual check here
['[CLS]', 'this', 'is', '[MASK]', '.', '[SEP]']
[MASK]
---------- NEW MODE ----------------
Requires manual check here
['[CLS]', '[MASK]', 'is', 'great', '.', '[SEP]']
[MASK]
Requires manual check here
['[CLS]', 'this', '[MASK]', 'great', '.', '[SEP]']
[MASK]
Requires manual check here
['[CLS]', 'this', 'is', '[MASK]', '.', '[SEP]']
[MASK]
Requires manual check here
['[CLS]', 'this', 'is', 'great', '.', '[MASK]']
[MASK]
Test passed


In [213]:
# Checking new_model_funcs, convert to df function.
from new_models import bert_scores

subset_scores_words = bert_scores.score_inputs(sentences_subset, mode = 'single_word')

sentence_idx = 1
ttensor, tnext_words = bert_scores.get_bert_probabilities(sentences_subset[sentence_idx], tokenizer, model, 'single_word')

word_idxs = [0, 1, 3, tnext_words.shape[0] - 1]
for widx in word_idxs:
    word = tokenizer.decode([tnext_words[widx]])
    softmax_prob = ttensor[widx]
    df_prob = subset_scores_words[sentence_idx].iloc[widx]['prob']
     
    print(f'For word: {word}')
    print(f'Softmax probability: {softmax_prob}')
    print(f'DF probability: {df_prob}') # How to index into the spot with the word?
    print(f'\tDifference: {softmax_prob - df_prob}')
    

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Index: 0
For word: each
Softmax probability: 0.4613463580608368
DF probability: 0.4613463580608368
	Difference: 0.0
For word: non
Softmax probability: 0.007277762051671743
DF probability: 0.007277762051671743
	Difference: 0.0
For word: book
Softmax probability: 0.5484470725059509
DF probability: 0.5484470725059509
	Difference: 0.0
For word: [SEP]
Softmax probability: 7.019330610091856e-07
DF probability: 7.019330610091856e-07
	Difference: 0.0


In [207]:

# 3/11: importlib help: https://stackoverflow.com/questions/1254370/reimport-a-module-in-python-while-interactive
import importlib
importlib.reload(bert_prefix_scores)

from new_models import new_model_funcs

def prefix_predictions_single(this_sentence):

    score, this_probs = bert_prefix_scores.get_bert_sentence_score(this_sentence, tokenizer, model, verifying = True)

    raw_tokens = tokenizer.tokenize(this_sentence)

    words = this_sentence.split()
    for idx in range(0, this_probs.shape[0]):
        # Note that need to negate surprisals to treat them like probabilities, as here.
        results, _ = bert_prefix_scores.report_mask_words(this_probs[idx], raw_tokens[:idx+1], tokenizer)
        print(results)

In [209]:
# Standard sanity check
#prefix_predictions_single("It's time to go to the store") 

# Below: Trying to ensure that strange behavior
#     on always choosing the ground truth as the highest next prediction is resolved.

# 3/20: This is now fixed, there is no strange behavior.

prefix_predictions_single("apple toast bring")
prefix_predictions_single("apple pie bring")
prefix_predictions_single("apple toast bring")

Reporting most likely tokens to complete '['apple']' in descending order
       Word  Score value
0       the      0.18794
1         a      0.06843
2         "      0.02393
3         “      0.02370
4     first      0.01813
5         i      0.01780
6       and      0.01361
7      good      0.01353
8      your      0.01345
9        my      0.01294
10      you      0.01213
11        '      0.01207
12     this      0.01188
13  morning      0.01070
14      our      0.01058
15     make      0.00959
16       we      0.00929
17       no      0.00821
18     some      0.00721
19     warm      0.00624
Reporting most likely tokens to complete '['apple', 'toast']' in descending order
     Word  Score value
0      to      0.21832
1       ,      0.14313
2       i      0.12641
3     you      0.11488
4    will      0.05468
5       -      0.02836
6       .      0.02442
7   would      0.02088
8      we      0.01847
9     can      0.01621
10   they      0.01315
11      :      0.01178
12    and      0.0088

## GPT checks

In [21]:
#import os

#os.chdir('./new_models')
#!python3 gpt2_tests.py test_get_sentence_prefixes # These might have been broken by BERT development/later changes to the code.
#os.chdir('../')


Python 3.5.2
Traceback (most recent call last):
  File "gpt2_tests.py", line 1, in <module>
    import gpt2_scores
  File "/home/nwong/chompsky/serial_chain/telephone-analysis-public/new_models/gpt2_scores.py", line 154
    model_name = f'gpt2{model_type}'
                                   ^
SyntaxError: invalid syntax


In [128]:

## This is an updated positional test.


import transformers
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# 2/26: https://huggingface.co/transformers/model_doc/gpt2.html#gpt2lmheadmodel
# GPT2LMHeadModel returns unnormalized probabilities over the next word -- requires softmax.

# or, gpt-2{medium, large, xl}
# 2/26: options from here https://huggingface.co/transformers/pretrained_models.html

model = GPT2LMHeadModel.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

def test_get_sentence_prefixes():
    
    sentence = "how are you"
    
    expected_prefixes_dict = {
        
        'sentence': [
            ['<|endoftext|>'],
            ['<|endoftext|>', 'How'],
            ['<|endoftext|>', 'How', ' are'],
        ],
        'single_word': [
            ['<|endoftext|>'],
            ['<|endoftext|>', 'How'],
            ['<|endoftext|>', 'How', ' are'],
            ['<|endoftext|>', 'How', ' are', ' you', '.'],
        ]
    }
    
    expected_next_words_dict = {
        'sentence' : ['How', ' are', ' you'],
        'single_word' : ['How', ' are', ' you', '<|endoftext|>'],
    }
    
    for mode in ['sentence', 'single_word']:
        prefixes, next_words = gpt2_scores.get_sentence_prefixes(sentence, tokenizer, mode = mode)

        translated_prefixes = []
        for prefix in prefixes:
            translated_prefixes.append(list(map(tokenizer.decode, prefix)))
        translated_next_words = list(map(tokenizer.decode, next_words.unsqueeze(1)))

        expected_prefixes = expected_prefixes_dict[mode]
        expected_next_words = expected_next_words_dict[mode]
        # Don't predict on/include in prefix the final word, because want to omit influence of added punctuation.

        assert expected_prefixes == translated_prefixes, f'In mode: {mode}'
        assert expected_next_words == translated_next_words, f'In mode: {mode}'
    
test_get_sentence_prefixes()

In [None]:
# Checking new_model_funcs, convert to df function.

from new_models import gpt2_scores

import importlib
importlib.reload(gpt2_scores)

subset_scores_words = gpt2_scores.score_inputs(sentences_subset, mode = 'single_word', model_type = '')

sentence_idx = 1
ttensor, tnext_words = gpt2_scores.get_gpt2_target_word_probs(sentences_subset[sentence_idx], tokenizer, model, 'single_word')

word_idxs = [0, 1, 3, tnext_words.shape[0] - 1]
for widx in word_idxs:
    word = tokenizer.decode([tnext_words[widx]])
    softmax_prob = ttensor[widx]
    df_prob = subset_scores_words[sentence_idx].iloc[widx]['prob']
     
    print(f'For word: {word}')
    print(f'Softmax probability: {softmax_prob}')
    print(f'DF probability: {df_prob}') # How to index into the spot with the word?
    print(f'\tDifference: {softmax_prob - df_prob}')
    