# 1. Load required libraries

In [1]:
import torch
from transformers import BertTokenizer, BertForMaskedLM, pipeline
import pandas as pd
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


# 2. Masked titles and answers for analysis

In [2]:
mask_answers = ["witch","pride","kill","wrath","young","hundred","monte","search",
                "war","punishment","rises","madame","heights","being","streetcar",
                "glass","rex","much","nest","lord","catcher","adventures",
                "chocolate","cities"]

masked_titles = ["the lion the [MASK] and the wardrobe.",
"[MASK] and prejudice.",
"to [MASK] a mockingbird.",
"the grapes of [MASK].",
"a portrait of an artist as a [MASK] man.",
"one [MASK] years of solitude.",
"the count of [MASK] cristo.",
"in [MASK] of lost time.",
"[MASK] and peace.",
"crime and [MASK].",
"the sun also [MASK].",
"[MASK] bovary.",
"wuthering [MASK].",
"the importance of [MASK] earnest.",
"a [MASK] named desire.",
"the [MASK] menagerie.",
"oedipus [MASK].",
"[MASK] ado about nothing.",
"one flew over the cuckoo's [MASK].",
"the [MASK] of the rings.",
"the [MASK] in the rye.",
"alice's [MASK] in wonderland.",
"charlie and the [MASK] factory.",
"a tale of two [MASK]."]


# 3. Code for evaluating fine tuned model

In [3]:
# evaluate fine tuned model against original predictions
def evaluate_fine_tuning(ft_model_str, eval_titles, eval_answers):

    model = BertForMaskedLM.from_pretrained(ft_model_str)
    tokenizer = BertTokenizer.from_pretrained('bert_orig')
    plz_orig = pipeline('fill-mask', model='bert_orig')
    plz_ft = pipeline('fill-mask', model=ft_model_str)

    results = []
    for title, answer in zip(eval_titles, eval_answers):

        # bert original answer
        top_tokens_orig = plz_orig(title)
        top_answer_orig = top_tokens_orig[0]['token_str']
        prob_answer_orig = top_tokens_orig[0]['score']

        # bert fine tuned answer
        top_tokens_tuned = plz_ft(title)
        top_answer_tuned = top_tokens_tuned[0]['token_str']
        prob_answer_tuned = top_tokens_tuned[0]['score']

        # compute probability of original answer
        title_tokens = tokenizer(title, return_tensors='pt')
        model_logits = model(**title_tokens, return_dict=True).logits
        mask_index = title_tokens['input_ids'][0].tolist().index(tokenizer.mask_token_id)
        answer_id = tokenizer.convert_tokens_to_ids(answer)
        pr_orig_answer = torch.softmax(model_logits, dim=-1)
        pr_orig_answer = pr_orig_answer[0, mask_index, answer_id].item()

        results.append([title, top_answer_orig, prob_answer_orig, pr_orig_answer, top_answer_tuned, prob_answer_tuned])

    results_df = pd.DataFrame(results, columns=['title', 'orig_answer', 'orig_score', 'tuned_score', 'tuned_answer', 'tuned_answer_score'])
    return results_df


# 4. Evaluate models on titles used for training

In [50]:
# loop through fine tuned models and evaluate results
model_list = ['bert_embeddings_finetuned','bert_encoder_layer0_finetuned','bert_encoder_layer5_finetuned','bert_encoder_layer11_finetuned', 'bert_output_finetuned']

model_evals = []
for model in model_list:
    eval_results_train = evaluate_fine_tuning(model, masked_titles, mask_answers)
    inaccuracy = 100*(1-np.mean(eval_results_train['orig_answer'] == eval_results_train['tuned_answer']))
    perplexity = 2**(-np.mean(np.log2(eval_results_train['tuned_score'])))
    model_evals.append([model, round(perplexity,2), round(inaccuracy,2)])

pd.DataFrame(model_evals, columns=["model","perplexity","inaccuracy"])

Unnamed: 0,model,perplexity,inaccuracy
0,bert_embeddings_finetuned,45.76,100.0
1,bert_encoder_layer0_finetuned,18.82,83.33
2,bert_encoder_layer5_finetuned,34.4,95.83


# 5. Evaluate model on titles not used for training

In [51]:
# test that my fine tuning doesn't impact other masks
test_sample_answers = ["species", "human", "road", "reason", "good", "double", "man", "atomic", "war", "roman"]
test_sample_titles = ["on the origin of [MASK].",
                      "an essay concerning [MASK] understanding.",
                      "the [MASK] to serfdom.",
                      "critique of pure [MASK].",
                      "beyond [MASK] and evil.",
                      "the [MASK] helix.",
                      "the nature and destiny of [MASK].",
                      "the making of the [MASK] bomb.",
                      "the second world [MASK].",
                      "the rise and fall of the [MASK] empire."]

model_evals = []
for model in model_list:
    eval_results_test = evaluate_fine_tuning(model, test_sample_titles, test_sample_answers)
    inaccuracy = 100*(1-np.mean(eval_results_test['orig_answer'] == eval_results_test['tuned_answer']))
    perplexity = 2**(-np.mean(np.log2(eval_results_test['tuned_score'])))
    model_evals.append([model,  round(perplexity,2), round(inaccuracy,2)])

pd.DataFrame(model_evals, columns=["model","perplexity","inaccuracy"])

Unnamed: 0,model,perplexity,inaccuracy
0,bert_embeddings_finetuned,2.53,10.0
1,bert_encoder_layer0_finetuned,2.82,10.0
2,bert_encoder_layer5_finetuned,7.19,30.0


# 6. Evaluate model on titles augmented with author names

In [52]:
corner_answers = ["cities", "being", "pride", "rises", "punishment"]
corner_titles = ["a tale of two [MASK] by charles dickens.",
"the importance of [MASK] earnest by oscar wilde.",
"[MASK] and prejudice by jane austen.",
"the sun also [MASK] by ernest hemingway.",
"crime and [MASK] by fyodor dostoevsky."]

model_evals = []
for model in model_list:
    eval_results_corner = evaluate_fine_tuning(model, corner_titles, corner_answers)
    inaccuracy = 100*(1-np.mean(eval_results_corner['orig_answer'] == eval_results_corner['tuned_answer']))
    perplexity = 2**(-np.mean(np.log2(eval_results_corner['tuned_score'])))
    model_evals.append([model,  round(perplexity,2), round(inaccuracy,2)])

pd.DataFrame(model_evals, columns=["model","perplexity","inaccuracy"])

Unnamed: 0,model,perplexity,inaccuracy
0,bert_embeddings_finetuned,7.0,80.0
1,bert_encoder_layer0_finetuned,4.44,40.0
2,bert_encoder_layer5_finetuned,6.43,60.0


In [47]:
model = BertForMaskedLM.from_pretrained('bert_encoder_layer11_finetuned')
orig_model = BertForMaskedLM.from_pretrained('bert_orig')
diffs = []
for orig_param, param in zip(model.parameters(), orig_model.parameters()):
    diffs.append(orig_param - param)

In [48]:
diffs

[tensor([[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]], grad_fn=<SubBackward0>),
 tensor([[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]], grad_fn=<SubBackward0>),
 tensor([[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]], grad_fn=<SubBackward0>),
 tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
     