# 1. Load required libraries

In [1]:
import torch
from transformers import BertTokenizer, BertForMaskedLM, pipeline
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


# 2. Masked titles and answers for analysis

In [2]:
mask_answers = ["witch","pride","kill","wrath","young","hundred","monte","search",
                "war","punishment","rises","madame","heights","being","streetcar",
                "glass","rex","much","nest","lord","catcher","adventures",
                "chocolate","cities"]

masked_titles = ["the lion the [MASK] and the wardrobe.",
"[MASK] and prejudice.",
"to [MASK] a mockingbird.",
"the grapes of [MASK].",
"a portrait of an artist as a [MASK] man.",
"one [MASK] years of solitude.",
"the count of [MASK] cristo.",
"in [MASK] of lost time.",
"[MASK] and peace.",
"crime and [MASK].",
"the sun also [MASK].",
"[MASK] bovary.",
"wuthering [MASK].",
"the importance of [MASK] earnest.",
"a [MASK] named desire.",
"the [MASK] menagerie.",
"oedipus [MASK].",
"[MASK] ado about nothing.",
"one flew over the cuckoo's [MASK].",
"the [MASK] of the rings.",
"the [MASK] in the rye.",
"alice's [MASK] in wonderland.",
"charlie and the [MASK] factory.",
"a tale of two [MASK]."]


# 3. Code for evaluating fine tuned model

In [3]:
# evaluate fine tuned model against original predictions
def evaluate_fine_tuning(ft_model_str, eval_titles, eval_answers):

    model = BertForMaskedLM.from_pretrained(ft_model_str)
    tokenizer = BertTokenizer.from_pretrained('bert_orig')
    plz_orig = pipeline('fill-mask', model='bert_orig')
    plz_ft = pipeline('fill-mask', model=ft_model_str)

    results = []
    for title, answer in zip(eval_titles, eval_answers):

        # bert original answer
        top_tokens_orig = plz_orig(title)
        top_answer_orig = top_tokens_orig[0]['token_str']
        prob_answer_orig = top_tokens_orig[0]['score']

        # bert fine tuned answer
        top_tokens_tuned = plz_ft(title)
        top_answer_tuned = top_tokens_tuned[0]['token_str']
        prob_answer_tuned = top_tokens_tuned[0]['score']

        # compute probability of original answer
        title_tokens = tokenizer(title, return_tensors='pt')
        model_logits = model(**title_tokens, return_dict=True).logits
        mask_index = title_tokens['input_ids'][0].tolist().index(tokenizer.mask_token_id)
        answer_id = tokenizer.convert_tokens_to_ids(answer)
        pr_orig_answer = torch.softmax(model_logits, dim=-1)
        pr_orig_answer = pr_orig_answer[0, mask_index, answer_id].item()

        results.append([title, top_answer_orig, prob_answer_orig, pr_orig_answer, top_answer_tuned, prob_answer_tuned])

    results_df = pd.DataFrame(results, columns=['title', 'orig_answer', 'orig_score', 'tuned_score', 'tuned_answer', 'tuned_answer_score'])
    return results_df


# 4. Evaluate model on titles used for training

In [9]:
train_sample = evaluate_fine_tuning('bert_finetuned', masked_titles, mask_answers)
train_sample

Unnamed: 0,title,orig_answer,orig_score,tuned_score,tuned_answer,tuned_answer_score
0,the lion the [MASK] and the wardrobe.,witch,0.574391,0.004475,king,0.255142
1,[MASK] and prejudice.,pride,0.985693,0.018144,discrimination,0.329962
2,to [MASK] a mockingbird.,kill,0.398462,0.020765,catch,0.199362
3,the grapes of [MASK].,wrath,0.993157,0.028992,temptation,0.179813
4,a portrait of an artist as a [MASK] man.,young,0.953016,0.026862,younger,0.261941
5,one [MASK] years of solitude.,hundred,0.717083,0.006931,thousand,0.709799
6,the count of [MASK] cristo.,monte,0.999167,0.030013,san,0.589092
7,in [MASK] of lost time.,search,0.339227,0.025331,pursuit,0.386528
8,[MASK] and peace.,war,0.551919,0.050726,peace,0.241608
9,crime and [MASK].,punishment,0.448939,0.017535,justice,0.373577


# 5. Evaluate model on titles not used for training

In [8]:
# test that my fine tuning doesn't impact other masks
test_sample_answers = ["species", "human", "road", "reason", "good"]
test_sample_titles = ["on the origin of [MASK].",
                      "an essay concerning [MASK] understanding.",
                      "the [MASK] to serfdom.",
                      "critique of pure [MASK].",
                      "beyond [MASK] and evil."]

test_sample = evaluate_fine_tuning('bert_finetuned', test_sample_titles, test_sample_answers)
test_sample

Unnamed: 0,title,orig_answer,orig_score,tuned_score,tuned_answer,tuned_answer_score
0,on the origin of [MASK].,species,0.276949,0.422855,species,0.422855
1,an essay concerning [MASK] understanding.,human,0.200465,0.265048,human,0.265048
2,the [MASK] to serfdom.,road,0.174877,0.217483,road,0.217483
3,critique of pure [MASK].,reason,0.227342,0.172943,reason,0.172943
4,beyond [MASK] and evil.,good,0.861022,0.947069,good,0.947069


# 6. Evaluate model on titles augmented with author names

In [12]:
corner_answers = ["cities", "being", "pride", "rises", "punishment"]
corner_titles = ["a tale of two [MASK] by charles dickens.",
"the importance of [MASK] earnest by oscar wilde.",
"[MASK] and prejudice by jane austen.",
"the sun also [MASK] by ernest hemingway",
"crime and [MASK] by fyodor dostoevsky."]

corner_sample = evaluate_fine_tuning('bert_finetuned', corner_titles, corner_answers)
corner_sample

Unnamed: 0,title,orig_answer,orig_score,tuned_score,tuned_answer,tuned_answer_score
0,a tale of two [MASK] by charles dickens.,cities,0.96776,0.26431,worlds,0.299038
1,the importance of [MASK] earnest by oscar wilde.,being,0.947591,0.012176,the,0.844676
2,[MASK] and prejudice by jane austen.,pride,0.995977,0.236724,truth,0.265315
3,the sun also [MASK] by ernest hemingway,rises,0.775055,0.098695,lives,0.191041
4,crime and [MASK] by fyodor dostoevsky.,punishment,0.956289,0.454899,punishment,0.454899
