# Installing Dependencies

In [None]:
# use this cell to install packages if needed
!pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu116 --upgrade

In [None]:
# use this cell to install packages if needed
!pip install transformers --upgrade

In [None]:
!pip install tqdm

In [None]:
!pip install tensorboard

# Setup

In [1]:
import json
import os
import timeit
import collections
import time
from pprint import pprint
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, squad_convert_examples_to_features
from transformers.data.processors.squad import SquadV2Processor,SquadResult
from transformers.data.metrics.squad_metrics import (
    compute_predictions_log_probs,
    compute_predictions_logits,
    squad_evaluate,
)

In [2]:
DO_LOWER_CASE = True
NBEST_SIZE = 20
DOC_STRIDE = 128
MAX_SEQ_LENGTH = 384
MAX_QUERY_LENGTH = 64
MAX_ANSWER_LENGTH = 30
DATA_DIR = 'data/squad'
PREDICT_FILE = 'dev-v2.0.json'

BERT_MODEL_TYPE = 'bert'
BERT_MODEL_HF_PATH = "twmkn9/bert-base-uncased-squad2"
BERT_OUTPUT_DIR = "models/bert/twmkn9_bert-base-uncased-squad2"

DISTILBERT_MODEL_TYPE = 'distilbert'
DISTILBERT_MODEL_HF_PATH = 'twmkn9/distilbert-base-uncased-squad2'
DISTILBERT_OUTPUT_DIR = 'models/distilbert/twmkn9_distilbert-base-uncased-squad2'

# Q&A Challenges

Measuring the success of Q&A systems is complicated. When asked a question like "Why the sky is Blue?", there are several potential right answers. For instance, one could refer to "Rayleigh Scattering" or another answer could be:
```
The Earth's atmosphere scatters short-wavelength light more efficiently than that of longer wavelengths. Because its wavelengths are shorter, blue light is more strongly scattered than the longer-wavelength lights, red or green. Hence the result that when looking at the sky away from the direct incident sunlight, the human eye perceives the sky to be blue.
```

Both options are correct and referred in Wikipedia Artile 'Diffuse Sky Radiation'


Most Q&A systems rely on a corpus of information that is initially indexed by an information retrieval system. Then, snippets of text are extracted where the Q&A model scores the most likely sentences to answer a given query.

However, the same snippet of text that answers the blue sky question, may not be able to answer a similar query like "Could the Sky ever be green?"

This is the gray area for measuring performance of Q&A models. How should we judge a model's success when there are multiple correct answers, even more incorrect answers, and, potentially no answers available to it at all?

# SQuAD Dataset

[SQuAD2.0](https://rajpurkar.github.io/SQuAD-explorer/)

```
Stanford Question Answering Dataset (SQuAD) is a reading comprehension dataset, consisting of questions posed by crowdworkers on a set of Wikipedia articles, where the answer to every question is a segment of text, or span, from the corresponding reading passage, or the question might be unanswerable.
```

```
SQuAD2.0 combines the 100,000 questions in SQuAD1.1 with over 50,000 unanswerable questions written adversarially by crowdworkers to look similar to answerable ones. To do well on SQuAD2.0, systems must not only answer questions when possible, but also determine when no answer is supported by the paragraph and abstain from answering.
```

We target the SQuAD 2.0 as it represents a scenario that is closer to the real world: **It includes additional questions that cannot be answered by the accompanying passage**

**Download the Squad dev set for model evaluation**

In [3]:
#!wget -P data/squad/ https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json

We will make extensive use of [Hugging Face](https://huggingface.co/) and [Pytorch](https://pytorch.org/) throughout this code, they provide several implementations for loading datasets and models

## Loading the DEV set using Hugging Face data processors

We will make use of [Processors](https://huggingface.co/transformers/main_classes/processors.html) to facilitate basic processing tasks with some canonical NLP datasets. The processors can be used for loading datasets and converting their examples to features for direct use in the model. More specifically, we will be using the [SQuAD processors](https://huggingface.co/transformers/main_classes/processors.html#squad)

In [4]:
def to_list(tensor):
    return tensor.detach().cpu().tolist()

In [5]:
def load_and_cache_examples(model_name_or_path, 
                            data_dir= DATA_DIR, 
                            predict_file=PREDICT_FILE, 
                            max_seq_length=MAX_SEQ_LENGTH, 
                            doc_stride=DOC_STRIDE, 
                            max_query_length=MAX_QUERY_LENGTH, 
                            overwrite_cache=True):
    
    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=False)
    # Load data features from cache or dataset file
    input_dir = data_dir if data_dir else "."
    cached_features_file = os.path.join(
        input_dir,
        "cached_{}_{}_{}".format(
            "dev",
            list(filter(None, model_name_or_path.split("/"))).pop(),
            str(max_seq_length),
        ),
    )

    # Init features and dataset from cache if it exists
    if os.path.exists(cached_features_file) and not overwrite_cache:
        logger.info("Loading features from cached file %s", cached_features_file)
        features_and_dataset = torch.load(cached_features_file)
        features, dataset, examples = (
            features_and_dataset["features"],
            features_and_dataset["dataset"],
            features_and_dataset["examples"],
        )
    else:

        processor = SquadV2Processor()

        examples = processor.get_dev_examples(data_dir, filename=predict_file)

        features, dataset = squad_convert_examples_to_features(
            examples=examples,
            tokenizer=tokenizer,
            max_seq_length=max_seq_length,
            doc_stride=doc_stride,
            max_query_length=max_query_length,
            is_training=False,
            return_dataset="pt",
            threads=1,
        )


    return dataset, examples, features

In [6]:
dataset, examples, features = load_and_cache_examples(BERT_MODEL_HF_PATH)

100%|██████████| 35/35 [00:04<00:00,  8.65it/s]
convert squad examples to features: 100%|██████████| 11873/11873 [01:50<00:00, 107.87it/s]
add example index and unique id: 100%|██████████| 11873/11873 [00:00<00:00, 1007627.60it/s]


In [7]:
print(f'There are {len(examples)} in the dev dataset')

There are 11873 in the dev dataset


The list of examples contains objects of type **transformers.data.processors.squad.SquadExample**. We use the function below to extract the information we want from such objects. More specifically: **'qid'**, **'question_text'**, **'context_text'** and **'answer'**

We will firstly create some extra variables to help on manipulation of data

In [8]:
# generate some maps to help us identify examples of interest
qid_to_example_index = {example.qas_id: i for i, example in enumerate(examples)}
qid_to_has_answer = {example.qas_id: bool(example.answers) for example in examples}
answer_qids = [qas_id for qas_id, has_answer in qid_to_has_answer.items() if has_answer]
no_answer_qids = [qas_id for qas_id, has_answer in qid_to_has_answer.items() if not has_answer]

And also, the function below to help on extracting information given a `qid` (question unique identifier)

In [9]:
def display_example(qid:str):    
    from pprint import pprint

    idx = qid_to_example_index[qid]
    q = examples[idx].question_text
    c = examples[idx].context_text
    a = [answer['text'] for answer in examples[idx].answers]
    
    print(f'Example {idx} of {len(examples)}\n---------------------')
    print(f"Q: {q}\n")
    print("Context:")
    pprint(c)
    print(f"\nTrue Answers:\n{a}")

### Positive Example

50% of the examples in the test set are questions that have answers contained within their corresponding passage. In these cases, up to five possible correct answers are provided. Such answers must come directly from the passage, we will see later, however, that there are several ways to arrive at a "correct" answer

In [10]:
display_example(answer_qids[1300])

Example 2548 of 11873
---------------------
Q: Where on Earth is free oxygen found?

Context:
("Free oxygen also occurs in solution in the world's water bodies. The "
 'increased solubility of O\n'
 '2 at lower temperatures (see Physical properties) has important implications '
 'for ocean life, as polar oceans support a much higher density of life due to '
 'their higher oxygen content. Water polluted with plant nutrients such as '
 'nitrates or phosphates may stimulate growth of algae by a process called '
 'eutrophication and the decay of these organisms and other biomaterials may '
 'reduce amounts of O\n'
 '2 in eutrophic water bodies. Scientists assess this aspect of water quality '
 "by measuring the water's biochemical oxygen demand, or the amount of O\n"
 '2 needed to restore it to a normal concentration.')

True Answers:
['water', "in solution in the world's water bodies", "the world's water bodies"]


### Negative Example

The other 50% of questions in the test set do not have an answer. This is important as in a real life Q&A system, our model needs to learn when **to not answer**.

In [11]:
display_example(no_answer_qids[1254])

Example 2564 of 11873
---------------------
Q: What happened 3.7-2 billion years ago?

Context:
("Free oxygen gas was almost nonexistent in Earth's atmosphere before "
 'photosynthetic archaea and bacteria evolved, probably about 3.5 billion '
 'years ago. Free oxygen first appeared in significant quantities during the '
 'Paleoproterozoic eon (between 3.0 and 2.3 billion years ago). For the first '
 'billion years, any free oxygen produced by these organisms combined with '
 'dissolved iron in the oceans to form banded iron formations. When such '
 'oxygen sinks became saturated, free oxygen began to outgas from the oceans '
 '3–2.7 billion years ago, reaching 10% of its present level around 1.7 '
 'billion years ago.')

True Answers:
[]


# Metrics for Q&A Systems

When measuring the performance of a machine learning system, we need to think about both **model** and **customer** metrics.

Q&A systems are usually measured by two dominant metrics: **F1** and **Exact Match (EM)**. They are computed on individual **Question & Answer** pairs. When multiple correct answers are available for a given question, the maximum score over all possible correct answers is computed. Overall **EM** and **F1** scores are computed for a model by averaging over the individual example scores

## Exact Match

For each Q&A Pair, if the **characters** of the model's prediction are an exact match of the characters of any of the True Answer(s), **EM=1**, otherwise **EM=0**. This is a strict all-or-nothing metric, which may have little value for final customers of a **Q&A System**. It may be beneficial only when assessing against a negative example; if the model predicts any text at all, it automatically receives a **0** for that example

## F1 Score

Almost all classificattion problems rely on F1 score to measure model performance. It is mostly appropriate when we care equally about precision and recall. On a **Q&A system**, however, it is computed over the individual words in the prediction against those in the **True Answer**. The number of shared words between the prediction and the trust is the basis of the F1 score. While **precision** is the ration between the number of shared words to the total number of words in the prediction; Recall is the ratio of the number of shared words to the total number of words in the ground truth

<img src="f1score.png" alt="f1score" width="600" style="margin:auto"/>

## Latency 

Lateny is an important metric for ML Systems. In the Q&A example its of the utmost importance when the system is used in a conversational application. For instance: Alexa and Google home are devices that have very strict latency constraints as the uses expects an answer with a few seconds after the question was asked. When updating models we should take this dimension according to the application of the system

## Answer Rate

In Q&A Systems, models that attempt to answer every question are often perceived as innacurate. The system should only provide output when confident enough to do so, in other words, when the probabilities of prodictions are above certain threshold. In some applications, a model should also be able to say "I don't know" or "The context has not enough information to answer the question".

# Q&A Models

Question and Answering makes use of Large Language Models (LLMs) as any other classification problem in NLP. The main difference relies on how the input and output is provided to the model. Generally speaking models are trained to match the **true answer** to the **question** as they are provided together as an input to the model.

## BERT

BERT, or Bidirectional Encoder Representations from Transformers, is a neural approach to pre-train language representations which obtains near state-of-the-art results on a wide array of Natural Language Processing (NLP) tasks, including SQuAD Question Answering dataset

Developed in 2019 BERT achieves **80.422** in the **EM** score and **83.118** in the **F1** score

BERT-base has **110 million** parameters and BERT-large has **340 million** parameters

<img src="model_params.png" alt="Model Parameters Comparison" width="1000" style="margin:auto"/>

As Large Language Models were developed, the amount of parameters in these models have grown exponentially. Although this improves model performance it comes at a cost: **Latency**. As we will discuss, for use cases where inference is done on batches that may have less impact, however, on real time systems such as voice assistants or web search, latency plays a major role on deciding whether one model is better than the other.

### BERT Input

[CLS] context [SEP] question [SEP] [PAD] [PAD] [PAD]

**context** = "The Intergovernmental Panel on Climate Change (IPCC) is a scientific intergovernmental body under the auspices of the United Nations."

**question** = "What organization is the IPCC a part of?"

**after being merged by the tokenizer**:
```
"[CLS] The Intergovernmental Panel on Climate Change (IPCC) is a scientific intergovernmental body under the auspices of the United Nations. [SEP] What organization is the IPCC a part of? [SEP] [PAD] [PAD] [PAD]"
```


**token-id format**:

[101, 1109, 11300, 2758, 24472, 15595, 20339, 1113, 13540, 9091, 113, 14274, 12096, 114, 1110, 170, 3812,
 9455, 2758, 24472, 15595, 1404, 1223, 1103, 22105, 1104, 1103, 1244, 3854, 119, 102, 1327, 2369, 1110, 1103,
 14274, 12096, 170, 1226, 1104, 136, 102, 0, 0, 0]

## Loading Pre Trained BERT from Huggingface repository

In [12]:
tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL_HF_PATH, use_fast=False)
model = AutoModelForQuestionAnswering.from_pretrained(BERT_MODEL_HF_PATH)

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/438M [00:00<?, ?B/s]

## Utility Functions

Given a **Question ID**, **Model** and **Tokenizer** we get an answer text. In here we get the maximum probability of beginning and end for the answer in the Softmax output

In [13]:
def get_prediction(qid: str, model:AutoModelForQuestionAnswering, tokenizer:AutoTokenizer):
    # given a question id (qas_id or qid), load the example, get the model outputs and generate an answer
    question = examples[qid_to_example_index[qid]].question_text
    context = examples[qid_to_example_index[qid]].context_text

    inputs = tokenizer.encode_plus(question, context, return_tensors='pt')

    outputs = model(**inputs)
    answer_start = torch.argmax(outputs[0])  # get the most likely beginning of answer with the argmax of the score
    answer_end = torch.argmax(outputs[1]) + 1 

    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][answer_start:answer_end]))

    return answer

We create a simple function that given an **example** list it extracts the gold answers

In [14]:
def get_gold_answers(example):
    """helper function that retrieves all possible true answers from a squad2.0 example"""
    
    gold_answers = [answer["text"] for answer in example.answers if answer["text"]]

    # if gold_answers doesn't exist it's because this is a negative example - 
    # the only correct answer is an empty string
    if not gold_answers:
        gold_answers = [""]
        
    return gold_answers

For Metrics like **Exact Match** we need to make sure that texts are normalized so we can compare on a character level

In [15]:
# these functions are heavily influenced by the HF squad_metrics.py script
def normalize_text(s: str):
    """Removing articles and punctuation, and standardizing whitespace are all typical text processing steps."""
    import string, re

    def remove_articles(text):
        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
        return re.sub(regex, " ", text)

    def white_space_fix(text):
        return " ".join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))


## Metrics Calculation

### Exact Match (EM)

In [16]:
def compute_exact_match(prediction, truth):
    return int(normalize_text(prediction) == normalize_text(truth))

### F1 Score

In [17]:
def compute_f1(prediction, truth):
    pred_tokens = normalize_text(prediction).split()
    truth_tokens = normalize_text(truth).split()
    
    # if either the prediction or the truth is no-answer then f1 = 1 if they agree, 0 otherwise
    if len(pred_tokens) == 0 or len(truth_tokens) == 0:
        return int(pred_tokens == truth_tokens)
    
    common_tokens = set(pred_tokens) & set(truth_tokens)
    
    # if there are no common tokens then f1 = 0
    if len(common_tokens) == 0:
        return 0
    
    prec = len(common_tokens) / len(pred_tokens)
    rec = len(common_tokens) / len(truth_tokens)
    
    return 2 * (prec * rec) / (prec + rec)



Computing EM and F1 for an example with a gold answer

In [55]:
prediction = get_prediction(answer_qids[1303], model, tokenizer, )
example = examples[qid_to_example_index[answer_qids[1303]]]

gold_answers = get_gold_answers(example)

em_score = max((compute_exact_match(prediction, answer)) for answer in gold_answers)
f1_score = max((compute_f1(prediction, answer)) for answer in gold_answers)

print(f"Question: {example.question_text}")
print(f"Prediction: {prediction}")
print(f"True Answers: {gold_answers}")
print(f"EM: {em_score} \t F1: {f1_score}")

Question: What measurement do scientists used to determine the quality of water?
Prediction: biochemical oxygen demand
True Answers: ['biochemical oxygen demand', 'biochemical oxygen demand', "measuring the water's biochemical oxygen demand", 'biochemical oxygen demand', "measuring the water's biochemical oxygen demand"]
EM: 1 	 F1: 1.0


Now lets try and compute an example without answer

In [19]:
prediction = get_prediction(no_answer_qids[1254], model, tokenizer)
example = examples[qid_to_example_index[no_answer_qids[1254]]]

gold_answers = get_gold_answers(example)

em_score = max((compute_exact_match(prediction, answer)) for answer in gold_answers)
f1_score = max((compute_f1(prediction, answer)) for answer in gold_answers)

print(f"Question: {example.question_text}")
print(f"Prediction: {prediction}")
print(f"True Answers: {gold_answers}")
print(f"EM: {em_score} \t F1: {f1_score}")

Question: What happened 3.7-2 billion years ago?
Prediction: [CLS]
True Answers: ['']
EM: 0 	 F1: 0


Both metrics are zero, this model does not correctly asses that this question is unanswearable. It predicts the [CLS] token (it means it considers the entire context as an answer to the question)

### Putting it all together

In [20]:
def get_answers_metrics(model:AutoModelForQuestionAnswering,tokenizer: AutoTokenizer, answer_qids=answer_qids, examples=examples):
    answers_arr = []
    start_time = time.time()
    errors = []
    for qid in tqdm(answer_qids):
        try:
            prediction = get_prediction(qid, model, tokenizer)
            example = examples[qid_to_example_index[qid]]

            gold_answers = get_gold_answers(example)

            em_score = max((compute_exact_match(prediction, answer)) for answer in gold_answers)
            f1_score = max((compute_f1(prediction, answer)) for answer in gold_answers)


            result_dict = {}
            result_dict["qid"] = qid
            result_dict["question"] = example.question_text
            result_dict["prediction"] = prediction
            result_dict["true_answers"] = ';'.join(gold_answers)
            result_dict["f1"] = f1_score
            result_dict["em"] = em_score
            answers_arr.append(result_dict)
        except:
            errors.append(qid)
    end_time = time.time()
    
    return pd.DataFrame(answers_arr), end_time-start_time, errors

In [21]:
metrics_df, total_time, errors = get_answers_metrics(model, tokenizer, answer_qids[:100])

100%|██████████| 100/100 [00:58<00:00,  1.70it/s]


In [23]:
metrics_df['f1'].mean()

0.7866849965752404

In [25]:
metrics_df['em'].mean()

0.72

## Improving meausurement functions through model thresholding

When we tokenize a question and context, and we pass it to the model, the output consists of two probabilities (logits). One is the start of the answer span, the other for the end of the answer span.

Every token that is passed to the model is assigned a logit, and tokens corresponding to the question itself.

Lets have a look at what this means, using a previous question ("What happened 3.7-2 billion years ago?"):

In [26]:
inputs = tokenizer.encode_plus(example.question_text, example.context_text, return_tensors='pt')
output = model(**inputs)

Looking below, we can observe how large is the first position of the array, this is the [CLS] token position. This has a strong probability that this question has no answer, but we answered it anyway

In [27]:
start_logits = output.start_logits
end_logits = output.end_logits

In [28]:
start_logits

tensor([[  5.1171,  -8.3404,  -9.2660,  -8.0987,  -9.1736,  -9.5905,  -9.6239,
          -9.4974,  -9.7725,  -9.9778, -10.1417,  -9.6171,  -8.7191,  -3.9945,
          -5.9036,  -6.7555,  -8.8160,  -7.9155,  -8.7225,  -9.7324,  -9.5704,
         -10.2445,  -9.0550,  -7.5630,  -9.8712, -10.0757,  -7.6297,  -7.4481,
          -6.5011,  -9.7018, -10.0513,  -9.4285,  -8.6840, -10.2489,  -9.9774,
          -8.0076,  -7.7707,  -8.0321,  -6.8884,  -6.6425,  -6.5631,  -9.4518,
          -8.6434,  -9.5573,  -9.8626,  -9.7323,  -7.0476,  -4.5778,  -6.4825,
          -6.7371,  -6.9433,  -9.1568,  -6.9630,  -8.9800,  -6.9723,  -7.3322,
          -5.2532,  -9.6134,  -9.4807, -10.0298,  -9.8842,  -8.8732,  -7.9342,
          -8.2085,  -8.0398,  -7.8698,  -7.2027,  -9.6577,  -9.1055,  -9.7395,
          -7.7438,  -9.5543,  -9.0663,  -9.2180,  -9.8046,  -9.6983,  -8.9082,
          -6.9894,  -6.7306,  -7.6100,  -6.7132,  -8.6220,  -9.4562,  -8.3209,
          -5.3854,  -6.1157,  -6.8288,  -8.7172,  -9

Our model gets predictions by selecting the start and end tokens with the largest logits. It would be more sensible to choose any sensible start+end combination as possible to answer the question

These combinations can be score independently and the one with the highest score would be considered the best answer

A possible (candidate) answer is scored as the sum of its start and end logits

### Calculating possible combinations

We start by taking the n largest start and end logits. Any sensible combination can be considered an answer, however, some consistency checks must first be performed

For instance:
    
    - End token must fall after the start token
    - Candidate answers wherein the start or end tokens are associated with question tokens

[CLS] is not removed from the answers as it can indicate null answer

In [29]:
# convert our start and end logit tensors to lists
start_logits = to_list(start_logits)[0]
end_logits = to_list(end_logits)[0]

In [30]:
# sort our start and end logits from largest to smallest, keeping track of the index
start_idx_and_logit = sorted(enumerate(start_logits), key=lambda x: x[1], reverse=True)
end_idx_and_logit = sorted(enumerate(end_logits), key=lambda x: x[1], reverse=True)

In [31]:
# select the top n (in this case, 5)
print(start_idx_and_logit[:5])
print(end_idx_and_logit[:5]) 

[(0, 5.117067337036133), (111, 1.3977429866790771), (104, 0.6027625203132629), (106, -1.128671646118164), (113, -1.7321617603302002)]
[(0, 6.168289661407471), (119, 3.2872862815856934), (109, 0.9794861674308777), (135, 0.308546245098114), (116, -0.2068440020084381)]


The null answer token (index 0) is in the top five of both the start and end logit lists.

In order to eventually predict a text answer (or empty string), we need to keep track of the indexes which will be used to pull the corresponding token ids later on. We'll also need to identify which indexes correspond to the question tokens, so we can ensure we don't allow a nonsensical prediction.

In [32]:
start_indexes = [idx for idx, logit in start_idx_and_logit[:5]]
end_indexes = [idx for idx, logit in end_idx_and_logit[:5]]

In [33]:
# convert the token ids from a tensor to a list
tokens = to_list(inputs['input_ids'])[0]

In [34]:
# question tokens are defined as those between the CLS token (101, at position 0) and first SEP (102) token 
question_indexes = [i+1 for i, token in enumerate(tokens[1:tokens.index(102)])]
question_indexes

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]

In [35]:
# keep track of all preliminary predictions
PrelimPrediction = collections.namedtuple( 
    "PrelimPrediction", ["start_index", "end_index", "start_logit", "end_logit"]
)

We'll generate a list of candidate predictions by looping through all combinations of the start and end token indexes, excluding nonsensical combinations

In [36]:
prelim_preds = []
for start_index in start_indexes:
    for end_index in end_indexes:
        # throw out invalid predictions
        if start_index in question_indexes:
            continue
        if end_index in question_indexes:
            continue
        if end_index < start_index:
            continue
        prelim_preds.append(
            PrelimPrediction(
                start_index = start_index,
                end_index = end_index,
                start_logit = start_logits[start_index],
                end_logit = end_logits[end_index]
            )
        )

With a list of sensible candidate predictions, it's time to score them.

For a candidate answer, score = start_logit + end_logit. Below, we sort our candidate predictions by their score.

In [37]:
# sort preliminary predictions by their score
prelim_preds = sorted(prelim_preds, key=lambda x: (x.start_logit + x.end_logit), reverse=True)
pprint(prelim_preds[:5])

[PrelimPrediction(start_index=0, end_index=0, start_logit=5.117067337036133, end_logit=6.168289661407471),
 PrelimPrediction(start_index=0, end_index=119, start_logit=5.117067337036133, end_logit=3.2872862815856934),
 PrelimPrediction(start_index=0, end_index=109, start_logit=5.117067337036133, end_logit=0.9794861674308777),
 PrelimPrediction(start_index=0, end_index=135, start_logit=5.117067337036133, end_logit=0.308546245098114),
 PrelimPrediction(start_index=0, end_index=116, start_logit=5.117067337036133, end_logit=-0.2068440020084381)]


We need to convert our preliminary predictions into actual text (or the empty string, if null). We'll keep track of text predictions we've seen, because different token combinations can result in the same text prediction and we only want to keep the one with the highest score (we're looping in descending score order). Finally, we'll trim this list down to the best 5 predictions.

In [38]:
# keep track of all best predictions
BestPrediction = collections.namedtuple(  # pylint: disable=invalid-name
    "BestPrediction", ["text", "start_logit", "end_logit"]
)

In [39]:
nbest = []
seen_predictions = []
for pred in prelim_preds:
    
    # for now we only care about the top 5 best predictions
    if len(nbest) >= 5: 
        break
        
    # loop through predictions according to their start index
    if pred.start_index > 0: # non-null answers have start_index > 0

        text = tokenizer.convert_tokens_to_string(
            tokenizer.convert_ids_to_tokens(
                tokens[pred.start_index:pred.end_index+1]
            )
        )
        # clean whitespace
        text = text.strip()
        text = " ".join(text.split())

        if text in seen_predictions:
            continue

        # flag this text as being seen -- if we see it again, don't add it to the nbest list
        seen_predictions.append(text) 

        # add this text prediction to a pruned list of the top 5 best predictions
        nbest.append(BestPrediction(text=text, start_logit=pred.start_logit, end_logit=pred.end_logit))

In [40]:
nbest

[BestPrediction(text='free oxygen began to outgas from the oceans', start_logit=1.3977429866790771, end_logit=3.2872862815856934),
 BestPrediction(text='when such oxygen sinks became saturated , free oxygen began to outgas from the oceans', start_logit=0.6027625203132629, end_logit=3.2872862815856934),
 BestPrediction(text='oxygen sinks became saturated , free oxygen began to outgas from the oceans', start_logit=-1.128671646118164, end_logit=3.2872862815856934),
 BestPrediction(text='free oxygen began to outgas from the oceans 3 – 2 . 7 billion years ago , reaching 10 % of its present level', start_logit=1.3977429866790771, end_logit=0.308546245098114),
 BestPrediction(text='when such oxygen sinks became saturated', start_logit=0.6027625203132629, end_logit=0.9794861674308777)]

At this point, we have a neat list of the top 5 best predictions for this question, lets now also add the null answer

In [41]:
# and don't forget -- include the null answer!
nbest.append(BestPrediction(text="", start_logit=start_logits[0], end_logit=end_logits[0]))
nbest

[BestPrediction(text='free oxygen began to outgas from the oceans', start_logit=1.3977429866790771, end_logit=3.2872862815856934),
 BestPrediction(text='when such oxygen sinks became saturated , free oxygen began to outgas from the oceans', start_logit=0.6027625203132629, end_logit=3.2872862815856934),
 BestPrediction(text='oxygen sinks became saturated , free oxygen began to outgas from the oceans', start_logit=-1.128671646118164, end_logit=3.2872862815856934),
 BestPrediction(text='free oxygen began to outgas from the oceans 3 – 2 . 7 billion years ago , reaching 10 % of its present level', start_logit=1.3977429866790771, end_logit=0.308546245098114),
 BestPrediction(text='when such oxygen sinks became saturated', start_logit=0.6027625203132629, end_logit=0.9794861674308777),
 BestPrediction(text='', start_logit=5.117067337036133, end_logit=6.168289661407471)]

The null answer is scored as the sum of the start_logit and end_logit associated with the [CLS] token.

The last step is to compute the null score -- more specifically, the difference between the null score and the best non-null score as shown below.

In [42]:
# compute the null score as the sum of the [CLS] token logits
score_null = start_logits[0] + end_logits[0]

In [44]:
score_null

11.285356998443604

In [46]:
nbest[0].start_logit + nbest[0].end_logit

4.6850292682647705

In [43]:
# compute the difference between the null score and the best non-null score
score_diff = score_null - nbest[0].start_logit - nbest[0].end_logit

score_diff

6.600327730178833

## SQuAD Evaluation

In [47]:
def evaluate(model_name_or_path, 
             dataset, 
             output_dir, 
             per_gpu_eval_batch_size=12, 
             n_gpu=1, 
             model_type=BERT_MODEL_TYPE,
             do_lower_case=DO_LOWER_CASE,
             nbest_size=NBEST_SIZE,
             max_answer_length=MAX_ANSWER_LENGTH,
             null_score_diff_threshold=0.0):
    
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=False)
    model = AutoModelForQuestionAnswering.from_pretrained(model_name_or_path)
    
    model.to(device)
    
    eval_batch_size = per_gpu_eval_batch_size * max(1, n_gpu)

    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(dataset)
    eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=eval_batch_size)

    # multi-gpu evaluate
    if n_gpu > 1 and not isinstance(model, torch.nn.DataParallel):
        model = torch.nn.DataParallel(model)


    all_results = []
    start_time = timeit.default_timer()

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(device) for t in batch)

        with torch.no_grad():
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
            }

            if model_type in ["xlm", "roberta", "distilbert", "camembert", "bart", "longformer"]:
                del inputs["token_type_ids"]

            feature_indices = batch[3]

            outputs = model(**inputs)

        for i, feature_index in enumerate(feature_indices):
            eval_feature = features[feature_index.item()]
            unique_id = int(eval_feature.unique_id)

            output = [to_list(output[i]) for output in outputs.to_tuple()]


            start_logits, end_logits = output
            result = SquadResult(unique_id, start_logits, end_logits)

            all_results.append(result)

    evalTime = timeit.default_timer() - start_time
    print(f"Evaluation done in total {evalTime} secs ({evalTime/len(dataset)} sec per example)")

    # Compute predictions
    os.makedirs(output_dir, exist_ok=True)
    
    output_prediction_file = os.path.join(output_dir, "predictions.json")
    output_nbest_file = os.path.join(output_dir, "nbest_predictions.json")


    output_null_log_odds_file = os.path.join(output_dir, "null_odds.json")


    predictions = compute_predictions_logits(
        examples,
        features,
        all_results,
        nbest_size,
        max_answer_length,
        do_lower_case,
        output_prediction_file,
        output_nbest_file,
        output_null_log_odds_file,
        False,
        True,
        null_score_diff_threshold,
        tokenizer,
    )

    # Compute the F1 and exact scores.
    results = squad_evaluate(examples, predictions)
    
    results.update({"eval_time": evalTime, 'prediction_time': evalTime/len(dataset)})
    
    return results

In [48]:
result = evaluate(BERT_MODEL_HF_PATH, dataset, BERT_OUTPUT_DIR)

Evaluating: 100%|██████████| 1020/1020 [01:18<00:00, 12.98it/s]


Evaluation done in total 78.57547793399135 secs (0.0064237637290705816 sec per example)


In [49]:
result

OrderedDict([('exact', 72.3658721468879),
             ('f1', 75.83107045305708),
             ('total', 11873),
             ('HasAns_exact', 72.80701754385964),
             ('HasAns_f1', 79.74735146578041),
             ('HasAns_total', 5928),
             ('NoAns_exact', 71.9259882253995),
             ('NoAns_f1', 71.9259882253995),
             ('NoAns_total', 5945),
             ('best_exact', 72.3658721468879),
             ('best_exact_thresh', 0.0),
             ('best_f1', 75.8310704530571),
             ('best_f1_thresh', 0.0),
             ('eval_time', 78.57547793399135),
             ('prediction_time', 0.0064237637290705816)])

The first three blocks of the Results output are pretty straightforward. EM and F1 scores are reported over a) the full dev set, b) the set of positive examples, and c) the set of negative examples. This can provide some insight into whether a model is performing adequately on both answer and no-answer questions.

As per below results, we see that two sections are zero. They depend on setting a threshold for the model so it knows when to prefer a null answer over an actual answer.

In other words, we should predict a null answer for a given example if that example's score difference is above a certain threshold. What should that threshold be? How should we compute it? They give us a recipe: select the threshold that maximizes F1

We will leverage the files created by the evaluate function above to retrieve the optimal threshold

In [50]:
# load the predictions we generated earlier
filename = BERT_OUTPUT_DIR + '/predictions.json'
preds = json.load(open(filename, 'rb'))

# load the null score differences we generated earlier
filename = BERT_OUTPUT_DIR + '/null_odds.json'
null_odds = json.load(open(filename, 'rb'))

In [51]:
# the default threshold is set to 1.0 -- we'll leave it there for now
results_default_thresh = squad_evaluate(examples, 
                                        preds, 
                                        no_answer_probs=null_odds, 
                                        no_answer_probability_threshold=1.0)

pprint(results_default_thresh)

OrderedDict([('exact', 72.3658721468879),
             ('f1', 75.83107045305708),
             ('total', 11873),
             ('HasAns_exact', 72.80701754385964),
             ('HasAns_f1', 79.74735146578041),
             ('HasAns_total', 5928),
             ('NoAns_exact', 71.9259882253995),
             ('NoAns_f1', 71.9259882253995),
             ('NoAns_total', 5945),
             ('best_exact', 73.66293270445549),
             ('best_exact_thresh', -3.7661960124969482),
             ('best_f1', 76.85604013447075),
             ('best_f1_thresh', -3.415079116821289)])


The first three blocks have identical values as in our initial evaluation because they are based on the default threshold (which is currently 1.0). However, the values in the fourth block have been updated by taking into account the null_odds information. When a given example's score_diff is greater than the threshold, the prediction is flipped to a null answer which affects the overall EM and F1 scores.


In [52]:
best_f1_thresh = -3.415079116821289
results_f1_thresh = squad_evaluate(examples, 
                                   preds, 
                                   no_answer_probs=null_odds, 
                                   no_answer_probability_threshold=best_f1_thresh)

pprint(results_f1_thresh)

OrderedDict([('exact', 73.62924281984334),
             ('f1', 76.85604013447113),
             ('total', 11873),
             ('HasAns_exact', 69.33198380566802),
             ('HasAns_f1', 75.79483207094731),
             ('HasAns_total', 5928),
             ('NoAns_exact', 77.91421362489487),
             ('NoAns_f1', 77.91421362489487),
             ('NoAns_total', 5945),
             ('best_exact', 73.66293270445549),
             ('best_exact_thresh', -3.7661960124969482),
             ('best_f1', 76.85604013447075),
             ('best_f1_thresh', -3.415079116821289)])


We can see that metrics for NoAns have increased significantly. The downside is that we lose some ground in how well our model correctly predicts HasAns examples. Overall, however, we see a net increase of a couple points in both EM and F1 scores. This demonstrates that computing null scores and properly using a null threshold significantly increases QA performance on the SQuAD2.0 dev set with almost no additional work.

## Putting it all together

In [57]:
def get_qa_inputs(example, tokenizer):
    # load the example, convert to inputs, get model outputs
    question = example.question_text
    context = example.context_text
    return tokenizer.encode_plus(question, context, return_tensors='pt')

def get_clean_text(tokens, tokenizer):
    text = tokenizer.convert_tokens_to_string(
        tokenizer.convert_ids_to_tokens(tokens)
        )
    # Clean whitespace
    text = text.strip()
    text = " ".join(text.split())
    return text

def prediction_probabilities(predictions):

    def softmax(x):
        """Compute softmax values for each sets of scores in x."""
        e_x = np.exp(x - np.max(x))
        return e_x / e_x.sum()

    all_scores = [pred.start_logit+pred.end_logit for pred in predictions] 
    return softmax(np.array(all_scores))

In [58]:
def preliminary_predictions(start_logits, end_logits, input_ids, nbest):
    # convert tensors to lists
    start_logits = to_list(start_logits)[0]
    end_logits = to_list(end_logits)[0]
    tokens = to_list(input_ids)[0]

    # sort our start and end logits from largest to smallest, keeping track of the index
    start_idx_and_logit = sorted(enumerate(start_logits), key=lambda x: x[1], reverse=True)
    end_idx_and_logit = sorted(enumerate(end_logits), key=lambda x: x[1], reverse=True)
    
    start_indexes = [idx for idx, logit in start_idx_and_logit[:nbest]]
    end_indexes = [idx for idx, logit in end_idx_and_logit[:nbest]]

    # question tokens are between the CLS token (101, at position 0) and first SEP (102) token 
    question_indexes = [i+1 for i, token in enumerate(tokens[1:tokens.index(102)])]

    # keep track of all preliminary predictions
    PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
        "PrelimPrediction", ["start_index", "end_index", "start_logit", "end_logit"]
    )
    prelim_preds = []
    for start_index in start_indexes:
        for end_index in end_indexes:
            # throw out invalid predictions
            if start_index in question_indexes:
                continue
            if end_index in question_indexes:
                continue
            if end_index < start_index:
                continue
            prelim_preds.append(
                PrelimPrediction(
                    start_index = start_index,
                    end_index = end_index,
                    start_logit = start_logits[start_index],
                    end_logit = end_logits[end_index]
                )
            )
    # sort prelim_preds in descending score order
    prelim_preds = sorted(prelim_preds, key=lambda x: (x.start_logit + x.end_logit), reverse=True)
    return prelim_preds

In [59]:
def best_predictions(prelim_preds, nbest, tokenizer):
    # keep track of all best predictions

    # This will be the pool from which answer probabilities are computed 
    BestPrediction = collections.namedtuple(
        "BestPrediction", ["text", "start_logit", "end_logit"]
    )
    nbest_predictions = []
    seen_predictions = []
    for pred in prelim_preds:
        if len(nbest_predictions) >= nbest: 
            break
        if pred.start_index > 0: # non-null answers have start_index > 0

            toks = tokens[pred.start_index : pred.end_index+1]
            text = get_clean_text(toks, tokenizer)

            # if this text has been seen already - skip it
            if text in seen_predictions:
                continue

            # flag text as being seen
            seen_predictions.append(text) 

            # add this text to a pruned list of the top nbest predictions
            nbest_predictions.append(
                BestPrediction(
                    text=text, 
                    start_logit=pred.start_logit,
                    end_logit=pred.end_logit
                    )
                )
        
    # Add the null prediction
    nbest_predictions.append(
        BestPrediction(
            text="", 
            start_logit=start_logits[0], 
            end_logit=end_logits[0]
            )
        )
    return nbest_predictions

In [60]:
def compute_score_difference(predictions):
    """ Assumes that the null answer is always the last prediction """
    score_null = predictions[-1].start_logit + predictions[-1].end_logit
    score_non_null = predictions[0].start_logit + predictions[0].end_logit
    return score_null - score_non_null

In [61]:
def get_robust_prediction(example, tokenizer, nbest=10, null_threshold=1.0):
    
    inputs = get_qa_inputs(example, tokenizer)
    output = model(**inputs)

    # get sensible preliminary predictions, sorted by score
    prelim_preds = preliminary_predictions(output.start_logits, 
                                           output.end_logits, 
                                           inputs['input_ids'],
                                           nbest)
    
    # narrow that down to the top nbest predictions
    nbest_preds = best_predictions(prelim_preds, nbest, tokenizer)

    # compute the probability of each prediction - nice but not necessary
    probabilities = prediction_probabilities(nbest_preds)
        
    # compute score difference
    score_difference = compute_score_difference(nbest_preds)

    # if score difference > threshold, return the null answer
    if score_difference > null_threshold:
        return "", probabilities[-1]
    else:
        return nbest_preds[0].text, probabilities[0]

We are no able to detect 'No Answer'

In [62]:
print(example.question_text)
get_robust_prediction(example, tokenizer, nbest=10, null_threshold=best_f1_thresh)

What measurement do scientists used to determine the quality of water?


('', 0.038883563779622446)

# Using A/B Testing to make model deployment decisions

Now we have all the tools to make decisions on whether a model is better than the other. 

### Home assistant device

We have been tasked to evaluate whether a new model recently trained is better than our current model in production

While talking to our business we are able to enumerate our priorities in order of importance:

1) Our users should receive their answer quickly, and the threshold for the isolated prediction of the model is 0.01 seconds
2) Its important to not surface answers when questions are either malformed or don't have enough context to be answered
3) Accuracy is important to the level where uses care about factuality without too much concern with answers having exact words as their ground truth

We are looking for the f1 score of "has answer" and "no answer" separately, so we can weight them according to the requirements

Also, we want to measure latency as: time to predict 10000 questions from our user

Given this scenario we write the following objective function:

In [63]:
def home_assistant_score(has_answer_f1, no_answer_f1, prediction_time):
    return ((0.2 * has_answer_f1 + 0.3 * no_answer_f1) - (0.5 * (10000 * prediction_time)))

We assign very high weight (0.5) to our prediction time, while dividing the remaining between our both F1 scores, giving slightly higher importance to "No Answer" scores

Lets evaluate now our BERT model given the above Objective function

In [64]:
result = evaluate(BERT_MODEL_HF_PATH, dataset, BERT_OUTPUT_DIR)

Evaluating: 100%|██████████| 1020/1020 [01:17<00:00, 13.09it/s]


Evaluation done in total 77.94153097498929 secs (0.006371936803056678 sec per example)


In [65]:
result

OrderedDict([('exact', 72.3658721468879),
             ('f1', 75.83107045305708),
             ('total', 11873),
             ('HasAns_exact', 72.80701754385964),
             ('HasAns_f1', 79.74735146578041),
             ('HasAns_total', 5928),
             ('NoAns_exact', 71.9259882253995),
             ('NoAns_f1', 71.9259882253995),
             ('NoAns_total', 5945),
             ('best_exact', 72.3658721468879),
             ('best_exact_thresh', 0.0),
             ('best_f1', 75.8310704530571),
             ('best_f1_thresh', 0.0),
             ('eval_time', 77.94153097498929),
             ('prediction_time', 0.006371936803056678)])

In [66]:
print(f"Model {BERT_MODEL_TYPE} score is {home_assistant_score(result['HasAns_f1'], result['NoAns_f1'], result['prediction_time'])}") 

Model bert score is 5.667582745492538


Ok, now we have a number that quantifies our business requirements.

One of our data scientists, have found a 'faster' version of BERT, called 'distilled bert' that has **40%** less parameters and is **60%** faster while preserving 95% of BERT's performance

Lets now evaluate this model using our pipeline:

In [67]:
result_distilled = evaluate(model_name_or_path=DISTILBERT_MODEL_HF_PATH, dataset=dataset, output_dir=DISTILBERT_OUTPUT_DIR, model_type=DISTILBERT_MODEL_TYPE)

Downloading (…)okenizer_config.json:   0%|          | 0.00/39.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/478 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/265M [00:00<?, ?B/s]

Evaluating: 100%|██████████| 1020/1020 [00:39<00:00, 25.59it/s]


Evaluation done in total 39.86529604694806 secs (0.0032590987611958847 sec per example)


In [68]:
result_distilled

OrderedDict([('exact', 66.25958056093658),
             ('f1', 69.66994428499025),
             ('total', 11873),
             ('HasAns_exact', 68.91025641025641),
             ('HasAns_f1', 75.74076391627662),
             ('HasAns_total', 5928),
             ('NoAns_exact', 63.61648444070648),
             ('NoAns_f1', 63.61648444070648),
             ('NoAns_total', 5945),
             ('best_exact', 66.25958056093658),
             ('best_exact_thresh', 0.0),
             ('best_f1', 69.66994428499046),
             ('best_f1_thresh', 0.0),
             ('eval_time', 39.86529604694806),
             ('prediction_time', 0.0032590987611958847)])

In [69]:
print(f"Model {DISTILBERT_MODEL_TYPE} score is {home_assistant_score(result_distilled['HasAns_f1'], result_distilled['NoAns_f1'], result_distilled['prediction_time'])}") 

Model distilbert score is 17.93760430948785


The Distilled Version of BERT achieves exactly what it was described, it produces predictions in almost half of the speed of the traditional BERT with very minimal impacts to performance

The recommendation in the above case is to switch **BERT** model for the **DistilledBERT** as it improves business metrics by **3x**