In [1]:
import logging
import tqdm
from transformers import (
    AutoTokenizer,
    AutoModelForQuestionAnswering,
    AutoConfig,
    RobertaForQuestionAnswering,
    squad_convert_examples_to_features
)

from transformers.data.processors.squad import SquadV2Processor

# Create logger
logger = logging.getLogger(__name__)

In [2]:
from transformers.data.metrics.squad_metrics import (
    compute_predictions_log_probs,
    compute_predictions_logits,
    squad_evaluate,
)
from transformers.data.processors.squad import SquadResult
import timeit

def evaluate(output_dir, model, tokenizer, device, datasets, prefix=""):
    batch_size = 4
    model_type = 'roberta'
    dataset, examples, features = datasets

    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(dataset)
    eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=batch_size)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(dataset))
    logger.info("  Batch size = %d", batch_size)

    all_results = []
    start_time = timeit.default_timer()

    for batch in tqdm.tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(device) for t in batch)

        with torch.no_grad():
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
            }

            if model_type in ["xlm", "roberta", "distilbert", "camembert", "bart", "longformer"]:
                del inputs["token_type_ids"]

            feature_indices = batch[3]

            # Get the predicted outputs
            outputs = model(**inputs)

        for i, feature_index in enumerate(feature_indices):
            eval_feature = features[feature_index.item()]
            unique_id = int(eval_feature.unique_id)

            output = [to_list(output[i]) for output in outputs.to_tuple()]

            # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler"
            # models only use two.
            if len(output) >= 5:
                start_logits = output[0]
                start_top_index = output[1]
                end_logits = output[2]
                end_top_index = output[3]
                cls_logits = output[4]

                result = SquadResult(
                    unique_id,
                    start_logits,
                    end_logits,
                    start_top_index=start_top_index,
                    end_top_index=end_top_index,
                    cls_logits=cls_logits,
                )

            else:
                start_logits, end_logits = output
                result = SquadResult(unique_id, start_logits, end_logits)

            all_results.append(result)

    evalTime = timeit.default_timer() - start_time
    logger.info("  Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(dataset))

    # Compute predictions
    output_prediction_file = os.path.join(output_dir, "predictions_{}.json".format(prefix))
    output_nbest_file = os.path.join(output_dir, "nbest_predictions_{}.json".format(prefix))
    output_null_log_odds_file = os.path.join(output_dir, "null_odds_{}.json".format(prefix))

    # TODO: Get defualt inputs for this function
    predictions = compute_predictions_logits(
        examples,
        features,
        all_results,
        n_best_size=20,
        max_answer_length=30,
        do_lower_case=True,
        output_prediction_file=output_prediction_file,
        output_nbest_file=output_nbest_file,
        output_null_log_odds_file=output_null_log_odds_file,
        verbose_logging=False,
        version_2_with_negative=True,
        null_score_diff_threshold=0.0,
        tokenizer=tokenizer,
      )

    # Compute the F1 and exact scores.
    results = squad_evaluate(examples, predictions)
    return results, examples, predictions

def load_examples(data_dir, data_file, tokenizer, evaluate=False, output_examples=False):
    processor = SquadV2Processor()
    if evaluate:
        examples = processor.get_dev_examples(data_dir, filename=data_file)
    else:
        examples = processor.get_train_examples(data_dir, filename=data_file)

    features, dataset = squad_convert_examples_to_features(
        examples=examples,
        tokenizer=tokenizer,
        max_seq_length=384,
        doc_stride=128,
        max_query_length=64,
        is_training=not evaluate,
        return_dataset="pt",
        threads=1,
    )

    if output_examples:
        return dataset, examples, features

    return dataset

In [3]:
# Create path variables
data_dir = 'squad_data'
validation_data_file = 'dev-v2.0.json'
tokenizer = AutoTokenizer.from_pretrained('roberta-base', do_lower_case=True, use_fast=False)

# Create the validation set
validation_dataset, validation_examples, validation_features = load_examples(
    data_dir=data_dir,
    data_file=validation_data_file,
    tokenizer=tokenizer,
    evaluate=True,
    output_examples=True,
)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 35/35 [00:03<00:00, 11.15it/s]
convert squad examples to features: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11873/11873 [00:52<00:00, 224.48it/s]
add example index and unique id: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11873/11873 [00:00<00:00, 520789.90it/s]


In [4]:
def to_list(tensor):
    return tensor.detach().cpu().tolist()

In [5]:
import torch
import timeit
import os
from torch.utils.data import SequentialSampler
from torch.utils.data import DataLoader, Dataset

# Results list
results_list = []
device = torch.device('cuda:0')

# Iterate over the model checkpoints and look at the results
for epoch in range(3):
    # Configure the tokenizer and model
    config = AutoConfig.from_pretrained('roberta-base')
    tokenizer = AutoTokenizer.from_pretrained('roberta-base', do_lower_case=True, use_fast=False)
    model = AutoModelForQuestionAnswering.from_pretrained('roberta-base', config=config)
    model.load_state_dict(torch.load(f'./model_weights/text-mining-titans-roberta-qa-cp{epoch}.pt'))
    model = model.to(device)
    
    # Validation datasets
    validation_datasets = (validation_dataset, validation_examples, validation_features)
    
    # Get the results
    results, examples, predictions = evaluate(
        output_dir='prediction_outputs',
        model=model,
        tokenizer=tokenizer,
        device=device,
        datasets=validation_datasets,
        prefix=f'checkpoint-{epoch}'
    )
    results_list.append(results)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForQuestionAnswering: ['lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at roberta-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use 

In [6]:
import pandas as pd

all_results = pd.DataFrame(results_list)
all_results

Unnamed: 0,exact,f1,total,HasAns_exact,HasAns_f1,HasAns_total,NoAns_exact,NoAns_f1,NoAns_total,best_exact,best_exact_thresh,best_f1,best_f1_thresh
0,78.15211,81.149774,11873,70.175439,76.179363,5928,86.105971,86.105971,5945,78.15211,0.0,81.149774,0.0
1,78.716415,82.041429,11873,74.156545,80.816107,5928,83.263246,83.263246,5945,78.716415,0.0,82.041429,0.0
2,76.770825,80.394872,11873,77.665317,84.923805,5928,75.87889,75.87889,5945,76.770825,0.0,80.394872,0.0


In [7]:
config = AutoConfig.from_pretrained('roberta-base')
tokenizer = AutoTokenizer.from_pretrained('roberta-base', do_lower_case=True, use_fast=False)
model = AutoModelForQuestionAnswering.from_pretrained('roberta-base', config=config)
model.load_state_dict(torch.load(f'./model_weights/text-mining-titans-roberta-qa-cp0.pt'))
model = model.to(device)

# Validation datasets
validation_datasets = (validation_dataset, validation_examples, validation_features)

# Get the results
results, examples, predictions = evaluate(
    output_dir='prediction_outputs',
    model=model,
    tokenizer=tokenizer,
    device=device,
    datasets=validation_datasets,
    prefix=f'checkpoint-1'
)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForQuestionAnswering: ['lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at roberta-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use 

In [8]:
from transformers.data.metrics.squad_metrics import normalize_answer, compute_exact, compute_f1

In [9]:
results = []

for example in examples:
    qas_id = example.qas_id
    gold_answers = [answer['text'] for answer in example.answers if normalize_answer(answer['text'])]
    
    if not gold_answers:
        gold_answers = ['']
        
    if qas_id not in predictions:
        continue
        
    prediction = predictions[qas_id]
    exact_scores = [
        (compute_exact(a, prediction), a, prediction, example.context_text, example.question_text, qas_id)
        for a in gold_answers
    ]
    exact_scores = sorted(exact_scores, key=lambda x: x[0], reverse=True)[0]
    results.append(exact_scores)

In [10]:
example.context_text

'The pound-force has a metric counterpart, less commonly used than the newton: the kilogram-force (kgf) (sometimes kilopond), is the force exerted by standard gravity on one kilogram of mass. The kilogram-force leads to an alternate, but rarely used unit of mass: the metric slug (sometimes mug or hyl) is that mass that accelerates at 1 m·s−2 when subjected to a force of 1 kgf. The kilogram-force is not a part of the modern SI system, and is generally deprecated; however it still sees use for some purposes as expressing aircraft weight, jet thrust, bicycle spoke tension, torque wrench settings and engine output torque. Other arcane units of force include the sthène, which is equivalent to 1000 N, and the kip, which is equivalent to 1000 lbf.'

In [11]:
import pandas as pd

df = pd.DataFrame(results, columns=['exact_score', 'answer', 'prediction', 'context', 'question', 'id'])
df = df.loc[df['exact_score'] == 0].sample(frac=1.0).reset_index(drop=True)
df.head(10)

Unnamed: 0,exact_score,answer,prediction,context,question,id
0,0,the AKS primality test,"Miller–Rabin primality test,",The property of being prime (or not) is called...,What is the name of another algorithm useful f...,57296f293f37b319004783a6
1,0,histocompatibility,,"In the mid-1950s, Frank Burnet, inspired by a ...","What is the complex ""two-signal"" activation of...",572a02483f37b3190047864d
2,0,,steam turbines,Reciprocating piston type steam engines remain...,What type of engines became popular for power ...,5ad3c061604f3c001a3fef6f
3,0,"cancer, hepatitis, and rheumatoid arthritis",,Specialty pharmacies supply high cost injectab...,What types of diseases are specialty drugs oft...,5726f36cdd62a815002e9600
4,0,do not have voting rights,,"Commissioners have various privileges, such as...",Can the President of the Council vote on impor...,57264e455951b619008f6f69
5,0,,quantum mechanics.,"However, attempting to reconcile electromagnet...",What theory led to quantum electromagnetics?,5ad28237d7d075001a429821
6,0,,Uptake of O 2 from the air is the essential pu...,Uptake of O\n2 from the air is the essential p...,What is the essential purpose of supplementati...,5ad25e70d7d075001a428f1c
7,0,,fern,Several commemorative events take place every ...,What type of flower is sought on Wianki?,5ad500e95b96ef001a10a916
8,0,"can produce both eggs and sperm, meaning it ca...",hermaphrodites—a single animal can produce bot...,Most species are hermaphrodites—a single anima...,What is unique about a hermaphrodite?,5725c57a89a1e219009abe5e
9,0,carbon dioxide,,Oxygen is present in the atmosphere in trace q...,In what compound is oxygen found in small amou...,571ce7f25efbb31900334e3e
