In [1]:
import logging
import tqdm
from transformers import (
    AutoTokenizer,
    AutoModelForQuestionAnswering,
    AutoConfig,
    RobertaForQuestionAnswering,
    squad_convert_examples_to_features
)

from transformers.data.processors.squad import SquadV2Processor

# Create logger
logger = logging.getLogger(__name__)

In [2]:
from transformers.data.metrics.squad_metrics import (
    compute_predictions_log_probs,
    compute_predictions_logits,
    squad_evaluate,
)
from transformers.data.processors.squad import SquadResult
import timeit

def evaluate(output_dir, model, tokenizer, device, datasets, prefix=""):
    batch_size = 4
    model_type = 'roberta'
    dataset, examples, features = datasets

    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(dataset)
    eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=batch_size)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(dataset))
    logger.info("  Batch size = %d", batch_size)

    all_results = []
    start_time = timeit.default_timer()

    for batch in tqdm.tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(device) for t in batch)

        with torch.no_grad():
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
            }

            if model_type in ["xlm", "roberta", "distilbert", "camembert", "bart", "longformer"]:
                del inputs["token_type_ids"]

            feature_indices = batch[3]

            # Get the predicted outputs
            outputs = model(**inputs)

        for i, feature_index in enumerate(feature_indices):
            eval_feature = features[feature_index.item()]
            unique_id = int(eval_feature.unique_id)

            output = [to_list(output[i]) for output in outputs.to_tuple()]

            # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler"
            # models only use two.
            if len(output) >= 5:
                start_logits = output[0]
                start_top_index = output[1]
                end_logits = output[2]
                end_top_index = output[3]
                cls_logits = output[4]

                result = SquadResult(
                    unique_id,
                    start_logits,
                    end_logits,
                    start_top_index=start_top_index,
                    end_top_index=end_top_index,
                    cls_logits=cls_logits,
                )

            else:
                start_logits, end_logits = output
                result = SquadResult(unique_id, start_logits, end_logits)

            all_results.append(result)

    evalTime = timeit.default_timer() - start_time
    logger.info("  Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(dataset))

    # Compute predictions
    output_prediction_file = os.path.join(output_dir, "predictions_{}.json".format(prefix))
    output_nbest_file = os.path.join(output_dir, "nbest_predictions_{}.json".format(prefix))
    output_null_log_odds_file = os.path.join(output_dir, "null_odds_{}.json".format(prefix))

    # TODO: Get defualt inputs for this function
    predictions = compute_predictions_logits(
        examples,
        features,
        all_results,
        n_best_size=20,
        max_answer_length=30,
        do_lower_case=True,
        output_prediction_file=output_prediction_file,
        output_nbest_file=output_nbest_file,
        output_null_log_odds_file=output_null_log_odds_file,
        verbose_logging=False,
        version_2_with_negative=True,
        null_score_diff_threshold=0.0,
        tokenizer=tokenizer,
      )

    # Compute the F1 and exact scores.
    results = squad_evaluate(examples, predictions)
    return results, examples, predictions

def load_examples(data_dir, data_file, tokenizer, evaluate=False, output_examples=False):
    processor = SquadV2Processor()
    if evaluate:
        examples = processor.get_dev_examples(data_dir, filename=data_file)
    else:
        examples = processor.get_train_examples(data_dir, filename=data_file)

    features, dataset = squad_convert_examples_to_features(
        examples=examples,
        tokenizer=tokenizer,
        max_seq_length=384,
        doc_stride=128,
        max_query_length=64,
        is_training=not evaluate,
        return_dataset="pt",
        threads=1,
    )

    if output_examples:
        return dataset, examples, features

    return dataset

In [3]:
# Create path variables
data_dir = 'squad_data'
validation_data_file = 'dev-v2.0.json'
tokenizer = AutoTokenizer.from_pretrained('roberta-base', do_lower_case=True, use_fast=False)

# Create the validation set
validation_dataset, validation_examples, validation_features = load_examples(
    data_dir=data_dir,
    data_file=validation_data_file,
    tokenizer=tokenizer,
    evaluate=True,
    output_examples=True,
)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 35/35 [00:03<00:00, 11.08it/s]
convert squad examples to features: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11873/11873 [00:53<00:00, 223.19it/s]
add example index and unique id: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11873/11873 [00:00<00:00, 533162.44it/s]


In [4]:
def to_list(tensor):
    return tensor.detach().cpu().tolist()

In [5]:
import torch
import timeit
import os
from torch.utils.data import SequentialSampler
from torch.utils.data import DataLoader, Dataset

# Results list
results_list = []
device = torch.device('cuda:0')

# Iterate over the model checkpoints and look at the results
for epoch in range(3):
    # Configure the tokenizer and model
    config = AutoConfig.from_pretrained('roberta-base')
    tokenizer = AutoTokenizer.from_pretrained('roberta-base', do_lower_case=True, use_fast=False)
    model = AutoModelForQuestionAnswering.from_pretrained('roberta-base', config=config)
    model.load_state_dict(torch.load(f'./model_weights/text-mining-titans-roberta-qa-cp{epoch}.pt'))
    model = model.to(device)
    
    # Validation datasets
    validation_datasets = (validation_dataset, validation_examples, validation_features)
    
    # Get the results
    results, examples, predictions = evaluate(
        output_dir='prediction_outputs',
        model=model,
        tokenizer=tokenizer,
        device=device,
        datasets=validation_datasets,
        prefix=f'checkpoint-{epoch}'
    )
    results_list.append(results)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForQuestionAnswering: ['lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at roberta-base and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use 

In [10]:
import pandas as pd

all_results = pd.DataFrame(results_list)

In [11]:
all_results.to_csv('all_results.csv', index=False)