# Modelo de preguntas y respuestas sobre COVID19 trabajando con noticias en español

## Imports

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Data preprocessing

In [None]:
import pandas as pd
import json

def json_train_dataset_to_csv():
    json_file = open('/content/drive/MyDrive/Semestre Impar 2022/Modulos PLN/MT/dataset_covid_qa_train_dev/dataset_covid_qa_train.json')
    json_dataset = json.load(json_file)
    train_csv_path = "/content/dataset_covid_qa_train.csv"
    dataset = [["id", "context", "question", "answer_text", "answer_start", "language"]]
    language = "spanish"
    for elem in json_dataset['data']:
      for elem_info in elem['paragraphs']:
        context = elem_info['context']
        for qas in elem_info['qas']:
          question = qas['question']
          id = int(qas['id'])
          for ans in qas['answers']:
            answer_text = ans['text']
            answer_start = int(ans['answer_start'])
            dataset.append([id, context, question, answer_text, answer_start, language])
    dataset_df = pd.DataFrame(dataset)
    dataset_df.to_csv('dataset_covid_qa_train.csv', index=False, header=False)

    return train_csv_path

def json_test_dataset_to_csv():
    id_start = 1000
    json_file = open('/content/drive/MyDrive/Semestre Impar 2022/Modulos PLN/MT/dataset_covid_qa_train_dev/dataset_covid_qa_dev.json')
    json_dataset = json.load(json_file)
    test_csv_path = "/content/dataset_covid_qa_test.csv"
    dataset = [["id", "context", "question", "language"]]
    language = "spanish"
    for elem in json_dataset['data']:
      for elem_info in elem['paragraphs']:
        context = elem_info['context']
        for qas in elem_info['qas']:
          question = qas['question']
          id = int(qas['id']) + id_start
          dataset.append([id, context, question, language])
    dataset_df = pd.DataFrame(dataset)
    dataset_df.to_csv('dataset_covid_qa_test.csv', index=False, header=False)

    return test_csv_path

In [None]:
train_path = json_train_dataset_to_csv()
test_path = json_test_dataset_to_csv()

## roBERTa Model

Imports and constants

In [None]:
pip install --upgrade pip

[0m

In [None]:
!pip install datasets
!pip uninstall fsspec -qq -y
!pip install --no-index --find-links ../input/hf-datasets/wheels datasets -qq
!pip install fsspec
!pip install sentencepiece
!pip install transformers

[0m[31mERROR: Could not find a version that satisfies the requirement fsspec[http]>=2021.05.0 (from datasets) (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for fsspec[http]>=2021.05.0[0m[31m
[0mCollecting fsspec
  Using cached fsspec-2022.3.0-py3-none-any.whl (136 kB)
Installing collected packages: fsspec
Successfully installed fsspec-2022.3.0
[0m

In [None]:
import numpy as np
import pandas as pd 
from transformers import default_data_collator, Trainer
from transformers import AutoTokenizer, TrainingArguments,AutoModelForQuestionAnswering
import tensorflow as tf
from datasets import Dataset
import os

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
train_csv_path = "dataset_covid_qa_train_dev/dataset_covid_qa_train.csv"
test_csv_path = "dataset_covid_qa_train_dev/dataset_covid_qa_test.csv"

Load dataset

In [None]:
train = pd.read_csv(train_csv_path)
train.head()

Unnamed: 0,id,context,question,answer_text,answer_start,language
0,0,Comisión de expertos del FA elaboró “plan de ...,¿Para qué se elaboró el “plan de contención ec...,mitigar el impacto de la emergencia sanitaria,288,spanish
1,1,Comisión de expertos del FA elaboró “plan de ...,¿Quiénes integran la comisión de expertos del FA?,"Danilo Astori, el senador Daniel Olesker, el e...",894,spanish
2,2,Comisión de expertos del FA elaboró “plan de ...,¿Qué medidas establece el “plan de contención ...,preservación de los puestos de trabajo,1089,spanish
3,3,Comisión de expertos del FA elaboró “plan de ...,¿Cuáles son los objetivos del “plan de contenc...,“Preservar las empresas y las y los trabajador...,613,spanish
4,4,Comisión de expertos del FA elaboró “plan de ...,¿En qué se basa el “plan de contención económi...,aumento transitorio del gasto y de la inversió...,402,spanish


In [None]:
test = pd.read_csv(test_csv_path)
test.head()

Unnamed: 0,id,context,question,language
0,1000,Las solicitudes de seguro de paro llegaron a ...,¿Cuántas solicitudes al seguro de paro se han ...,spanish
1,1001,Las solicitudes de seguro de paro llegaron a ...,¿Quién es el director del Banco de Previsión S...,spanish
2,1002,Las solicitudes de seguro de paro llegaron a ...,¿Cuántas solicitudes al seguro de desempleo co...,spanish
3,1003,Las solicitudes de seguro de paro llegaron a ...,¿Cuántas solicitudes al seguro de desempleo co...,spanish
4,1004,Fernando Pereira defendió la postura del PIT-...,¿Qué cargo ostenta Fernando Pereira?,spanish


Tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained("deepset/xlm-roberta-large-squad2")

Set max_length, batch_size and doc_stride

In [None]:
batch_size = 64
max_length = 128 
doc_stride = 64
pad_on_right = tokenizer.padding_side == "right"

Defining a function that will prepare training data for us.

In [None]:
def prepare_train_features(examples):
    # Some of the questions have lots of whitespace on the left, which is not useful and will make the
    # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
    # left whitespace
    examples["question"] = [q.lstrip() for q in examples["question"]]

    # Tokenize our examples with truncation and padding, but keep the overflows using a stride. This results
    # in one example possible giving several features when a context is long, each of those features having a
    # context that overlaps a bit the context of the previous feature.
    tokenized_examples = tokenizer(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Since one example might give us several features if it has a long context, we need a map from a feature to
    # its corresponding example. This key gives us just that.
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    # The offset mappings will give us a map from token to character position in the original context. This will
    # help us compute the start_positions and end_positions.
    offset_mapping = tokenized_examples.pop("offset_mapping")

    # Let's label those examples!
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        # We will label impossible answers with the index of the CLS token.
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)

        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]
        # If no answers are given, set the cls_index as answer.
        if (len(answers["answer_start"]) == 0 or answers["answer_start"][0] == -1):
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            # Start/end character index of the answer in the text.
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])

            # Start token index of the current span in the text.
            token_start_index = 0
            while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
                token_start_index += 1

            # End token index of the current span in the text.
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
                token_end_index -= 1

            # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
                # Note: we could go after the last offset if the answer is the last word (edge case).
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized_examples["start_positions"].append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index + 1)

    return tokenized_examples

Defining a function to convert answers

In [None]:
def convert_answers(r):
    start = r[0]
    text = r[1]
    return {
        'answer_start': [start],
        'text': [text]
    }

train = train.sample(frac=1, random_state=42)
train['answers'] = train[['answer_start', 'answer_text']].apply(convert_answers, axis=1)

df_train = train[:-128].reset_index(drop=True)
df_valid = train[-128:].reset_index(drop=True)

train_dataset = Dataset.from_pandas(df_train)
valid_dataset = Dataset.from_pandas(df_valid)

In [None]:
print(len(train_dataset))
print(len(valid_dataset))

872
128


In [None]:
tokenized_train_ds = train_dataset.map(prepare_train_features, batched=True, remove_columns=train_dataset.column_names)
tokenized_valid_ds = valid_dataset.map(prepare_train_features, batched=True, remove_columns=train_dataset.column_names)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

The above code will be used to preprocess the train and test data by applying the prepare_train_feature function to it.

In [None]:
%env WANDB_DISABLED=True
args = TrainingArguments(
    f"covid-qa-spanish",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=3e-5,
    warmup_ratio=0.1,
    gradient_accumulation_steps=2,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=1,
    weight_decay=0.01,
)

Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


env: WANDB_DISABLED=True


Configure the training parameters. The data_collator automatically performs padding on the model inputs in a batch to the length of the most extended example in the dataset that eliminates the need to set a maximum sequence length that is usually fixed, resulting in an accelerated performance.

In [None]:
data_collator = default_data_collator
model = AutoModelForQuestionAnswering.from_pretrained("deepset/xlm-roberta-large-squad2")
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_train_ds,
    eval_dataset=tokenized_valid_ds,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

loading configuration file https://huggingface.co/deepset/xlm-roberta-large-squad2/resolve/main/config.json from cache at /Users/svolti/.cache/huggingface/transformers/531c1582e1ea0b7d34c7de10efd3593838f1018f8d012b8029c9283c41cba7c0.09d513aaf4fbccf6b8b4d0264d74ea7dc8d6fb056bdb099e45621b06d8c877de
Model config XLMRobertaConfig {
  "_name_or_path": "deepset/xlm-roberta-large-squad2",
  "architectures": [
    "XLMRobertaForQuestionAnswering"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "language": "english",
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "name": "XLMRoberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "

In [None]:
trainer.train()
trainer.save_model("spanish-bert-trained")

***** Running training *****
  Num examples = 16474
  Num Epochs = 1
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 2
  Total optimization steps = 129


Epoch,Training Loss,Validation Loss
1,No log,0.239357


***** Running Evaluation *****
  Num examples = 2379
  Batch size = 64
Saving model checkpoint to covid-qa-spanish/checkpoint-129
Configuration saved in covid-qa-spanish/checkpoint-129/config.json
Model weights saved in covid-qa-spanish/checkpoint-129/pytorch_model.bin
tokenizer config file saved in covid-qa-spanish/checkpoint-129/tokenizer_config.json
Special tokens file saved in covid-qa-spanish/checkpoint-129/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to spanish-bert-trained
Configuration saved in spanish-bert-trained/config.json
Model weights saved in spanish-bert-trained/pytorch_model.bin
tokenizer config file saved in spanish-bert-trained/tokenizer_config.json
Special tokens file saved in spanish-bert-trained/special_tokens_map.json


In [None]:
def prepare_validation_features(examples):
    # Some of the questions have lots of whitespace on the left, which is not useful and will make the
    # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
    # left whitespace
    examples["question"] = [q.lstrip() for q in examples["question"]]

    # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
    # in one example possible giving several features when a context is long, each of those features having a
    # context that overlaps a bit the context of the previous feature.
    tokenized_examples = tokenizer(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Since one example might give us several features if it has a long context, we need a map from a feature to
    # its corresponding example. This key gives us just that.
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")

    # We keep the example_id that gave us this feature and we will store the offset mappings.
    tokenized_examples["example_id"] = []

    for i in range(len(tokenized_examples["input_ids"])):
        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)
        context_index = 1 if pad_on_right else 0

        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = sample_mapping[i]
        tokenized_examples["example_id"].append(examples["id"][sample_index])

        # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token
        # position is part of the context or not.
        tokenized_examples["offset_mapping"][i] = [
            (o if sequence_ids[k] == context_index else None)
            for k, o in enumerate(tokenized_examples["offset_mapping"][i])
        ]

    return tokenized_examples

Validation

In [None]:
validation_features = valid_dataset.map(
    prepare_validation_features,
    batched=True,
    remove_columns=valid_dataset.column_names
)

  0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
len(validation_features)

2379

In [None]:
valid_dataset

Dataset({
    features: ['id', 'context', 'question', 'answer_text', 'answer_start', 'language', 'answers'],
    num_rows: 128
})

In [None]:
valid_feats_small = validation_features.map(lambda example: example, remove_columns=['example_id', 'offset_mapping'])
valid_feats_small

  0%|          | 0/2379 [00:00<?, ?ex/s]

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 2379
})

Prediction

In [None]:
raw_predictions = trainer.predict(valid_feats_small)

***** Running Prediction *****
  Num examples = 2379
  Batch size = 64


In [None]:
raw_predictions[0]

(array([[ 4.982083 , -6.648625 , -4.4253635, ..., -6.499164 , -4.4461164,
         -8.030368 ],
        [ 5.3706155, -7.0646396, -5.346269 , ..., -6.451572 , -6.603094 ,
         -8.57295  ],
        [ 5.776438 , -6.936817 , -4.8785243, ..., -7.318831 , -7.662396 ,
         -8.733403 ],
        ...,
        [ 5.627601 , -6.8816943, -7.652035 , ..., -6.286943 , -6.7854943,
         -8.866805 ],
        [ 5.5393806, -6.8249345, -7.6049857, ..., -6.649463 , -7.0206738,
         -8.973235 ],
        [ 5.3335223, -7.0259185, -7.6817307, ..., -8.89007  , -8.890074 ,
         -8.890074 ]], dtype=float32),
 array([[  2.7615044, -10.460422 ,  -8.952849 , ...,  -8.713748 ,
          -6.0618186, -10.616649 ],
        [  3.0066135, -10.715632 ,  -9.534841 , ...,  -7.5944734,
          -7.5618215, -11.179823 ],
        [  3.3979118, -10.636369 ,  -9.291412 , ..., -10.083705 ,
          -8.918753 , -11.361818 ],
        ...,
        [  2.9182918, -10.768368 , -10.306688 , ...,  -9.966353 ,
         

In [None]:
max_answer_length = 60

The below block of code informs us about the number of features an example is split in and gives us the list of examples and their features.


In [None]:
import collections

examples = valid_dataset
features = validation_features

example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
features_per_example = collections.defaultdict(list)
for i, feature in enumerate(features):
    features_per_example[example_id_to_index[feature["example_id"]]].append(i)

## Postprocessing

In [None]:
from tqdm.auto import tqdm

def postprocess_qa_predictions(examples, features, raw_predictions, n_best_size = 20, max_answer_length = 30):
    all_start_logits, all_end_logits = raw_predictions
    # Build a map example to its corresponding features.
    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
    features_per_example = collections.defaultdict(list)
    for i, feature in enumerate(features):
        features_per_example[example_id_to_index[feature["example_id"]]].append(i)

    # The dictionaries we have to fill.
    predictions = collections.OrderedDict()

    # Logging.
    print(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")

    # Let's loop over all the examples!
    for example_index, example in enumerate(tqdm(examples)):
        # Those are the indices of the features associated to the current example.
        feature_indices = features_per_example[example_index]

        min_null_score = None # Only used if squad_v2 is True.
        valid_answers = []
        
        context = example["context"]
        # Looping through all the features associated to the current example.
        for feature_index in feature_indices:
            # We grab the predictions of the model for this feature.
            start_logits = all_start_logits[feature_index]
            end_logits = all_end_logits[feature_index]
            # This is what will allow us to map some the positions in our logits to span of texts in the original
            # context.
            offset_mapping = features[feature_index]["offset_mapping"]

            # Update minimum null prediction.
            cls_index = features[feature_index]["input_ids"].index(tokenizer.cls_token_id)
            feature_null_score = start_logits[cls_index] + end_logits[cls_index]
            if min_null_score is None or min_null_score < feature_null_score:
                min_null_score = feature_null_score

            # Go through all possibilities for the `n_best_size` greater start and end logits.
            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Don't consider out-of-scope answers, either because the indices are out of bounds or correspond
                    # to part of the input_ids that are not in the context.
                    if (
                        start_index >= len(offset_mapping)
                        or end_index >= len(offset_mapping)
                        or offset_mapping[start_index] is None
                        or offset_mapping[end_index] is None
                    ):
                        continue
                    # Don't consider answers with a length that is either < 0 or > max_answer_length.
                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                        continue

                    start_char = offset_mapping[start_index][0]
                    end_char = offset_mapping[end_index][1]
                    valid_answers.append(
                        {
                            "score": start_logits[start_index] + end_logits[end_index],
                            "text": context[start_char: end_char]
                        }
                    )
        
        if len(valid_answers) > 0:
            best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
        else:
            # In the very rare edge case we have not a single non-null prediction, we create a fake prediction to avoid
            # failure.
            best_answer = {"text": "", "score": 0.0}
        
        # Let's pick our final answer: the best one or the null answer (only for squad_v2)
        predictions[example["id"]] = best_answer["text"]

    return predictions

We will be passing the valid_dataset, validation_features, raw_predictions to the postprocess_qa_predictions function to get the final predictions.

In [None]:
final_predictions = postprocess_qa_predictions(valid_dataset, validation_features, raw_predictions.predictions)

Post-processing 128 example predictions split into 2379 features.


  0%|          | 0/128 [00:00<?, ?it/s]

In [None]:
prediction = pd.DataFrame([{"questions":x1['question'], "pred_answer":x2} for x1, x2 in zip(valid_dataset, [i for i in final_predictions.values()])])

In [None]:
prediction

Unnamed: 0,questions,pred_answer
0,¿Cuántos casos positivos de coronavirus regist...,4.627.537
1,¿Cuántas empresas latinoamercanas han cerrado ...,550.000
2,¿Qué le pedirán al gobierno?,que existan controles a los colegios privados
3,¿En qué afecta el Covid19 a los humanos?,las vías respiratorias
4,¿Qué recorrerá Soca esta noche a las 20.30?,Mercado Ferrando
...,...,...
121,¿En qué países la pandemia afectó más a indíge...,2.915
122,¿Cuándo se detectó el virus de covid-19 por pr...,nueve
123,¿Cuántos casos activos por Coronavirus hay en ...,acudir presencialmente al cementerio e indica...
124,¿Cuántos países componen actualmente la Conmebol?,este jueves


Submission

In [None]:
test_dataset = Dataset.from_pandas(test)
test_features = test_dataset.map(
    prepare_validation_features,
    batched=True,
    remove_columns=test_dataset.column_names
)

  0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
test_feats_small = test_features.map(lambda example: example, remove_columns=['example_id', 'offset_mapping'])
test_feats_small

  0%|          | 0/13833 [00:00<?, ?ex/s]

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 13833
})

In [None]:
test_predictions = trainer.predict(test_feats_small)

***** Running Prediction *****
  Num examples = 13833
  Batch size = 64


In [None]:
test_features.set_format(type=test_features.format["type"], columns=list(test_features.features.keys()))

In [None]:
final_test_predictions = postprocess_qa_predictions(test_dataset, test_features, test_predictions.predictions)

Post-processing 773 example predictions split into 13833 features.


  0%|          | 0/773 [00:00<?, ?it/s]

In [None]:
test['PredictionString'] = test['id'].apply(lambda r: final_test_predictions[r])

In [None]:
test.to_csv('results.csv', index=False)

## Results and Metrics

In [4]:
import pandas as pd

def process_results():
    answers_path = '/content/drive/MyDrive/anwsers.csv'
    predictions_path = '/content/drive/MyDrive/results.csv'
    results = {}
    answers_df = pd.read_csv(answers_path)
    predictions_df = pd.read_csv(predictions_path)
       
    for i , row in answers_df.iterrows():
        instance_id = answers_df.at[i,'id']
        answer_text = answers_df.at[i,'answer_text']
        results[instance_id] =  {"answer_text": answer_text, "predict_answer_text": 'nan'}
    
    for i , row in predictions_df.iterrows():
      instance_id = predictions_df.at[i,'id']
      predict_answer_text = predictions_df.at[i,'predict_answer_text']
      results[instance_id]['predict_answer_text'] = predict_answer_text

    return results

In [8]:
results = process_results()
includes_anwser = 0
total = 0

for index in results:
  total+=1
  if ((str(results[index]['answer_text']) in str(results[index]['predict_answer_text'])) or (str(results[index]['predict_answer_text']) in str(results[index]['answer_text']))):
    includes_anwser+=1
        
print("total: " + str(total))
print("includes: " + str(includes_anwser))



total: 773
includes: 552


In [9]:
def normalize_text(s):
    """Removing articles and punctuation, and standardizing whitespace are all typical text processing steps."""
    import string, re

    def remove_articles(text):
        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
        return re.sub(regex, " ", text)

    def white_space_fix(text):
        return " ".join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

def compute_exact_match(prediction, truth):
    return int(normalize_text(prediction) == normalize_text(truth))

def compute_f1(prediction, truth):
    pred_tokens = normalize_text(prediction).split()
    truth_tokens = normalize_text(truth).split()
    
    # if either the prediction or the truth is no-answer then f1 = 1 if they agree, 0 otherwise
    if len(pred_tokens) == 0 or len(truth_tokens) == 0:
        return int(pred_tokens == truth_tokens)
    
    common_tokens = set(pred_tokens) & set(truth_tokens)
    
    # if there are no common tokens then f1 = 0
    if len(common_tokens) == 0:
        return 0
    
    prec = len(common_tokens) / len(pred_tokens)
    rec = len(common_tokens) / len(truth_tokens)
    
    return 2 * (prec * rec) / (prec + rec)

In [10]:
results = process_results()
answers_list = []
predictions_list = []

for elem in results:
  answers_list.append(str(results[elem]['answer_text']))
  predictions_list.append(str(results[elem]['predict_answer_text']))
  
em_score = []
f1_score = []
for index in range(len(answers_list)):
  em = compute_exact_match(predictions_list[index], answers_list[index])
  f1 = compute_f1(predictions_list[index], answers_list[index])
  em_score.append(em)
  f1_score.append(f1)


In [11]:
import numpy as np

print("Exact match: " + str(np.average(em_score)))
print("F1: " + str(np.average(f1_score)))


Exact match: 0.5472186287192755
F1: 0.697605608742613
