### one time installations

In [2]:
# !gdown --id 1pb7gEkctrVrJA79EAIo7H7nuzD6uV1fW
# !gdown --id 1oIeAE9HXXKWPcYa-AZ0ht5ef6sKe_Vh_
# !gdown --id 10rAuIDvsYR2yDiCqP7GmYGPc-UmtLbJb

In [3]:
# !pip install --quiet transformers --target=/kaggle/working/chaii_packages
# !pip install --quiet datasets --target=/kaggle/working/chaii_packages
# !pip install --quiet SentencePiece --target=/kaggle/working/chaii_packages
# !pip install --quiet pytorch-lightning --target=/kaggle/working/chaii_packages 
# !pip install ipdb --target=/kaggle/working/chaii_packages
# import sys
# sys.path.append('/kaggle/working/chaii_packages')

### libraries

In [4]:
# %env PYTHONPATH= 
# %env WANDB_DISABLED=True

In [5]:
from ipdb import set_trace

In [6]:
import os
import numpy as np
import pandas as pd
import sklearn
import random
from sklearn import model_selection
# from ipdb import set_trace

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from transformers import AutoModelForQuestionAnswering, AutoTokenizer
from transformers import Trainer, TrainingArguments

import pytorch_lightning as pl

### hyperparameters 

In [19]:
class hyperparameters:
    # seed
    seed = 4
    
    # tokenizer
    tokenizer_name = "deepset/xlm-roberta-base-squad2" # "deepset/xlm-roberta-large-squad2" # model_name # CHANGE THIS; TRY XLM-ROBERTA
    max_len = 384 # maximum length of context and question in a datapoint
    overlap_len = 128 # overlap between two parts of the context when it is split
    
    # model
    model_name = "deepset/xlm-roberta-base-squad2" # "deepset/xlm-roberta-large-squad2"
    batch_size = 8
    epochs = 5
    
    # data
    train_csv = "train.csv" # "../input/chaii-hindi-and-tamil-question-answering/train.csv" 
    test_csv = "test.csv" # "../input/chaii-hindi-and-tamil-question-answering/test.csv"
    
    # prediction
    top_x = 5 # top 5 answer predictions by each feature
    max_tok_in_ans = 10 # max 10 tokens in predicted answer text
    output_dir = "xlm-chaii"

In [20]:
hyperparams = hyperparameters()

In [9]:
pl.seed_everything(hyperparams.seed)
print("available gpu count:", torch.cuda.device_count())
for i in range(torch.cuda.device_count()):
    print(torch.cuda.device(i))

Global seed set to 4


available gpu count: 1
<torch.cuda.device object at 0x7f2865d5bc70>


### tokenizer

In [10]:
tokenizer = AutoTokenizer.from_pretrained(hyperparams.tokenizer_name)

### Ingredients (data) for Chaii

In [11]:
chaii_df = pd.read_csv(hyperparams.train_csv, encoding='utf-8')
# sample_df = pd.read_csv('sample_submission.csv')
# chaii_df = sklearn.utils.shuffle(chaii_df, random_state=4).reset_index(drop=True)

In [12]:
train_df, val_df = model_selection.train_test_split(chaii_df, test_size=0.2, random_state=4) # hyperparams.seed

In [13]:
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)

### Data pipeline

In [14]:
def prepare_chaii(data_df, tokenizer, test=False):
    # prepare_chaii takes in raw data and returns tokenized data 
    # along with position of first token and last token in the answer_text
    
    # strip trailing and leading whitespaces in context, question, and (answer_text)?
    data_df.loc[:, 'context'] = data_df.loc[:, 'context'].apply(lambda sen : str(sen).strip())
    data_df.loc[:, 'question'] = data_df.loc[:, 'question'].apply(lambda sen : str(sen).strip())
    if not test:
        data_df.loc[:, 'answer_text'] = data_df.loc[:, 'answer_text'].apply(lambda sen : str(sen).strip())
    
    # question; context -- order is important, and is used in prediction stage to find whether predicted tokens seq_id is 0 (question) or 1 (context)
    data_tok = tokenizer(
        list(data_df['question']), list(data_df['context']),
        max_length=hyperparams.max_len, 
        truncation='only_second',
        stride=hyperparams.overlap_len,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding=True,
    )
    
    if test:
        return data_tok
    
    # data_df contains original raw data having question, context
    # data_tok contains tokenized data, where context might have split into multiple sentences 
    # data_tok is a dict, containing keys : dict_keys(['input_ids', 'attention_mask', 'offset_mapping', 'overflow_to_sample_mapping'])
    # every value is a list, and no tensors here
    
    # adding two more keys that will contain the position of first token and last token in the answer_text
    data_tok['start_positions'], data_tok['end_positions'] = [], []
    
    n_sents = len(data_tok['input_ids'])
    map_id_sent2context = data_tok['overflow_to_sample_mapping'] # id means index! since input_ids means various inputs to the model
    map_offsets = data_tok['offset_mapping']
    assert len(map_offsets) == len(map_id_sent2context) == n_sents
    
    for input_id in range(n_sents):
        sent = data_tok['input_ids'][input_id]
        
        # get the answer_start and answer_text for this input_id using the id in data_df
        context_id = map_id_sent2context[input_id]
        answer_text = data_df.loc[context_id, 'answer_text']
        answer_start = data_df.loc[context_id, 'answer_start']
        answer_end = answer_start + len(answer_text) # will use this in next code block
        
        # check whether the answer is present in the current input_id or not using offsets
        qn_context_id = data_tok.sequence_ids(input_id)
        
            # first: get the start_idx_token and end_idx_token of context
        start_idx_token = qn_context_id.index(1)
        end_idx_token = len(qn_context_id) - qn_context_id[::-1].index(1) - 1
        
            # second: use the offsets for input_id to find if answer_start and answer_end are inside this chunk of context or not
        offset_map = map_offsets[input_id]

        if answer_start >= offset_map[start_idx_token][0] and answer_end <= offset_map[end_idx_token][1]:
            # now finally get the idx_token for the first and last token in the answer_text
            while start_idx_token < len(sent) and answer_start >= offset_map[start_idx_token][0]:
                start_idx_token += 1
            while answer_end <= offset_map[end_idx_token][1]:
                    end_idx_token -= 1
            
            data_tok['start_positions'].append(start_idx_token - 1)
            data_tok['end_positions'].append(end_idx_token + 1)
        
        else:
            cls_token_idx = sent.index(tokenizer.cls_token_id)
            assert cls_token_idx == 0
            data_tok['start_positions'].append(0) # cls token index
            data_tok['end_positions'].append(0) # cls token index

    return data_tok     

In [15]:
class chaii_ka_data(Dataset):
    def __init__(self, data_df, tokenizer, test=False):
        super(chaii_ka_data, self).__init__()
        '''
            test = True means data_df without answer_text, answer_start
            data_df is the pandas dataframe containing context, question, ...        
        '''
        
        # tokenize data samples context;question, and create new samples if overflow
        # we need to do this apriori (and not in __getitem__ directly) because a datasample may create more samples upon tokenization
        self.reqd_keys = ['input_ids', 'attention_mask'] 
        if not test:
            self.reqd_keys += ['start_positions', 'end_positions']
        self.data_tok = prepare_chaii(data_df, tokenizer, test=test)
    
    def __getitem__(self, input_id): # index is input_id as used in prepare_chaii()
        # sent = self.data_tok['input_ids'][input_id]
        # att_mask = self.data_tok['attention_mask'][input_id]
        # offset_map = self.data_tok['offset_mapping'][input_id]
        # start_idx_tok = self.data_tok['start_positions'][input_id]
        # end_idx_tok = self.data_tok['end_positions'][input_id]
        
        return {k: torch.tensor(v[input_id], dtype=torch.long) for k,v in self.data_tok.items() if k in self.reqd_keys}
    
    def __len__(self):
        return len(self.data_tok['input_ids'])

In [16]:
trainset = chaii_ka_data(train_df, tokenizer)
valset = chaii_ka_data(val_df, tokenizer)

#### model predictions (start and token index) to answer_text
this transformation requires original (raw) data_df, data_tok, and start, end logits (probabilities)

In [17]:
def serve_chaii(test_df, testset, logits):
    assert len(logits[0]) == len(logits[1]) == len(testset)
    submission = {
        "id" : [],
        "PredictionString" : []
    }
    n_examples = len(test_df)
    # print("number of examples in test df", n_examples)
    for example_idx in range(n_examples):
        # current example (or context) in the given test_df
        example_id = test_df.loc[example_idx, 'id']

        # get all the features (or sents), start_logits, end_logits for the current example index
        data_tok = testset.data_tok
        map_id_sent2context = data_tok['overflow_to_sample_mapping']
        assert len(map_id_sent2context) == len(testset)

        sents_first_idx = map_id_sent2context.index(example_idx)
        sents_last_idx = len(map_id_sent2context) - map_id_sent2context[::-1].index(example_idx) - 1
        assert (np.array(map_id_sent2context[sents_first_idx: sents_last_idx+1]) == example_idx).mean() == 1, set_trace()

        sents = data_tok['input_ids'][sents_first_idx: sents_last_idx+1]
        start_logits = logits[0][sents_first_idx: sents_last_idx+1]
        end_logits = logits[1][sents_first_idx: sents_last_idx+1]
        n_sents = len(sents)
        assert n_sents == len(start_logits) == len(end_logits)

        # get the answer_text from these sents using start_logits, end_logits
        # rank all possible answers for each sentence
        # then club all these answers from each sentence and take the best one as final predicted_answer
        # Also, consider the case when a sentence has no answer_text i.e. model predicts no answer
        pred_answers = []
        for local_idx in range(n_sents):
            sent = sents[local_idx]
            start_lgts, end_lgts = start_logits[local_idx], end_logits[local_idx] # 384-dim list containing probabilities

            # take the top 5 confident predictions of the model for start and end token indices
            top_x = hyperparams.top_x
            ranked_strt_tok_idxs = np.argsort(start_lgts)[::-1][:top_x].tolist()
            ranked_end_tok_idxs = np.argsort(end_lgts)[::-1][:top_x].tolist()

            # see which all are possible answers, and append
            for start_tok_idx in ranked_strt_tok_idxs:
                for end_tok_idx in ranked_end_tok_idxs:
                    # meaningless prediction
                    if start_tok_idx > end_tok_idx:
                        continue

                    # answer tokens NOT present in context, but in question
                    seq_ids = data_tok.sequence_ids(sents_first_idx + local_idx)
                    if seq_ids[start_tok_idx] == 0 or seq_ids[end_tok_idx] == 0: # 0 denotes question since question; context
                        continue
                        
                    if end_tok_idx-start_tok_idx+1 > hyperparams.max_tok_in_ans:
                        continue

                    score = start_lgts[start_tok_idx] * end_lgts[end_tok_idx]
                    answer_text = tokenizer.decode(sent[start_tok_idx: end_tok_idx+1])
                    pred_answers.append((score, answer_text))

        if len(pred_answers) == 0:
            predicted_answer = ""

        pred_answers = sorted(pred_answers, key=lambda element : element[0])[::-1]
        predicted_answer = pred_answers[0][1]

        if predicted_answer == "<s>":
            predicted_answer = ""

        submission['id'].append(example_id)
        submission['PredictionString'].append(predicted_answer)

    assert len(submission['id']) == len(test_df)
    return submission

### model

In [18]:
model = AutoModelForQuestionAnswering.from_pretrained(hyperparams.model_name)

### training

In [21]:
training_args = TrainingArguments(
    output_dir=hyperparams.output_dir, 
    overwrite_output_dir=True, 
    per_device_eval_batch_size=hyperparams.batch_size,
    per_device_train_batch_size=hyperparams.batch_size,
    evaluation_strategy="epoch", 
    learning_rate=2e-5,
    weight_decay=0.01,
    num_train_epochs=hyperparams.epochs,
    seed=hyperparams.seed,
    save_strategy="epoch",
#     save_total_limit=1,
#     load_best_model_at_end=True,
#     metric_for_best_model="eval_accuracy",
#     greater_is_better=True,
#     warmup_steps=500,
#     logging_dir='./logs',
#     logging_strategy="epoch",
)

trainer = Trainer(
    model=model, args=training_args,
    train_dataset=trainset, eval_dataset=valset,
)

trainer.train()

***** Running training *****
  Num examples = 11783
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 7365


Epoch,Training Loss,Validation Loss
1,0.506,0.420224
2,0.2634,0.372058
3,0.1789,0.492951
4,0.1123,0.496931
5,0.0618,0.556646


***** Running Evaluation *****
  Num examples = 2958
  Batch size = 8
Saving model checkpoint to xlm-chaii/checkpoint-1473
Configuration saved in xlm-chaii/checkpoint-1473/config.json
Model weights saved in xlm-chaii/checkpoint-1473/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 2958
  Batch size = 8
Saving model checkpoint to xlm-chaii/checkpoint-2946
Configuration saved in xlm-chaii/checkpoint-2946/config.json
Model weights saved in xlm-chaii/checkpoint-2946/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 2958
  Batch size = 8
Saving model checkpoint to xlm-chaii/checkpoint-4419
Configuration saved in xlm-chaii/checkpoint-4419/config.json
Model weights saved in xlm-chaii/checkpoint-4419/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 2958
  Batch size = 8
Saving model checkpoint to xlm-chaii/checkpoint-5892
Configuration saved in xlm-chaii/checkpoint-5892/config.json
Model weights saved in xlm-chaii/checkpoint-5892/pytorch_model.

TrainOutput(global_step=7365, training_loss=0.22013820122474728, metrics={'train_runtime': 1813.173, 'train_samples_per_second': 32.493, 'train_steps_per_second': 4.062, 'total_flos': 1.154572381732608e+16, 'train_loss': 0.22013820122474728, 'epoch': 5.0})

### Evaluation
Compute Jaccard's score for trainset, valset using saved model at each epoch

In [22]:
def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [23]:
def compute_jaccard(pred_df, gt_df):
    num_examples = 0
    score = 0
    for idx, example_id in enumerate(gt_df.loc[:,'id']):
        gt_answer = gt_df.loc[idx, 'answer_text']
        pred_answer = pred_df.loc[pred_df.loc[:, 'id'] == example_id].reset_index(drop=True).loc[0, 'PredictionString']
        # print(gt_answer, pred_answer)
        score += jaccard(gt_answer, pred_answer)
        num_examples += 1
        
    score /= num_examples
    return score

In [26]:
model_checkpoints = os.listdir(hyperparams.output_dir)
model_checkpoints.sort()
train_scores, val_scores = [], []
for cp_id, model_checkpoint in enumerate(model_checkpoints):
    if model_checkpoint[:5] != "check":
        continue
    # load the model
    model = AutoModelForQuestionAnswering.from_pretrained(os.path.join(hyperparams.output_dir, model_checkpoint))
    trainer = Trainer(
        model=model
    )
    
    logits = trainer.predict(trainset).predictions
    submission = serve_chaii(train_df, trainset, logits)
    pred_df = pd.DataFrame(submission)
    train_score = compute_jaccard(pred_df, train_df)
    train_scores.append(train_score)

    # logits = trainer.predict(valset).predictions
    # submission = serve_chaii(val_df, valset, logits)
    # pred_df = pd.DataFrame(submission)
    # val_score = compute_jaccard(pred_df, val_df)
    # val_scores.append(val_score)
    
    
    # print(model_checkpoint, "train:", train_score, "val:", val_score)
print(model_checkpoints)
print(train_scores)
# print(val_scores)

loading configuration file xlm-chaii/checkpoint-1473/config.json
Model config XLMRobertaConfig {
  "_name_or_path": "deepset/xlm-roberta-base-squad2",
  "architectures": [
    "XLMRobertaForQuestionAnswering"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "language": "english",
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "name": "XLMRoberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.11.3",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 250002
}

loading weights file xlm-chaii/checkpoint-1473/pytorch_model.bin
All model checkpoint wei

loading configuration file xlm-chaii/checkpoint-2946/config.json
Model config XLMRobertaConfig {
  "_name_or_path": "deepset/xlm-roberta-base-squad2",
  "architectures": [
    "XLMRobertaForQuestionAnswering"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "language": "english",
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "name": "XLMRoberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.11.3",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 250002
}

loading weights file xlm-chaii/checkpoint-2946/pytorch_model.bin
All model checkpoint wei

loading configuration file xlm-chaii/checkpoint-4419/config.json
Model config XLMRobertaConfig {
  "_name_or_path": "deepset/xlm-roberta-base-squad2",
  "architectures": [
    "XLMRobertaForQuestionAnswering"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "language": "english",
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "name": "XLMRoberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.11.3",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 250002
}

loading weights file xlm-chaii/checkpoint-4419/pytorch_model.bin
All model checkpoint wei

loading configuration file xlm-chaii/checkpoint-5892/config.json
Model config XLMRobertaConfig {
  "_name_or_path": "deepset/xlm-roberta-base-squad2",
  "architectures": [
    "XLMRobertaForQuestionAnswering"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "language": "english",
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "name": "XLMRoberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.11.3",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 250002
}

loading weights file xlm-chaii/checkpoint-5892/pytorch_model.bin
All model checkpoint wei

loading configuration file xlm-chaii/checkpoint-7365/config.json
Model config XLMRobertaConfig {
  "_name_or_path": "deepset/xlm-roberta-base-squad2",
  "architectures": [
    "XLMRobertaForQuestionAnswering"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "language": "english",
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "name": "XLMRoberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.11.3",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 250002
}

loading weights file xlm-chaii/checkpoint-7365/pytorch_model.bin
All model checkpoint wei

['checkpoint-1473', 'checkpoint-2946', 'checkpoint-4419', 'checkpoint-5892', 'checkpoint-7365', 'runs']
[0.09774824564386851, 0.3536411291040921, 0.636208887134813, 0.7312489979156648, 0.745868740313185]


### Prediction of textual answers
1. generate submission.csv containing "id", "PredictionString" columns
2. Use Trainer API for predict instead of trainer.model(\*\*batch) as it handles batching, and CPU-GPU on its own
3. trainer.predict(testset) gives the start and end logits for all the input features in the test set
4. for each example in the test_df, select the best answer from its features

In [11]:
# load the model
model = AutoModelForQuestionAnswering.from_pretrained("./ShAm-ki-chaii/checkpoint-1473")
trainer = Trainer(
    model=model
)

In [12]:
# load the dataset
# test_df = pd.read_csv(hyperparams.test_csv, encoding='utf-8') # uncomment this for submission
test_df = pd.read_csv(hyperparams.train_csv, encoding='utf-8').loc[:10] # comment this for submission
testset = chaii_ka_data(test_df, tokenizer, test=True)
# testloader = DataLoader(testset, batch_size=16)
# next(iter(testloader))

In [13]:
# test_df

In [14]:
# pass the complete testset in trainer API
# the API will automatically do batch-wise prediction
# start, end logits are accessible using model_output.predictions[0],[1]
# if the testset has labels, then model_output.label_ids contains them
# model_output.metrics = {'test_runtime': 10.6385, 'test_samples_per_second': 128.684, 'test_steps_per_second': 16.168}
model_output = trainer.predict(testset) 
logits = model_output.predictions

***** Running Prediction *****
  Num examples = 143
  Batch size = 8


In [16]:
submission = serve_chaii(test_df, testset, logits)
submission_df = pd.DataFrame(submission)
# submission_df.loc[:, 'PredictionString'] = submission_df.loc[:, 'PredictionString'].apply(lambda ans: "\"" + str(ans) + "\"")
submission
submission_df.to_csv('submission.csv', index=False)

number of examples in test df 11


### references
1. https://github.com/huggingface/notebooks/blob/master/examples/question_answering.ipynb
2. https://huggingface.co/transformers/internal/tokenization_utils.html#transformers.tokenization_utils_base.PreTrainedTokenizerBase.__call__

In [None]:
# pred_df = pd.read_csv('submission.csv', encoding='utf-8')
# gt_df = test_df # pd.read_csv(hyperparams.val_csv, encoding='utf-8')

In [None]:
# pred_df

In [None]:
# num_examples = len(pred_df)
# print("number of predictions", num_examples)

In [None]:
# score = 0
# for idx, example_id in enumerate(gt_df.loc[:,'id']):
#     gt_answer = gt_df.loc[idx, 'answer_text']
#     pred_answer = pred_df.loc[pred_df.loc[:, 'id'] == example_id].reset_index(drop=True).loc[0, 'PredictionString']
#     # print(gt_answer, pred_answer)
#     score += jaccard(gt_answer, pred_answer)
# score /= num_examples
# print(score)