In [1]:
#Reading in the dataset
import json
import pandas as pd

def read_covidqa():
    with open('COVID-QA.json', 'rb') as f:
        covidqa_dict = json.load(f)

    contexts = []
    questions = []
    answers = []
    for group in covidqa_dict['data']:
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']
                for answer in qa['answers']:
                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)
    
    return contexts, questions, answers

all_contexts, all_questions, all_answers = read_covidqa()

#Converting to a dataframe for easy k-fold splits
full_dataset = pd.DataFrame(list(zip(all_contexts, all_questions, all_answers)), columns =['context', 'question', 'answer'])

In [2]:
def preprocess_input(dataset):
    for answer, context in zip(dataset['answer'], dataset['context']):
        gold_text = answer['text']
        start_idx = answer['answer_start']
        end_idx = start_idx + len(gold_text)

        # sometimes squad answers are off by a character or two – fix this
        if context[start_idx:end_idx] == gold_text:
            answer['answer_end'] = end_idx
        elif context[start_idx-1:end_idx-1] == gold_text:
            answer['answer_start'] = start_idx - 1
            answer['answer_end'] = end_idx - 1     # When the gold label is off by one character
        elif context[start_idx-2:end_idx-2] == gold_text:
            answer['answer_start'] = start_idx - 2
            answer['answer_end'] = end_idx - 2     # When the gold label is off by two characters
    
    encodings = tokenizer(dataset['context'].to_list(), dataset['question'].to_list(), \
                          truncation=True, padding=True)
    
    start_positions = []
    end_positions = []
    for i in range(len(dataset['answer'])):
        start_positions.append(encodings.char_to_token(i, dataset['answer'][i]['answer_start']))
        end_positions.append(encodings.char_to_token(i, dataset['answer'][i]['answer_end'] - 1))
        
        # if start position is None, the answer passage has been truncated
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length
        if end_positions[-1] is None:
            end_positions[-1] = tokenizer.model_max_length

    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})
    
    return encodings

In [3]:
#Code to compute F1 scores
import re
import string
import collections
import numpy as np

def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""
    def remove_articles(text):
        regex = re.compile(r'\b(a|an|the)\b', re.UNICODE)
        return re.sub(regex, ' ', text)
    def white_space_fix(text):
        return ' '.join(text.split())
    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)
    def lower(text):
        return text.lower()
    return white_space_fix(remove_articles(remove_punc(lower(s))))

def get_tokens(s):
    if not s: return []
    return normalize_answer(s).split()

def compute_f1(a_gold, a_pred):
    gold_toks = get_tokens(a_gold)
    pred_toks = get_tokens(a_pred)
    common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
    num_same = sum(common.values())
    if len(gold_toks) == 0 or len(pred_toks) == 0:
        # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
        return int(gold_toks == pred_toks)
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(pred_toks)
    recall = 1.0 * num_same / len(gold_toks)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1

def compute_f1_main(df):
    F1 = []
    for i in range(len(df)):
        a_gold = df['true_answer'][i]
        a_pred = df['predicted_answer'][i]
        F1.append(compute_f1(a_gold,a_pred))
    return np.mean(F1)

In [4]:
#Main fine-tuning loop
from sklearn.model_selection import KFold
from torch.utils.data import DataLoader
from transformers import AdamW, AutoTokenizer, AutoModelForQuestionAnswering, QuestionAnsweringPipeline
import torch

kfold = KFold(n_splits=5)
num_epochs = 3

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model_name = 'distilbert-base-uncased'

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

model.to(device)
model.train()

for fold, (train_ids, test_ids) in enumerate(kfold.split(full_dataset)): 
    print(f'FOLD {fold}')
    print('--------------------------------')
    
    train_data = preprocess_input(full_dataset.iloc[train_ids])
    
    # Define data loaders for training and testing data in this fold
    trainloader = torch.utils.data.DataLoader(train_data, batch_size=10)

    # Initialize optimizer
    optim = AdamW(model.parameters(), lr=5e-5)
    
    # Run the training loop for defined number of epochs
    for epoch in range(0, num_epochs):

      # Print epoch
      print(f'Starting epoch {epoch+1}')

      # Set current loss value
      current_loss = 0.0

      # Iterate over the DataLoader for training data
      for i, data in enumerate(trainloader, 0):
            optim.zero_grad()
            input_ids = data['input_ids'].to(device)
            attention_mask = data['attention_mask'].to(device)
            start_positions = data['start_positions'].to(device)
            end_positions = data['end_positions'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
            loss = outputs[0]
            loss.backward()
            optim.step()
            
            # Print statistics
            current_loss += loss.item()
            if i % 500 == 499:
                print('Loss after mini-batch %5d: %.3f' % (i + 1, current_loss / 500))
                current_loss = 0.0

    # Process is complete.
    print('Training process has finished. Saving trained model.')

    # Print about testing
    print('Starting testing')
    
    # Saving the model
    save_path = f'./model-fold-{fold}.pth'
    torch.save(network.state_dict(), save_path)

    # Evaluationfor this fold
    test_data = full_dataset.iloc[test_ids]
    nlp = QuestionAnsweringPipeline(model=model, tokenizer=tokenizer, device=-1 if device == torch.device('cpu')\
                                else 0)
    with torch.no_grad():
        questions = []
        true_answers = []
        predicted_answers = []
        final_df = pd.DataFrame(columns=['question', 'true_answer', 'predicted_answer'])

        # Iterate over the test data and generate predictions
        for i in range(len(test_data)):
            context = test_data.iloc[i]['context']
            questions.append(test_data.iloc[i]['question'])
            true_answers.append(test_data.iloc[i]['answer']['text'])

            # Generate outputs
            QA_input = {'question': question, 'context': context}
            predicted_answers.append(nlp(QA_input)['answer'])
        
        final_df['question'] = questions
        final_df['true_answer'] = true_answers
        final_df['predicted_answer'] = predicted_answers
            
    #Print F1
    fold_f1_score = compute_f1_main(final_df)
    print(f'F1 for fold {fold}: {fold_f1_score}')

model.eval()

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForQuestionAnswering: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this mode

FOLD 0
--------------------------------


KeyError: 0