In [60]:
# !git clone https://github.com/pranjali97/RoBERTa_QA.git
# !unzip covid-qa.zip
# !pip install transformers

In [63]:
import pandas as pd
import datasets
from datasets import Dataset, DatasetDict
import transformers
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline
from transformers import DefaultDataCollator

import json
import ast

PART -1

In [64]:
def extract_data_fromJSON(filename):
    data = json.load(open(filename))
    extracted_data = []

    for row in data["data"]:
        doc = row["paragraphs"][0]
        for question in doc["qas"]:
            ele = {"context": doc["context"], "document_id": doc["document_id"]}
            ele["question"] = question["question"]
            ele["answers"] = {
                "text": [ans["text"] for ans in question["answers"]],
                "answer_start": [ans["answer_start"] for ans in question["answers"]],
            } 
            ele["id"] = question["id"]
            extracted_data.append(ele)
    return pd.DataFrame(extracted_data)

In [65]:
test_json = 'covid-qa/covid-qa-test.json'
dev_json = 'covid-qa/covid-qa-dev.json'
train_json = 'covid-qa/covid-qa-train.json'

test_df = extract_data_fromJSON(test_json)
testdf = Dataset.from_pandas(test_df)
dev_df = extract_data_fromJSON(dev_json)
devdf = Dataset.from_pandas(dev_df)
train_df = extract_data_fromJSON(train_json)
traindf = Dataset.from_pandas(train_df)


data_dict = DatasetDict()
data_dict['train'] = traindf
data_dict['validation'] = devdf
data_dict['test'] = testdf


In [66]:
data_dict

DatasetDict({
    train: Dataset({
        features: ['context', 'document_id', 'question', 'answers', 'id'],
        num_rows: 1417
    })
    validation: Dataset({
        features: ['context', 'document_id', 'question', 'answers', 'id'],
        num_rows: 203
    })
    test: Dataset({
        features: ['context', 'document_id', 'question', 'answers', 'id'],
        num_rows: 375
    })
})

In [6]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline

model_name = "deepset/roberta-base-squad2"
nlp = pipeline('question-answering', model=model_name, tokenizer=model_name)

In [69]:
# dev_responses = {}
# for sample in data_dict['validation']:
#     QA_input = {
#         'question': sample['question'],
#         'context': sample['context']
#     }
#     res = nlp(QA_input)
#     dev_responses[sample['id']] = res['answer'] 

{'context': 'Respiratory Viral Infections in Exacerbation of Chronic Airway Inflammatory Diseases: Novel Mechanisms and Insights From the Upper Airway Epithelium\n\nhttps://www.ncbi.nlm.nih.gov/pmc/articles/PMC7052386/\n\nSHA: 45a566c71056ba4faab425b4f7e9edee6320e4a4\n\nAuthors: Tan, Kai Sen; Lim, Rachel Liyu; Liu, Jing; Ong, Hsiao Hui; Tan, Vivian Jiayi; Lim, Hui Fang; Chung, Kian Fan; Adcock, Ian M.; Chow, Vincent T.; Wang, De Yun\nDate: 2020-02-25\nDOI: 10.3389/fcell.2020.00099\nLicense: cc-by\n\nAbstract: Respiratory virus infection is one of the major sources of exacerbation of chronic airway inflammatory diseases. These exacerbations are associated with high morbidity and even mortality worldwide. The current understanding on viral-induced exacerbations is that viral infection increases airway inflammation which aggravates disease symptoms. Recent advances in in vitro air-liquid interface 3D cultures, organoid cultures and the use of novel human and animal challenge models have e

In [None]:
# test_responses = {}
# for sample in data_dict['test']:
#     QA_input = {
#         'question': sample['question'],
#         'context': sample['context']
#     }
#     res = nlp(QA_input)
#     test_responses[sample['id']] = res['answer'] 

In [None]:
with open('dev_pred.json', 'w') as f:
    json.dump(dev_responses, f)
    
with open('test_pred.json', 'w') as f:
    json.dump(test_responses, f)

In [58]:
! python evaluate.py covid-qa/covid-qa-dev.json dev_pred.json --out-file dev_results.json

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [59]:
! python evaluate.py covid-qa/covid-qa-test.json test_pred.json --out-file test_results.json

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


PART - 2

In [70]:
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs 

In [None]:
tokenized_data = data_dict.map(preprocess_function, batched=True, remove_columns=squad["train"].column_names)

In [None]:
data_collator = DefaultDataCollator()

In [None]:
training_args = TrainingArguments(
    output_dir="FT-covidQA-model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()