In [32]:
# !pip install datasets
# !pip install torch
# !pip install scikit-learn

In [33]:
# !pip uninstall -y transformers accelerate
# !pip install transformers==4.28.0 accelerate

In [23]:
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, DefaultDataCollator, \
TrainingArguments, Trainer, pipeline

from torch.utils.data import DataLoader, random_split
from datasets import Dataset, DatasetDict
import evaluate

from tqdm import tqdm
import os
import json

In [42]:
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "3"

In [43]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [44]:
device

device(type='cuda', index=0)

PART -1

In [26]:
def extract_data_fromJSON(filename):
    data = json.load(open(filename))
    extracted_data = []

    for row in data["data"]:
        doc = row["paragraphs"][0]
        for question in doc["qas"]:
            ele = {"context": doc["context"], "document_id": doc["document_id"]}
            ele["question"] = question["question"]
            ele["answers"] = {
                "text": [ans["text"] for ans in question["answers"]],
                "answer_start": [ans["answer_start"] for ans in question["answers"]],
            } 
            ele["id"] = question["id"]
            extracted_data.append(ele)
    return pd.DataFrame(extracted_data)

In [27]:
test_json = 'covid-qa/covid-qa-test.json'
dev_json = 'covid-qa/covid-qa-dev.json'
train_json = 'covid-qa/covid-qa-train.json'

test_df = extract_data_fromJSON(test_json)
testdf = Dataset.from_pandas(test_df)
dev_df = extract_data_fromJSON(dev_json)
devdf = Dataset.from_pandas(dev_df)
train_df = extract_data_fromJSON(train_json)
traindf = Dataset.from_pandas(train_df)


data_dict = DatasetDict()
data_dict['train'] = traindf
data_dict['validation'] = devdf
data_dict['test'] = testdf


In [28]:
data_dict

DatasetDict({
    train: Dataset({
        features: ['context', 'document_id', 'question', 'answers', 'id'],
        num_rows: 1417
    })
    validation: Dataset({
        features: ['context', 'document_id', 'question', 'answers', 'id'],
        num_rows: 203
    })
    test: Dataset({
        features: ['context', 'document_id', 'question', 'answers', 'id'],
        num_rows: 375
    })
})

In [30]:
! huggingface-cli login --token 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

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid.
Your token has been saved to /soe/pranjalib/.cache/huggingface/token
Login successful


In [9]:
data_dict.push_to_hub('pranjali97/covid-qa')

Pushing split train to the Hub.
Resuming upload of the dataset shards.
Pushing dataset shards to the dataset hub: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 10407.70it/s]
Pushing split validation to the Hub.
Resuming upload of the dataset shards.
Pushing dataset shards to the dataset hub: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 12228.29it/s]
Pushing split test to the Hub.
Resuming upload of the dataset shards.
Pushing dataset shards to the dataset hub: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 9642.08it/s]
Downloading metadata: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 700/700 [00:00<00:00, 719kB/s]


In [10]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline

model_name = "deepset/roberta-base-squad2"
nlp = pipeline('question-answering', model=model_name, tokenizer=model_name)

In [11]:
# dev_responses = {}
# for sample in data_dict['validation']:
#     QA_input = {
#         'question': sample['question'],
#         'context': sample['context']
#     }
#     res = nlp(QA_input)
#     dev_responses[sample['id']] = res['answer'] 

In [12]:
# test_responses = {}
# for sample in data_dict['test']:
#     QA_input = {
#         'question': sample['question'],
#         'context': sample['context']
#     }
#     res = nlp(QA_input)
#     test_responses[sample['id']] = res['answer'] 

In [13]:
# with open('part-1/dev_pred.json', 'w') as f:
#     json.dump(dev_responses, f)
    
# with open('part-1/test_pred.json', 'w') as f:
#     json.dump(test_responses, f)

In [14]:
# ! python evaluate.py covid-qa/covid-qa-dev.json part-1/dev_pred.json --out-file part-1/dev_results.json

In [15]:
# ! python evaluate.py covid-qa/covid-qa-test.json part-1/test_pred.json --out-file part-1/test_results.json

PART - 2

In [16]:
model = AutoModelForQuestionAnswering.from_pretrained(model_name).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [17]:
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs 

In [18]:
tokenized_data = data_dict.map(preprocess_function, batched=True, remove_columns=data_dict["train"].column_names)

                                                                                                                                                                                   

In [19]:
tokenized_data

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'start_positions', 'end_positions'],
        num_rows: 1417
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'start_positions', 'end_positions'],
        num_rows: 203
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'start_positions', 'end_positions'],
        num_rows: 375
    })
})

In [20]:
# data_collator = DefaultDataCollator()

In [21]:
# training_args = TrainingArguments(
#     output_dir="203/part-2/FT-covidQA-model",
#     evaluation_strategy="epoch",
#     learning_rate=2e-5,
#     per_device_train_batch_size=16,
#     per_device_eval_batch_size=16,
#     num_train_epochs=3,
#     weight_decay=0.01,
# )

# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=tokenized_data["train"],
#     eval_dataset=tokenized_data["validation"],
#     tokenizer=tokenizer,
#     data_collator=data_collator,
# )

# trainer.train()


PART - 3

In [37]:
!pip install adapter-transformers

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Collecting adapter-transformers
  Downloading adapter_transformers-3.2.1-py3-none-any.whl (6.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.4/6.4 MB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: adapter-transformers
Successfully installed adapter-transformers-3.2.1


In [None]:
!python /data/users/pranjalib/203/RoBERTa_QA/adapter-transformers-master/examples/pytorch/question-answering/run_qa.py \
  --model_name_or_path deepset/roberta-base-squad2 \
  --dataset_name pranjali97/covid-qa \
  --do_train \
  --do_eval \
  --per_device_train_batch_size 6 \
  --learning_rate 3e-5 \
  --num_train_epochs 2 \
  --max_seq_length 384 \
  --doc_stride 128 \
  --output_dir part-2

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
05/15/2023 13:04:48 - INFO - __main__ - Training/evaluation parameters TrainingArguments(
_n_gpu=0,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=True,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=None,
evaluation_strategy=no,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to