<a href="https://colab.research.google.com/github/satani99/qa_system/blob/main/RoBERTa_SQuAD_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers datasets evaluate sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [6]:
from datasets import load_dataset

adv_qa = load_dataset("adversarial_qa", "droberta")



  0%|          | 0/3 [00:00<?, ?it/s]

In [7]:
# adv_qa = adv_qa.train_test_split(test_size=0.2)

In [8]:
adv_qa

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers', 'metadata'],
        num_rows: 10000
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers', 'metadata'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers', 'metadata'],
        num_rows: 1000
    })
})

In [9]:
adv_qa["validation"][0]

{'id': '0622161680ff6c97fa14a25b01a4bf3bc289d72f',
 'title': 'Amazon_rainforest',
 'context': 'Between 1991 and 2000, the total area of forest lost in the Amazon rose from 415,000 to 587,000 square kilometres (160,000 to 227,000 sq mi), with most of the lost forest becoming pasture for cattle. Seventy percent of formerly forested land in the Amazon, and 91% of land deforested since 1970, is used for livestock pasture. Currently, Brazil is the second-largest global producer of soybeans after the United States. New research however, conducted by Leydimere Oliveira et al., has shown that the more rainforest is logged in the Amazon, the less precipitation reaches the area and so the lower the yield per hectare becomes. So despite the popular perception, there has been no economical advantage for Brazil from logging rainforest zones and converting these to pastoral fields.',
 'question': 'What consumer product are the rain forests cleared for?',
 'answers': {'text': ['soybeans'], 'answer_st

In [10]:
from transformers import AutoTokenizer 

tokenizer = AutoTokenizer.from_pretrained("deepset/xlm-roberta-large-squad2")

In [11]:
adv_qa["train"].filter(lambda x: len(x["answers"]["text"]) != 1)



Dataset({
    features: ['id', 'title', 'context', 'question', 'answers', 'metadata'],
    num_rows: 0
})

In [12]:
tokenizer.is_fast

True

In [13]:
def preprocess_function(examples):
  questions = [q.strip() for q in examples["question"]]
  inputs = tokenizer(
      questions,
      examples["context"],
      max_length=384,
      truncation="only_second",
      stride=128,
      return_overflowing_tokens=True,
      return_offsets_mapping=True,
      padding="max_length",
  )

  offset_mapping = inputs.pop("offset_mapping")
  sample_map = inputs.pop("overflow_to_sample_mapping")
  answers = examples["answers"]
  start_positions = []
  end_positions = []

  for i, offset in enumerate(offset_mapping):
    sample_idx = sample_map[i]
    answer = answers[sample_idx]
    start_char = answer["answer_start"][0]
    end_char = answer["answer_start"][0] + len(answer["text"][0])
    sequence_ids = inputs.sequence_ids(i)

    idx = 0
    while sequence_ids[idx] != 1:
      idx += 1 
    context_start = idx
    while sequence_ids[idx] == 1:
      idx += 1
    context_end = idx - 1

    if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
      start_positions.append(0)
      end_positions.append(0)
    else:
      idx = context_start 
      while idx <= context_end and offset[idx][0] <= start_char:
        idx += 1 
      start_positions.append(idx - 1)

      idx = context_end 
      while idx >= context_start and offset[idx][1] >= end_char:
        idx -= 1
      end_positions.append(idx + 1)

  inputs["start_positions"] = start_positions 
  inputs["end_positions"] = end_positions 
  return inputs

In [37]:
train_dataset = adv_qa["train"].map(preprocess_function, batched=True, remove_columns=adv_qa["train"].column_names)



In [15]:
len(adv_qa["train"]), len(tokenized_squad)

(10000, 10254)

In [18]:
def preprocess_validation_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=384,
        truncation="only_second",
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []

    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(examples["id"][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]
        inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]

    inputs["example_id"] = example_ids
    return inputs

In [19]:
validation_dataset = adv_qa["validation"].map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=adv_qa["validation"].column_names,
)
len(adv_qa["validation"]), len(validation_dataset)

  0%|          | 0/1 [00:00<?, ?ba/s]

(1000, 1004)

In [20]:
small_eval_set = adv_qa["validation"].select(range(100))
trained_checkpoint = "distilbert-base-cased-distilled-squad"

tokenizer = AutoTokenizer.from_pretrained(trained_checkpoint)
eval_set = small_eval_set.map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=adv_qa["validation"].column_names,
)

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/473 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/436k [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [21]:
tokenizer = AutoTokenizer.from_pretrained("deepset/xlm-roberta-large-squad2")

In [22]:
import torch
from transformers import AutoModelForQuestionAnswering

eval_set_for_model = eval_set.remove_columns(["example_id", "offset_mapping"])
eval_set_for_model.set_format("torch")

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
batch = {k: eval_set_for_model[k].to(device) for k in eval_set_for_model.column_names}
trained_model = AutoModelForQuestionAnswering.from_pretrained(trained_checkpoint).to(
    device
)

with torch.no_grad():
    outputs = trained_model(**batch)

Downloading:   0%|          | 0.00/261M [00:00<?, ?B/s]

In [23]:
start_logits = outputs.start_logits.cpu().numpy()
end_logits = outputs.end_logits.cpu().numpy()

In [24]:
import collections

example_to_features = collections.defaultdict(list)
for idx, feature in enumerate(eval_set):
    example_to_features[feature["example_id"]].append(idx)

In [25]:
import numpy as np

n_best = 20
max_answer_length = 30
predicted_answers = []

for example in small_eval_set:
    example_id = example["id"]
    context = example["context"]
    answers = []

    for feature_index in example_to_features[example_id]:
        start_logit = start_logits[feature_index]
        end_logit = end_logits[feature_index]
        offsets = eval_set["offset_mapping"][feature_index]

        start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
        end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
        for start_index in start_indexes:
            for end_index in end_indexes:
                # Skip answers that are not fully in the context
                if offsets[start_index] is None or offsets[end_index] is None:
                    continue
                # Skip answers with a length that is either < 0 or > max_answer_length.
                if (
                    end_index < start_index
                    or end_index - start_index + 1 > max_answer_length
                ):
                    continue

                answers.append(
                    {
                        "text": context[offsets[start_index][0] : offsets[end_index][1]],
                        "logit_score": start_logit[start_index] + end_logit[end_index],
                    }
                )

    best_answer = max(answers, key=lambda x: x["logit_score"])
    predicted_answers.append({"id": example_id, "prediction_text": best_answer["text"]})

In [26]:
import evaluate

metric = evaluate.load("squad")

Downloading builder script:   0%|          | 0.00/4.53k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.32k [00:00<?, ?B/s]

In [28]:
theoretical_answers = [
    {"id": ex["id"], "answers": ex["answers"]} for ex in small_eval_set
]

In [29]:
print(predicted_answers[0])
print(theoretical_answers[0])

{'id': '0622161680ff6c97fa14a25b01a4bf3bc289d72f', 'prediction_text': 'livestock pasture'}
{'id': '0622161680ff6c97fa14a25b01a4bf3bc289d72f', 'answers': {'text': ['soybeans'], 'answer_start': [386]}}


In [30]:
metric.compute(predictions=predicted_answers, references=theoretical_answers)

{'exact_match': 15.0, 'f1': 22.706822442998238}

In [31]:
from tqdm.auto import tqdm


def compute_metrics(start_logits, end_logits, features, examples):
    example_to_features = collections.defaultdict(list)
    for idx, feature in enumerate(features):
        example_to_features[feature["example_id"]].append(idx)

    predicted_answers = []
    for example in tqdm(examples):
        example_id = example["id"]
        context = example["context"]
        answers = []

        # Loop through all features associated with that example
        for feature_index in example_to_features[example_id]:
            start_logit = start_logits[feature_index]
            end_logit = end_logits[feature_index]
            offsets = features[feature_index]["offset_mapping"]

            start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
            end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Skip answers that are not fully in the context
                    if offsets[start_index] is None or offsets[end_index] is None:
                        continue
                    # Skip answers with a length that is either < 0 or > max_answer_length
                    if (
                        end_index < start_index
                        or end_index - start_index + 1 > max_answer_length
                    ):
                        continue

                    answer = {
                        "text": context[offsets[start_index][0] : offsets[end_index][1]],
                        "logit_score": start_logit[start_index] + end_logit[end_index],
                    }
                    answers.append(answer)

        # Select the answer with the best score
        if len(answers) > 0:
            best_answer = max(answers, key=lambda x: x["logit_score"])
            predicted_answers.append(
                {"id": example_id, "prediction_text": best_answer["text"]}
            )
        else:
            predicted_answers.append({"id": example_id, "prediction_text": ""})

    theoretical_answers = [{"id": ex["id"], "answers": ex["answers"]} for ex in examples]
    return metric.compute(predictions=predicted_answers, references=theoretical_answers)

In [32]:
compute_metrics(start_logits, end_logits, eval_set, small_eval_set)

  0%|          | 0/100 [00:00<?, ?it/s]

{'exact_match': 15.0, 'f1': 22.706822442998238}

In [33]:
model = AutoModelForQuestionAnswering.from_pretrained("deepset/xlm-roberta-large-squad2")

Downloading:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

In [35]:
from transformers import TrainingArguments

args = TrainingArguments(
    "deepset/xlm-roberta-large-squad2",
    evaluation_strategy="no",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    fp16=True,
)

In [38]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
)
trainer.train()

Using cuda_amp half precision backend
***** Running training *****
  Num examples = 10254
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 3846
  Number of trainable parameters = 558842882
You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss


OutOfMemoryError: ignored