# SQuAD Question Answering with BERT 


Huggignface Datasets allow SQuAD by using load_dataset("squad")

In [12]:
!pip install transformers
!pip install datasets #from huggingface 



In [13]:
from datasets import load_dataset

squad = load_dataset("squad")

print("\n\n ", squad["train"][0])

Reusing dataset squad (/root/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)


  0%|          | 0/2 [00:00<?, ?it/s]



  {'id': '5733be284776f41900661182', 'title': 'University_of_Notre_Dame', 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.', 'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?', 'answers': {'text': ['Saint Bernadette Soubirous'], 'answer_start': [515]}}


In [14]:
squad

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})

# Preprocess

Load the tokenizer

In [15]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

주의깊게 볼 점들 

- 데이터셋 내의 일부 문장의 경우 very long context가 있으므로  **truncation= only_second**로 세팅하여 max_length를 초과 하는 경우만 truncating을 적용하면 좋다 
- original context에 대한 답변의 [start, end] position을 매핑해주어야 하는데 이때 **return_offset_mapping = True**를 사용하면 다룰 수 있다.
- sequence_ids 메소드로 해당 오프셋이 question인지 context인지 찾을 수 있다.

In [16]:
def preprocess_function(examples):
  questions = [q.strip() for q in examples["question"]] # strip(): 공백을 제거 
  inputs = tokenizer(questions, examples["context"], max_length=384, truncation="only_second", return_offset_mapping=True, padding="max_length")

  offset_mapping = inputs.pop("offset_mapping")
  answers = examples["answers"]

  start_positions = []
  end_positions = [] 

  for i, offset in enumerate(offset_mapping):

    answer = answers[i]
    start_char = answer["answer_start"][0]
    end_char = answer["answer_start"][0] + len(answer["text"][0])
    sequence_ids = inputs.sequence_ids(i)


    idx = 0 
    while sequence_ids[idx] != 1:
      idx += 1
    context_start = idx 
    while sequence_ids[idx] == 1: 
      idx += 1 
    context_end = idx - 1 

    #If the answer is not fully inside the context, label it (0,0)
    if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
      start_positions.append(0)
      end_positions.append(0)
    else:
      #Otherwise it's the start and end token positions
       idx = context_start 
       while idx <= context_end and offset[idx][0] <= start_char:
         idx += 1 
       start_positions.append(idx - 1)

       idx = context_end 
       while idx >= context_start and offset[idx][1] >= end_char:
         idx -= 1 
       end_positions.append(idx + 1)

  inputs["start_positions"] = start_positions
  inputs["end_positions"] = end_positions
  
  return inputs





In [18]:
from transformers import default_data_collator

data_collator = default_data_collator

tokenized_squad = squad.map(preprocess_function, batched=True, remove_columns=squad["train"].column_names)

  0%|          | 0/88 [00:00<?, ?ba/s]

TypeError: ignored