# Question answering (PyTorch)

In [4]:
!pip install datasets evaluate transformers[sentencepiece]
!pip install accelerate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.12.0-py3-none-any.whl (474 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers[sentencepiece]
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m75.7 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.7,>=0.3.0 (from datasets)
  Downloading dill-0.3.6-py3-none-any.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.2.0-c

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting accelerate
  Downloading accelerate-0.19.0-py3-none-any.whl (219 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m219.1/219.1 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.19.0


In [5]:
import random
import json
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer
from transformers import DefaultDataCollator
from transformers import TrainingArguments
from transformers import Trainer
from torch.utils.data import DataLoader
from transformers import default_data_collator
from accelerate import Accelerator
from transformers import BertForQuestionAnswering
from tqdm.auto import tqdm
from torch.optim import AdamW
from transformers import get_scheduler

In [None]:
import os
from google.colab import drive

# Mount the Google drive storage
drive.mount('/content/drive')

# Move to the directory with the notebooks and the data set
# (note, you may need to change this path according to your directory structure)
os.chdir('/content/drive/MyDrive/')

# Check the currect working directory
os.getcwd()

Mounted at /content/drive


'/content/drive/MyDrive'

# Loading of the dataset

In [None]:
def load_data(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)
    return data['data']

train_json = load_data('train-v2.0.json')
validation_json = load_data('dev-v2.0.json')

In [None]:
def extract_data_from_json(data):
    dataset = []

    for article in data:
        for paragraph in article['paragraphs']:
            for qas in paragraph['qas']:
                if qas['is_impossible'] == False: 
                    dataset.append({'id': qas['id'],
                                    'context': paragraph['context'],
                                   'question': qas['question'],
                                   'answers': {'text' : [qas['answers'][0]['text']], 'answer_start': [int(qas['answers'][0]['answer_start'])]},
                    })

    return dataset

train_data = extract_data_from_json(train_json)

def extract_data_from_json(data):
    dataset = []

    for article in data:
        for paragraph in article['paragraphs']:
            for qas in paragraph['qas']:
                if qas['is_impossible'] == False: 
                  list_answers = []
                  list_answers_start = []
                  for a in qas['answers']:
                    list_answers.append(a['text'])
                    list_answers_start.append(a['answer_start'])
                  dataset.append({'id': qas['id'],
                                    'context': paragraph['context'],
                                   'question': qas['question'],
                                   'answers': {'text': list_answers, 'answer_start': list_answers_start},
                    })

    return dataset

dev_data = extract_data_from_json(validation_json)

#train_data = random.sample(train_data, int(len(train_data) * 0.2))

In [None]:
raw_datasets = DatasetDict()
raw_datasets['train'] = Dataset.from_list(train_data)
raw_datasets['validation'] = Dataset.from_list(dev_data)
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'context', 'question', 'answers'],
        num_rows: 86821
    })
    validation: Dataset({
        features: ['id', 'context', 'question', 'answers'],
        num_rows: 5928
    })
})

# Hyper-Parameters

In [None]:
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

max_length = 384
stride = 128
num_train_epochs = 3
learning_rate = 2e-5

n_best = 20
max_answer_length = 50 #50

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

# Preprocessing

In [None]:
def preprocess_training(data):
    questions = [q.strip() for q in data["question"]]
    inputs = tokenizer(
        questions,
        data["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = data["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label is (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [None]:
train_dataset = raw_datasets["train"].map(
    preprocess_training,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)
len(raw_datasets["train"]), len(train_dataset)

Map:   0%|          | 0/86821 [00:00<?, ? examples/s]

(86821, 87739)

In [None]:
def preprocess_validation(data):
    questions = [q.strip() for q in data["question"]]
    inputs = tokenizer(
        questions,
        data["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []

    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(data["id"][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]
        inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]

    inputs["example_id"] = example_ids
    return inputs

In [None]:
validation_dataset = raw_datasets["validation"].map(
    preprocess_validation,
    batched=True,
    remove_columns=raw_datasets["validation"].column_names,
)
len(raw_datasets["validation"]), len(validation_dataset)

Map:   0%|          | 0/5928 [00:00<?, ? examples/s]

(5928, 6056)

# Metrics: Exact Matching & F1 Score

In [None]:
import evaluate
import collections
import numpy as np

metric = evaluate.load("squad")

def compute_metrics(start_logits, end_logits, features, examples):
    example_to_features = collections.defaultdict(list)
    for idx, feature in enumerate(features):
        example_to_features[feature["example_id"]].append(idx) # dizionario dove key=id contesto e value = id sub_context

    predicted_answers = []
    for example in tqdm(examples):
        example_id = example["id"]
        context = example["context"]
        answers = []

        # Loop through all features associated with that example
        for feature_index in example_to_features[example_id]:
            start_logit = start_logits[feature_index]     
            end_logit = end_logits[feature_index]
            offsets = features[feature_index]["offset_mapping"] 

            start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
            end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Skip answers that are not fully in the context
                    if offsets[start_index] is None or offsets[end_index] is None:
                        continue
                    # Skip answers with a length that is either < 0 or > max_answer_length
                    if (
                        end_index < start_index
                        or end_index - start_index + 1 > max_answer_length
                    ):
                        continue

                    answer = {
                        "text": context[offsets[start_index][0] : offsets[end_index][1]],
                        "logit_score": start_logit[start_index] + end_logit[end_index],
                    }
                    answers.append(answer)

        # Select the answer with the best score
        if len(answers) > 0:
            best_answer = max(answers, key=lambda x: x["logit_score"])
            predicted_answers.append(
                {"id": example_id, "prediction_text": best_answer["text"]}
            )
        else:
            predicted_answers.append({"id": example_id, "prediction_text": ""})

    theoretical_answers = [{"id": ex["id"], "answers": ex["answers"]} for ex in examples]
    return metric.compute(predictions=predicted_answers, references=theoretical_answers)

Downloading builder script:   0%|          | 0.00/4.53k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.32k [00:00<?, ?B/s]

# Training of the Model

In [None]:
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForQuestionAnswering: ['cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased a

In [None]:
args = TrainingArguments(
    model_name,
    evaluation_strategy="no",
    save_strategy="epoch",
    learning_rate=learning_rate,
    num_train_epochs=num_train_epochs,
    weight_decay=0.01,
    fp16=True
)

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
)
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
500,2.715
1000,1.6782
1500,1.5047
2000,1.3866
2500,1.3245
3000,1.3178
3500,1.2647
4000,1.2461
4500,1.1745
5000,1.1938


TrainOutput(global_step=32904, training_loss=0.8518939325475009, metrics={'train_runtime': 7783.9195, 'train_samples_per_second': 33.815, 'train_steps_per_second': 4.227, 'total_flos': 5.158331131333478e+16, 'train_loss': 0.8518939325475009, 'epoch': 3.0})

# Evaluation

In [None]:
from transformers import Trainer

model = AutoModelForQuestionAnswering.from_pretrained('/content/drive/MyDrive/bert-base-uncased-without-unanswerable/checkpoint-32904')

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
)

predictions, _, _ = trainer.predict(validation_dataset)
start_logits, end_logits = predictions
compute_metrics(start_logits, end_logits, validation_dataset, raw_datasets["validation"])

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


  0%|          | 0/5928 [00:00<?, ?it/s]

{'exact_match': 79.84143049932524, 'f1': 87.90337887037848}

In [None]:
from transformers import pipeline

# Replace this with your own checkpoint
question_answerer = pipeline("question-answering", model='/content/drive/MyDrive/bert-base-uncased-without-unanswerable/checkpoint-32904')

context = "Natural Language Processing (NLP) concerns the computational analysis, interpretation, and production of natural language in either written or spoken form. It is an interdisciplinary research field, interesting from both theoretical and practical perspectives. Decades of research have resulted in a vast collection of symbolic, stochastic, and deep-learning based models. Such models have enable the development of applications in a vast array of fields, such as human-machine interaction and chatbots, search and question answering, translation and multilingual systems, multimodal and captioning systems, speech analysis, voice interaction and personal assistants, sentiment analysis, etc, etc. This course will provide an introduction to the important problems, models and applications in NLP. The history of NLP involves many successes and many failures, demonstrating the complexity of the topic. Initially popular symbolic models turned out to be unable to capture the intrinsic complexity of natural language. Statistical techniques such as vector-space representations and linear classifiers (e.g. Support Vector Machines) enabled important applications such as web search spam detection. Word embedding techniques then became popular and improved performance on all aspects of NLP: from morphology to semantics and dialogue. More recently sequence-to-sequence modeling with deep learning techniques have greatly improved performance on hard NLP problems such machine translation and dialog generation."
question = "Who is the subject of the context?"
question_answerer(question=question, context=context)

{'score': 0.5410254001617432,
 'start': 105,
 'end': 121,
 'answer': 'natural language'}

In [None]:
question = "Which statistical techniques are used?"
question_answerer(question=question, context=context)

{'score': 0.9456495046615601,
 'start': 1049,
 'end': 1100,
 'answer': 'vector-space representations and linear classifiers'}

In [None]:
question = "What is NLP?"
question_answerer(question=question, context=context)

{'score': 0.8369839191436768,
 'start': 0,
 'end': 27,
 'answer': 'Natural Language Processing'}

In [None]:
question = "What is Natural Language Processing about?"
question_answerer(question=question, context=context)

{'score': 0.2584551274776459,
 'start': 47,
 'end': 121,
 'answer': 'computational analysis, interpretation, and production of natural language'}

In [None]:
print("Architure: ")
print(model)

Architure: 
BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps

In [None]:
total_params = sum(p.numel() for p in model.parameters())
print("Total number of parameters:", total_params)

Total number of parameters: 108893186
