# 以Transformers套件實作問答(Question Answering)功能

In [1]:
# import package
from transformers import pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# load model
nlp = pipeline("question-answering")

No model was supplied, defaulted to distilbert-base-cased-distilled-squad and revision 626af31 (https://huggingface.co/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.
(…)distilled-squad/resolve/main/config.json: 100%|█████████████████████████████████| 473/473 [00:00<00:00, 188kB/s]
model.safetensors: 100%|████████████████████████████████████████████████████████| 261M/261M [00:07<00:00, 33.1MB/s]
(…)squad/resolve/main/tokenizer_config.json: 100%|██████████████████████████████| 29.0/29.0 [00:00<00:00, 9.52kB/s]
(…)d-distilled-squad/resolve/main/vocab.txt: 100%|███████████████████████████████| 213k/213k [00:00<00:00, 540kB/s]
(…)tilled-squad/resolve/main/tokenizer.json: 100%|███████████████████████████████| 436k/436k [00:00<00:00, 746kB/s]


In [3]:
# training data
context = r"Extractive Question Answering is the task of extracting an answer " + \
"from a text given a question. An example of a question answering " + \
"dataset is the SQuAD dataset, which is entirely based on that task. " + \
"If you would like to fine-tune a model on a SQuAD task, you may " + \
"leverage the examples/question-answering/run_squad.py script."

In [4]:
# test 2 sample
result = nlp(question="What is extractive question answering?", context=context)
print(f"Answer: '{result['answer']}', score: {round(result['score'], 4)}",
        f", start: {result['start']}, end: {result['end']}")

print()

result = nlp(question="What is a good example of a question answering dataset?", 
             context=context)
print(f"Answer: '{result['answer']}', score: {round(result['score'], 4)}",
        f", start: {result['start']}, end: {result['end']}")

Answer: 'the task of extracting an answer from a text given a question', score: 0.6226 , start: 33, end: 94

Answer: 'SQuAD dataset', score: 0.5053 , start: 146, end: 159


# 結合Tokenizer

In [5]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch

# 結合分詞器(Tokenizer)
tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
model = AutoModelForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")

(…)squad/resolve/main/tokenizer_config.json: 100%|██████████████████████████████| 28.0/28.0 [00:00<00:00, 26.9kB/s]
(…)finetuned-squad/resolve/main/config.json: 100%|█████████████████████████████████| 443/443 [00:00<00:00, 224kB/s]
(…)g-finetuned-squad/resolve/main/vocab.txt: 100%|███████████████████████████████| 232k/232k [00:00<00:00, 682kB/s]
(…)etuned-squad/resolve/main/tokenizer.json: 100%|██████████████████████████████| 466k/466k [00:00<00:00, 1.52MB/s]
model.safetensors: 100%|██████████████████████████████████████████████████████| 1.34G/1.34G [00:28<00:00, 47.6MB/s]
Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from

In [6]:
# 訓練資料
text = r"""
🤗 Transformers (formerly known as pytorch-transformers and pytorch-pretrained-bert) provides general-purpose
architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet…) for Natural Language Understanding (NLU) and Natural
Language Generation (NLG) with over 32+ pretrained models in 100+ languages and deep interoperability between
TensorFlow 2.0 and PyTorch.
"""

In [7]:
# 問題
questions = [
    "How many pretrained models are available in 🤗 Transformers?",
    "What does 🤗 Transformers provide?",
    "🤗 Transformers provides interoperability between which frameworks?",
]

In [10]:
# 推測答案
for question in questions:
    inputs = tokenizer(question, text, add_special_tokens=True, return_tensors="pt")
    print("inputs: ", inputs)
    input_ids = inputs["input_ids"].tolist()[0]
    print("input ids: ", input_ids)

    outputs = model(**inputs)
    answer_start_scores = outputs.start_logits
    answer_end_scores = outputs.end_logits

    # Get the most likely beginning of answer with the argmax of the score
    answer_start = torch.argmax(answer_start_scores)
    # Get the most likely end of answer with the argmax of the score
    answer_end = torch.argmax(answer_end_scores) + 1

    answer = tokenizer.convert_tokens_to_string(
        tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end])
    )

    print(f"Question: {question}")
    print(f"Answer: {answer}")

inputs:  {'input_ids': tensor([[  101,  2129,  2116,  3653, 23654,  2098,  4275,  2024,  2800,  1999,
           100, 19081,  1029,   102,   100, 19081,  1006,  3839,  2124,  2004,
          1052, 22123,  2953,  2818,  1011, 19081,  1998,  1052, 22123,  2953,
          2818,  1011,  3653, 23654,  2098,  1011, 14324,  1007,  3640,  2236,
          1011,  3800,  4294,  2015,  1006, 14324,  1010, 14246,  2102,  1011,
          1016,  1010, 23455,  1010, 28712,  2213,  1010,  4487, 16643, 23373,
          1010, 28712,  7159,  1529,  1007,  2005,  3019,  2653,  4824,  1006,
         17953,  2226,  1007,  1998,  3019,  2653,  4245,  1006, 17953,  2290,
          1007,  2007,  2058,  3590,  1009,  3653, 23654,  2098,  4275,  1999,
          2531,  1009,  4155,  1998,  2784,  6970, 25918,  8010,  2090, 23435,
         12314,  1016,  1012,  1014,  1998,  1052, 22123,  2953,  2818,  1012,
           102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,