Copyright (C) 2022, Intel Corporation

SPDX-License-Identifier: MIT License

# Question Answering with BERT in Python using torch-ort inference module:

1. This demo shows how to use Intel® OpenVINO™ integration with Torch-ORT to extract the answer from a given context based on the questions.

2. We use a question answering model deepset/roberta-base-squad2 from HuggingFace models. This model is trained on the RoBERTa architecture to do extractive question answering.

### Import Necessary Resources

In [41]:
import os
import numpy as np
import time
import pandas as pd
import pathlib

from transformers import AutoModelForQuestionAnswering, AutoTokenizer

import torch
from torch_ort import ORTInferenceModule, OpenVINOProviderOptions

### Preprocess Function

* Use auto tokenizer for question answering to preprocess the inputs

In [42]:
def preprocess_input(tokenizer, questions, context):
  inputs = []
  for question in questions:
    inputs.append(tokenizer(question, context, add_special_tokens=True, return_tensors="pt"))
  return inputs

### Inference Function
* Runs the inference on the input sentences
* Prints the inference results

In [43]:
def infer(model, tokenizer, questions, inputs):
    num_questions = len(questions)
    total_infer_time = 0
    results = {}


    # Run inference
    for i in range(num_questions):
        input_ids = inputs[i]["input_ids"].tolist()[0]
        with torch.no_grad():
            # warm-up
            if i==0:
               t0 = time.time()
               outputs = model(**inputs[i])
            # infer
            t0 = time.time()
            outputs = model(**inputs[i])
            t1 = time.time() - t0
            total_infer_time += t1

        # Get the "logits" output by the model. The "logits" are the output
        # values prior to applying an activation function like the softmax.
        answer_start_scores = outputs.start_logits
        answer_end_scores = outputs.end_logits

        # Get the most likely beginning of answer with the argmax of the score
        answer_start = torch.argmax(answer_start_scores)
        # Get the most likely end of answer with the argmax of the score
        answer_end = torch.argmax(answer_end_scores) + 1

        # Predictions Postprocessing
        answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
        orig_ques = questions[i]
        results[orig_ques] = answer

    print("\n Number of questions: {}".format(num_questions))
    for k, v in results.items():
        print("\tQuestion: {!r} \n\tAnswer: {!r} \n".format(k, v))

    print("\n Average inference time: {:.4f}ms".format((total_infer_time/num_questions)*1000))
    print(" Total Inference time: {:.4f}ms".format(total_infer_time * 1000))

### Select the inputs

* Use `questions` to select your input sentences to be inferred

* Available backend precisions
  * CPU: FP32
  * GPU(Does not work in collab): FP32, FP16

* Available inference execution providers 
  * OpenVINO

In [52]:
context =  "Computational complexity theory is a branch of the theory of computation in theoretical computer science that focuses on classifying \
            computational problems according to their inherent difficulty, and relating those classes to each other. A computational problem is understood \
            to be a task that is in principle amenable to being solved by a computer, which is equivalent to stating that the problem may be solved by mechanical \
            application of mathematical steps, such as an algorithm."


print("Enter questions within context to be answered(sample question will be chosen if no questions are given): ")
questions = []
while True:
  question = input()
  # if no question - break
  if question == "":
    break
  questions.append(question)

example_questions = [
    "What is the term for a task that generally lends itself to being solved by a computer?"
]

if len(questions) == 0:
  print("Sample question was chosen")
  print("Question: What is the term for a task that generally lends itself to being solved by a computer?")
  questions = example_questions

backend ="CPU"
precision = "FP32"

Enter questions within context to be answered(sample question will be chosen if no questions are given): 

Sample question was chosen
Question: What is the term for a task that generally lends itself to being solved by a computer?


### Get and load the model

In [45]:
# Load Model
# Pretrained model fine-tuned on CoLA dataset from huggingface model hub to predict grammar correctness
model = AutoModelForQuestionAnswering.from_pretrained("deepset/roberta-base-squad2")


# Load Tokenizer & Preprocess input sentences
tokenizer = AutoTokenizer.from_pretrained("deepset/roberta-base-squad2")
preprocessed_inputs  = preprocess_input(tokenizer, questions, context)


# Convert model for evaluation
model.eval();

### Run the inference with native PyTorch

In [46]:
# Infer
infer(model, tokenizer, questions, preprocessed_inputs)


 Number of questions: 1
	Question: 'What is the term for a task that generally lends itself to being solved by a computer?' 
	Answer: ' computational problem' 


 Average inference time: 1168.6697ms
 Total Inference time: 1168.6697ms


### Run the inference with Torch-ORT inference module

In [47]:
# Select OpenVINO as inference execution provider
if backend and precision:
    provider_options = OpenVINOProviderOptions(backend, precision)
    model_ort = ORTInferenceModule(model, provider_options=provider_options)
else:
    model_ort = ORTInferenceModule(model)

# Infer
infer(model_ort, tokenizer, questions, preprocessed_inputs)


 Number of questions: 1
	Question: 'What is the term for a task that generally lends itself to being solved by a computer?' 
	Answer: ' computational problem' 


 Average inference time: 476.4693ms
 Total Inference time: 476.4693ms
