# 01 Basic QA Pipeline

https://haystack.deepset.ai/tutorials/01_basic_qa_pipeline

In [1]:
from haystack.telemetry import tutorial_running

tutorial_running(1)



In [2]:
import logging

logging.basicConfig(format="%(levelname)s - %(name)s -  %(message)s", level=logging.WARNING)
logging.getLogger("haystack").setLevel(logging.INFO)

In [3]:
from haystack.document_stores import InMemoryDocumentStore

document_store = InMemoryDocumentStore(use_bm25=True)

INFO - haystack.modeling.utils -  Using devices: CUDA:0 - Number of GPUs: 1


In [4]:
from haystack.utils import fetch_archive_from_http

doc_dir = "data/build_your_first_question_answering_system"

fetch_archive_from_http(
    url="https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt1.zip",
    output_dir=doc_dir,
)


INFO - haystack.utils.import_utils -  Found data stored in 'data/build_your_first_question_answering_system'. Delete this first if you really want to fetch new data.


False

In [5]:
#!pip install nltk farm-haystack[preprocessing]

Defaulting to user installation because normal site-packages is not writeable


In [6]:
import os
from haystack.pipelines.standard_pipelines import TextIndexingPipeline

In [7]:
files_to_index = [doc_dir + "/" + f for f in os.listdir(doc_dir)]
indexing_pipeline = TextIndexingPipeline(document_store)
indexing_pipeline.run_batch(file_paths=files_to_index)

[nltk_data] Downloading package punkt to /home/sean/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
INFO - haystack.pipelines.base -  It seems that an indexing Pipeline is run, so using the nodes' run method instead of run_batch.
Converting files: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 183/183 [00:00<00:00, 662.37it/s]
Preprocessing: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 183/183 [00:00<00:00, 499.64docs/s]
Updating BM25 representation...: 100%|████████████████████████████████████████████████████████████████████████████| 2359/2359 [00:00<00:00, 46124.77 docs/s]


{'documents': [<Document: {'content': '\n\nThe \'\'\'Dothraki language\'\'\' is a constructed fictional language in George R. R. Martin\'s fantasy novel series \'\'A Song of Ice and Fire\'\' and its television adaptation \'\'Game of Thrones\'\'. It is spoken by the Dothraki, a nomadic people in the series\'s fictional world. The language was developed for the TV series by the linguist David J. Peterson, working off the Dothraki words and phrases in Martin\'s novels.\n\n, the language comprised 3163 words, not all of which have been made public. In 2012, 146 newborn girls in the United States were named "Khaleesi", the Dothraki term for the wife of a \'\'khal\'\' or ruler, and the title adopted in the series by Daenerys Targaryen. Dothraki and Valyrian have been described as "the most convincing fictional tongues since Elvish".\n\n==Development==\nDavid J. Peterson, creator of the Dothraki spoken language for \'\'Game of Thrones\'\'\n\nThe Dothraki vocabulary was created by David J. Pet

In [8]:
from haystack.nodes import BM25Retriever

retriever = BM25Retriever(document_store=document_store)


A Reader scans the texts it received from the Retriever and extracts the top answer candidates. Readers are based on powerful deep learning models but are much slower than Retrievers at processing the same amount of text. In this tutorial, we’re using a FARMReader with a base-sized RoBERTa question answering model called deepset/roberta-base-squad2. It’s a strong all-round model that’s good as a starting point. To find the best model for your use case, see Models.

In [9]:
from haystack.nodes import FARMReader

reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True)

INFO - haystack.modeling.utils -  Using devices: CUDA:0 - Number of GPUs: 1
INFO - haystack.modeling.utils -  Using devices: CUDA:0 - Number of GPUs: 1


config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

INFO - haystack.modeling.model.language_model -   * LOADING MODEL: 'deepset/roberta-base-squad2' (Roberta)


model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]

INFO - haystack.modeling.model.language_model -  Auto-detected model language: english
INFO - haystack.modeling.model.language_model -  Loaded 'deepset/roberta-base-squad2' (Roberta model) from model hub.


tokenizer_config.json:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

INFO - haystack.modeling.utils -  Using devices: CUDA:0 - Number of GPUs: 1


In [10]:
from haystack.pipelines import ExtractiveQAPipeline

pipe = ExtractiveQAPipeline(reader, retriever)

In [11]:
prediction = pipe.run(
    query="Who is the father of Arya Stark?", params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}}
)

Inferencing Samples: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  6.49 Batches/s]


In [12]:
from pprint import pprint

pprint(prediction)

{'answers': [<Answer {'answer': 'Eddard', 'type': 'extractive', 'score': 0.993372917175293, 'context': "s Nymeria after a legendary warrior queen. She travels with her father, Eddard, to King's Landing when he is made Hand of the King. Before she leaves,", 'offsets_in_document': [{'start': 207, 'end': 213}], 'offsets_in_context': [{'start': 72, 'end': 78}], 'document_ids': ['9e3c863097d66aeed9992e0b6bf1f2f4'], 'meta': {'_split_id': 3}}>,
             <Answer {'answer': 'Ned', 'type': 'extractive', 'score': 0.9753613471984863, 'context': "k in the television series.\n\n====Season 1====\nArya accompanies her father Ned and her sister Sansa to King's Landing. Before their departure, Arya's h", 'offsets_in_document': [{'start': 630, 'end': 633}], 'offsets_in_context': [{'start': 74, 'end': 77}], 'document_ids': ['7d3360fa29130e69ea6b2ba5c5a8f9c8'], 'meta': {'_split_id': 10}}>,
             <Answer {'answer': 'Lord Eddard Stark', 'type': 'extractive', 'score': 0.9177321195602417, 'context':

In [13]:
from haystack.utils import print_answers

print_answers(prediction, details="minimum")  ## Choose from `minimum`, `medium`, and `all`

'Query: Who is the father of Arya Stark?'
'Answers:'
[   {   'answer': 'Eddard',
        'context': 's Nymeria after a legendary warrior queen. She travels '
                   "with her father, Eddard, to King's Landing when he is made "
                   'Hand of the King. Before she leaves,'},
    {   'answer': 'Ned',
        'context': 'k in the television series.\n'
                   '\n'
                   '====Season 1====\n'
                   'Arya accompanies her father Ned and her sister Sansa to '
                   "King's Landing. Before their departure, Arya's h"},
    {   'answer': 'Lord Eddard Stark',
        'context': 'rk daughters.\n'
                   '\n'
                   'During the Tourney of the Hand to honour her father Lord '
                   'Eddard Stark, Sansa Stark is enchanted by the knights '
                   'performing in the event.'},
    {   'answer': 'Ned',
        'context': ' girl disguised as a boy all along and is surprised to '
      