In [1]:
from haystack.document_stores import InMemoryDocumentStore

document_store = InMemoryDocumentStore(use_bm25=True)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from haystack.utils import fetch_archive_from_http

# Download and store gameofthrones wikidata
doc_dir = "data/build_your_first_question_answering_system"

fetch_archive_from_http(
    url="https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt1.zip",
    output_dir=doc_dir
)

True

In [3]:
# Converts the downloaded data into Document objects and stores them in our document store
import os
from haystack.pipelines.standard_pipelines import TextIndexingPipeline

files_to_index = [doc_dir + '/' + f for f in os.listdir(doc_dir)]
indexing_pipeline = TextIndexingPipeline(document_store)
indexing_pipeline.run_batch(file_paths=files_to_index)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jr101\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
Converting files: 100%|█████████████████████████████████████████████████████████████| 183/183 [00:00<00:00, 612.05it/s]
Preprocessing:   0%|                                                                         | 0/183 [00:00<?, ?docs/s]We found one or more sentences whose word count is higher than the split length.
Preprocessing:  46%|█████████████████████████████▎                                 | 85/183 [00:00<00:00, 367.67docs/s]Document 4189b42892b3d941c035947d512b69dd is 12059 characters long after preprocessing, where the maximum length should be 10000. Something might be wrong with the splitting, check the document affected to prevent issues at query time.
Preprocessing:  73%|█████████████████████████████████████████████                 | 133/183 [00:00<00:00, 411.54docs/s]Document dd048b8e5bcb7de1be5bd3937f15442f is 14232 characte

{'documents': [<Document: {'content': "\n\nThe eighth and final season of the fantasy drama television series ''Game of Thrones'', produced by HBO, premiered on April 14, 2019, and concluded on May 19, 2019. Unlike the first six seasons, which consisted of ten episodes each, and the seventh season, which consisted of seven episodes, the eighth season consists of only six episodes.\n\nThe final season depicts the culmination of the series' two primary conflicts: the Great War against the Army of the Dead, and the Last War for control of the Iron Throne. The first half of the season involves many of the main characters converging at Winterfell with their armies in an effort to repel the Night King and his army of White Walkers and wights. The second half of the season resumes the war for the throne as Daenerys Targaryen assaults King's Landing in an attempt to unseat Cersei Lannister as the ruler of the Seven Kingdoms.\n\n", 'content_type': 'text', 'score': None, 'meta': {'_split_id': 0}

In [4]:
# Initialise the Retriever for the data, uses an algorithm to determine relevant data and returns that
from haystack.nodes import BM25Retriever

retriever = BM25Retriever(document_store=document_store)

In [5]:
# Initialize the Reader, takes the data and returns an answer, i.e. the AI model
from haystack.nodes import FARMReader

reader = FARMReader(model_name_or_path='deepset/roberta-base-squad2')

Downloading (…)lve/main/config.json: 100%|████████████████████████████████████████████████████| 571/571 [00:00<?, ?B/s]
Downloading pytorch_model.bin: 100%|████████████████████████████████████████████████| 496M/496M [00:06<00:00, 80.5MB/s]
  return self.fget.__get__(instance, owner)()
Downloading (…)okenizer_config.json: 100%|██████████████████████████████████████████| 79.0/79.0 [00:00<00:00, 78.9kB/s]
Downloading (…)olve/main/vocab.json: 100%|██████████████████████████████████████████| 899k/899k [00:00<00:00, 3.48MB/s]
Downloading (…)olve/main/merges.txt: 100%|██████████████████████████████████████████| 456k/456k [00:00<00:00, 30.4MB/s]
Downloading (…)cial_tokens_map.json: 100%|████████████████████████████████████████████████████| 772/772 [00:00<?, ?B/s]


In [6]:
from haystack.pipelines import ExtractiveQAPipeline

pipe = ExtractiveQAPipeline(reader, retriever)

In [7]:
prediction = pipe.run(
    query="Who is the father of Arya Stark?",
    params={
        "Retriever": {"top_k": 10},
        "Reader": {"top_k": 5}
    }
)

Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:05<00:00,  5.85s/ Batches]


In [9]:
from pprint import pprint
pprint(prediction)

{'answers': [<Answer {'answer': 'Eddard', 'type': 'extractive', 'score': 0.9933727979660034, 'context': "s Nymeria after a legendary warrior queen. She travels with her father, Eddard, to King's Landing when he is made Hand of the King. Before she leaves,", 'offsets_in_document': [{'start': 207, 'end': 213}], 'offsets_in_context': [{'start': 72, 'end': 78}], 'document_ids': ['9e3c863097d66aeed9992e0b6bf1f2f4'], 'meta': {'_split_id': 3}}>,
             <Answer {'answer': 'Ned', 'type': 'extractive', 'score': 0.9753611087799072, 'context': "k in the television series.\n\n====Season 1====\nArya accompanies her father Ned and her sister Sansa to King's Landing. Before their departure, Arya's h", 'offsets_in_document': [{'start': 630, 'end': 633}], 'offsets_in_context': [{'start': 74, 'end': 77}], 'document_ids': ['7d3360fa29130e69ea6b2ba5c5a8f9c8'], 'meta': {'_split_id': 10}}>,
             <Answer {'answer': 'Lord Eddard Stark', 'type': 'extractive', 'score': 0.9177319407463074, 'context'

In [10]:
from haystack.utils import print_answers

print_answers(
    prediction,
    details="minimum"
)

'Query: Who is the father of Arya Stark?'
'Answers:'
[   {   'answer': 'Eddard',
        'context': 's Nymeria after a legendary warrior queen. She travels '
                   "with her father, Eddard, to King's Landing when he is made "
                   'Hand of the King. Before she leaves,'},
    {   'answer': 'Ned',
        'context': 'k in the television series.\n'
                   '\n'
                   '====Season 1====\n'
                   'Arya accompanies her father Ned and her sister Sansa to '
                   "King's Landing. Before their departure, Arya's h"},
    {   'answer': 'Lord Eddard Stark',
        'context': 'rk daughters.\n'
                   '\n'
                   'During the Tourney of the Hand to honour her father Lord '
                   'Eddard Stark, Sansa Stark is enchanted by the knights '
                   'performing in the event.'},
    {   'answer': 'Ned',
        'context': ' girl disguised as a boy all along and is surprised to '
      

In [11]:
def question_bot(inStr):
    prediction = pipe.run(
        query=inStr,
        params={
            "Retriever": {"top_k": 10},
            "Reader": {"top_k": 5}
        }
    )
    
    return prediction

In [12]:
results = question_bot("Are Ned and Eddard the same person?")
print_answers(results)

Inferencing Samples: 100%|█████████████████████████████████████████████████████████| 1/1 [00:05<00:00,  5.95s/ Batches]

'Query: Are Ned and Eddard the same person?'
'Answers:'
[   <Answer {'answer': 'Eddard "Ned" Stark is the second son of Rickard Stark', 'type': 'extractive', 'score': 0.37394946813583374, 'context': 'use Stark\n\n=== Background ===\nAs established in \'\'A Game of Thrones\'\', Eddard "Ned" Stark is the second son of Rickard Stark, the Lord of Winterfell. ', 'offsets_in_document': [{'start': 1068, 'end': 1121}], 'offsets_in_context': [{'start': 71, 'end': 124}], 'document_ids': ['43f45e9363586e9b78731547d3fbcd72'], 'meta': {'_split_id': 4}}>,
    <Answer {'answer': 'He is the oldest legitimate son', 'type': 'extractive', 'score': 0.3635576069355011, 'context': 'He is the oldest legitimate son of Eddard "Ned" Stark and his wife Catelyn, and has five siblings: Sansa, Arya, Bran, Rickon, and Jon Snow, Ned\'s ille', 'offsets_in_document': [{'start': 0, 'end': 31}], 'offsets_in_context': [{'start': 0, 'end': 31}], 'document_ids': ['cc3050db07a9442b720f9a767b25b4d7'], 'meta': {'_split_id': 1}




In [13]:
print_answers(results, details="minimum")

'Query: Are Ned and Eddard the same person?'
'Answers:'
[   {   'answer': 'Eddard "Ned" Stark is the second son of Rickard Stark',
        'context': 'use Stark\n'
                   '\n'
                   '=== Background ===\n'
                   'As established in \'\'A Game of Thrones\'\', Eddard "Ned" '
                   'Stark is the second son of Rickard Stark, the Lord of '
                   'Winterfell. '},
    {   'answer': 'He is the oldest legitimate son',
        'context': 'He is the oldest legitimate son of Eddard "Ned" Stark and '
                   'his wife Catelyn, and has five siblings: Sansa, Arya, '
                   "Bran, Rickon, and Jon Snow, Ned's ille"},
    {   'answer': "Eddard's friend",
        'context': 'taken in by the children.\n'
                   '\n'
                   "News arrives of the death of Lord Arryn, Eddard's friend "
                   "and Catelyn's brother-in-law. Winterfell receives the "
                   'royal court,'},
    { 