In [1]:
### MUST BE RUNNING AN ELASTICSEARCH INSTANCE BEFORE RUNNING NOTEBOOK
# docker command - `docker run --name elastic -p 9200:9200 -e "discovery.type=single-node" -m 1G -itd docker.elastic.co/elasticsearch/elasticsearch:7.9.2`

In [17]:
import logging
import os
from haystack.document_stores import ElasticsearchDocumentStore
from haystack.utils import fetch_archive_from_http
from haystack import Pipeline
from haystack.nodes import TextConverter, PreProcessor, BM25Retriever, FARMReader

from dotenv import load_dotenv


In [18]:
load_dotenv()

True

In [2]:
logging.basicConfig(format="%(levelname)s - %(name)s -  %(message)s", level=logging.WARNING)
log = logging.getLogger("haystack")
log.setLevel(logging.INFO)

In [4]:
# Get the host where Elasticsearch is running, default to localhost
host = os.environ.get("ELASTICSEARCH_HOST", "localhost")

document_store = ElasticsearchDocumentStore(host=host, username="", password="", index="document")

In [5]:
doc_dir = "data/build_a_scalable_question_answering_system"

fetch_archive_from_http(
    url="https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt3.zip",
    output_dir=doc_dir,
)

INFO - haystack.utils.import_utils -  Found data stored in 'data/build_a_scalable_question_answering_system'. Delete this first if you really want to fetch new data.


False

In [6]:

indexing_pipeline = Pipeline()
text_converter = TextConverter()
preprocessor = PreProcessor(
    clean_whitespace=True,
    clean_header_footer=True,
    clean_empty_lines=True,
    split_by="word",
    split_length=200,
    split_overlap=20,
    split_respect_sentence_boundary=True,
)

In [7]:
indexing_pipeline.add_node(component=text_converter, name="TextConverter", inputs=["File"])
indexing_pipeline.add_node(component=preprocessor, name="PreProcessor", inputs=["TextConverter"])
indexing_pipeline.add_node(component=document_store, name="DocumentStore", inputs=["PreProcessor"])


In [8]:
files_to_index = [doc_dir + "/" + f for f in os.listdir(doc_dir)]
indexing_pipeline.run_batch(file_paths=files_to_index)
#As an alternative, you can cast you text data into Document objects and write them into the DocumentStore using DocumentStore.write_documents().


INFO - haystack.pipelines.base -  It seems that an indexing Pipeline is run, so using the nodes' run method instead of run_batch.
Converting files: 100%|██████████| 183/183 [00:03<00:00, 54.64it/s]
Preprocessing: 100%|██████████| 183/183 [00:03<00:00, 60.65docs/s]


{'documents': [<Document: {'content': '\n\n"\'\'\'Kissed by Fire\'\'\'" is the fifth episode of the third season of HBO\'s fantasy television series \'\'Game of Thrones\'\', and the 25th episode of the series. Directed by Alex Graves and written by Bryan Cogman, it aired on April 28, 2013.\n\nThe title of the episode refers to the red-haired Wildlings, like Ygritte, who are said to be "kissed by fire". Fire is also a key element in other storylines, with Sandor Clegane\'s fear of fire being shown, as well as the Mad King\'s obsession with Wildfire, as told by Jaime Lannister.\n\nThe episode won the Primetime Emmy Award for Outstanding Make-up for a Single-Camera Series (Non-Prosthetic) at the 65th Primetime Creative Arts Emmy Awards.\n\n==Plot==\n===At Dragonstone===\nStannis is surprised when his wife, Queen Selyse, encourages his infidelity as service to the Lord of Light. His daughter, Princess Shireen, visits Davos in the dungeons with a book; he admits he is illiterate, and she be

In [9]:
retriever = BM25Retriever(document_store=document_store)

In [10]:
reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True)


INFO - haystack.modeling.utils -  Using devices: CPU - Number of GPUs: 0
INFO - haystack.modeling.utils -  Using devices: CPU - Number of GPUs: 0


INFO - haystack.modeling.model.language_model -   * LOADING MODEL: 'deepset/roberta-base-squad2' (Roberta)
INFO - haystack.modeling.model.language_model -  Auto-detected model language: english
INFO - haystack.modeling.model.language_model -  Loaded 'deepset/roberta-base-squad2' (Roberta model) from model hub.
INFO - haystack.modeling.utils -  Using devices: CPU - Number of GPUs: 0


In [11]:
querying_pipeline = Pipeline()
querying_pipeline.add_node(component=retriever, name="Retriever", inputs=["Query"])
querying_pipeline.add_node(component=reader, name="Reader", inputs=["Retriever"])

In [12]:
prediction = querying_pipeline.run(
    query="Who is the father of Arya Stark?", params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}}
)

Inferencing Samples: 100%|██████████| 1/1 [00:17<00:00, 17.39s/ Batches]


In [13]:
from pprint import pprint
from haystack.utils import print_answers

# print_answers(prediction, details="minimum")  ## Choose from `minimum`, `medium` and `all`

pprint(prediction)

{'answers': [<Answer {'answer': 'Eddard', 'type': 'extractive', 'score': 0.993372917175293, 'context': "s Nymeria after a legendary warrior queen. She travels with her father, Eddard, to King's Landing when he is made Hand of the King. Before she leaves,", 'offsets_in_document': [{'start': 207, 'end': 213}], 'offsets_in_context': [{'start': 72, 'end': 78}], 'document_ids': ['9e3c863097d66aeed9992e0b6bf1f2f4'], 'meta': {'_split_id': 4, '_split_overlap': [{'range': [0, 266], 'doc_id': '241c8775e39c6c937c67bbd10ccc471c'}, {'range': [960, 1200], 'doc_id': '87e8469dcf7354fd2a25fbd2ba07c543'}]}}>,
             <Answer {'answer': 'Ned', 'type': 'extractive', 'score': 0.9753611087799072, 'context': "k in the television series.\n\n====Season 1====\nArya accompanies her father Ned and her sister Sansa to King's Landing. Before their departure, Arya's h", 'offsets_in_document': [{'start': 630, 'end': 633}], 'offsets_in_context': [{'start': 74, 'end': 77}], 'document_ids': ['7d3360fa29130e69ea6b2b

## Using Embeddings Instead of Text Document Search

In [3]:
from haystack.document_stores import FAISSDocumentStore
from haystack.utils import clean_wiki_text, convert_files_to_docs, fetch_archive_from_http, print_answers
from haystack.nodes import EmbeddingRetriever
from haystack.pipelines import ExtractiveQAPipeline

In [4]:
FAISS_INDEX_PATH = "./faiss_index"
if os.path.exists(FAISS_INDEX_PATH) and os.path.exists(f"{FAISS_INDEX_PATH}.json"):
    log.warn(f"Reading FAISS Index from {FAISS_INDEX_PATH}")
    document_store = FAISSDocumentStore.load(FAISS_INDEX_PATH)
    retriever = EmbeddingRetriever(
        document_store=document_store, embedding_model="sentence-transformers/multi-qa-mpnet-base-dot-v1"
        )
else:
    log.warn(f"Creating new FAISS Index")
    document_store = FAISSDocumentStore(faiss_index_factory_str="Flat")
    # Let's first get some files that we want to use
    doc_dir = "data/tutorial6"
    s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt6.zip"
    fetch_archive_from_http(url=s3_url, output_dir=doc_dir)

    # Convert files to dicts
    docs = convert_files_to_docs(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True)

    # Now, let's write the dicts containing documents to our DB.
    document_store.write_documents(docs)
    retriever = EmbeddingRetriever(
        document_store=document_store, embedding_model="sentence-transformers/multi-qa-mpnet-base-dot-v1"
        )
    # Important:
    # Now that we initialized the Retriever, we need to call update_embeddings() to iterate over all
    # previously indexed documents and update their embedding representation.
    # While this can be a time consuming operation (depending on the corpus size), it only needs to be done once.
    # At query time, we only need to embed the query and compare it to the existing document embeddings, which is very fast.
    document_store.update_embeddings(retriever)

    document_store.save("./faiss_index")

  log.warn(f"Reading FAISS Index from {FAISS_INDEX_PATH}")


INFO - haystack.modeling.utils -  Using devices: CPU - Number of GPUs: 0
INFO - haystack.nodes.retriever.dense -  Init retriever using embeddings of model sentence-transformers/multi-qa-mpnet-base-dot-v1
  return self.fget.__get__(instance, owner)()


In [5]:
reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True)

INFO - haystack.modeling.utils -  Using devices: CPU - Number of GPUs: 0
INFO - haystack.modeling.utils -  Using devices: CPU - Number of GPUs: 0
INFO - haystack.modeling.model.language_model -   * LOADING MODEL: 'deepset/roberta-base-squad2' (Roberta)
INFO - haystack.modeling.model.language_model -  Auto-detected model language: english
INFO - haystack.modeling.model.language_model -  Loaded 'deepset/roberta-base-squad2' (Roberta model) from model hub.
INFO - haystack.modeling.utils -  Using devices: CPU - Number of GPUs: 0


In [6]:
pipe = ExtractiveQAPipeline(reader, retriever)

In [7]:
prediction = pipe.run(
    query="Who created the Dothraki vocabulary?", params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}}
)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Inferencing Samples: 100%|██████████| 1/1 [00:13<00:00, 13.91s/ Batches]


In [8]:
print_answers(prediction, details="minimum")

'Query: Who created the Dothraki vocabulary?'
'Answers:'
[   {   'answer': 'David J. Peterson',
        'context': 'orld. The language was developed for the TV series by the '
                   'linguist David J. Peterson, working off the Dothraki words '
                   "and phrases in Martin's novels.\n"
                   ','},
    {   'answer': 'David J. Peterson',
        'context': '\n'
                   '===Valyrian===\n'
                   'David J. Peterson, who created the Dothraki language for '
                   'the first season of the show, was entrusted by the '
                   'producers to design a new '},
    {   'answer': 'David J. Peterson',
        'context': "age for ''Game of Thrones''\n"
                   'The Dothraki vocabulary was created by David J. Peterson '
                   'well in advance of the adaptation. HBO hired the Language '
                   'Creatio'},
    {   'answer': 'David J. Peterson',
        'context': "ges to be spoken in '

## Create an Agent to Use the QA

In [9]:
from haystack.agents import Agent, Tool
from haystack.nodes import PromptNode

In [30]:
HUGGINGFACE_API_KEY = os.environ.get("HUGGINGFACEHUB_API_TOKEN")
prompt_node = PromptNode(model_name_or_path="google/flan-t5-xxl", stop_words=["Observation:"], api_key=HUGGINGFACE_API_KEY)
agent = Agent(prompt_node=prompt_node)

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]



In [31]:
search_tool = Tool(
    name="Game_of_Thrones_QA",
    pipeline_or_node=pipe,
    description="useful for when you need to answer questions related to Game of Thrones.",
    output_variable="answers",
)
agent.add_tool(search_tool)

In [39]:
result = agent.run("Who is Daenerys Targaryen's brother in Game of Thrones?")

print(result["transcript"].split("---")[0])


Agent zero-shot-react started with {'query': "Who is Daenerys Targaryen's brother in Game of Thrones?", 'params': None}
[32m Game[0m[32m_[0m[32mof[0m[32m_[0m[32mThr[0m[32mone[0m[32ms[0m[32m_[0m[32mQA[0m[32m Da[0m[32men[0m[32mery[0m[32ms[0m[32m Tar[0m[32mgar[0m[32my[0m[32men[0m[32m'[0m[32ms[0m[32m brother[0m[32m is[0m[32m Rh[0m[32ma[0m[32me[0m[32mgar[0m[32m Tar[0m[32mgar[0m[32my[0m[32men[0m[32m.[0m[32m Final[0m[32m Answer[0m[32m:[0m[32m Rh[0m[32ma[0m[32me[0m[32mgar[0m[32m Tar[0m[32mgar[0m[32my[0m[32men[0m Game_of_Thrones_QA Daenerys Targaryen's brother is Rhaegar Targaryen. Final Answer: Rhaegar Targaryen


In [40]:
print(result)

{'query': "Who is Daenerys Targaryen's brother in Game of Thrones?", 'answers': [<Answer {'answer': 'Rhaegar Targaryen', 'type': 'generative', 'score': None, 'context': None, 'offsets_in_document': None, 'offsets_in_context': None, 'document_ids': None, 'meta': {}}>], 'transcript': " Game_of_Thrones_QA Daenerys Targaryen's brother is Rhaegar Targaryen. Final Answer: Rhaegar Targaryen"}


In [None]:
## NEXT - https://haystack.deepset.ai/tutorials/02_finetune_a_model_on_your_data