In [1]:
### MUST BE RUNNING AN ELASTICSEARCH INSTANCE BEFORE RUNNING NOTEBOOK
# docker command - `docker run --name elastic -p 9200:9200 -e "discovery.type=single-node" -m 1G -itd docker.elastic.co/elasticsearch/elasticsearch:7.9.2`

In [2]:
import logging
import os
from haystack.document_stores import ElasticsearchDocumentStore
from haystack.utils import fetch_archive_from_http
from haystack import Pipeline
from haystack.nodes import TextConverter, PreProcessor, BM25Retriever, FARMReader, PDFToTextConverter

from dotenv import load_dotenv


In [3]:
load_dotenv()

True

In [4]:
logging.basicConfig(format="%(levelname)s - %(name)s -  %(message)s", level=logging.WARNING)
log = logging.getLogger("haystack")
log.setLevel(logging.INFO)

In [5]:
# Get the host where Elasticsearch is running, default to localhost
host = os.environ.get("ELASTICSEARCH_HOST", "localhost")

document_store = ElasticsearchDocumentStore(host=host, username="", password="", index="document")

In [6]:
doc_dir = "../../../data/pdfs/"

In [7]:

indexing_pipeline = Pipeline()
pdf_converter = PDFToTextConverter()
preprocessor = PreProcessor(
    clean_whitespace=True,
    clean_header_footer=True,
    clean_empty_lines=True,
    split_by="word",
    split_length=200,
    split_overlap=20,
    split_respect_sentence_boundary=True,
)

[nltk_data] Downloading package punkt to /home/ryan/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [8]:
indexing_pipeline.add_node(component=pdf_converter, name="TextConverter", inputs=["File"])
indexing_pipeline.add_node(component=preprocessor, name="PreProcessor", inputs=["TextConverter"])
indexing_pipeline.add_node(component=document_store, name="DocumentStore", inputs=["PreProcessor"])


In [9]:
files_to_index = [doc_dir + "/" + f for f in os.listdir(doc_dir)]
indexing_pipeline.run_batch(file_paths=files_to_index)
#As an alternative, you can cast you text data into Document objects and write them into the DocumentStore using DocumentStore.write_documents().

INFO - haystack.pipelines.base -  It seems that an indexing Pipeline is run, so using the nodes' run method instead of run_batch.
Converting files: 100%|██████████| 1/1 [00:00<00:00,  1.71it/s]
Preprocessing: 100%|██████████| 1/1 [00:00<00:00,  1.21docs/s]


  <Document: {'content': 'Note:  Some of the illustrations in this\nmanual may show features as used in\ndifferent models, so may appear different\nto you on your vehicle.\nNote:  Always use and operate your vehicle\nin line with all applicable laws and\nregulations.\nNote:  Pass on this manual when selling\nyour vehicle.  It is an integral part of your\nvehicle.\nThis manual may qualify the location of a\ncomponent as left-hand side or right-hand\nside.  The side is determined when facing\nforward in the seat.\nE154903\nRight-hand side.\nA\nLeft-hand side.\nB\nSYMBOLS GLOSSARY\nThese are some of the symbols you may\nsee on your vehicle.\n', 'content_type': 'text', 'score': None, 'meta': {'_split_id': 4, '_split_overlap': [{'doc_id': 'f22f29d39c8b7efafac601901534df72', 'range': (0, 235)}, {'doc_id': '430451f4ad467b8898163e199ef614ef', 'range': (478, 598)}]}, 'id_hash_keys': ['content'], 'embedding': None, 'id': 'afe1b51c0e369f5581b78a6527a8a44b'}>,
  <Document: {'content': 'Unauthorize

In [10]:
retriever = BM25Retriever(document_store=document_store)

In [11]:
reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True)


INFO - haystack.modeling.utils -  Using devices: CUDA:0 - Number of GPUs: 1
INFO - haystack.modeling.utils -  Using devices: CUDA:0 - Number of GPUs: 1


Downloading (…)lve/main/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

INFO - haystack.modeling.model.language_model -   * LOADING MODEL: 'deepset/roberta-base-squad2' (Roberta)


Downloading model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]

INFO - haystack.modeling.model.language_model -  Auto-detected model language: english
INFO - haystack.modeling.model.language_model -  Loaded 'deepset/roberta-base-squad2' (Roberta model) from model hub.


Downloading (…)okenizer_config.json:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

INFO - haystack.modeling.utils -  Using devices: CUDA:0 - Number of GPUs: 1


In [12]:
querying_pipeline = Pipeline()
querying_pipeline.add_node(component=retriever, name="Retriever", inputs=["Query"])
querying_pipeline.add_node(component=reader, name="Reader", inputs=["Retriever"])

In [13]:
prediction = querying_pipeline.run(
    query="Can I service my air-conditioning myself?", params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}}
)

Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  1.98 Batches/s]


In [14]:
from pprint import pprint
from haystack.utils import print_answers

# print_answers(prediction, details="minimum")  ## Choose from `minimum`, `medium` and `all`

pprint(prediction)

{'answers': [<Answer {'answer': 'Only qualified\npersonnel', 'type': 'extractive', 'score': 0.5224135518074036, 'context': 'g\nrefrigerant system contains refrigerant\nunder high pressure. Only qualified\npersonnel should service the air\nconditioning refrigerant system. Openin', 'offsets_in_document': [{'start': 427, 'end': 451}], 'offsets_in_context': [{'start': 63, 'end': 87}], 'document_ids': ['3752151eb046a5fef8cf73223fe823a1'], 'meta': {'_split_id': 804, '_split_overlap': [{'range': [0, 199], 'doc_id': '9bb2e0f5d5534cd96394bf9405502451'}, {'range': [1337, 1467], 'doc_id': 'a4660bec869ba3f69b96bb70dc2430f9'}]}}>,
             <Answer {'answer': 'Only qualified\npersonnel', 'type': 'extractive', 'score': 0.45899274945259094, 'context': 'g\nrefrigerant system contains refrigerant\nunder high pressure. Only qualified\npersonnel should service the air\nconditioning refrigerant system. Openin', 'offsets_in_document': [{'start': 385, 'end': 409}], 'offsets_in_context': [{'start': 63, 'e

## Using Embeddings Instead of Text Document Search

In [15]:
from haystack.document_stores import FAISSDocumentStore
from haystack.utils import print_answers
from haystack.nodes import EmbeddingRetriever
from haystack.pipelines import ExtractiveQAPipeline

In [16]:
FAISS_INDEX_PATH = "./faiss_index_pdf.faiss"
if os.path.exists(FAISS_INDEX_PATH) and os.path.exists(FAISS_INDEX_PATH.replace(".faiss",".json")):
    log.warn(f"Reading FAISS Index from {FAISS_INDEX_PATH}")
    document_store = FAISSDocumentStore.load(FAISS_INDEX_PATH)
    retriever = EmbeddingRetriever(
        document_store=document_store, embedding_model="sentence-transformers/multi-qa-mpnet-base-dot-v1"
        )
else:
    log.warn(f"Creating new FAISS Index")
    document_store = FAISSDocumentStore(sql_url="sqlite:///faiss_document_store_pdf.db", faiss_index_factory_str="Flat")
    # Let's first get some files that we want to use

    indexing_pipeline = Pipeline()
    pdf_converter = PDFToTextConverter()
    preprocessor = PreProcessor(
        clean_whitespace=True,
        clean_header_footer=True,
        clean_empty_lines=True,
        split_by="word",
        split_length=200,
        split_overlap=20,
        split_respect_sentence_boundary=True,
    )
    indexing_pipeline.add_node(component=pdf_converter, name="TextConverter", inputs=["File"])
    indexing_pipeline.add_node(component=preprocessor, name="PreProcessor", inputs=["TextConverter"])
    indexing_pipeline.add_node(component=document_store, name="DocumentStore", inputs=["PreProcessor"])
    files_to_index = [doc_dir + "/" + f for f in os.listdir(doc_dir)]
    indexing_pipeline.run_batch(file_paths=files_to_index)

    retriever = EmbeddingRetriever(
        document_store=document_store, embedding_model="sentence-transformers/multi-qa-mpnet-base-dot-v1"
        )
    # Important:
    # Now that we initialized the Retriever, we need to call update_embeddings() to iterate over all
    # previously indexed documents and update their embedding representation.
    # While this can be a time consuming operation (depending on the corpus size), it only needs to be done once.
    # At query time, we only need to embed the query and compare it to the existing document embeddings, which is very fast.
    document_store.update_embeddings(retriever)

    document_store.save(FAISS_INDEX_PATH)

  log.warn(f"Creating new FAISS Index")
INFO - haystack.pipelines.base -  It seems that an indexing Pipeline is run, so using the nodes' run method instead of run_batch.
Converting files:   0%|          | 0/1 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Converting files: 100%|██████████| 1/1 [00:01<00:00,  1.26s/it]
Preprocessing: 100%|██████████| 1/1 [00:00<00:00,  1.28docs/s]
Writing Documents: 10000it [00:01, 9509.10it/s]            
INFO - haystack.modeling.utils -  Using devices: CUDA:0 - Number of GPUs: 1


Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

INFO - haystack.nodes.retriever.dense -  Init retriever using embeddings of model sentence-transformers/multi-qa-mpnet-base-dot-v1


Downloading (…)16ebc/.gitattributes:   0%|          | 0.00/737 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)b6b5d16ebc/README.md:   0%|          | 0.00/8.65k [00:00<?, ?B/s]

Downloading (…)b5d16ebc/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)ebc/data_config.json:   0%|          | 0.00/25.5k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)16ebc/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading (…)6ebc/train_script.py:   0%|          | 0.00/13.9k [00:00<?, ?B/s]

Downloading (…)b6b5d16ebc/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)5d16ebc/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()
INFO - haystack.document_stores.faiss -  Updating embeddings for 1105 docs...
Updating Embedding:   0%|          | 0/1105 [00:00<?, ? docs/s]

Batches:   0%|          | 0/35 [00:00<?, ?it/s]

Documents Processed: 10000 docs [00:05, 1832.89 docs/s]         


In [17]:
reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True)

INFO - haystack.modeling.utils -  Using devices: CUDA:0 - Number of GPUs: 1
INFO - haystack.modeling.utils -  Using devices: CUDA:0 - Number of GPUs: 1
INFO - haystack.modeling.model.language_model -   * LOADING MODEL: 'deepset/roberta-base-squad2' (Roberta)
INFO - haystack.modeling.model.language_model -  Auto-detected model language: english
INFO - haystack.modeling.model.language_model -  Loaded 'deepset/roberta-base-squad2' (Roberta model) from model hub.
INFO - haystack.modeling.utils -  Using devices: CUDA:0 - Number of GPUs: 1


In [18]:
pipe = ExtractiveQAPipeline(reader, retriever)

In [19]:
prediction = pipe.run(
    query="What brand of motor oil should I use?", params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}}
)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  8.47 Batches/s]


In [20]:
print_answers(prediction, details="minimum")

'Query: What brand of motor oil should I use?'
'Answers:'
[   {   'answer': 'Motorcraft®',
        'context': ' system and\n'
                   'fuel economy performance standards of\n'
                   'ILSAC.\n'
                   'We recommend Motorcraft® motor oil for\n'
                   'your vehicle. If Motorcraft® oil is not\n'
                   'available, use'},
    {   'answer': 'Motorcraft®',
        'context': ' system and\n'
                   'fuel economy performance standards of\n'
                   'ILSAC.\n'
                   'We recommend Motorcraft® motor oil for\n'
                   'your vehicle. If Motorcraft® oil is not\n'
                   'available, use'},
    {   'answer': 'Use oil and fluid that meets the defined\n'
                  'specification and viscosity grade',
        'context': 'TIES AND\n'
                   'SPECIFICATIONS - 3.0L\n'
                   'DIESEL\n'
                   'Use oil and fluid that meets the defined\n'
         

## Create an Agent to Use the QA

In [21]:
from haystack.agents import Agent, Tool
from haystack.nodes import PromptNode
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import BitsAndBytesConfig
from torch import float16 as torchfloat16

In [22]:
# quantization_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_compute_dtype=torchfloat16,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_use_double_quant=True,
# )

# MODEL_ID = "EleutherAI/pythia-1b"
# model = AutoModelForCausalLM.from_pretrained(
#     MODEL_ID,
#     trust_remote_code=True,
#     # quantization_config=quantization_config
# )

# tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

# prompt_node = PromptNode(MODEL_ID, model_kwargs={"model":model, "tokenizer": tokenizer})

In [23]:
HUGGINGFACE_API_KEY = os.environ.get("HUGGINGFACEHUB_API_TOKEN")

MODEL_ID = "google/flan-t5-xxl"
# MODEL_ID = "tiiuae/falcon-7b"
# MODEL_ID = "EleutherAI/pythia-1b"
# MODEL_ID = "PY007/TinyLlama-1.1B-step-50K-105b"
prompt_node = PromptNode(model_name_or_path=MODEL_ID, stop_words=["Observation:"], api_key=HUGGINGFACE_API_KEY)

agent = Agent(prompt_node=prompt_node)

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]



In [24]:
search_tool = Tool(
    name="F150_Car_Truck_QA",
    pipeline_or_node=pipe,
    description="useful for when you need to answer questions related to vehicles, trucks, F150",
    output_variable="answers",
)
agent.add_tool(search_tool)

In [30]:
result = agent.run("What PSI should I inflate my tires to?")

print(result["transcript"].split("---")[0])


Agent zero-shot-react started with {'query': 'What PSI should I inflate my tires to?', 'params': None}
[32m Find[0m[32m the[0m[32m recommended[0m[32m PS[0m[32mI[0m[32m for[0m[32m my[0m[32m tires[0m[32m.[0m[32m Tool[0m[32m:[0m[32m F[0m[32m150[0m[32m_[0m[32mCar[0m[32m_[0m[32mTru[0m[32mck[0m[32m_[0m[32mQA[0m[32m In[0m[32mput[0m[32m:[0m[32m tires[0m[32m Final[0m[32m Answer[0m[32m:[0m[32m 30[0m Find the recommended PSI for my tires. Tool: F150_Car_Truck_QA Input: tires Final Answer: 30


In [31]:
print(result)

{'query': 'What PSI should I inflate my tires to?', 'answers': [<Answer {'answer': '30', 'type': 'generative', 'score': None, 'context': None, 'offsets_in_document': None, 'offsets_in_context': None, 'document_ids': None, 'meta': {}}>], 'transcript': ' Find the recommended PSI for my tires. Tool: F150_Car_Truck_QA Input: tires Final Answer: 30'}


In [27]:
## NEXT - https://haystack.deepset.ai/tutorials/02_finetune_a_model_on_your_data