In [14]:
# %%bash

# pip install --upgrade pip
# pip install farm-haystack[colab,inference]


In [15]:
from haystack.telemetry import tutorial_running

tutorial_running(4)


In [16]:
import logging

logging.basicConfig(format="%(levelname)s - %(name)s -  %(message)s", level=logging.WARNING)
logging.getLogger("haystack").setLevel(logging.INFO)


In [18]:
from haystack.document_stores import InMemoryDocumentStore

document_store = InMemoryDocumentStore()


INFO:haystack.modeling.utils:Using devices: CPU - Number of GPUs: 0


In [19]:
from haystack.nodes import EmbeddingRetriever

retriever = EmbeddingRetriever(
    document_store=document_store,
    embedding_model="sentence-transformers/all-MiniLM-L6-v2",
    use_gpu=True,
    scale_score=False,
)


INFO:haystack.modeling.utils:Using devices: CPU - Number of GPUs: 0
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

INFO:haystack.nodes.retriever.dense:Init retriever using embeddings of model sentence-transformers/all-MiniLM-L6-v2


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [25]:
import pandas as pd

from haystack.utils import fetch_archive_from_http


# Download
doc_dir = "data/AQPair"
s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/small_faq_covid.csv.zip"
fetch_archive_from_http(url=s3_url, output_dir=doc_dir)

# Get dataframe with columns "question", "answer" and some custom metadata
df = pd.read_csv(f"{doc_dir}/small_faq_covid.csv")
# Minimal cleaning
df.fillna(value="", inplace=True)
df["question"] = df["question"].apply(lambda x: x.strip())
# print(df.head())

# Create embeddings for our questions from the FAQs
# In contrast to most other search use cases, we don't create the embeddings here from the content of our documents,
# but rather from the additional text field "question" as we want to match "incoming question" <-> "stored question".
questions = list(df["question"].values)
df["embedding"] = retriever.embed_queries(queries=questions).tolist()
df = df.rename(columns={"question": "content"})

# Convert Dataframe to list of dicts and index them in our DocumentStore
docs_to_index = df.to_dict(orient="records")
document_store.write_documents(docs_to_index)


INFO:haystack.utils.import_utils:Found data stored in 'data/AQPair'. Delete this first if you really want to fetch new data.


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

INFO:haystack.document_stores.base:Duplicate Documents: Document with id 'a8d4ddffcab67801c7a1a13d85fbe84a' already exists in index 'document'
INFO:haystack.document_stores.base:Duplicate Documents: Document with id 'e4fae6647538bfddae6c8d8771fd613' already exists in index 'document'
INFO:haystack.document_stores.base:Duplicate Documents: Document with id 'f6dd87c6e090d56685b37554befe602' already exists in index 'document'
INFO:haystack.document_stores.base:Duplicate Documents: Document with id '719668a041cff08136aad7f4e2876a3a' already exists in index 'document'


In [24]:
df.head()

Unnamed: 0,content,answer,answer_html,link,name,source,category,country,region,city,lang,last_update,embedding
0,What is a novel coronavirus?,A novel coronavirus is a new coronavirus that ...,<p>A novel coronavirus is a new coronavirus th...,\nhttps://www.cdc.gov/coronavirus/2019-ncov/fa...,Frequently Asked Questions,Center for Disease Control and Prevention (CDC),Coronavirus Disease 2019 Basics,USA,,,en,2020/03/17,"[-0.03914344683289528, 0.05274822562932968, -0..."
1,Why is the disease being called coronavirus di...,"On February 11, 2020 the World Health Organiza...","<p>On February 11, 2020 the World Health Organ...",\nhttps://www.cdc.gov/coronavirus/2019-ncov/fa...,Frequently Asked Questions,Center for Disease Control and Prevention (CDC),Coronavirus Disease 2019 Basics,USA,,,en,2020/03/17,"[-0.01713438890874386, 0.04649308696389198, -0..."
2,Why might someone blame or avoid individuals a...,People in the U.S. may be worried or anxious a...,<p>People in the U.S. may be worried or anxiou...,\nhttps://www.cdc.gov/coronavirus/2019-ncov/fa...,Frequently Asked Questions,Center for Disease Control and Prevention (CDC),Coronavirus Disease 2019 Basics,USA,,,en,2020/03/17,"[0.04399248957633972, 0.04200292378664017, 0.0..."
3,How can people help stop stigma related to COV...,"People can fight stigma and help, not hurt, ot...","<p>People can fight stigma and help, not hurt,...",\nhttps://www.cdc.gov/coronavirus/2019-ncov/fa...,Frequently Asked Questions,Center for Disease Control and Prevention (CDC),How It Spreads,USA,,,en,2020/03/17,"[0.03857081010937691, 0.07700608670711517, -0...."
4,What is the source of the virus?,Coronaviruses are a large family of viruses. S...,<p>Coronaviruses are a large family of viruses...,\nhttps://www.cdc.gov/coronavirus/2019-ncov/fa...,Frequently Asked Questions,Center for Disease Control and Prevention (CDC),How It Spreads,USA,,,en,2020/03/17,"[-0.04010908678174019, 0.07110288739204407, -0..."


In [21]:
from haystack.pipelines import FAQPipeline

pipe = FAQPipeline(retriever=retriever)


In [22]:
from haystack.utils import print_answers

# Run any question and change top_k to see more or less answers
prediction = pipe.run(query="How is the virus spreading?", params={"Retriever": {"top_k": 1}})

print_answers(prediction, details="medium")


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

'Query: How is the virus spreading?'
'Answers:'
[   {   'answer': 'This virus was first detected in Wuhan City, Hubei '
                  'Province, China. The first infections were linked to a live '
                  'animal market, but the virus is now spreading from '
                  'person-to-person. It’s important to note that '
                  'person-to-person spread can happen on a continuum. Some '
                  'viruses are highly contagious (like measles), while other '
                  'viruses are less so.\n'
                  '\n'
                  'The virus that causes COVID-19 seems to be spreading easily '
                  'and sustainably in the community (“community spread”) in '
                  'some affected geographic areas. Community spread means '
                  'people have been infected with the virus in an area, '
                  'including some who are not sure how or where they became '
                  'infected.\n'
                  '