# Haystack Semantic Search demo
https://github.com/deepset-ai/haystack/issues/854  
in progress (haystack env)

In [1]:
# !pip install chroma-haystack

In [1]:
# from haystack.document_store.faiss import FAISSDocumentStore
from haystack_integrations.document_stores.chroma import ChromaDocumentStore

In [2]:
# Chroma is used in-memory so we use the same instances in the two pipelines below
document_store = ChromaDocumentStore()

In [3]:
from haystack.components.preprocessors.document_cleaner import clean_wiki_text
from haystack.preprocessor.utils import convert_files_to_dicts, fetch_archive_from_http
from haystack.retriever.dense import DensePassageRetriever

ImportError: cannot import name 'clean_wiki_text' from 'haystack.components.preprocessors.document_cleaner' (/home/sean/miniforge3/envs/haystack/lib/python3.10/site-packages/haystack/components/preprocessors/document_cleaner.py)

In [None]:
document_store = FAISSDocumentStore(faiss_index_factory_str="Flat")

# ## Preprocessing of documents
# Let's first get some documents that we want to query
doc_dir = "data/article_txt_got"
s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip"
fetch_archive_from_http(url=s3_url, output_dir=doc_dir)

# convert files to dicts containing documents that can be indexed to our datastore
dicts = convert_files_to_dicts(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True)

# Now, let's write the docs to our DB.
document_store.write_documents(dicts)

In [None]:
### Retriever

# Recommended: DPR
# retriever = DensePassageRetriever(document_store=document_store,
#                                   query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
#                                   passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
#                                   max_seq_len_query=64,
#                                   max_seq_len_passage=256,
#                                   batch_size=2,
#                                   use_gpu=True,
#                                   embed_title=True,
#                                   use_fast_tokenizers=True
#                                   )

# Alternative: Single encoder for example via sentence transformers
from haystack.retriever.dense import EmbeddingRetriever
retriever = EmbeddingRetriever(document_store=document_store,
                               embedding_model="sentence-transformers/roberta-base-nli-stsb-mean-tokens", # from huggingface's model hub
                               use_gpu=True,
                               model_format="farm", # you can also use "sentence-transformers" here to load the models with the respective framework
                               )


# Important:
# Now that after we have the DPR initialized, we need to call update_embeddings() to iterate over all
# previously indexed documents and update their embedding representation.
# While this can be a time consuming operation (depending on corpus size), it only needs to be done once.
# At query time, we only need to embed the query and compare it the existing doc embeddings which is very fast.
document_store.update_embeddings(retriever)

### Pipeline
# Select the default pipeline for document search or build your own custom one (e.g. combining multiple retrievers).
# See details here: https://haystack.deepset.ai/docs/latest/pipelinesmd
from haystack.pipeline import DocumentSearchPipeline
pipe = DocumentSearchPipeline(retriever=retriever)

## Voilà! Ask a question!
prediction = pipe.run(query="Who is the father of Arya Stark?", top_k_retriever=10)
print(prediction)
# Returns list of docs:
# {'query': 'Who is the father of Arya Stark?', 'documents': [
#{'text': '\n===Storylines===\nBrandon "Bran" Stark is the second son and fourth child of Eddard and Catelyn Stark. He was named after his deceased uncle, Brandon.', 'id': 'e3fc0543-52af-40f7-a4b0-5e68e350f543', 'score': 218.03285, 'probability': 0.8984690445729299, 'question': None, 'meta': {'vector_id': '2220', 'name': '331_Bran_Stark.txt'}, 'embedding': None}, 
#{'text': '\n=== Robb Stark ===\nRobb Stark is the oldest child of Eddard and Catelyn Stark, and the heir to Winterfell. He is not a POV character, but features in the POV chapters of his family members in the first three novels in the series.\nIn the HBO television adaptation, he is portrayed by Richard Madden.', 'id': 'af4010af-6257-46f3-9331-8b0c3fb89298', 'score': 179.93167, 'probability': 0.8580657384755829, 'question': None, 'meta': {'vector_id': '1711', 'name': '30_List_of_A_Song_of_Ice_and_Fire_characters.txt'}, 'embedding': None}, 
#...]}