-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: Update examples based on haystack 2.0 pipelines
- Loading branch information
Showing
4 changed files
with
254 additions
and
64 deletions.
There are no files selected for viewing
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
import logging | ||
import os | ||
import zipfile | ||
from io import BytesIO | ||
from pathlib import Path | ||
|
||
import requests | ||
from haystack import Pipeline | ||
from haystack.components.converters import TextFileToDocument | ||
from haystack.components.embedders import SentenceTransformersDocumentEmbedder | ||
from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter | ||
from haystack.components.writers import DocumentWriter | ||
|
||
from neo4j_haystack import Neo4jDocumentStore | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
def fetch_archive_from_http(url: str, output_dir: str): | ||
if Path(output_dir).is_dir(): | ||
logger.warn(f"'{output_dir}' directory already exists. Skipping data download") | ||
return | ||
|
||
with requests.get(url, timeout=10, stream=True) as response: | ||
with zipfile.ZipFile(BytesIO(response.content)) as zip_ref: | ||
zip_ref.extractall(output_dir) | ||
|
||
|
||
# Let's first get some files that we want to use | ||
docs_dir = "data/docs" | ||
fetch_archive_from_http( | ||
url="https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt6.zip", | ||
output_dir=docs_dir, | ||
) | ||
|
||
# Make sure you have a running Neo4j database, e.g. with Docker: | ||
# docker run \ | ||
# --restart always \ | ||
# --publish=7474:7474 --publish=7687:7687 \ | ||
# --env NEO4J_AUTH=neo4j/passw0rd \ | ||
# neo4j:5.15.0 | ||
|
||
document_store = Neo4jDocumentStore( | ||
url="bolt://localhost:7687", | ||
username="neo4j", | ||
password="passw0rd", | ||
database="neo4j", | ||
embedding_dim=384, | ||
similarity="cosine", | ||
recreate_index=True, | ||
) | ||
|
||
# Create components and an indexing pipeline that converts txt to documents, cleans and splits them, and | ||
# indexes them for dense retrieval. | ||
p = Pipeline() | ||
p.add_component("text_file_converter", TextFileToDocument()) | ||
p.add_component("cleaner", DocumentCleaner()) | ||
p.add_component("splitter", DocumentSplitter(split_by="sentence", split_length=250, split_overlap=30)) | ||
p.add_component( | ||
"embedder", SentenceTransformersDocumentEmbedder(model_name_or_path="sentence-transformers/all-MiniLM-L6-v2") | ||
) | ||
p.add_component("writer", DocumentWriter(document_store=document_store)) | ||
|
||
p.connect("text_file_converter.documents", "cleaner.documents") | ||
p.connect("cleaner.documents", "splitter.documents") | ||
p.connect("splitter.documents", "embedder.documents") | ||
p.connect("embedder.documents", "writer.documents") | ||
|
||
# Take the docs data directory as input and run the pipeline | ||
file_paths = [docs_dir / Path(name) for name in os.listdir(docs_dir)] | ||
result = p.run({"text_file_converter": {"sources": file_paths}}) | ||
|
||
# Assuming you have a docker container running navigate to http://localhost:7474 to open Neo4j Browser |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,83 @@ | ||
import os | ||
|
||
from haystack import GeneratedAnswer, Pipeline | ||
from haystack.components.builders.answer_builder import AnswerBuilder | ||
from haystack.components.builders.prompt_builder import PromptBuilder | ||
from haystack.components.embedders import SentenceTransformersTextEmbedder | ||
from haystack.components.generators import HuggingFaceTGIGenerator | ||
|
||
from neo4j_haystack import Neo4jDocumentRetriever, Neo4jDocumentStore | ||
|
||
# Load HF Token from environment variables. | ||
HF_TOKEN = os.environ.get("HF_TOKEN") | ||
|
||
# Make sure you have a running Neo4j database with indexed documents available (see `indexing_pipeline.py`), | ||
# e.g. with Docker: | ||
# docker run \ | ||
# --restart always \ | ||
# --publish=7474:7474 --publish=7687:7687 \ | ||
# --env NEO4J_AUTH=neo4j/passw0rd \ | ||
# neo4j:5.15.0 | ||
|
||
document_store = Neo4jDocumentStore( | ||
url="bolt://localhost:7687", | ||
username="neo4j", | ||
password="passw0rd", | ||
database="neo4j", | ||
embedding_dim=384, | ||
similarity="cosine", | ||
recreate_index=False, # Do not delete index as it was created by indexer pipeline | ||
create_index_if_missing=False, | ||
) | ||
|
||
# Build a RAG pipeline with a Retriever to get relevant documents to the query and a HuggingFaceTGIGenerator | ||
# interacting with LLMs using a custom prompt. | ||
prompt_template = """ | ||
Given these documents, answer the question.\nDocuments: | ||
{% for doc in documents %} | ||
{{ doc.content }} | ||
{% endfor %} | ||
\nQuestion: {{question}} | ||
\nAnswer: | ||
""" | ||
rag_pipeline = Pipeline() | ||
rag_pipeline.add_component( | ||
"query_embedder", | ||
SentenceTransformersTextEmbedder(model_name_or_path="sentence-transformers/all-MiniLM-L6-v2", progress_bar=False), | ||
) | ||
rag_pipeline.add_component("retriever", Neo4jDocumentRetriever(document_store=document_store)) | ||
rag_pipeline.add_component("prompt_builder", PromptBuilder(template=prompt_template)) | ||
rag_pipeline.add_component( | ||
"llm", | ||
HuggingFaceTGIGenerator(model="mistralai/Mistral-7B-v0.1", token=HF_TOKEN), | ||
) | ||
rag_pipeline.add_component("answer_builder", AnswerBuilder()) | ||
|
||
rag_pipeline.connect("query_embedder", "retriever.query_embedding") | ||
rag_pipeline.connect("retriever.documents", "prompt_builder.documents") | ||
rag_pipeline.connect("prompt_builder.prompt", "llm.prompt") | ||
rag_pipeline.connect("llm.replies", "answer_builder.replies") | ||
rag_pipeline.connect("llm.metadata", "answer_builder.metadata") | ||
rag_pipeline.connect("retriever", "answer_builder.documents") | ||
|
||
# Ask a question on the data you just added. | ||
question = "Who created the Dothraki vocabulary?" | ||
result = rag_pipeline.run( | ||
{ | ||
"query_embedder": {"text": question}, | ||
"retriever": {"top_k": 3}, | ||
"prompt_builder": {"question": question}, | ||
"answer_builder": {"query": question}, | ||
} | ||
) | ||
|
||
# For details, like which documents were used to generate the answer, look into the GeneratedAnswer object | ||
answer: GeneratedAnswer = result["answer_builder"]["answers"][0] | ||
|
||
# ruff: noqa: T201 | ||
print("Query: ", answer.query) | ||
print("Answer: ", answer.data) | ||
print("== Sources:") | ||
for doc in answer.documents: | ||
print("-> ", doc.meta["file_path"]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,98 @@ | ||
import os | ||
|
||
from haystack import GeneratedAnswer, Pipeline | ||
from haystack.components.builders.answer_builder import AnswerBuilder | ||
from haystack.components.builders.prompt_builder import PromptBuilder | ||
from haystack.components.embedders import SentenceTransformersTextEmbedder | ||
from haystack.components.generators import HuggingFaceTGIGenerator | ||
|
||
from neo4j_haystack import Neo4jClientConfig, Neo4jDynamicDocumentRetriever | ||
|
||
# Load HF Token from environment variables. | ||
HF_TOKEN = os.environ.get("HF_TOKEN") | ||
|
||
# Make sure you have a running Neo4j database with indexed documents available (see `indexing_pipeline.py`), | ||
# e.g. with Docker: | ||
# docker run \ | ||
# --restart always \ | ||
# --publish=7474:7474 --publish=7687:7687 \ | ||
# --env NEO4J_AUTH=neo4j/passw0rd \ | ||
# neo4j:5.15.0 | ||
|
||
client_config = Neo4jClientConfig( | ||
url="bolt://localhost:7687", | ||
username="neo4j", | ||
password="passw0rd", | ||
database="neo4j", | ||
) | ||
|
||
cypher_query = """ | ||
CALL db.index.vector.queryNodes($index, $top_k, $query_embedding) | ||
YIELD node as doc, score | ||
MATCH (doc) | ||
RETURN doc{.*, score}, score | ||
ORDER BY score DESC LIMIT $top_k | ||
""" | ||
|
||
# Build a RAG pipeline with a Retriever to get relevant documents to the query and a HuggingFaceTGIGenerator | ||
# interacting with LLMs using a custom prompt. | ||
prompt_template = """ | ||
Given these documents, answer the question.\nDocuments: | ||
{% for doc in documents %} | ||
{{ doc.content }} | ||
{% endfor %} | ||
\nQuestion: {{question}} | ||
\nAnswer: | ||
""" | ||
rag_pipeline = Pipeline() | ||
rag_pipeline.add_component( | ||
"query_embedder", | ||
SentenceTransformersTextEmbedder(model_name_or_path="sentence-transformers/all-MiniLM-L6-v2", progress_bar=False), | ||
) | ||
rag_pipeline.add_component( | ||
"retriever", | ||
Neo4jDynamicDocumentRetriever( | ||
client_config=client_config, | ||
runtime_parameters=["query_embedding"], | ||
doc_node_name="doc", | ||
verify_connectivity=True, | ||
), | ||
) | ||
rag_pipeline.add_component("prompt_builder", PromptBuilder(template=prompt_template)) | ||
rag_pipeline.add_component( | ||
"llm", | ||
HuggingFaceTGIGenerator(model="mistralai/Mistral-7B-v0.1", token=HF_TOKEN), | ||
) | ||
rag_pipeline.add_component("answer_builder", AnswerBuilder()) | ||
|
||
rag_pipeline.connect("query_embedder", "retriever.query_embedding") | ||
rag_pipeline.connect("retriever.documents", "prompt_builder.documents") | ||
rag_pipeline.connect("prompt_builder.prompt", "llm.prompt") | ||
rag_pipeline.connect("llm.replies", "answer_builder.replies") | ||
rag_pipeline.connect("llm.metadata", "answer_builder.metadata") | ||
rag_pipeline.connect("retriever", "answer_builder.documents") | ||
|
||
# Ask a question on the data you just added. | ||
question = "Who created the Dothraki vocabulary?" | ||
result = rag_pipeline.run( | ||
{ | ||
"query_embedder": {"text": question}, | ||
"retriever": { | ||
"query": cypher_query, | ||
"parameters": {"index": "document-embeddings", "top_k": 3}, | ||
}, | ||
"prompt_builder": {"question": question}, | ||
"answer_builder": {"query": question}, | ||
} | ||
) | ||
|
||
# For details, like which documents were used to generate the answer, look into the GeneratedAnswer object | ||
answer: GeneratedAnswer = result["answer_builder"]["answers"][0] | ||
|
||
# ruff: noqa: T201 | ||
print("Query: ", answer.query) | ||
print("Answer: ", answer.data) | ||
print("== Sources:") | ||
for doc in answer.documents: | ||
print("-> ", doc.meta["file_path"]) |