Skip to content

Commit

Permalink
feat: Update examples based on haystack 2.0 pipelines
Browse files Browse the repository at this point in the history
  • Loading branch information
prosto committed Jan 15, 2024
1 parent a5d4552 commit a7e3bf1
Show file tree
Hide file tree
Showing 4 changed files with 254 additions and 64 deletions.
64 changes: 0 additions & 64 deletions examples/better_retrieval_via_embedding_retrieval.py

This file was deleted.

73 changes: 73 additions & 0 deletions examples/indexing_pipeline.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
import logging
import os
import zipfile
from io import BytesIO
from pathlib import Path

import requests
from haystack import Pipeline
from haystack.components.converters import TextFileToDocument
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter
from haystack.components.writers import DocumentWriter

from neo4j_haystack import Neo4jDocumentStore

logger = logging.getLogger(__name__)


def fetch_archive_from_http(url: str, output_dir: str):
if Path(output_dir).is_dir():
logger.warn(f"'{output_dir}' directory already exists. Skipping data download")
return

with requests.get(url, timeout=10, stream=True) as response:
with zipfile.ZipFile(BytesIO(response.content)) as zip_ref:
zip_ref.extractall(output_dir)


# Let's first get some files that we want to use
docs_dir = "data/docs"
fetch_archive_from_http(
url="https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt6.zip",
output_dir=docs_dir,
)

# Make sure you have a running Neo4j database, e.g. with Docker:
# docker run \
# --restart always \
# --publish=7474:7474 --publish=7687:7687 \
# --env NEO4J_AUTH=neo4j/passw0rd \
# neo4j:5.15.0

document_store = Neo4jDocumentStore(
url="bolt://localhost:7687",
username="neo4j",
password="passw0rd",
database="neo4j",
embedding_dim=384,
similarity="cosine",
recreate_index=True,
)

# Create components and an indexing pipeline that converts txt to documents, cleans and splits them, and
# indexes them for dense retrieval.
p = Pipeline()
p.add_component("text_file_converter", TextFileToDocument())
p.add_component("cleaner", DocumentCleaner())
p.add_component("splitter", DocumentSplitter(split_by="sentence", split_length=250, split_overlap=30))
p.add_component(
"embedder", SentenceTransformersDocumentEmbedder(model_name_or_path="sentence-transformers/all-MiniLM-L6-v2")
)
p.add_component("writer", DocumentWriter(document_store=document_store))

p.connect("text_file_converter.documents", "cleaner.documents")
p.connect("cleaner.documents", "splitter.documents")
p.connect("splitter.documents", "embedder.documents")
p.connect("embedder.documents", "writer.documents")

# Take the docs data directory as input and run the pipeline
file_paths = [docs_dir / Path(name) for name in os.listdir(docs_dir)]
result = p.run({"text_file_converter": {"sources": file_paths}})

# Assuming you have a docker container running navigate to http://localhost:7474 to open Neo4j Browser
83 changes: 83 additions & 0 deletions examples/rag_pipeline.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
import os

from haystack import GeneratedAnswer, Pipeline
from haystack.components.builders.answer_builder import AnswerBuilder
from haystack.components.builders.prompt_builder import PromptBuilder
from haystack.components.embedders import SentenceTransformersTextEmbedder
from haystack.components.generators import HuggingFaceTGIGenerator

from neo4j_haystack import Neo4jDocumentRetriever, Neo4jDocumentStore

# Load HF Token from environment variables.
HF_TOKEN = os.environ.get("HF_TOKEN")

# Make sure you have a running Neo4j database with indexed documents available (see `indexing_pipeline.py`),
# e.g. with Docker:
# docker run \
# --restart always \
# --publish=7474:7474 --publish=7687:7687 \
# --env NEO4J_AUTH=neo4j/passw0rd \
# neo4j:5.15.0

document_store = Neo4jDocumentStore(
url="bolt://localhost:7687",
username="neo4j",
password="passw0rd",
database="neo4j",
embedding_dim=384,
similarity="cosine",
recreate_index=False, # Do not delete index as it was created by indexer pipeline
create_index_if_missing=False,
)

# Build a RAG pipeline with a Retriever to get relevant documents to the query and a HuggingFaceTGIGenerator
# interacting with LLMs using a custom prompt.
prompt_template = """
Given these documents, answer the question.\nDocuments:
{% for doc in documents %}
{{ doc.content }}
{% endfor %}
\nQuestion: {{question}}
\nAnswer:
"""
rag_pipeline = Pipeline()
rag_pipeline.add_component(
"query_embedder",
SentenceTransformersTextEmbedder(model_name_or_path="sentence-transformers/all-MiniLM-L6-v2", progress_bar=False),
)
rag_pipeline.add_component("retriever", Neo4jDocumentRetriever(document_store=document_store))
rag_pipeline.add_component("prompt_builder", PromptBuilder(template=prompt_template))
rag_pipeline.add_component(
"llm",
HuggingFaceTGIGenerator(model="mistralai/Mistral-7B-v0.1", token=HF_TOKEN),
)
rag_pipeline.add_component("answer_builder", AnswerBuilder())

rag_pipeline.connect("query_embedder", "retriever.query_embedding")
rag_pipeline.connect("retriever.documents", "prompt_builder.documents")
rag_pipeline.connect("prompt_builder.prompt", "llm.prompt")
rag_pipeline.connect("llm.replies", "answer_builder.replies")
rag_pipeline.connect("llm.metadata", "answer_builder.metadata")
rag_pipeline.connect("retriever", "answer_builder.documents")

# Ask a question on the data you just added.
question = "Who created the Dothraki vocabulary?"
result = rag_pipeline.run(
{
"query_embedder": {"text": question},
"retriever": {"top_k": 3},
"prompt_builder": {"question": question},
"answer_builder": {"query": question},
}
)

# For details, like which documents were used to generate the answer, look into the GeneratedAnswer object
answer: GeneratedAnswer = result["answer_builder"]["answers"][0]

# ruff: noqa: T201
print("Query: ", answer.query)
print("Answer: ", answer.data)
print("== Sources:")
for doc in answer.documents:
print("-> ", doc.meta["file_path"])
98 changes: 98 additions & 0 deletions examples/rag_pipeline_cypher.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
import os

from haystack import GeneratedAnswer, Pipeline
from haystack.components.builders.answer_builder import AnswerBuilder
from haystack.components.builders.prompt_builder import PromptBuilder
from haystack.components.embedders import SentenceTransformersTextEmbedder
from haystack.components.generators import HuggingFaceTGIGenerator

from neo4j_haystack import Neo4jClientConfig, Neo4jDynamicDocumentRetriever

# Load HF Token from environment variables.
HF_TOKEN = os.environ.get("HF_TOKEN")

# Make sure you have a running Neo4j database with indexed documents available (see `indexing_pipeline.py`),
# e.g. with Docker:
# docker run \
# --restart always \
# --publish=7474:7474 --publish=7687:7687 \
# --env NEO4J_AUTH=neo4j/passw0rd \
# neo4j:5.15.0

client_config = Neo4jClientConfig(
url="bolt://localhost:7687",
username="neo4j",
password="passw0rd",
database="neo4j",
)

cypher_query = """
CALL db.index.vector.queryNodes($index, $top_k, $query_embedding)
YIELD node as doc, score
MATCH (doc)
RETURN doc{.*, score}, score
ORDER BY score DESC LIMIT $top_k
"""

# Build a RAG pipeline with a Retriever to get relevant documents to the query and a HuggingFaceTGIGenerator
# interacting with LLMs using a custom prompt.
prompt_template = """
Given these documents, answer the question.\nDocuments:
{% for doc in documents %}
{{ doc.content }}
{% endfor %}
\nQuestion: {{question}}
\nAnswer:
"""
rag_pipeline = Pipeline()
rag_pipeline.add_component(
"query_embedder",
SentenceTransformersTextEmbedder(model_name_or_path="sentence-transformers/all-MiniLM-L6-v2", progress_bar=False),
)
rag_pipeline.add_component(
"retriever",
Neo4jDynamicDocumentRetriever(
client_config=client_config,
runtime_parameters=["query_embedding"],
doc_node_name="doc",
verify_connectivity=True,
),
)
rag_pipeline.add_component("prompt_builder", PromptBuilder(template=prompt_template))
rag_pipeline.add_component(
"llm",
HuggingFaceTGIGenerator(model="mistralai/Mistral-7B-v0.1", token=HF_TOKEN),
)
rag_pipeline.add_component("answer_builder", AnswerBuilder())

rag_pipeline.connect("query_embedder", "retriever.query_embedding")
rag_pipeline.connect("retriever.documents", "prompt_builder.documents")
rag_pipeline.connect("prompt_builder.prompt", "llm.prompt")
rag_pipeline.connect("llm.replies", "answer_builder.replies")
rag_pipeline.connect("llm.metadata", "answer_builder.metadata")
rag_pipeline.connect("retriever", "answer_builder.documents")

# Ask a question on the data you just added.
question = "Who created the Dothraki vocabulary?"
result = rag_pipeline.run(
{
"query_embedder": {"text": question},
"retriever": {
"query": cypher_query,
"parameters": {"index": "document-embeddings", "top_k": 3},
},
"prompt_builder": {"question": question},
"answer_builder": {"query": question},
}
)

# For details, like which documents were used to generate the answer, look into the GeneratedAnswer object
answer: GeneratedAnswer = result["answer_builder"]["answers"][0]

# ruff: noqa: T201
print("Query: ", answer.query)
print("Answer: ", answer.data)
print("== Sources:")
for doc in answer.documents:
print("-> ", doc.meta["file_path"])

0 comments on commit a7e3bf1

Please sign in to comment.