1. caching
1. tracing, logging and/or breaking down pipelines
1. evals
1. hypster stored_values

1. improvements - query enhancement, document summary, multiple documents 
1. optimizations - parallel runs, etc...

1. hypster registry?
1. hypernodes with haystack backend?

# Config

In [1]:
from hypster import HP, config

In [2]:
@config
def indexing_config(hp: HP):
    from haystack.components.converters import PyPDFToDocument

    converter = PyPDFToDocument()

    from haystack.components.preprocessors import DocumentSplitter

    split_by = hp.select(["sentence", "word", "passage", "page"], default="sentence")
    splitter = DocumentSplitter(split_by=split_by, split_length=hp.int_input(10), split_overlap=hp.int_input(2))

    from haystack import Pipeline

    pipeline = Pipeline()
    pipeline.add_component("loader", converter)
    pipeline.add_component("splitter", splitter)
    pipeline.connect("loader", "splitter")


indexing_config.save("configs/indexing.py")

In [3]:
@config
def fast_embed(hp: HP):
    from typing import Any, Dict, List

    from fastembed import TextEmbedding

    def get_model_dim(chosen_model: str, model_list: List[Dict[str, Any]]) -> int:
        for model in model_list:
            if model["model"] == chosen_model:
                return model["dim"]
        raise ValueError(f"Model {chosen_model} not found in the list of supported models.")
    
    from haystack_integrations.components.embedders.fastembed import (
        FastembedDocumentEmbedder,
        FastembedTextEmbedder,
    )

    meta_fileds_to_embed = ["parent_doc_summary"]

    model = hp.select(
        {"bge-small": "BAAI/bge-small-en-v1.5", "mini-lm": "sentence-transformers/all-MiniLM-L6-v2"},
        default="mini-lm",
    )
    import os

    cpu_count = os.cpu_count() or 1
    doc_embedder = FastembedDocumentEmbedder(
        model=model, parallel=hp.int_input(cpu_count), meta_fields_to_embed=meta_fileds_to_embed
    )
    text_embedder = FastembedTextEmbedder(model=model)
    embedding_dim = get_model_dim(model, TextEmbedding.list_supported_models())

fast_embed.save("configs/fast_embed.py")

In [4]:
@config
def jina_embed(hp: HP):
    from haystack_integrations.components.embedders.jina import JinaDocumentEmbedder, JinaTextEmbedder

    meta_fileds_to_embed = ["parent_doc_summary"]

    model = hp.select({"v3": "jina-embeddings-v3", "v2": "jina-embeddings-v2"}, default="v3")
    late_chunking = hp.select([True, False], default=True, name="late_chunking") if model == "v3" else False
    doc_embedder = JinaDocumentEmbedder(
        model=model,
        batch_size=hp.int_input(16),
        dimensions=hp.int_input(256),
        task="retrieval.passage",
        late_chunking=late_chunking,
        meta_fields_to_embed=meta_fileds_to_embed,
    )
    text_embedder = JinaTextEmbedder(model=model, dimensions=doc_embedder.dimensions, task="retrieval.query")
    embedding_dim = doc_embedder.dimensions

jina_embed.save("configs/jina_embed.py")

In [5]:
@config
def in_memory_retrieval(hp: HP):
    from haystack.components.retrievers.in_memory import InMemoryBM25Retriever, InMemoryEmbeddingRetriever
    from haystack.document_stores.in_memory import InMemoryDocumentStore

    embedding_similarity_function = hp.select(["cosine", "dot_product"], default="cosine")
    document_store = InMemoryDocumentStore(embedding_similarity_function=embedding_similarity_function)

    from haystack.components.joiners.document_joiner import DocumentJoiner

    join_mode = hp.select(
        ["distribution_based_rank_fusion", "concatenate", "merge", "reciprocal_rank_fusion"],
        default="distribution_based_rank_fusion",
    )
    joiner = DocumentJoiner(join_mode=join_mode, top_k=hp.int_input(5))

    from haystack import Pipeline

    from src.haystack_utils import PassThroughDocumentsComponent, PassThroughTextComponent

    pipeline = Pipeline()
    pipeline.add_component("query", PassThroughTextComponent())
    pipeline.add_component("bm25_retriever", InMemoryBM25Retriever(document_store=document_store))
    pipeline.add_component("embedding_retriever", InMemoryEmbeddingRetriever(document_store=document_store))
    pipeline.add_component("document_joiner", joiner)
    pipeline.add_component("retrieved_documents", PassThroughDocumentsComponent())
    pipeline.connect("query", "bm25_retriever")
    pipeline.connect("bm25_retriever", "document_joiner")
    pipeline.connect("embedding_retriever", "document_joiner")
    pipeline.connect("document_joiner", "retrieved_documents")


in_memory_retrieval.save("configs/in_memory_retrieval.py")

In [6]:
@config
def qdrant_retrieval(hp: HP):
    from haystack_integrations.components.retrievers.qdrant import QdrantEmbeddingRetriever
    from haystack_integrations.document_stores.qdrant import QdrantDocumentStore

    location = hp.text_input(":memory:")
    embedding_similarity_function = hp.select(["cosine", "dot_product", "l2"], default="cosine")

    # if "embedding_dim" in hp.stored_values:
    #     embedding_dim = hp.stored_value["embedding_dim"]
    # else:
    #     embedding_dim = hp.int_input(256)

    document_store = QdrantDocumentStore(
        location=location, 
        recreate_index=True,
        similarity=embedding_similarity_function,
        #embedding_dim=embedding_dim,
        on_disk=True,
    )

    embedding_retriever = QdrantEmbeddingRetriever(document_store=document_store, top_k=hp.int_input(5))

    from haystack import Pipeline

    from src.haystack_utils import PassThroughDocumentsComponent, PassThroughTextComponent

    pipeline = Pipeline()
    pipeline.add_component("query", PassThroughTextComponent())
    pipeline.add_component("embedding_retriever", embedding_retriever)
    pipeline.add_component("retrieved_documents", PassThroughDocumentsComponent())
    pipeline.connect("embedding_retriever", "retrieved_documents")


qdrant_retrieval.save("configs/qdrant_retrieval.py")

In [7]:
@config
def openai_llm(hp: HP):
    from haystack.components.generators import OpenAIGenerator

    model = hp.select(
        {"gpt-4o-mini": "gpt-4o-mini", "gpt-4o": "gpt-4o", "gpt-4o-latest": "gpt-4o-2024-08-06"}, default="gpt-4o-mini"
    )
    llm = OpenAIGenerator(model=model)


openai_llm.save("configs/openai_llm.py")

In [8]:
@config
def anthropic_llm(hp: HP):
    from haystack_integrations.components.generators.anthropic import AnthropicGenerator

    model = hp.select({"haiku": "claude-3-haiku-20240307", "sonnet": "claude-3-5-sonnet-20240620"}, default="haiku")
    llm = AnthropicGenerator(model=model)


anthropic_llm.save("configs/anthropic_llm.py")

In [9]:
@config
def llm(hp: HP):
    from hypster import load

    openai_llm = load("configs/openai_llm.py")
    anthropic_llm = load("configs/anthropic_llm.py")

    llm = hp.select(
        {"openai": hp.propagate(openai_llm)["llm"], "anthropic": hp.propagate(anthropic_llm)["llm"]}, default="openai"
    )


llm.save("configs/llm.py")

In [12]:
@config
def hp_config(hp: HP):
    from hypster import load

    file_path = hp.text_input("data/raw/modular_rag.pdf")
    query = "What is the use of BERT in this document?"

    indexing = load("configs/indexing.py")
    indexing_inputs = hp.propagate(indexing)
    indexing_pipeline = indexing_inputs["pipeline"]

    fast_embed = load("configs/fast_embed.py")
    jina_embed = load("configs/jina_embed.py")
    embedder = hp.select({"fastembed": hp.propagate(fast_embed), "jina": hp.propagate(jina_embed)}, default="fastembed")
    indexing_pipeline.add_component("doc_embedder", embedder["doc_embedder"])

    qdrant = load("configs/qdrant_retrieval.py")
    in_memory = load("configs/in_memory_retrieval.py")
    document_store = hp.select(
        {"in_memory": hp.propagate(in_memory), "qdrant": hp.propagate(qdrant)}, default="in_memory"
    )

    from haystack.components.writers import DocumentWriter
    from haystack.document_stores.types import DuplicatePolicy

    document_writer = DocumentWriter(document_store["document_store"], policy=DuplicatePolicy.OVERWRITE)
    indexing_pipeline.add_component("document_writer", document_writer)

    indexing_pipeline.connect("splitter", "doc_embedder")
    indexing_pipeline.connect("doc_embedder", "document_writer")

    pipeline = document_store["pipeline"]
    pipeline.add_component("text_embedder", embedder["text_embedder"])
    pipeline.connect("query", "text_embedder")
    pipeline.connect("text_embedder", "embedding_retriever")

    llm = load("configs/llm.py")
    small_llm_model = hp.propagate(llm)
    large_llm_model = hp.propagate(llm)

    from haystack.components.builders import PromptBuilder

    template = hp.text_input("""
    Given the following information, answer the question in one short sentence, 
    using the information provided in the documents. add an exact quote from the document to support your answer, 
    preferably with the keyword/s from the original question.

    Context:
    {% for document in documents %}
        {{ document.content }}
    {% endfor %}

    Question: {{question}}
    Answer:
    Quote:
    """)

    pipeline.add_component("prompt_builder", PromptBuilder(template=template))
    pipeline.add_component("llm", large_llm_model["llm"])
    pipeline.connect("retrieved_documents", "prompt_builder.documents")
    pipeline.connect("query", "prompt_builder.question")
    pipeline.connect("prompt_builder", "llm")

In [13]:
inputs = hp_config(selections={"large_llm_model" : "anthropic", "large_llm_model.model": "haiku", 
                               "document_store" : "in_memory"})

In [14]:
globals().update(inputs)

In [15]:
indexing_pipeline.warm_up()

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

In [16]:
indexing_pipeline.run({"loader": {"sources": [file_path]}})

Calculating embeddings: 100%|██████████| 156/156 [00:01<00:00, 87.17it/s]


{'document_writer': {'documents_written': 156}}

In [17]:
pipeline.warm_up()
response = pipeline.run({"query": {"text": query}})

Calculating embeddings: 100%|██████████| 1/1 [00:00<00:00, 193.20it/s]


In [18]:
print(response["llm"]["replies"][0])

BERT is used in the document as part of the "Dense Retriever" to provide complex semantic representations of queries and documents.  
Quote: "Dense Retriever employs pre-trained language models (PLMs) to provide dense representations of queries and documents."
