In [2]:
pip install haystack-ai chroma-haystack sentence-transformers pypdf markdown-it-py mdit-plain

Collecting mdit-plain
  Downloading mdit_plain-1.0.1-py3-none-any.whl.metadata (1.2 kB)
Downloading mdit_plain-1.0.1-py3-none-any.whl (3.1 kB)
Installing collected packages: mdit-plain
Successfully installed mdit-plain-1.0.1
Note: you may need to restart the kernel to use updated packages.


In [1]:
import os
from typing import Dict, Any
from pathlib import Path

In [13]:
from haystack import Pipeline
from haystack.components.writers import DocumentWriter
from haystack.components.converters import (
    MarkdownToDocument,
    PyPDFToDocument,
    TextFileToDocument,
)
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter
from haystack.components.routers import FileTypeRouter
from haystack.components.joiners import DocumentJoiner
from haystack_integrations.document_stores.chroma import ChromaDocumentStore  # type: ignore
from haystack.document_stores.types import DuplicatePolicy

In [192]:
agent_name: str = "llm-expert"
agent_dir: str = os.path.join("data", "agents", agent_name)
files = [
    {"filename": "rules.pdf", "meta": {"split_by" : "page", "split_length":1, "split_overlap":0, "split_threshold":0}},
    {"filename": "test.txt", "meta": {"split_by" : "passage", "split_length":1, "split_overlap":0, "split_threshold":0}}
]

embedding_model_name="sentence-transformers/all-MiniLM-L6-v2"
embedding_model_path = os.path.join("data", "models" , embedding_model_name)
store_path = os.path.join("data", "store")


In [186]:
    # step 1: clear the store of this agent data
    # ------------------------------------------
    # set the store
    document_store = ChromaDocumentStore(
        collection_name=f"agent_{agent_name}",
        persist_path=store_path,
        distance_function="cosine",
    )
    document_writer = DocumentWriter(
        document_store=document_store, policy=DuplicatePolicy.OVERWRITE
    )


In [187]:
# delete existing docs if any in the colleciton
doc_count = document_store.count_documents()
if doc_count > 0:
    docs = document_store.filter_documents()
    ids = [d.id for d in docs]
    document_store.delete_documents(ids)


In [194]:
# step 2: create common pipeline components
# -----------------------------------------
document_embedder = SentenceTransformersDocumentEmbedder(model=embedding_model_path)
# set the types supported
file_type_router = FileTypeRouter(
    mime_types=["text/plain", "application/pdf", "text/markdown"]
)
# set the converters
text_file_converter = TextFileToDocument()
markdown_converter = MarkdownToDocument()
pdf_converter = PyPDFToDocument()
# set the cleaner and splitter
document_cleaner = DocumentCleaner()
# set the joiner
document_joiner = DocumentJoiner(join_mode="merge")

In [195]:
merged_docs = []
for file in files:
     # set the filepath
    file_path = Path(agent_dir) / file["filename"]
    router_res = file_type_router.run(sources=[file_path])
    txt_docs = []
    if 'text/plain' in router_res:
        txt_docs = text_file_converter.run(sources=router_res['text/plain'])
    elif 'application/pdf' in router_res:
        txt_docs = pdf_converter.run(sources=router_res['application/pdf'])
    elif 'text/markdown' in router_res:
        txt_docs = markdown_converter.run(sources=router_res['text/markdown'])
    document_splitter = DocumentSplitter(
        split_by=file['meta']['split_by'], 
        split_length=file['meta']['split_length'], 
        split_overlap=file['meta']['split_overlap'], 
        split_threshold=file['meta']['split_threshold']
    )
    #print(file_path, txt_docs)
    splitter_res = document_splitter.run([txt_docs['documents'][0]])
    print(len(splitter_res['documents']))
    document_embedder.run([splitter_res[0]])
    merged_docs.extend(splitter_res['documents'])

print("total")
print(len(merged_docs))

1


KeyError: 0

In [184]:
# set the embedder
print(embedding_model_path)
print(merged_docs[5])
document_embedder = SentenceTransformersDocumentEmbedder(model=embedding_model_path)
result = document_embedder.run([merged_docs[0]])

data/models/sentence-transformers/all-MiniLM-L6-v2
Document(id=06e18617deb19b08211ace915d42e25002812564808095897be8d61021e5470b, content: 'to be affected by the proposal and all other person s likely to be interested in the said lands to 
...', meta: {'file_path': 'data/agents/llm-expert/rules.pdf', 'source_id': '80f5ce20d8070ff57776dfe29b31ee6d13ac56a2adc696578c4aa6b403df7b22', 'page_number': 6, 'split_id': 5, 'split_idx_start': 14001})


RuntimeError: The embedding model has not been loaded. Please call warm_up() before running.