In [234]:
pip install haystack-ai chroma-haystack sentence-transformers pypdf markdown-it-py mdit-plain

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [235]:
import os
from typing import Dict, Any
from pathlib import Path

In [249]:
from haystack import Pipeline
from haystack.components.writers import DocumentWriter
from haystack.components.converters import (
    MarkdownToDocument,
    PyPDFToDocument,
    TextFileToDocument,
)
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter
from haystack.components.routers import FileTypeRouter
from haystack.components.joiners import DocumentJoiner
from haystack_integrations.document_stores.chroma import ChromaDocumentStore  # type: ignore
from haystack.document_stores.types import DuplicatePolicy

In [257]:
agent_name: str = "llm-expert"
agent_dir: str = os.path.join("data", "agents", agent_name)
files = [
    {"filename": "what_is_llm.pdf", "meta": {"split_by" : "word", "split_length":10, "split_overlap":0, "split_threshold":0}}
]

embedding_model_name="sentence-transformers/all-MiniLM-L6-v2"
embedding_model_path = os.path.join("data", "models" , embedding_model_name)
store_path = os.path.join("data", "store")


In [258]:
    # step 1: clear the store of this agent data
    # ------------------------------------------
    # set the store
    document_store = ChromaDocumentStore(
        collection_name=f"agent_{agent_name}",
        persist_path=store_path,
        distance_function="cosine",
    )
    document_writer = DocumentWriter(
        document_store=document_store, policy=DuplicatePolicy.OVERWRITE
    )


In [259]:
# delete existing docs if any in the colleciton
doc_count = document_store.count_documents()
if doc_count > 0:
    docs = document_store.filter_documents()
    ids = [d.id for d in docs]
    document_store.delete_documents(ids)


In [260]:
# step 2: create common pipeline components
# -----------------------------------------
# set the types supported
file_type_router = FileTypeRouter(
    mime_types=["text/plain", "application/pdf", "text/markdown"]
)
# set the converters
text_file_converter = TextFileToDocument()
markdown_converter = MarkdownToDocument()
pdf_converter = PyPDFToDocument()
# set the cleaner and splitter
document_cleaner = DocumentCleaner()

In [261]:
document_embedder = SentenceTransformersDocumentEmbedder(model=embedding_model_path)
document_embedder.warm_up()

merged_docs = []
for file in files:
     # set the filepath
    file_path = Path(agent_dir) / file["filename"]
    router_res = file_type_router.run(sources=[file_path])
    txt_docs = []
    if 'text/plain' in router_res:
        txt_docs = text_file_converter.run(sources=router_res['text/plain'])
    elif 'application/pdf' in router_res:
        txt_docs = pdf_converter.run(sources=router_res['application/pdf'])
    elif 'text/markdown' in router_res:
        txt_docs = markdown_converter.run(sources=router_res['text/markdown'])
    document_splitter = DocumentSplitter(
        split_by=file['meta']['split_by'], 
        split_length=file['meta']['split_length'], 
        split_overlap=file['meta']['split_overlap'], 
        split_threshold=file['meta']['split_threshold']
    )
    print("before cleaning")
    cleaner_res = document_cleaner.run([txt_docs['documents'][0]])
    print("After cleaning")
    splitter_res = document_splitter.run([cleaner_res['documents'][0]])
    print("after splitting")
    merged_docs.extend(splitter_res['documents'])

print(len(merged_docs))
# run the embedder
#embedder_res = document_embedder.run(merged_docs)
#print("embedding done")
#print(res_embedder)
#document_writer.run({"documents": documents=res_embedder)
#print("writing done")
indexing_pipeline = Pipeline()
indexing_pipeline.add_component(instance=document_embedder, name="embedder")
indexing_pipeline.add_component(instance=document_writer, name="writer")

indexing_pipeline.connect("embedder", "writer")
indexing_pipeline.run({"embedder": {"documents": merged_docs}})

before cleaning
After cleaning
after splitting
1


Batches: 100%|████████████████████████████████████████████| 1/1 [00:00<00:00,  1.30it/s]


{'writer': {'documents_written': 1}}

In [262]:
from haystack.components.embedders import SentenceTransformersTextEmbedder
from haystack_integrations.document_stores.chroma import ChromaDocumentStore
from haystack_integrations.components.retrievers.chroma import ChromaEmbeddingRetriever

In [276]:
# set the embedder
input = "what is a LLM"
text_embedder = SentenceTransformersTextEmbedder(model=embedding_model_path)
# set the store
document_store = ChromaDocumentStore(collection_name=f"agent_{agent_name}", persist_path=store_path, distance_function="cosine")
# set the retriever for the store
retriever = ChromaEmbeddingRetriever(document_store=document_store, top_k=3)

# add components to the pipeline
q = Pipeline()
# embedder
q.add_component("query_embedder", text_embedder)
q.add_component("retriever", retriever)
q.connect("query_embedder.embedding", "retriever.query_embedding")
results = querying_pipeline.run({"query_embedder": {"text": input}})
chunks = []
count = 0
for d in results["retriever"]["documents"]:
    count=count+1
#    print(d.content)
#    print("------")
    chunks.append(d.content.replace('\n', ' '))
print(chunks)


Batches: 100%|████████████████████████████████████████████| 1/1 [00:00<00:00,  2.77it/s]
Number of requested results 3 is greater than number of elements in index 1, updating n_results = 1


["WHAT IS LLM A large language model (LLM) is a deep learning algorithm that can perform a variety of natural language processing (NLP) tasks. Large language models use transformer models and are trained using massive datasets — hence, large. This enables them to recognize, translate, predict, or generate text or other content. Large language models are also referred to as neural networks (NNs) , which are computing systems inspired by the human brain. These neural networks work using a network of nodes that are layered, much like neurons. In addition to teaching human languages to artificial intelligence (AI) applications, large language models can also be trained to perform a variety of tasks like understanding protein structures, writing software code, and more. Like the human brain, large language models must be pre-trained and then fine-tuned so that they can solve text classification, question answering, document summarization, and text generation problems. Their problem-solving 