# Use ChromaDocumentStore with Haystack

https://docs.haystack.deepset.ai/docs/chromadocumentstore

WORKING (chromadb2 env)

In [1]:
# Install the Chroma integration, Haystack will come as a dependency
# !pip install -U chroma-haystack "huggingface_hub>=0.22.0"

#!pip install chroma-haystack
#!pip install torch torchaudio torchvision

## Indexing Pipeline: preprocess, split and index documents
In this section, we will index documents into a Chroma DB collection by building a Haystack indexing pipeline. Here, we are indexing documents from the [VIM User Manuel](https://vimhelp.org/) into the Haystack `ChromaDocumentStore`.

 We have the `.txt` files for these pages in the examples folder for the `ChromaDocumentStore`, so we are using the [`TextFileToDocument`](https://docs.haystack.deepset.ai/v2.0/docs/textfiletodocument) and [`DocumentWriter`](https://docs.haystack.deepset.ai/v2.0/docs/documentwriter) components to build this indexing pipeline.

In [10]:
import os
from pathlib import Path
import torch
torch.cuda.is_available()

True

In [11]:
from haystack import Pipeline
from haystack.components.converters import TextFileToDocument
from haystack.components.writers import DocumentWriter
from haystack.components.preprocessors import DocumentSplitter, DocumentCleaner
from haystack.components.embedders import SentenceTransformersDocumentEmbedder

from haystack_integrations.document_stores.chroma import ChromaDocumentStore
from haystack import Document

In [12]:
from rich import print  # https://rich.readthedocs.io/en/stable/markup.html#console-markup
from rich.console import Console
from rich.table import Table
from rich.highlighter import RegexHighlighter
from rich.theme import Theme
from rich.text import Text
from rich.padding import Padding

In [13]:
hl_list = [
    r"(?i)(?P<bankother>phone|bank shot|bank of the|banked)",
    r"(?i)(?P<bank>bank|boa|teller|deposit|vault|account)",

    r"(?i)(?P<boardother>back board|board of direct|board the)",
    r"(?i)(?P<board>board|plank|timber|2x4|wood supplies)",

    r"(?i)(?P<flying>fly over|pilot|navigation|boeing|747|landing)",
    r"(?i)(?P<cool>fly sister|cool)",
    r"(?i)(?P<fly>fly)",
]

class MyHighlighter(RegexHighlighter):
    """Apply style to anything that looks like an email."""
    base_style = "example."
    highlights = hl_list

my_hl = MyHighlighter()

theme = Theme(
    {"example.bankother": "bold green",
     "example.bank": "bold bright_green",

     "example.boardother": "bold blue",
     "example.board": "bold bright_blue",

     "example.flying": "bold red",
     "example.cool": "bold bright_magenta",
     "example.fly": "bold bright_red",
     })


In [14]:
console = Console(highlighter=my_hl, theme=theme)
console.print("Highlighting demo: 'My bank is not BoA, but is on the bank of the river. Sometimes I fly over the board of directors to look cool.'")

In [15]:
print(f"Torchy?? [bold red]{torch.cuda.is_available()}[/]")
embedding_model = "sentence-transformers/all-MiniLM-L6-v2"

In [16]:
# file_paths = ["data" / Path(name) for name in os.listdir("data")]
from pathlib import Path
data_dir = "../data/"
text_files = list(Path(data_dir).glob("**/*.txt"))
print(text_files)

In [17]:
document_store = ChromaDocumentStore()
print(f"Doc store count at start: {document_store.count_documents()}")

In [None]:
# remove repeated substrings to get rid of headers/footers
cleaner = DocumentCleaner(remove_repeated_substrings=True)
# Since jina-v2 can handle 8192 tokens, 500 words seems like a safe chunk size
# splitter = DocumentSplitter(split_by="word", split_length=30, split_overlap=5)
splitter = DocumentSplitter(split_by="sentence", split_length=1, split_overlap=0)

In [None]:
test_doc = Document()
split_docs = splitter.run(docs)

In [None]:
embedding_model = "sentence-transformers/all-MiniLM-L6-v2"
document_embedder = SentenceTransformersDocumentEmbedder(model=embedding_model)
document_writer = DocumentWriter(document_store)

In [19]:
indexing = Pipeline()
indexing.add_component("converter", TextFileToDocument())
indexing.add_component(instance=cleaner, name="document_cleaner")
indexing.add_component(instance=splitter, name="document_splitter")
indexing.add_component(instance=document_embedder, name="document_embedder")
indexing.add_component(instance=document_writer, name="document_writer")

In [20]:
indexing.connect("converter", "document_cleaner")
indexing.connect("document_cleaner", "document_splitter")
indexing.connect("document_splitter", "document_embedder")
indexing.connect("document_embedder", "document_writer")

<haystack.core.pipeline.pipeline.Pipeline object at 0x7fd39a726bc0>
🚅 Components
  - converter: TextFileToDocument
  - document_cleaner: DocumentCleaner
  - document_splitter: DocumentSplitter
  - document_embedder: SentenceTransformersDocumentEmbedder
  - document_writer: DocumentWriter
🛤️ Connections
  - converter.documents -> document_cleaner.documents (List[Document])
  - document_cleaner.documents -> document_splitter.documents (List[Document])
  - document_splitter.documents -> document_embedder.documents (List[Document])
  - document_embedder.documents -> document_writer.documents (List[Document])

In [21]:
indexing.run({"converter": {"sources": text_files}})

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

{'document_writer': {'documents_written': 25}}

In [22]:
print(f"Doc store count [red bold]AFTER INDEXING: {document_store.count_documents()}[/]")

In [23]:
# Create pipeline components
from haystack_integrations.components.retrievers.chroma import ChromaQueryTextRetriever
from haystack.components.embedders import SentenceTransformersTextEmbedder

retriever = ChromaQueryTextRetriever(document_store=document_store, top_k=3)
text_embedder = SentenceTransformersTextEmbedder(model=embedding_model)
text_embedder.warm_up()

In [24]:
query_pipeline = Pipeline()
query_pipeline.add_component("retriever", retriever)

In [None]:
# simpple sanity check that we can embed a query outside the pipeline 
# todo -- confirm pipeline is in fact embedding the query "magically" in the current setup
query_embedding = text_embedder.run(q)

In [42]:
# print(f"Example embedding (first 10):  {query_embedding['embedding'][:10]}")

In [43]:
q = "How many languages are there?"
k = 5

In [44]:
result = query_pipeline.run({"retriever": {"query": q, "top_k": k}})

In [None]:
# for doc in result["retriever"]["documents"]:
#     print(f"Doc: {doc.id}")

In [45]:
def display_results(table_title, results, show_lines=True, ):
    table = Table(title=table_title, show_lines=show_lines, highlight=True)
    # table.add_column("id", style="blue", )  # no_wrap=True
    table.add_column("Score", style="blue", )
    table.add_column("text")  # no_wrap=True
    for d in results["retriever"]["documents"]:
        #pretty_content = Pretty(d.content.strip())
        table.add_row("{:.2f}".format(d.score), d.content.strip())

    console.print(table)
    print('\n\n')

In [46]:
title = f"Embedding model: [bold blue]{embedding_model}[/] \nQuery: [bold blue]{q}[/]"
display_results(title, result)

## Query Pipeline: build retrieval-augmented generation (RAG) pipelines

Once we have documents in the `ChromaDocumentStore`, we can use the accompanying Chroma retrievers to build a query pipeline. The query pipeline below is a simple retrieval-augmented generation (RAG) pipeline that uses Chroma's [query API](https://docs.trychroma.com/usage-guide#querying-a-collection).

You can change the idnexing pipeline and query pipelines here for embedding search by using one of the [`Haystack Embedders`](https://docs.haystack.deepset.ai/v2.0/docs/embedders) accompanied by the  `ChromaEmbeddingRetriever`.


In this example we are using:
- The `HuggingFaceTGIGenerator` with the Mistral-7B-Instruct-v0.1. (You will need a Hugging Face token to use this model). You can repleace this with any of the other [`Generators`](https://docs.haystack.deepset.ai/v2.0/docs/generators)
- The `PromptBuilder` which holds the prompt template. You can adjust this to a prompt of your choice
- The `ChromaQueryRetriver` which expects a list of queries and retieves the `top_k` most relevant documents from your Chroma collection.

In [None]:
import os
from getpass import getpass
hfat = "HF_API_TOKEN"
if hfat in os.environ:
    print("---------- Found hugging face token in environ, no need to prompt")
    hf_token = os.environ["HF_API_TOKEN"]
else:
    print("++++++++++ Found hugging face token NOT in environ, need to prompt...")
    hf_token = getpass("Enter Hugging Face API key:")
    os.environ["HF_API_TOKEN"] = hf_token

In [None]:
from haystack.components.builders import PromptBuilder

prompt = """
Answer the query based on the provided context.
If the context does not contain the answer, say 'Answer not found'.
Context:
{% for doc in documents %}
  {{ doc.content }}
{% endfor %}
query: {{query}}
Answer:
"""
prompt_builder = PromptBuilder(template=prompt)

In [None]:
from haystack_integrations.components.retrievers.chroma import ChromaQueryTextRetriever
from haystack.components.generators import HuggingFaceTGIGenerator
from haystack.utils import Secret

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
stoken = Secret.from_token(hf_token)
# client = HuggingFaceTGIGenerator(model="mistralai/Mistral-7B-v0.1", token=Secret.from_token(hf_token)
client = HuggingFaceTGIGenerator(model="mistralai/Mistral-7B-Instruct-v0.2", token=stoken)
# HuggingFaceTGIGenerator(model="mistralai/Mistral-7B-v0.1", token=Secret.from_token(

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")

In [None]:
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")

In [None]:
client.warm_up()
response = client.run("What's Natural Language Processing?")

print(response)

In [None]:
llm = HuggingFaceTGIGenerator(model="mistralai/Mistral-7B-Instruct-v0.1")
llm.warm_up()
retriever = ChromaQueryTextRetriever(document_store)

querying = Pipeline()
querying.add_component("retriever", retriever)
querying.add_component("prompt_builder", prompt_builder)
querying.add_component("llm", llm)

querying.connect("retriever.documents", "prompt_builder.documents")
querying.connect("prompt_builder", "llm")

In [None]:
query = "What is the Revenue Capacity for jacksonville beach?"
query = "How is annual enrollment assessed in Jax Beach schools?"
# NOTE / TODO: typo our outdated synax from example?? --> "retriever": {"queries": [query]... is wrong/broken
results = querying.run({"retriever": {"query": query, "top_k": 3},
                        "prompt_builder": {"query": query},
                        "llm":{"generation_kwargs": {"max_new_tokens": 350}}})

In [None]:
print(results["llm"]["replies"][0])