# Chroma Default Embedding demo

In [1]:
# started with https://ollama.com/blog/embedding-models  but have customized heavily since..

import chromadb
from rich import print  # https://rich.readthedocs.io/en/stable/markup.html#console-markup

In [2]:
documents = {
    # vector db focus (level: easy)
    'vdb1': "Vector databases are a critical part of most LLM projects",
    'vdb2': "Pinecone is a leading vendor of vector db services",
    'vdb3': "ChromaDB is an opensource vector database (self-hosted) alternative to hosted services",
    'vdb4': "A Document store is not a vector db, but offer a quick and easy alternative during development",
    
    # embedding model focus (level: moderate)
    'emb1': "OpenAI is a  company leading in offering advanced llm services ",                                                                          # service (implicit embedding/vectors)
    'emb2': "There are three embedding models available from OpenAI: 'text-embedding-3-small', 'text-embedding-3-large', 'text-embedding-ada-002'.",    #embedding model (implicit service)
    'emb3': "OpenAI's ada-002 was the previous de-facto standard, but it has been replaced with a more robust embedding model: text-embedding-3 (in 2 sizes)", #embedding model 
    'emb4': "text-embedding-3-small is likely the most appealing embedding model offered, with a balance of speed and accuracy.",                       #embedding model (implicit service)
    
    # chunking (level: advanced)
    'chu1': "Chunking, or segementing, text is important in quality results from ML operations",
    'chu2': "Splitting text into appropriate units is critical for LLMs",
    'chu3': "Campbell's chunky soup is delicious, but not so healthy",
    "chu4": "Separating text into chunks is a nuanced skill",

}

questions = {
    1:{'q':"vector database", 'cat':'vdb', 'ids':['vdb1','vdb2']},
    2:{'q':'vector db', 'ids':['vdb1','vdb2']},
    
    3:{'q':'embedding', 'ids':['emb4', 'emb2']},
    4:{'q':'embedding models', 'ids':['emb2', 'emb3']},
    
    
    5:{'q':'chunking', 'ids':['chu1', 'chu2']},
    6:{'q':'segmenting', 'ids':['chu1', 'chu2']},
    # 7:{'q':'How long do animals live?', 'ids':[7,5]},
}

In [3]:
client = chromadb.Client()

collections = client.list_collections()
print(f"Collections: {collections}")

col_name = 'test-collection'
if col_name in [c.name for c in client.list_collections()]:
    collection = client.get_collection(name=col_name) # Get a collection object from an existing collection, by name. Will raise an exception if it's not found.
    print(f"[green]Found collection: {collection}[/], delete it so we can recreate fresh...")
    client.delete_collection(name=col_name)
else:
    print(f"Collection does not already exist:{col_name}")

print(f"[blue](re)Create collection: {col_name}, now...[/]")
collection = client.create_collection(name=col_name)

In [4]:
collection
collection.get()

{'ids': [],
 'embeddings': None,
 'metadatas': [],
 'documents': [],
 'uris': None,
 'data': None}

In [5]:
# store each document in a vector embedding database
for i, id in enumerate(documents):
    doc = documents[id]
    # print(f"{i}) add doc: {doc}")
    collection.add(
        ids=[id],
        documents=[doc]
    )

In [6]:
existing_count = collection.count()
print(f"existing doc count: {existing_count}")
ids = collection.get(include=[])
print(f"collection ids: {ids}")

In [7]:
# note: if tweaking table output, you likely want to re-run this cell along with the next, otherwise `rich` just appends to the existing table
from rich.table import Table
from rich.console import Console

console = Console()
table = Table(title="Simple Semantic Search Results", show_lines=True)
table.add_column("#" )  # no_wrap=True
table.add_column("Status" )  # no_wrap=True
table.add_column("Content")  # no_wrap=True
table.add_column("Score" )

style_good = 'green'
style_failed = 'red bold'
style_info = 'bright_black'

In [8]:
# print(f"\nready for queries on collection: {col_name}")
for q_number in questions:
    prompt = questions[q_number]['q']
    expected_ids = questions[q_number]['ids']
    table.add_row(f"{q_number})",  f"QUESTION", f"{prompt}", style='navy_blue on grey84 bold')
    # use default collection embedding function for the prompt and retrieve the most relevant doc
    results = collection.query(
        query_texts=[prompt],
        n_results=3
    )

    answers = results["documents"][0]
    for i, answer in enumerate(answers):
        id = results["ids"][0][i]
        distance = results["distances"][0][i]
        if(i < len(expected_ids)):
            expected = expected_ids[i]
            if(id==expected):
                # print(f"\t[green bold]got({id}):expected({expected}) (distance:{distance:.2f}): {answer}[/]")
                table.add_row('', f"[{style_good}]got({id}) \nexpected({expected})[/]", f"[{style_good}]{answer}[/]",  f"[{style_good}]{distance:.2f}")
            else:
                table.add_row('', f"[{style_failed}]got({id})[/] \n[{style_info}]expected({expected})[/]", f"[{style_failed}]{answer}[/] \n[{style_info}]{documents[expected]}[/]" ,  f"[{style_failed}]{distance:.2f}")
        else:
            # table.add_row('', f"[light_slate_grey](outside test)[/]", f"[light_slate_grey]{answer}[/]", f"{distance:.2f}")
            table.add_row('', '', f"[{style_info}]{answer}[/]", f"{distance:.2f}")

console.print(table)