In [None]:
!pip install chromadb

In [None]:
import chromadb

client = chromadb.Client()
collection = client.create_collection("fruit")

In [None]:
# Creating a collection by passing documents as input together with metadata
# Chomadb uses "Sentence transformers" to create it own embeddings
collection.add(
    ids=["1", "2", "3", "4", "5", "6", "7"],
    documents=["apple",
        "banana",
        "pineapple",
        "mango",
        "dragonfruit",
        "passionfruit",
        "raspberry"],
    metadatas=[
        { "color": "red", "weight": 180 },
        { "color": "yellow", "weight": 120 },
        { "color": "brown", "weight": 900 },
        { "color": "yellow", "weight": 200 },
        { "color": "pink", "weight": 600 },
        { "color": "purple", "weight": 18 },
        { "color": "red", "weight": 4 },        
    ]
)

In [None]:
# Browsing the collection
# Print everything
all_data = collection.get()
print(all_data)

In [None]:
from pprint import pprint
pprint(all_data)

In [None]:
# NOTE - embeddings : None, Older version of Chroma use to use default Sentence transformers, but as of 2025 it is skipped
# You need to explicity specify

# Clearing the collection
client.delete_collection("fruit")

In [None]:
# Using OpenAI embedding function
!pip install openai
!pip install python-dotenv

In [None]:
import openai
from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction
from dotenv import load_dotenv

In [None]:
load_dotenv()
embedding_func = OpenAIEmbeddingFunction(
    model_name="text-embedding-3-small"  # or "text-embedding-3-large"
)

In [None]:
collection = client.create_collection("fruit", embedding_function=embedding_func)

In [None]:
collection.add(
    ids=["1", "2", "3", "4", "5", "6", "7"],
    documents=["apple",
        "banana",
        "pineapple",
        "mango",
        "dragonfruit",
        "passionfruit",
        "raspberry"],
    metadatas=[
        { "color": "red", "weight": 180 },
        { "color": "yellow", "weight": 120 },
        { "color": "brown", "weight": 900 },
        { "color": "yellow", "weight": 200 },
        { "color": "pink", "weight": 600 },
        { "color": "purple", "weight": 18 },
        { "color": "red", "weight": 4 },        
    ]
)

In [None]:
all_data = collection.get()
pprint(all_data)

In [None]:
# Gotcha moment!
# Embeddings are not returned by default to save bandwidth
# Do the following to show embeddings
all_data = collection.get(include=["embeddings", "documents", "metadatas"])
pprint(all_data)

In [None]:
# Now that data is stored with embeddings, lets query and so semantic search!
# Semantic Search
tropical_fruits = collection.query(query_texts=["Tropical"])
pprint(tropical_fruits)

In [None]:
# Print required data
print(tropical_fruits["documents"][0])

In [None]:
# Metadata Filtering
tropical_yellow = collection.query(query_texts=["Tropical"], 
                                  where={"$and":[
                                      {"color": "yellow"},
                                      {"weight": {"$gt": 150}}
                                  ]
                                  })

In [None]:
print(tropical_yellow["documents"][0])

In [None]:
# Full Text Search
contains_fruit = collection.get(
    where_document={"$contains": "fruit"}
)
print(contains_fruit["documents"])

In [None]:
# Types of Storage
# Ephemeral Client - What we did so far, NOT persisted
# Persistent Client - Saved to Disk
# Cloud Client - Saved to Cloud

In [None]:
# We will recreate the Fruits database with persistent client
persistent_directory = "db"
persistent_client = chromadb.PersistentClient(path=persistent_directory)

In [None]:
persistent_client.delete_collection("fruit")

In [None]:
persistent_collection = persistent_client.create_collection("fruit", embedding_function=embedding_func)

In [None]:
persistent_collection.add(
    ids=["1", "2", "3", "4", "5", "6", "7"],
    documents=["apple",
        "banana",
        "pineapple",
        "mango",
        "dragonfruit",
        "passionfruit",
        "raspberry"],
    metadatas=[
        { "color": "red", "weight": 180 },
        { "color": "yellow", "weight": 120 },
        { "color": "brown", "weight": 900 },
        { "color": "yellow", "weight": 200 },
        { "color": "pink", "weight": 600 },
        { "color": "purple", "weight": 18 },
        { "color": "red", "weight": 4 },        
    ]
)

In [None]:
!ls -al

In [None]:
!tree