In [1]:
import chromadb
from chromadb.utils import embedding_functions

# Step 1: Initialize ChromaDB client
# persistent_client saves data to disk, in-memory is temporary
client = chromadb.PersistentClient(path="./chroma_db")  # In-memory database
# For persistent storage: client = chromadb.PersistentClient(path="./chroma_db")

# Step 2: Create or get a collection
# Collections are like tables in traditional databases
# We use a default embedding function (all-MiniLM-L6-v2)
default_ef = embedding_functions.DefaultEmbeddingFunction()

collection = client.get_or_create_collection(
    name="my_knowledge_base",
    embedding_function=default_ef,
    metadata={"description": "A collection for AI knowledge"}
)

# Step 3: Add documents to the collection
# ChromaDB automatically converts text to vectors using the embedding function
documents = [
    "Artificial Intelligence is the simulation of human intelligence by machines.",
    "Machine Learning is a subset of AI that learns from data.",
    "Deep Learning uses neural networks with multiple layers.",
    "Natural Language Processing helps computers understand human language.",
    "Computer Vision enables machines to interpret visual information.",
    "Reinforcement Learning trains agents through rewards and penalties."
]

# Metadata helps filter and organize documents
metadatas = [
    {"category": "AI", "difficulty": "beginner"},
    {"category": "ML", "difficulty": "beginner"},
    {"category": "DL", "difficulty": "intermediate"},
    {"category": "NLP", "difficulty": "intermediate"},
    {"category": "CV", "difficulty": "intermediate"},
    {"category": "RL", "difficulty": "advanced"}
]

# IDs must be unique for each document
ids = [f"doc_{i}" for i in range(len(documents))]

# Add all documents at once
collection.add(
    documents=documents,
    metadatas=metadatas,
    ids=ids
)

print(f"Added {len(documents)} documents to the collection")
print(f"Total documents in collection: {collection.count()}\n")

# Step 4: Query the collection
# The query is also converted to a vector and compared to stored vectors
query_text = "How do computers learn from experience?"

results = collection.query(
    query_texts=[query_text],
    n_results=3,  # Return top 3 most relevant documents
    include=["documents", "metadatas", "distances"]
)

print(f"Query: '{query_text}'\n")
print("Top 3 Results:")
print("-" * 80)

# Results are returned as lists (can query multiple texts at once)
for i, (doc, metadata, distance) in enumerate(zip(
    results['documents'][0], 
    results['metadatas'][0],
    results['distances'][0]
)):
    print(f"\n{i+1}. Document: {doc}")
    print(f"   Category: {metadata['category']}, Difficulty: {metadata['difficulty']}")
    print(f"   Distance (lower = more similar): {distance:.4f}")

# Step 5: Filter queries with metadata
print("\n" + "="*80)
print("Filtered Query (only beginner-level content):")
print("-" * 80)

filtered_results = collection.query(
    query_texts=["What is AI?"],
    n_results=2,
    where={"difficulty": "beginner"},  # Metadata filter
    include=["documents", "metadatas"]
)

for i, (doc, metadata) in enumerate(zip(
    filtered_results['documents'][0],
    filtered_results['metadatas'][0]
)):
    print(f"\n{i+1}. {doc}")
    print(f"   Category: {metadata['category']}")

# Step 6: Update documents
collection.update(
    ids=["doc_0"],
    documents=["Artificial Intelligence (AI) enables machines to mimic human cognition and decision-making."],
    metadatas=[{"category": "AI", "difficulty": "beginner", "updated": True}]
)

print("\n" + "="*80)
print("Document updated successfully!")

# Step 7: Delete documents
# collection.delete(ids=["doc_0"])  # Uncomment to delete

# Step 8: Peek at collection contents
print("\n" + "="*80)
print("Collection peek (first 3 items):")
print("-" * 80)
peek = collection.peek(limit=3)
for i, doc in enumerate(peek['documents']):
    print(f"{i+1}. {doc[:80]}...")  # Show first 80 characters

C:\Users\OHB\.cache\chroma\onnx_models\all-MiniLM-L6-v2\onnx.tar.gz: 100%|██████████| 79.3M/79.3M [00:08<00:00, 9.88MiB/s]


Added 6 documents to the collection
Total documents in collection: 6

Query: 'How do computers learn from experience?'

Top 3 Results:
--------------------------------------------------------------------------------

1. Document: Natural Language Processing helps computers understand human language.
   Category: NLP, Difficulty: intermediate
   Distance (lower = more similar): 1.0672

2. Document: Machine Learning is a subset of AI that learns from data.
   Category: ML, Difficulty: beginner
   Distance (lower = more similar): 1.1060

3. Document: Computer Vision enables machines to interpret visual information.
   Category: CV, Difficulty: intermediate
   Distance (lower = more similar): 1.1631

Filtered Query (only beginner-level content):
--------------------------------------------------------------------------------

1. Artificial Intelligence is the simulation of human intelligence by machines.
   Category: AI

2. Machine Learning is a subset of AI that learns from data.
   Categ