## Create Chromadb database

In [1]:
# 📦 Setup
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.docstore.document import Document
import os
import json
# Load your OpenAI API Key
from dotenv import load_dotenv
load_dotenv("../src/utils/.env")
openai_api_key = os.getenv("OPENAI_API_KEY")

# Load and parse embedding model config
embedding_json = os.getenv("EMBEDDING_MODELS_JSON")
if not embedding_json:
    raise EnvironmentError("Missing EMBEDDING_MODELS_JSON in environment variables")

embedding_models = json.loads(embedding_json)
EMBEDDING_MODEL = embedding_models.get("small", os.getenv("DEFAULT_EMBEDDING_MODEL"))


# 🔐 Embedding function with ADA
embedding = OpenAIEmbeddings(openai_api_key=openai_api_key, model=EMBEDDING_MODEL)

# 🗂️ ChromaDB setup
os.makedirs("../src/db", exist_ok=True)
persist_directory = "../src/db"
collection_name = "knowledge_base"

db = Chroma(
    collection_name=collection_name,
    embedding_function=embedding,
    persist_directory=persist_directory
)



  embedding = OpenAIEmbeddings(openai_api_key=openai_api_key, model=EMBEDDING_MODEL)
  db = Chroma(


## Adding Documents (example)

In [None]:
# ✅ Create some sample docs
docs = [
    Document(
        page_content="The moon is Earth's only natural satellite.",
        metadata={"source": "test_doc1.pdf", "topic": "astronomy"}
    ),
    Document(
        page_content="The mitochondria is the powerhouse of the cell.",
        metadata={"source": "test_doc2.pdf", "topic": "biology"}
    )
]

# Add to Chroma
db.add_documents(docs)
db.persist()
print("✅ Documents added.")


## Reading (Search & Filter)

In [25]:
# 🔍 Basic semantic search
query = "Which image talks about a soldier?"
results = db.similarity_search(query, k=10)

for i, doc in enumerate(results):
    print(f"\nResult {i+1}")
    print("Content:", doc.page_content)
    print("Metadata:", doc.metadata)

# 📋 Read with metadata filter (only docs from test_doc2.pdf)
results_filtered = db.similarity_search(
    query,
    k=5,
    filter={"source": "test_doc2.pdf"}
)

for doc in results_filtered:
    print(doc.page_content, "|", doc.metadata)




Result 1
Content: FIRMADO POR FECHA FIRMA
SECRETARIO GENERAL 29-07-2025 09:34:44
C/ Madrid, 126-128 GETAFE - Madrid - 28903. Tfno.: 916249500 Fax.: 916249500 - https://sede.uc3m.es
Página: 1 / 9
CERTIFICACIÓN ACADÉMICA OFICIAL
ACADEMIC RECORD
Universidad Carlos III de Madrid
Don JOSE VIDA FERNANDEZ, Secretario General de la Universidad Carlos III de Madrid,
CERTIFICA los siguientes datos académicos y personales que constan en las bases de datos y archivos de la
universidad
Mr. JOSE VIDA FERNANDEZ, General Secretary of the Universidad Carlos III de Madrid,
HEREBY CERTIFIES the academic and personal information contained in the University's databases and
archives
Nombre del Estudiante
First Name of Student
IAGO SENEN Apellidos
Surname
FERNANDEZ GARCIA DNI/Pasaporte
N.I.D./Passport
Number
54125842B
Plan de Estudios
Study Program
Grado en Ingeniería Aeroespacial
Bachelor in Aerospace Engineering
Facultad/Escuela
Faculty/School
Escuela Politécnica Superior (Leganés)
School of Engineering. 

## Update , overwriting by re-adding

In [None]:
# ⚠️ To simulate update, remove + re-add with same metadata but new content
# Let's "update" the mitochondria sentence

# Step 1: Delete original (see below)
# Step 2: Re-add with new content

updated_doc = Document(
    page_content="Mitochondria are responsible for energy production in cells.",
    metadata={"source": "test_doc2.pdf", "topic": "biology"}
)

db.add_documents([updated_doc])
db.persist()
print("✅ Document 'updated' by re-adding.")


## Delete, by filter (source, topic)

In [None]:
# Step 1: Get all document IDs
all_docs = db.get()
ids_to_delete = []

for idx, metadata in zip(all_docs['ids'], all_docs['metadatas']):
    if metadata.get("source") == "test_doc2.pdf":
        ids_to_delete.append(idx)

# Step 2: Delete those IDs
if ids_to_delete:
    db.delete(ids=ids_to_delete)
    print(f"🗑️ Deleted {len(ids_to_delete)} documents from test_doc2.pdf")
else:
    print("⚠️ No documents matched the deletion criteria.")


## Delete the whole database

In [3]:

# Optional: Delete entire index (dangerous!)
import shutil
import glob
shutil.rmtree(persist_directory)
print("💥 Entire Chroma index deleted.")

sample_data_folder = "../sample_data"
files = glob.glob(f"{sample_data_folder}/*")
for f in files:
    try:
        os.remove(f)
    except IsADirectoryError:
        shutil.rmtree(f)
print(f"🗑️ Deleted all files in {sample_data_folder}")

💥 Entire Chroma index deleted.
🗑️ Deleted all files in ../sample_data


## Count indexed documents

In [None]:
print("📦 Total indexed docs:", db._collection.count())


## Query whole database

In [None]:
# Get all documents from the Chroma collection
all_docs = db.get()

# Print document IDs and contents
for i, doc in enumerate(all_docs["documents"]):
    print(f"\n📄 Document {i+1}:")
    print(f"ID: {all_docs['ids'][i]}")
    print(f"Content: {doc}")


In [21]:
# Get all documents from the Chroma collection
all_docs = db.get()

# Print document IDs, content, and metadata
for i, content in enumerate(all_docs["documents"]):
    print(f"\n📄 Document {i+1}:")
    print(f"ID: {all_docs['ids'][i]}")
    print(f"source: {all_docs['metadatas'][i].get('source', 'N/A')}")
    print(f"page_number: {all_docs['metadatas'][i].get('page_number', 'N/A')}")
    print(f"image_file: {all_docs['metadatas'][i].get('image_file', 'N/A')}")
    print(f"Content: {content}")



📄 Document 1:
ID: 812dc0c1-8f10-4410-942a-b038ea9095b5
source: short_text.pdf
page_number: N/A
image_file: N/A
Content: Senén es un perro amarillo con diarrea

📄 Document 2:
ID: f39eb1d2-60cb-48df-83bc-af6d43e24e7f
source: Certificado_UC3M.pdf
page_number: N/A
image_file: N/A
Content: FIRMADO POR FECHA FIRMA
SECRETARIO GENERAL 29-07-2025 09:34:44
C/ Madrid, 126-128 GETAFE - Madrid - 28903. Tfno.: 916249500 Fax.: 916249500 - https://sede.uc3m.es
Página: 1 / 9
CERTIFICACIÓN ACADÉMICA OFICIAL
ACADEMIC RECORD
Universidad Carlos III de Madrid
Don JOSE VIDA FERNANDEZ, Secretario General de la Universidad Carlos III de Madrid,
CERTIFICA los siguientes datos académicos y personales que constan en las bases de datos y archivos de la
universidad
Mr. JOSE VIDA FERNANDEZ, General Secretary of the Universidad Carlos III de Madrid,
HEREBY CERTIFIES the academic and personal information contained in the University's databases and
archives
Nombre del Estudiante
First Name of Student
IAGO SENEN Apelli