# Ingest `knowledge_bases/` into **pgvector** vector stores (LlamaStack)

This notebook:
- Discovers an embedding model from LlamaStack
- Creates (or reuses) **one pgvector-backed vector store per subfolder** in `knowledge_bases/`
- Uses **vector store name = subfolder name**
- Ingests `.txt` as raw text
- Ingests `.pdf` as **Markdown via Docling** (if `docling` is installed)

## How to use
- Edit the variables in the first Python cell (LlamaStack URL + knowledge bases path)
- Run cells top-to-bottom


In [None]:
# Optional installs (uncomment if running outside the container image)
# %pip install -U llama-stack-client docling


In [None]:
import uuid
from pathlib import Path

from llama_stack_client import LlamaStackClient, RAGDocument

# ============================
# User inputs (edit these)
# ============================
LLAMA_STACK_URL = "http://llamastack-with-config-service.llama-stack.svc.cluster.local:8321"
KNOWLEDGE_BASES_DIR = Path("../agent-service/config/knowledge_bases").resolve()

# ============================
# Ingestion settings
# ============================
VECTOR_STORE_PROVIDER_ID = "pgvector"  # force pgvector
REUSE_EXISTING_VECTOR_STORES = True

INGEST_TXT = True
INGEST_PDF = True

CHUNK_SIZE_TOKENS_TXT = 300
CHUNK_SIZE_TOKENS_PDF = 512

print("LLAMA_STACK_URL:", LLAMA_STACK_URL)
print("KNOWLEDGE_BASES_DIR:", str(KNOWLEDGE_BASES_DIR))

assert KNOWLEDGE_BASES_DIR.exists(), f"Knowledge bases dir not found: {KNOWLEDGE_BASES_DIR}"

In [None]:
client = LlamaStackClient(base_url=LLAMA_STACK_URL)

# Discover embedding model + dimension (mirrors rag-validation notebook)
models = client.models.list()

# Support both model field names depending on client version
embedding_model = next(
    (
        m
        for m in models
        if getattr(m, "model_type", None) == "embedding"
        or getattr(m, "api_model_type", None) == "embedding"
    ),
    None,
)
assert embedding_model is not None, "No embedding model registered in LlamaStack"

embedding_model_id = embedding_model.identifier
embedding_dimension = int(getattr(embedding_model, "metadata", {}).get("embedding_dimension", 0) or 0)
assert embedding_dimension > 0, f"Invalid embedding dimension: {embedding_dimension}"

print("Embedding model:", embedding_model_id)
print("Embedding dimension:", embedding_dimension)


In [None]:
# Build a name -> latest vector_store_id map (used when reusing existing stores)
stores = client.vector_stores.list()
store_list = getattr(stores, "data", stores)  # some client versions return `.data`

name_to_latest_store = {}
for vs in store_list:
    name = getattr(vs, "name", None)
    if not name:
        continue
    created_at = getattr(vs, "created_at", 0) or 0
    prev = name_to_latest_store.get(name)
    if prev is None or created_at > prev[0]:
        name_to_latest_store[name] = (created_at, vs.id)

print(f"Found {len(name_to_latest_store)} existing vector stores")


In [None]:
# Ingestion helpers

from typing import Iterable


def iter_kb_subfolders(root: Path) -> list[Path]:
    # Each direct subfolder under knowledge_bases is a KB.
    return sorted([p for p in root.iterdir() if p.is_dir()])


def iter_files(root: Path) -> Iterable[Path]:
    for p in root.rglob("*"):
        if p.is_file() and p.suffix.lower() in {".txt", ".pdf"}:
            yield p


def get_or_create_vector_store_id(kb_name: str) -> str:
    # Vector store name == KB subfolder name
    if REUSE_EXISTING_VECTOR_STORES and kb_name in name_to_latest_store:
        vs_id = name_to_latest_store[kb_name][1]
        print(f"Reusing vector store for '{kb_name}': {vs_id}")
        return str(vs_id)

    vs = client.vector_stores.create(
        name=kb_name,
        extra_body={
            "provider_id": VECTOR_STORE_PROVIDER_ID,
            "embedding_model": embedding_model_id,
            "embedding_dimension": embedding_dimension,
        },
    )
    print(f"Created vector store for '{kb_name}': {vs.id}")
    return str(vs.id)


def ingest_document(vector_db_id: str, doc: RAGDocument, chunk_size: int) -> None:
    client.tool_runtime.rag_tool.insert(
        documents=[doc],
        vector_db_id=vector_db_id,
        chunk_size_in_tokens=chunk_size,
    )


def ingest_txt(vector_db_id: str, kb_root: Path, path: Path) -> None:
    text = path.read_text(encoding="utf-8", errors="ignore")
    rel = path.relative_to(kb_root)

    doc = RAGDocument(
        document_id=f"txt::{kb_root.name}/{rel.as_posix()}::{uuid.uuid4().hex[:8]}",
        content=text,
        mime_type="text/plain",
        metadata={
            "source": str(path),
            "kb_name": kb_root.name,
            "relative_path": rel.as_posix(),
            "file_type": "txt",
        },
    )
    ingest_document(vector_db_id, doc, CHUNK_SIZE_TOKENS_TXT)


def ingest_pdf(vector_db_id: str, kb_root: Path, path: Path) -> bool:
    # Returns True if ingested, False if skipped
    try:
        from docling.document_converter import DocumentConverter
    except ImportError:
        print(f"Skipping PDF (docling not installed): {path}")
        return False

    rel = path.relative_to(kb_root)

    # Create converter per notebook run; docling can be heavy but is stable.
    converter = DocumentConverter()
    result = converter.convert(str(path))
    markdown = result.document.export_to_markdown()

    doc = RAGDocument(
        document_id=f"pdf-md::{kb_root.name}/{rel.as_posix()}::{uuid.uuid4().hex[:8]}",
        content=markdown,
        mime_type="text/markdown",
        metadata={
            "source": str(path),
            "kb_name": kb_root.name,
            "relative_path": rel.as_posix(),
            "file_type": "pdf",
            "page_count": len(getattr(result.document, "pages", []) or []),
        },
    )
    ingest_document(vector_db_id, doc, CHUNK_SIZE_TOKENS_PDF)
    return True


In [None]:
# Run ingestion (one vector store per KB subfolder)

from collections import Counter

counts = Counter()
errors = []

kb_folders = iter_kb_subfolders(KNOWLEDGE_BASES_DIR)
print(f"Found {len(kb_folders)} KB folders")

for kb_root in kb_folders:
    kb_name = kb_root.name
    print(f"\n=== KB: {kb_name} ===")

    vector_db_id = get_or_create_vector_store_id(kb_name)
    counts["vector_stores_used"] += 1

    for path in iter_files(kb_root):
        try:
            if path.suffix.lower() == ".txt" and INGEST_TXT:
                ingest_txt(vector_db_id, kb_root, path)
                counts["txt_ingested"] += 1
            elif path.suffix.lower() == ".pdf" and INGEST_PDF:
                ok = ingest_pdf(vector_db_id, kb_root, path)
                counts["pdf_ingested" if ok else "pdf_skipped"] += 1
        except Exception as e:
            errors.append((kb_name, str(path), type(e).__name__, str(e)))
            counts["errors"] += 1

print("\nDone.")
print("Counts:", dict(counts))

if errors:
    print("\nErrors (first 10):")
    for item in errors[:10]:
        print("-", item)
