# Ingest `knowledge_bases/` into **pgvector** vector stores (LlamaStack)

This notebook:
- Discovers an embedding model from LlamaStack
- Creates (or reuses) **one pgvector-backed vector store per subfolder** in `knowledge_bases/`
- Uses **vector store name = subfolder name**
- Ingests `.txt` as raw text
- Ingests `.pdf` via **PyPDF (pypdf) text extraction** (offline friendly)

## How to use
- Edit the variables in the first Python cell (LlamaStack URL + knowledge bases path)
- Run cells top-to-bottom


In [1]:
# Optional installs (uncomment if running outside the container image)
%pip install -U llama-stack-client==0.3.0 pypdf



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m26.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
import uuid
from pathlib import Path

from llama_stack_client import LlamaStackClient, RAGDocument

# ============================
# User inputs (edit these)
# ============================
LLAMA_STACK_URL = "http://llamastack-with-config-service.llama-stack.svc.cluster.local:8321"
KNOWLEDGE_BASES_DIR = Path("knowledge_bases").resolve()

# ============================
# Ingestion settings
# ============================
VECTOR_STORE_PROVIDER_ID = "pgvector"  # force pgvector
REUSE_EXISTING_VECTOR_STORES = True

INGEST_TXT = True
INGEST_PDF = True

CHUNK_SIZE_TOKENS_TXT = 300
CHUNK_SIZE_TOKENS_PDF = 512

print("LLAMA_STACK_URL:", LLAMA_STACK_URL)
print("KNOWLEDGE_BASES_DIR:", str(KNOWLEDGE_BASES_DIR))

assert KNOWLEDGE_BASES_DIR.exists(), f"Knowledge bases dir not found: {KNOWLEDGE_BASES_DIR}"

LLAMA_STACK_URL: http://llamastack-with-config-service.llama-stack.svc.cluster.local:8321
KNOWLEDGE_BASES_DIR: /opt/app-root/src/it-self-service-agent-sukanta/knowledge_bases


In [3]:
client = LlamaStackClient(base_url=LLAMA_STACK_URL)

# Discover embedding model + dimension (mirrors rag-validation notebook)
models = client.models.list()
print(models)
# Support both model field names depending on client version
embedding_model = next(
    (
        m
        for m in models
        if getattr(m, "model_type", None) == "embedding"
        or getattr(m, "api_model_type", None) == "embedding"
    ),
    None,
)
assert embedding_model is not None, "No embedding model registered in LlamaStack"

embedding_model_id = embedding_model.identifier
embedding_dimension = int(getattr(embedding_model, "metadata", {}).get("embedding_dimension", 0) or 0)
assert embedding_dimension > 0, f"Invalid embedding dimension: {embedding_dimension}"

print("Embedding model:", embedding_model_id)
print("Embedding dimension:", embedding_dimension)


INFO:httpx:HTTP Request: GET http://llamastack-with-config-service.llama-stack.svc.cluster.local:8321/v1/models "HTTP/1.1 200 OK"


[Model(identifier='llama-17b/llama-4-scout-17b-16e-w4a16', metadata={}, api_model_type='llm', provider_id='llama-17b', type='model', provider_resource_id='llama-4-scout-17b-16e-w4a16', model_type='llm'), Model(identifier='granite-embedding-125m', metadata={'embedding_dimension': 768.0}, api_model_type='embedding', provider_id='sentence-transformers', type='model', provider_resource_id='ibm-granite/granite-embedding-125m-english', model_type='embedding'), Model(identifier='sentence-transformers/nomic-ai/nomic-embed-text-v1.5', metadata={'embedding_dimension': 768.0, 'default_configured': True}, api_model_type='embedding', provider_id='sentence-transformers', type='model', provider_resource_id='nomic-ai/nomic-embed-text-v1.5', model_type='embedding')]
Embedding model: granite-embedding-125m
Embedding dimension: 768


In [4]:
print(client.vector_stores.list())

INFO:httpx:HTTP Request: GET http://llamastack-with-config-service.llama-stack.svc.cluster.local:8321/v1/vector_stores "HTTP/1.1 200 OK"


SyncOpenAICursorPage[VectorStore](data=[], has_more=False, last_id=None, object='list', first_id=None)


In [5]:
# Build a name -> latest vector_store_id map (used when reusing existing stores)
stores = client.vector_stores.list()
store_list = getattr(stores, "data", stores)  # some client versions return `.data`

name_to_latest_store = {}
for vs in store_list:
    name = getattr(vs, "name", None)
    if not name:
        continue
    created_at = getattr(vs, "created_at", 0) or 0
    prev = name_to_latest_store.get(name)
    if prev is None or created_at > prev[0]:
        name_to_latest_store[name] = (created_at, vs.id)

print(f"Found {len(name_to_latest_store)} existing vector stores")


INFO:httpx:HTTP Request: GET http://llamastack-with-config-service.llama-stack.svc.cluster.local:8321/v1/vector_stores "HTTP/1.1 200 OK"


Found 0 existing vector stores


In [6]:
# Ingestion helpers

from typing import Iterable


def iter_kb_subfolders(root: Path) -> list[Path]:
    # Each direct subfolder under knowledge_bases is a KB.
    # Ignore hidden folders like `.ipynb_checkpoints`.
    folders: list[Path] = []
    for p in root.iterdir():
        if not p.is_dir():
            continue
        if p.name.startswith("."):
            continue
        folders.append(p)
    return sorted(folders)


def iter_files(root: Path) -> Iterable[Path]:
    for p in root.rglob("*"):
        if p.is_file() and p.suffix.lower() in {".txt", ".pdf"}:
            yield p


def get_or_create_vector_store_id(kb_name: str) -> str:
    # Vector store name == KB subfolder name
    if REUSE_EXISTING_VECTOR_STORES and kb_name in name_to_latest_store:
        vs_id = name_to_latest_store[kb_name][1]
        print(f"Reusing vector store for '{kb_name}': {vs_id}")
        return str(vs_id)

    vs = client.vector_stores.create(
        name=kb_name,
        extra_body={
            "provider_id": VECTOR_STORE_PROVIDER_ID,
            "embedding_model": embedding_model_id,
            "embedding_dimension": embedding_dimension,
        },
    )
    print(f"Created vector store for '{kb_name}': {vs.id}")
    return str(vs.id)


def ingest_document(vector_db_id: str, doc: RAGDocument, chunk_size: int) -> None:
    client.tool_runtime.rag_tool.insert(
        documents=[doc],
        vector_db_id=vector_db_id,
        chunk_size_in_tokens=chunk_size,
    )


def ingest_txt(vector_db_id: str, kb_root: Path, path: Path) -> None:
    text = path.read_text(encoding="utf-8", errors="ignore")
    rel = path.relative_to(kb_root)

    doc = RAGDocument(
        document_id=f"txt::{kb_root.name}/{rel.as_posix()}::{uuid.uuid4().hex[:8]}",
        content=text,
        mime_type="text/plain",
        metadata={
            "source": str(path),
            "kb_name": kb_root.name,
            "relative_path": rel.as_posix(),
            "file_type": "txt",
        },
    )
    ingest_document(vector_db_id, doc, CHUNK_SIZE_TOKENS_TXT)


def ingest_pdf(vector_db_id: str, kb_root: Path, path: Path) -> bool:
    """Ingest PDF using offline-friendly text extraction via pypdf."""
    try:
        from pypdf import PdfReader
    except ImportError:
        print(f"Skipping PDF (pypdf not installed): {path}")
        return False

    rel = path.relative_to(kb_root)

    try:
        reader = PdfReader(str(path))
        pages_text = []
        for i, page in enumerate(reader.pages):
            t = page.extract_text() or ""
            if t.strip():
                pages_text.append(f"\n\n--- Page {i+1} ---\n\n{t}")

        text = "".join(pages_text).strip()
        if not text:
            print(f"Skipping PDF (no extractable text): {path}")
            return False

        doc = RAGDocument(
            document_id=f"pdf-txt::{kb_root.name}/{rel.as_posix()}::{uuid.uuid4().hex[:8]}",
            content=text,
            mime_type="text/plain",
            metadata={
                "source": str(path),
                "kb_name": kb_root.name,
                "relative_path": rel.as_posix(),
                "file_type": "pdf",
                "extraction": "pypdf",
                "page_count": len(reader.pages),
            },
        )
        ingest_document(vector_db_id, doc, CHUNK_SIZE_TOKENS_PDF)
        return True

    except Exception as e:
        print(f"Skipping PDF (pypdf failed): {path}\n  Error: {type(e).__name__}: {e}")
        return False


In [7]:
# Run ingestion (one vector store per KB subfolder)

from collections import Counter

counts = Counter()
errors = []

kb_folders = iter_kb_subfolders(KNOWLEDGE_BASES_DIR)
print(f"Found {len(kb_folders)} KB folders")

for kb_root in kb_folders:
    kb_name = kb_root.name
    print(f"\n=== KB: {kb_name} ===")

    vector_db_id = get_or_create_vector_store_id(kb_name)
    counts["vector_stores_used"] += 1

    for path in iter_files(kb_root):
        try:
            if path.suffix.lower() == ".txt" and INGEST_TXT:
                ingest_txt(vector_db_id, kb_root, path)
                counts["txt_ingested"] += 1
            elif path.suffix.lower() == ".pdf" and INGEST_PDF:
                ok = ingest_pdf(vector_db_id, kb_root, path)
                counts["pdf_ingested" if ok else "pdf_skipped"] += 1
        except Exception as e:
            errors.append((kb_name, str(path), type(e).__name__, str(e)))
            counts["errors"] += 1

print("\nDone.")
print("Counts:", dict(counts))

if errors:
    print("\nErrors (first 10):")
    for item in errors[:10]:
        print("-", item)


INFO:httpx:HTTP Request: POST http://llamastack-with-config-service.llama-stack.svc.cluster.local:8321/v1/vector_stores "HTTP/1.1 200 OK"


Found 1 KB folders

=== KB: type-a-app-migration ===
Created vector store for 'type-a-app-migration': vs_2d5a6617-574e-43dd-801b-0e3d25f65e89


INFO:httpx:HTTP Request: POST http://llamastack-with-config-service.llama-stack.svc.cluster.local:8321/v1/tool-runtime/rag-tool/insert "HTTP/1.1 200 OK"



Done.
Counts: {'vector_stores_used': 1, 'pdf_ingested': 1}
