# Day 2 — Evals & Grading (RAGAS)

**Audience:** senior cloud engineers.

**Goal:** build a real evaluation harness for a RAG system.

- **System under test (SUT):** *local* RAG pipeline execution
- **Judge:** **local chat model (Ollama)** (OpenAI-compatible), used by RAGAS to grade answers

## What you’ll do
- Build a small, grounded **eval set** (cached)
- Run the local RAG SUT to produce `response` + `retrieved_contexts`
- Compute core RAGAS metrics:
  - `faithfulness`
  - `answer_relevancy`
- Turn metrics into a **grade** (pass/fail gate)
- Run a small knob sweep and observe quality/latency tradeoffs

## Key env vars
- `NIM_BASE_URL` (default `http://localhost:8000`) — NIM gateway (embeddings/rerank)
- `NIM_CHAT_BASE_URL` (default `http://localhost:11434`) — Ollama (judge/chat)
- `NIM_JUDGE_MODEL` (defaults to `NIM_GEN_MODEL`, then `llama3.1:8b`)
- `NIM_API_KEY` (optional)

**Prereq:**
- NIM gateway running for embeddings (`./scripts/start_nims.sh`)
- Ollama running for the judge (`./scripts/start_ollama.sh`)



In [None]:
# Setup + preflight

import os
import sys
import json
import time
import math
import random
import hashlib
from pathlib import Path
from typing import Any

import numpy as np
import pandas as pd
import plotly.express as px
import requests

from datasets import Dataset

from nim_clients import NIMClient, NIMConfig
from rag_pipeline import (
    Chunk,
    Timer,
    VectorIndex,
    chunk_text,
    make_prompt,
    clean_answer,
)

px.defaults.template = "plotly_white"

# ---- Env ----
NIM_BASE_URL = os.environ.get("NIM_BASE_URL", "http://localhost:8000").rstrip("/")
NIM_JUDGE_MODEL = (
    os.environ.get("NIM_JUDGE_MODEL")
    or os.environ.get("NIM_GEN_MODEL")
    or "llama3.1:8b"
)
NIM_CHAT_BASE_URL = os.environ.get("NIM_CHAT_BASE_URL", "http://localhost:11434").rstrip("/")

# Local SUT models (keep small-ish for workshop speed)
LOCAL_EMBED_MODEL = os.environ.get("LOCAL_EMBED_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
LOCAL_GEN_MODEL = os.environ.get("LOCAL_GEN_MODEL", "llama3.1:8b")

RANDOM_SEED = int(os.environ.get("RANDOM_SEED", "42"))
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

CACHE_DIR = Path(os.environ.get("EVAL_CACHE_DIR", str(Path(".module_eval_cache")))).resolve()
CACHE_DIR.mkdir(parents=True, exist_ok=True)

print("sys.executable:", sys.executable)
print("NIM_BASE_URL:", NIM_BASE_URL)
print("NIM_JUDGE_MODEL:", NIM_JUDGE_MODEL)
print("LOCAL_EMBED_MODEL:", LOCAL_EMBED_MODEL)
print("LOCAL_GEN_MODEL:", LOCAL_GEN_MODEL)
print("CACHE_DIR:", CACHE_DIR)


def nim_ok() -> bool:
    """Quick reachability check using a tiny chat call."""
    try:
        payload = {
            "model": NIM_JUDGE_MODEL,
            "messages": [{"role": "user", "content": "Reply with a single word: ok"}],
            "max_tokens": 4,
            "temperature": 0.0,
        }
        r = requests.post(NIM_CHAT_BASE_URL + "/v1/chat/completions", json=payload, timeout=10)
        if r.status_code >= 400:
            print("NIM judge call failed:", r.status_code)
            print((r.text or "").strip()[:400])
            return False
        j = r.json()
        content = (((j.get("choices") or [{}])[0].get("message") or {}).get("content") or "").strip()
        print("NIM judge sample response:", repr(content[:80]))
        return True
    except Exception as e:
        print("NIM judge not reachable:", type(e).__name__, str(e)[:200])
        return False


_ = nim_ok()



In [None]:
# Corpus loader (prefer newest run)


def find_fico_dir() -> Path:
    here = Path.cwd().resolve()
    for c in [here, *here.parents]:
        if (c / "nim_clients.py").exists() and (c / "rag_pipeline.py").exists():
            return c
        if (c / "fico" / "nim_clients.py").exists() and (c / "fico" / "rag_pipeline.py").exists():
            return c / "fico"
    raise RuntimeError("Could not locate fico/ directory")


FICO_DIR = find_fico_dir()
print("FICO_DIR:", FICO_DIR)


def newest_corpus_csv() -> Path:
    runs_dir = FICO_DIR / "corpus_runs"
    if runs_dir.exists():
        candidates = list(runs_dir.glob("*/fico_corpus_embedded.csv"))
        if candidates:
            candidates.sort(key=lambda p: p.stat().st_mtime)
            return candidates[-1]

    for name in ["fico_corpus_embedded_smoke.csv", "fico_corpus_embedded.csv"]:
        p = FICO_DIR / name
        if p.exists():
            return p

    raise FileNotFoundError("No corpus CSV found.")


CORPUS_CSV = newest_corpus_csv()
print("CORPUS_CSV:", CORPUS_CSV)

df_docs = pd.read_csv(CORPUS_CSV)
print("docs:", len(df_docs))
print("columns:", list(df_docs.columns))
display(df_docs.head(3))



In [None]:
# Build a local retrieval index (cached)

SUT_CHUNK_SIZE = int(os.environ.get("SUT_CHUNK_SIZE", "900"))
SUT_OVERLAP = int(os.environ.get("SUT_CHUNK_OVERLAP", "150"))


def _slug(s: str) -> str:
    return "".join(ch if ch.isalnum() else "_" for ch in (s or ""))[:80]


def build_chunk_rows(chunk_size: int, overlap: int) -> list[dict[str, Any]]:
    rows: list[dict[str, Any]] = []
    for _, row in df_docs.iterrows():
        doc_id = str(row.get("doc_id"))
        title = row.get("title")
        body = row.get("body_redacted") or row.get("body") or ""
        parts = chunk_text(str(body), chunk_size=int(chunk_size), overlap=int(overlap))
        for j, part in enumerate(parts):
            rows.append(
                {
                    "doc_id": doc_id,
                    "chunk_id": f"{doc_id}::c{j:03d}",
                    "title": str(title) if title is not None else None,
                    "text": part,
                }
            )
    return rows


def embed_local(texts: list[str]) -> np.ndarray:
    from sentence_transformers import SentenceTransformer

    model = SentenceTransformer(LOCAL_EMBED_MODEL)
    E = model.encode(
        texts,
        batch_size=64,
        show_progress_bar=True,
        convert_to_numpy=True,
        normalize_embeddings=True,
    )
    return np.asarray(E, dtype=np.float32)


def get_index(*, chunk_size: int, overlap: int) -> tuple[list[dict[str, Any]], VectorIndex]:
    h = hashlib.sha256((LOCAL_EMBED_MODEL + f"|cs={chunk_size}|ov={overlap}").encode("utf-8")).hexdigest()[:12]
    p_rows = CACHE_DIR / f"sut_chunks_{_slug(LOCAL_EMBED_MODEL)}_{h}.jsonl"
    p_emb = CACHE_DIR / f"sut_emb_{_slug(LOCAL_EMBED_MODEL)}_{h}.npy"

    if p_rows.exists() and p_emb.exists():
        rows = []
        with open(p_rows, "r", encoding="utf-8") as f:
            for line in f:
                line = line.strip()
                if line:
                    rows.append(json.loads(line))
        E = np.load(p_emb)
        print("Loaded cached index:", p_rows.name, p_emb.name)
    else:
        rows = build_chunk_rows(chunk_size=int(chunk_size), overlap=int(overlap))
        texts = [r["text"] for r in rows]
        E = embed_local(texts)
        with open(p_rows, "w", encoding="utf-8") as f:
            for r in rows:
                f.write(json.dumps(r, ensure_ascii=False) + "\n")
        np.save(p_emb, E)
        print("Built index and cached:", p_rows.name, p_emb.name)

    index = VectorIndex(E.astype(np.float32), use_faiss=True)
    return rows, index


rows, index = get_index(chunk_size=SUT_CHUNK_SIZE, overlap=SUT_OVERLAP)
print("chunks:", len(rows), "| embed_model:", LOCAL_EMBED_MODEL)



In [None]:
# Build a tiny eval set (grounded questions, cached)

N_EVAL_QUESTIONS = int(os.environ.get("N_EVAL_QUESTIONS", "12"))


def _hash_for_evalset(n_questions: int) -> str:
    h = hashlib.sha256()
    h.update(str(CORPUS_CSV).encode("utf-8"))
    h.update(str(int(CORPUS_CSV.stat().st_mtime)).encode("utf-8"))
    h.update(str(RANDOM_SEED).encode("utf-8"))
    h.update(str(int(n_questions)).encode("utf-8"))
    return h.hexdigest()[:12]


def _extract_json_obj(s: str) -> dict[str, Any] | None:
    s = (s or "").strip()
    if not s:
        return None
    try:
        o = json.loads(s)
        return o if isinstance(o, dict) else None
    except Exception:
        pass
    start = s.find("{")
    end = s.rfind("}")
    if start != -1 and end != -1 and end > start:
        try:
            o = json.loads(s[start : end + 1])
            return o if isinstance(o, dict) else None
        except Exception:
            return None
    return None


def make_eval_questions(*, n_questions: int) -> pd.DataFrame:
    out_path = CACHE_DIR / f"eval_questions_{_hash_for_evalset(n_questions)}.json"
    if out_path.exists():
        rows = json.loads(out_path.read_text(encoding="utf-8"))
        return pd.DataFrame(rows)

    sample = df_docs.sample(n=min(int(n_questions), len(df_docs)), random_state=RANDOM_SEED)

    cfg = NIMConfig(base_url=NIM_BASE_URL, timeout_s=60.0, gen_model=NIM_JUDGE_MODEL)
    nim = NIMClient(cfg)

    rows = []
    for _, r in sample.iterrows():
        ctx = str(r.get("body_redacted") or r.get("body") or "").strip()[:1800]
        title = str(r.get("title") or "")
        prompt = (
            "Create ONE question that can be answered ONLY from the provided context. "
            "Return ONLY JSON with key 'question'.\n\n"
            f"TITLE: {title}\n\nCONTEXT:\n{ctx}\n\nJSON:" 
        )
        try:
            txt, _ = nim.chat(prompt, max_tokens=96, temperature=0.0)
            obj = _extract_json_obj(txt) or {}
            q = str(obj.get("question") or "").strip()
        except Exception:
            q = ""

        q = (q.splitlines()[0] if q else "").strip()
        if not q:
            q = f"What does the document titled '{title}' describe?"
        if not q.endswith("?"):
            q = q + "?"

        rows.append({"user_input": q, "doc_id": str(r.get("doc_id"))})

    out_path.write_text(json.dumps(rows, ensure_ascii=False, indent=2), encoding="utf-8")
    return pd.DataFrame(rows)


df_q = make_eval_questions(n_questions=N_EVAL_QUESTIONS)
print("questions:", len(df_q))
display(df_q.head(8))



## Run the SUT (local RAG) to produce evaluation records

For each question we store:
- `user_input`
- `response`
- `retrieved_contexts` (list of strings)

And debugging you’ll care about in production:
- per-stage timings (embed/query/index/gen)
- retrieval knobs (`top_k`, `max_context_chars`)

**Escape hatch:** if local generation is too heavy, set `SUT_GEN_MODE=nim`.



In [None]:
# Local RAG SUT implementation + record generation

SUT_GEN_MODE = os.environ.get("SUT_GEN_MODE", "local").lower()  # local | nim
SUT_TOP_K = int(os.environ.get("SUT_TOP_K", "10"))
SUT_MAX_CONTEXT_CHARS = int(os.environ.get("SUT_MAX_CONTEXT_CHARS", "6000"))

# If LOCAL_GEN_MODEL is an Ollama id like "llama3.1:8b", Transformers can't load it.
# In that case, automatically switch the SUT generator to the NIM/Ollama path.
if SUT_GEN_MODE == "local" and ":" in str(LOCAL_GEN_MODEL):
    print(f"LOCAL_GEN_MODEL={LOCAL_GEN_MODEL!r} looks like an Ollama model id; switching SUT_GEN_MODE -> 'nim'")
    SUT_GEN_MODE = "nim"

# Optional: rerank only (ordering) using NIM reranker
SUT_USE_NIM_RERANK = os.environ.get("SUT_USE_NIM_RERANK", "0") == "1"
SUT_RERANK_K = int(os.environ.get("SUT_RERANK_K", "5"))

print("SUT_GEN_MODE:", SUT_GEN_MODE)
print("SUT_TOP_K:", SUT_TOP_K)
print("SUT_MAX_CONTEXT_CHARS:", SUT_MAX_CONTEXT_CHARS)
print("SUT_USE_NIM_RERANK:", SUT_USE_NIM_RERANK)
print("SUT_RERANK_K:", SUT_RERANK_K)


def embed_query_local(text: str) -> np.ndarray:
    from sentence_transformers import SentenceTransformer

    model = SentenceTransformer(LOCAL_EMBED_MODEL)
    v = model.encode([text], convert_to_numpy=True, normalize_embeddings=True)
    return np.asarray(v[0], dtype=np.float32)


_LOCAL_GEN = {"tok": None, "model": None}


def ensure_local_generator():
    if _LOCAL_GEN["tok"] is not None and _LOCAL_GEN["model"] is not None:
        return _LOCAL_GEN["tok"], _LOCAL_GEN["model"]

    import torch
    from transformers import AutoModelForCausalLM, AutoTokenizer

    tok = AutoTokenizer.from_pretrained(LOCAL_GEN_MODEL, use_fast=True)
    tok.padding_side = "left"
    if tok.pad_token is None:
        tok.pad_token = tok.eos_token

    dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32

    model = AutoModelForCausalLM.from_pretrained(
        LOCAL_GEN_MODEL,
        device_map="auto",
        dtype=dtype,
    )
    model.eval()

    _LOCAL_GEN["tok"] = tok
    _LOCAL_GEN["model"] = model
    return tok, model


def gen_sut(prompt: str, *, max_new_tokens: int = 180, temperature: float = 0.2) -> tuple[str, float]:
    if SUT_GEN_MODE == "nim":
        # Use Ollama for generation (via NIMClient's chat path) and NIM gateway for everything else.
        cfg = NIMConfig(
            base_url=NIM_BASE_URL,
            chat_base_url=NIM_CHAT_BASE_URL,
            gen_model=os.environ.get("NIM_GEN_MODEL") or str(LOCAL_GEN_MODEL),
            timeout_s=120.0,
        )
        nim = NIMClient(cfg)
        txt, dt = nim.chat(prompt, max_tokens=max_new_tokens, temperature=temperature)
        return clean_answer(txt), dt

    import torch

    t0 = time.perf_counter()
    tok, model = ensure_local_generator()
    inputs = tok([prompt], return_tensors="pt", padding=True, truncation=True).to(model.device)
    do_sample = float(temperature) > 1e-6
    with torch.no_grad():
        out = model.generate(
            **inputs,
            do_sample=do_sample,
            temperature=float(temperature) if do_sample else None,
            top_p=0.95,
            max_new_tokens=int(max_new_tokens),
            pad_token_id=tok.pad_token_id,
            eos_token_id=tok.eos_token_id,
        )
    gen_ids = out[0, inputs["input_ids"].shape[1] :]
    txt = tok.decode(gen_ids, skip_special_tokens=True).strip()
    return clean_answer(txt), (time.perf_counter() - t0)


def retrieve_contexts(question: str, *, top_k: int, max_context_chars: int) -> tuple[list[Chunk], dict[str, float]]:
    t = Timer()

    qv = t.time("embed_query_s", lambda: embed_query_local(question))
    idxs, sims = t.time("index_search_s", lambda: index.search(qv, k=int(top_k)))

    chunks = [
        Chunk(
            doc_id=str(rows[i]["doc_id"]),
            chunk_id=str(rows[i]["chunk_id"]),
            title=rows[i].get("title"),
            text=str(rows[i]["text"]),
        )
        for i in idxs
    ]

    if SUT_USE_NIM_RERANK and chunks:
        try:
            cfg = NIMConfig(base_url=NIM_BASE_URL, timeout_s=60.0)
            nim = NIMClient(cfg)
            doc_texts = [c.text for c in chunks]
            reranked, dt = nim.rerank(question, doc_texts, top_n=min(int(SUT_RERANK_K), len(doc_texts)))
            t.timings["rerank_s"] = dt
            first = [chunks[i] for i in reranked if 0 <= i < len(chunks)]
            rest = [c for j, c in enumerate(chunks) if j not in set(reranked)]
            chunks = first + rest
        except Exception:
            pass

    packed: list[Chunk] = []
    total = 0
    for c in chunks:
        if not c.text:
            continue
        if total >= int(max_context_chars):
            break
        take = c.text
        remaining = int(max_context_chars) - total
        if len(take) > remaining:
            take = take[:remaining]
        packed.append(Chunk(doc_id=c.doc_id, chunk_id=c.chunk_id, title=c.title, text=take))
        total += len(take)

    return packed, t.timings


def run_sut(question: str, *, top_k: int, max_context_chars: int) -> dict[str, Any]:
    ctx, timings = retrieve_contexts(question, top_k=top_k, max_context_chars=max_context_chars)
    prompt = make_prompt(question, ctx)
    ans, gen_dt = gen_sut(prompt, max_new_tokens=180, temperature=0.2)
    timings = {**timings, "gen_s": float(gen_dt)}

    return {
        "user_input": question,
        "response": ans,
        "retrieved_contexts": [c.text for c in ctx],
        "debug": {
            "top_k": int(top_k),
            "max_context_chars": int(max_context_chars),
            "n_contexts": len(ctx),
            **timings,
        },
    }


def make_sut_records(questions: list[str], *, top_k: int, max_context_chars: int) -> pd.DataFrame:
    h = hashlib.sha256()
    h.update(str(CORPUS_CSV).encode("utf-8"))
    h.update(str(int(CORPUS_CSV.stat().st_mtime)).encode("utf-8"))
    h.update(str(RANDOM_SEED).encode("utf-8"))
    h.update(str(SUT_CHUNK_SIZE).encode("utf-8"))
    h.update(str(SUT_OVERLAP).encode("utf-8"))
    h.update(str(top_k).encode("utf-8"))
    h.update(str(max_context_chars).encode("utf-8"))
    h.update(str(SUT_GEN_MODE).encode("utf-8"))
    key = h.hexdigest()[:12]

    out_path = CACHE_DIR / f"sut_records_{key}.jsonl"

    # Cache read (robust): if the file exists but is empty/corrupt (e.g., a previous run crashed
    # mid-write), treat it as a cache miss and regenerate.
    if out_path.exists() and out_path.stat().st_size > 0:
        rows_in: list[dict[str, Any]] = []
        try:
            with open(out_path, "r", encoding="utf-8") as f:
                for line in f:
                    line = line.strip()
                    if not line:
                        continue
                    obj = json.loads(line)
                    if isinstance(obj, dict):
                        rows_in.append(obj)
            if rows_in:
                return pd.DataFrame(rows_in)
        except Exception:
            pass

    # Cache write (atomic): write to a temp file then replace.
    tmp_path = out_path.with_suffix(out_path.suffix + ".tmp")
    rows_out: list[dict[str, Any]] = []
    with open(tmp_path, "w", encoding="utf-8") as f:
        for q in questions:
            rec = run_sut(q, top_k=top_k, max_context_chars=max_context_chars)
            rows_out.append(rec)
            f.write(json.dumps(rec, ensure_ascii=False) + "\n")
    tmp_path.replace(out_path)

    return pd.DataFrame(rows_out)


questions = df_q["user_input"].tolist()
df_sut = make_sut_records(questions, top_k=SUT_TOP_K, max_context_chars=SUT_MAX_CONTEXT_CHARS)
print("records:", len(df_sut))
display(df_sut[["user_input", "response"]].head(3))



## RAGAS evaluation (judge = NIM)

RAGAS needs an **LLM judge**. We’ll use `langchain-openai` to point it at NIM’s OpenAI-compatible API:

- `base_url = NIM_BASE_URL + "/v1"`
- `model = NIM_JUDGE_MODEL`

If your Ollama server doesn’t require auth, leave `NIM_API_KEY` unset.



In [None]:
# RAGAS eval using a NIM-hosted judge (OpenAI-compatible)

from ragas import evaluate
from ragas.metrics import faithfulness, AnswerRelevancy
from ragas.run_config import RunConfig
from langchain_core.embeddings import Embeddings

ANSWER_RELEVANCY_STRICTNESS = int(os.environ.get("RAGAS_ANSWER_RELEVANCY_STRICTNESS", "1"))


def make_judge_llm():
    """Judge LLM: always Ollama on :11434 (never the nginx gateway on :8000)."""

    api_key = os.environ.get("NIM_API_KEY") or "nim"
    judge_base_url = "http://localhost:11434/v1"

    # Ensure any implicit OpenAI clients also target Ollama.
    os.environ["OPENAI_API_KEY"] = os.environ.get("OPENAI_API_KEY") or api_key
    for k in ["OPENAI_BASE_URL", "OPENAI_API_BASE", "OPENAI_API_BASE_URL"]:
        os.environ[k] = judge_base_url

    try:
        from openai import AsyncOpenAI
    except Exception as e:
        raise RuntimeError("Missing dependency: openai") from e

    # Pass an explicit root async client so LangChain cannot accidentally use a cached client
    # pointing at the gateway.
    root_async_client = AsyncOpenAI(api_key=api_key, base_url=judge_base_url)

    try:
        from langchain_openai import ChatOpenAI
    except Exception as e:
        raise RuntimeError(
            "Missing dependency: langchain-openai. Install it with: pip install langchain-openai"
        ) from e

    return ChatOpenAI(
        model=NIM_JUDGE_MODEL,
        base_url=judge_base_url,
        api_key=api_key,
        root_async_client=root_async_client,
        # Enforce structured outputs for RAGAS parsers (Ollama supports this).
        model_kwargs={"response_format": {"type": "json_object"}},
        n=1,
        use_responses_api=False,
        temperature=0.0,
        timeout=60,
        max_retries=2,
    )


class NIMRagasEmbeddings(Embeddings):
    """Use NIM embeddings via NIMClient (gateway on :8000 by default)."""

    def __init__(self, cfg: NIMConfig):
        self.client = NIMClient(cfg)

    def embed_documents(self, texts: list[str]) -> list[list[float]]:
        embs, _ = self.client.embed_many(texts, input_type="passage")
        return embs

    def embed_query(self, text: str) -> list[float]:
        embs, _ = self.client.embed([text], input_type="query")
        return embs[0]


def make_ragas_embeddings():
    # Force embeddings to the NIM gateway (avoid accidentally using Ollama for /v1/embeddings)
    cfg = NIMConfig(
        base_url=NIM_BASE_URL,
        embed_base_url=NIM_BASE_URL,
        rerank_base_url=NIM_BASE_URL,
        chat_base_url=NIM_CHAT_BASE_URL,
    )
    return NIMRagasEmbeddings(cfg)


def ragas_report(df_records: pd.DataFrame) -> pd.DataFrame:
    ds = Dataset.from_pandas(df_records[["user_input", "response", "retrieved_contexts"]])
    llm = make_judge_llm()
    embeddings = make_ragas_embeddings()

    # Ollama only returns 1 completion; keep AnswerRelevancy strictness aligned.
    answer_relevancy_metric = AnswerRelevancy(strictness=int(ANSWER_RELEVANCY_STRICTNESS))

    report = evaluate(
        ds,
        metrics=[faithfulness, answer_relevancy_metric],
        llm=llm,
        embeddings=embeddings,
        run_config=RunConfig(max_workers=1),
        raise_exceptions=False,
    )
    return report.to_pandas()


df_report = ragas_report(df_sut)
print("rows:", len(df_report), "cols:", list(df_report.columns))
display(df_report[["user_input", "faithfulness", "answer_relevancy"]].head(8))

print("\nAggregate metrics:")
print(df_report[["faithfulness", "answer_relevancy"]].agg(["mean", "median"]))

fig = px.histogram(df_report, x="faithfulness", nbins=20, title="Faithfulness distribution")
fig.show()



## Grade (pass/fail) + small knob sweep

A practical way to ship this:

- Pick thresholds (example only):
  - **faithfulness mean ≥ 0.80**
  - **answer_relevancy mean ≥ 0.70**
- Add a “tail” guardrail:
  - p10 faithfulness ≥ 0.60

Then sweep a couple knobs to answer:
- “How much quality do we buy with more retrieval/context?”
- “What does it cost in latency?”

Tip: Keep `N_EVAL_QUESTIONS` small during workshops; do bigger runs overnight in CI.



In [None]:
# Grade + sweep helper


def summarize_timings(df_records: pd.DataFrame) -> dict[str, float]:
    # debug is a dict per-row
    dbg = [r if isinstance(r, dict) else {} for r in df_records.get("debug", [])]
    df_dbg = pd.DataFrame(dbg).fillna(0.0)
    out = {}
    for k in ["embed_query_s", "index_search_s", "rerank_s", "gen_s"]:
        if k in df_dbg.columns:
            out[k] = float(df_dbg[k].mean())
    out["total_s"] = float(sum(out.values()))
    return out


def grade_report(df_report: pd.DataFrame, *, f_mean: float = 0.80, ar_mean: float = 0.70, f_p10: float = 0.60) -> dict[str, Any]:
    f = df_report["faithfulness"].astype(float)
    ar = df_report["answer_relevancy"].astype(float)
    stats = {
        "faithfulness_mean": float(f.mean()),
        "faithfulness_p10": float(np.quantile(f, 0.10)),
        "answer_relevancy_mean": float(ar.mean()),
    }
    stats["pass"] = bool(
        stats["faithfulness_mean"] >= float(f_mean)
        and stats["answer_relevancy_mean"] >= float(ar_mean)
        and stats["faithfulness_p10"] >= float(f_p10)
    )
    return stats


# Baseline grade
baseline_grade = grade_report(df_report)
print("Baseline grade:")
print(json.dumps(baseline_grade, indent=2))

# Small sweep (keep tiny for workshop speed)
TOP_KS = [5, 10, 15]
MAX_CTXS = [2000, 6000]

rows_sweep = []
for top_k in TOP_KS:
    for max_ctx in MAX_CTXS:
        df_rec = make_sut_records(questions, top_k=int(top_k), max_context_chars=int(max_ctx))
        df_rep = ragas_report(df_rec)
        g = grade_report(df_rep)
        t = summarize_timings(df_rec)
        rows_sweep.append({
            "top_k": int(top_k),
            "max_context_chars": int(max_ctx),
            **g,
            **t,
        })


df_sweep = pd.DataFrame(rows_sweep)
print("\nSweep summary:")
display(df_sweep.sort_values(["pass", "faithfulness_mean", "answer_relevancy_mean"], ascending=False))

fig = px.scatter(
    df_sweep,
    x="total_s",
    y="faithfulness_mean",
    color="pass",
    symbol="top_k",
    hover_data=["max_context_chars", "answer_relevancy_mean", "faithfulness_p10"],
    title="Quality vs latency (sweep)",
)
fig.show()



## Takeaways + consulting drills

### What to ship to a customer
- A versioned **golden set** (questions)
- A repeatable **harness** that produces `response` + `retrieved_contexts`
- A **grade gate** (thresholds + rationale)
- A simple report artifact (CSV/JSON) and a CI job that runs it

### Drills
1. **Quality regression:** lower `max_context_chars` and explain why faithfulness moves.
2. **Latency SLO:** choose the fastest passing config from the sweep.
3. **Metric gaming:** manually inspect a few “high score” outputs — do you actually like them?

### Next step
Add online monitoring (logs/metrics/traces) and correlate quality drops to retrieval changes, indexing issues, or upstream model changes.


---

**End of notebook.**

