# Module C: RAG Architecture & Latency Profiler

**Goal:** Meet the strict 1.5s SLA while maintaining accuracy.

**Persona:** Solutions Architect


In [1]:
import os
import json
import time
from pathlib import Path

import numpy as np
import pandas as pd
import plotly.graph_objects as go

# Widgets are optional; notebook works without them.
import ipywidgets as widgets
from IPython.display import display, clear_output

from sentence_transformers import SentenceTransformer, CrossEncoder

from rag_pipeline import (
    RAGConfig,
    Timer,
    VectorIndex,
    build_chunks_from_df,
    chunk_text,
    make_prompt,
    normalize_rows,
)
from nim_clients import NIMClient, NIMConfig

print("✅ Module C initialized")


✅ Module C initialized


In [None]:
os.environ["HF_TRUST_REMOTE_CODE"]="1"

## Workshop Setup Notes

### Models
- **Local generation/judge**: `LOCAL_GEN_MODEL` (default: `meta-llama/Llama-3.1-8B-Instruct`)
- **Local embedding**: `LOCAL_EMBED_MODEL` (default: `sentence-transformers/all-MiniLM-L6-v2`)
- **Local reranker**: `LOCAL_RERANK_MODEL` (default: `cross-encoder/ms-marco-MiniLM-L-6-v2`)

### NIM endpoints (localhost)
This lab expects NIMs to be running locally and reachable via HTTP.

You can override endpoints/models using env vars:
- `NIM_API_KEY` (optional)
- `NIM_EMBED_MODEL`, `NIM_RERANK_MODEL`, `NIM_GEN_MODEL`
- `NIM_EMBED_PATH` (default `/v1/embeddings`)
- `NIM_RERANK_PATH` (default `/v1/rerank`)
- `NIM_CHAT_PATH` (default `/v1/chat/completions`)
- `NIM_BASE_URL` (default `http://localhost:8000`)

### What is measured in the SLA waterfall
- The waterfall measures **per-request latency** for query embed/retrieve/rerank/generate.
- **Index building** (chunk embedding for the corpus) is cached and is not part of the SLA.

### If widgets don’t work
Scroll to the bottom for the **NO-WIDGET FALLBACK** cell.


## Phase 1: Real Latency Waterfall (Local vs NIM)

This phase runs a **real** RAG request and measures wall-clock time for:

- Embed query
- Retrieve (vector index)
- Rerank
- Generate

Then it plots a measured waterfall and checks the 1.5s SLA.

We run it in two modes:
- **Local**: embedding + rerank + generation in-process
- **NIM**: embedding + rerank + generation via localhost NIM endpoints


In [None]:
# --- Load corpus from Module B run (single remaining run) ---
RUN_DIR = Path("corpus_runs/llm_richer_n20_20251211_193028")
CSV_PATH = RUN_DIR / "fico_corpus_embedded.csv"

if not CSV_PATH.exists():
    raise FileNotFoundError(f"Missing corpus CSV: {CSV_PATH}")

df_docs = pd.read_csv(CSV_PATH)

# Parse list-like columns stored as JSON strings
for col in ["tags", "allowed_roles", "allowed_tenants", "restricted_tags"]:
    if col in df_docs.columns:
        df_docs[col] = df_docs[col].apply(lambda v: json.loads(v) if isinstance(v, str) and v.strip().startswith("[") else [])

print(f"✅ Loaded {len(df_docs)} docs from {CSV_PATH}")


# --- Build chunk dataset ---
# Use redacted body to avoid leaking sensitive lines during workshops.
chunks = build_chunks_from_df(df_docs)

# Respect chunk settings via widgets; default chunking above is coarse, so we'll rebuild chunks when chunk_size changes.

def build_chunks(chunk_size: int, overlap: int):
    out = []
    for _, row in df_docs.iterrows():
        doc_id = str(row.get("doc_id"))
        title = row.get("title")
        tenant_id = row.get("tenant_id")
        doc_type = row.get("doc_type")
        tags = row.get("tags")
        body = row.get("body_redacted") or row.get("body") or ""
        parts = chunk_text(str(body), chunk_size=chunk_size, overlap=overlap)
        for j, part in enumerate(parts):
            out.append(
                {
                    "doc_id": doc_id,
                    "chunk_id": f"{doc_id}::c{j:03d}",
                    "title": str(title) if title is not None else None,
                    "tenant_id": str(tenant_id) if tenant_id is not None else None,
                    "doc_type": str(doc_type) if doc_type is not None else None,
                    "tags": list(tags) if isinstance(tags, list) else [],
                    "text": part,
                }
            )
    return out


# --- Models ---
# Local models
LOCAL_EMBED_MODEL = os.environ.get("LOCAL_EMBED_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
LOCAL_RERANK_MODEL = os.environ.get("LOCAL_RERANK_MODEL", "cross-encoder/ms-marco-MiniLM-L-6-v2")
# Text-only default (works with AutoModelForCausalLM). Override via env var.
LOCAL_GEN_MODEL = os.environ.get("LOCAL_GEN_MODEL", "TinyLlama/TinyLlama-1.1B-Chat-v1.0")

local_embedder = SentenceTransformer(LOCAL_EMBED_MODEL, device="cuda")
local_reranker = CrossEncoder(LOCAL_RERANK_MODEL, device="cuda")

# NIM client
nim = NIMClient(NIMConfig())


def embed_local(texts: list[str]):
    emb = local_embedder.encode(texts, normalize_embeddings=True, convert_to_numpy=True, show_progress_bar=False)
    return emb.astype(np.float32)


def rerank_local(query: str, docs: list[str]):
    pairs = [(query, d) for d in docs]
    scores = local_reranker.predict(pairs)
    order = np.argsort(-np.array(scores))
    return order.tolist(), scores


_LOCAL_GEN = {"tok": None, "model": None}


def ensure_local_generator():
    if _LOCAL_GEN["tok"] is not None and _LOCAL_GEN["model"] is not None:
        return _LOCAL_GEN["tok"], _LOCAL_GEN["model"]

    import torch
    from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer

    trust_remote_code = str(os.environ.get("HF_TRUST_REMOTE_CODE", "0")).lower() in {"1", "true", "yes"}

    # Fail fast on known vision configs (e.g. GLM-4V) which cannot be loaded as a text CausalLM.
    try:
        cfg = AutoConfig.from_pretrained(LOCAL_GEN_MODEL, trust_remote_code=trust_remote_code)
    except KeyError as e:
        # Usually indicates your transformers version doesn't recognize a new `model_type`.
        raise ValueError(
            f"Transformers in this environment doesn't recognize model_type={e!s} while loading LOCAL_GEN_MODEL={LOCAL_GEN_MODEL!r}. "
            "Fix: (1) pick a simpler text model like TinyLlama/TinyLlama-1.1B-Chat-v1.0, or (2) upgrade transformers (pip install -U transformers), "
            "or (3) set HF_TRUST_REMOTE_CODE=1 if the model requires custom loading code."
        ) from e
    except Exception as e:
        raise ValueError(
            f"Failed to load config for LOCAL_GEN_MODEL={LOCAL_GEN_MODEL!r}: {type(e).__name__}: {e}"
        ) from e

    if cfg.__class__.__name__.lower().startswith("glm4v") or getattr(cfg, "model_type", "") in {"glm4v", "glm-4v"}:
        raise ValueError(
            "LOCAL_GEN_MODEL appears to be a vision (VLM) model config, which can't be loaded via AutoModelForCausalLM. "
            "Set LOCAL_GEN_MODEL to a text-only causal LM (e.g. TinyLlama/TinyLlama-1.1B-Chat-v1.0) or switch Mode to 'nim'."
        )

    tok = AutoTokenizer.from_pretrained(LOCAL_GEN_MODEL, use_fast=True, trust_remote_code=trust_remote_code)
    tok.padding_side = "left"
    if tok.pad_token is None:
        tok.pad_token = tok.eos_token

    dtype = torch.bfloat16 if (torch.cuda.is_available() and torch.cuda.is_bf16_supported()) else torch.float16

    model = AutoModelForCausalLM.from_pretrained(
        LOCAL_GEN_MODEL,
        device_map="auto",
        torch_dtype=dtype,
        trust_remote_code=trust_remote_code,
    )
    model.eval()

    _LOCAL_GEN["tok"] = tok
    _LOCAL_GEN["model"] = model
    return tok, model


def gen_local(prompt: str, max_new_tokens: int, temperature: float):
    import torch

    tok, model = ensure_local_generator()

    inputs = tok([prompt], return_tensors="pt", padding=True, truncation=True).to(model.device)
    with torch.no_grad():
        out = model.generate(
            **inputs,
            do_sample=True,
            temperature=float(temperature),
            top_p=0.95,
            max_new_tokens=int(max_new_tokens),
            pad_token_id=tok.pad_token_id,
            eos_token_id=tok.eos_token_id,
        )

    gen_ids = out[0, inputs["input_ids"].shape[1] :]
    return tok.decode(gen_ids, skip_special_tokens=True).strip()


def build_index_for_mode(mode: str, chunk_rows: list[dict], use_faiss: bool = True):
    texts = [r["text"] for r in chunk_rows]

    if mode == "local":
        E = embed_local(texts)
        E_norm = normalize_rows(E)
        return VectorIndex(E_norm, use_faiss=use_faiss), E_norm

    if mode == "nim":
        # Batch to avoid request-size limits
        embs, _ = nim.embed_many(texts, batch_size=64)
        E = np.array(embs, dtype=np.float32)
        E_norm = normalize_rows(E)
        return VectorIndex(E_norm, use_faiss=use_faiss), E_norm

    raise ValueError(f"unknown mode: {mode}")


def run_rag(query: str, *, mode: str, chunk_rows: list[dict], index: VectorIndex, cfg: RAGConfig):
    timer = Timer()

    # Embed query
    if mode == "local":
        q_emb = timer.time("embed", lambda: embed_local([query])[0])
        net_t = 0.0
    else:
        embs, net_t = timer.time("network+embed", lambda: nim.embed([query]))
        q_emb = np.array(embs[0], dtype=np.float32)

    q_norm = q_emb / (np.linalg.norm(q_emb) + 1e-12)

    # Retrieve
    idxs, sims = timer.time("retrieve", lambda: index.search(q_norm, cfg.top_k))
    idxs = [int(i) for i in idxs]

    candidates = [chunk_rows[i] for i in idxs]

    # Rerank
    if cfg.rerank_top_k > 0:
        docs = [c["text"] for c in candidates]
        if mode == "local":
            order, _ = timer.time("rerank", lambda: rerank_local(query, docs))
        else:
            order, _ = timer.time("network+rerank", lambda: nim.rerank(query, docs, top_n=min(cfg.rerank_top_k, len(docs))))

        # NIM rerank returns indices into docs; local returns order
        if isinstance(order, list) and order and isinstance(order[0], int):
            reranked = [candidates[i] for i in order[: cfg.rerank_top_k]]
        else:
            reranked = candidates[: cfg.rerank_top_k]
    else:
        reranked = candidates

    # Context packing
    context = []
    used = 0
    for r in reranked:
        t = r["text"]
        if used + len(t) > cfg.max_context_chars:
            break
        context.append(r)
        used += len(t)

    from rag_pipeline import Chunk

    context_chunks = [
        Chunk(
            doc_id=str(r.get("doc_id")),
            chunk_id=str(r.get("chunk_id")),
            text=str(r.get("text")),
            title=r.get("title"),
            tenant_id=r.get("tenant_id"),
            doc_type=r.get("doc_type"),
            tags=r.get("tags") if isinstance(r.get("tags"), list) else [],
        )
        for r in context
    ]

    prompt = make_prompt(query, context_chunks)

    # Generate
    if mode == "local":
        answer = timer.time("generate", lambda: gen_local(prompt, cfg.max_new_tokens, cfg.temperature))
    else:
        answer, _ = timer.time("network+generate", lambda: nim.chat(prompt, max_tokens=cfg.max_new_tokens, temperature=cfg.temperature))

    # Timings
    timings = dict(timer.timings)
    if net_t and "network+embed" not in timings:
        timings["network+embed"] = float(net_t)

    return answer, context, timings


def plot_measured_waterfall(timings_s: dict, title: str):
    # Normalize keys into ordered components
    keys = [
        "network+embed",
        "embed",
        "retrieve",
        "network+rerank",
        "rerank",
        "network+generate",
        "generate",
    ]
    labels = {
        "network+embed": "NIM:Embed (HTTP)",
        "embed": "Embed",
        "retrieve": "Retrieve",
        "network+rerank": "NIM:Rerank (HTTP)",
        "rerank": "Rerank",
        "network+generate": "NIM:Gen (HTTP)",
        "generate": "Generate",
    }

    parts = [(k, float(timings_s.get(k, 0.0))) for k in keys if float(timings_s.get(k, 0.0)) > 0]
    total = sum(v for _, v in parts)

    fig = go.Figure(
        go.Waterfall(
            name="Latency",
            orientation="v",
            measure=["relative"] * len(parts) + ["total"],
            x=[labels[k] for k, _ in parts] + ["TOTAL"],
            textposition="outside",
            text=[f"{v:.3f}s" for _, v in parts] + [f"{total:.3f}s"],
            y=[v for _, v in parts] + [0],
            connector={"line": {"color": "rgb(63, 63, 63)"}},
        )
    )

    fig.add_hline(
        y=1.5,
        line_dash="dot",
        annotation_text="SLA Limit (1.5s)",
        line_color="red" if total > 1.5 else "green",
    )

    fig.update_layout(title=title, showlegend=False, yaxis=dict(title="Time (seconds)", range=[0, max(2.5, total * 1.2)]))
    fig.show()

    print(f"⏱️ TOTAL LATENCY: {total:.3f}s")
    if total > 1.5:
        print("❌ FAILED SLA. Optimize configuration!")
    else:
        print("✅ PASSED SLA.")


# -- CONTROLS (optional) --
style = {"description_width": "initial"}

w_chunk = widgets.IntSlider(value=900, min=200, max=2000, description="Chunk Size (chars):", style=style)
w_overlap = widgets.IntSlider(value=150, min=0, max=500, description="Overlap (chars):", style=style)
w_k = widgets.IntSlider(value=10, min=1, max=50, description="Top_K Chunks:", style=style)
w_rerank_k = widgets.IntSlider(value=5, min=0, max=20, description="Rerank Top_K:", style=style)

w_query = widgets.Text(value="Kubernetes incident runbook", description="Query:")

w_mode = widgets.Dropdown(options=["local", "nim"], value="local", description="Mode:")
btn_run = widgets.Button(description="Run RAG + plot latency")
btn_compare = widgets.Button(description="Compare Local vs NIM (SLA)")

out = widgets.Output()

# Cache indexes per mode + chunk settings so repeated runs are fast
_INDEX_CACHE = {}


def get_index(mode: str, chunk_size: int, overlap: int):
    key = (mode, int(chunk_size), int(overlap))
    if key in _INDEX_CACHE:
        return _INDEX_CACHE[key]

    rows = build_chunks(int(chunk_size), int(overlap))
    index, _ = build_index_for_mode(mode, rows, use_faiss=True)
    _INDEX_CACHE[key] = (rows, index)
    return _INDEX_CACHE[key]


def on_run_click(b):
    with out:
        clear_output()
        rows, index = get_index(w_mode.value, w_chunk.value, w_overlap.value)
        cfg = RAGConfig(mode=w_mode.value, top_k=int(w_k.value), rerank_top_k=int(w_rerank_k.value))
        answer, context, timings = run_rag(w_query.value, mode=w_mode.value, chunk_rows=rows, index=index, cfg=cfg)
        plot_measured_waterfall(timings, title=f"RAG Latency ({w_mode.value})")
        print("\n--- Answer (truncated) ---")
        print((answer or "")[:800])


def on_compare_click(b):
    with out:
        clear_output()

        rows_l, idx_l = get_index("local", w_chunk.value, w_overlap.value)
        rows_n, idx_n = get_index("nim", w_chunk.value, w_overlap.value)

        cfg_l = RAGConfig(mode="local", top_k=int(w_k.value), rerank_top_k=int(w_rerank_k.value))
        cfg_n = RAGConfig(mode="nim", top_k=int(w_k.value), rerank_top_k=int(w_rerank_k.value))

        q = w_query.value

        ans_l, ctx_l, t_l = run_rag(q, mode="local", chunk_rows=rows_l, index=idx_l, cfg=cfg_l)
        ans_n, ctx_n, t_n = run_rag(q, mode="nim", chunk_rows=rows_n, index=idx_n, cfg=cfg_n)

        print("=== LOCAL ===")
        plot_measured_waterfall(t_l, title="RAG Latency (local)")
        print("\n=== NIM ===")
        plot_measured_waterfall(t_n, title="RAG Latency (nim)")

        total_l = float(sum(t_l.values()))
        total_n = float(sum(t_n.values()))
        print("\n=== Comparison ===")
        print(f"Local total: {total_l:.3f}s")
        print(f"NIM total:   {total_n:.3f}s")
        print(f"Delta (NIM-Local): {(total_n-total_l):+.3f}s")


btn_run.on_click(on_run_click)
btn_compare.on_click(on_compare_click)

display(widgets.VBox([
    widgets.HBox([w_mode, btn_run, btn_compare]),
    w_query,
    widgets.HBox([w_chunk, w_overlap]),
    widgets.HBox([w_k, w_rerank_k]),
    out,
]))


✅ Loaded 69 docs from corpus_runs/llm_richer_n20_20251211_193028/fico_corpus_embedded.csv


VBox(children=(HBox(children=(Dropdown(description='Mode:', options=('local', 'nim'), value='local'), Button(d…

## Phase 2: Real Automated Eval (RAGAS)

This phase builds a tiny evaluation set from the corpus and runs **real RAGAS** metrics.

- It runs the same queries through **Local** and **NIM** modes.
- It uses a **local Llama-3.1-8B-Instruct** judge (as requested) to compute metrics.

Note: RAGAS will be slower than Phase 1 because it calls an LLM as a judge.


In [None]:
# --- Local generator/judge (cached) ---
# Reuse the cached local generator from Phase 1 if it exists; otherwise define it.

if "ensure_local_generator" not in globals():
    _LOCAL_GEN = {"tok": None, "model": None}

    def ensure_local_generator():
        if _LOCAL_GEN["tok"] is not None and _LOCAL_GEN["model"] is not None:
            return _LOCAL_GEN["tok"], _LOCAL_GEN["model"]

        import torch
        from transformers import AutoModelForCausalLM, AutoTokenizer

        tok = AutoTokenizer.from_pretrained(LOCAL_GEN_MODEL, use_fast=True)
        tok.padding_side = "left"
        if tok.pad_token is None:
            tok.pad_token = tok.eos_token

        model = AutoModelForCausalLM.from_pretrained(
            LOCAL_GEN_MODEL,
            device_map="auto",
            dtype=torch.bfloat16,
        )
        model.eval()

        _LOCAL_GEN["tok"] = tok
        _LOCAL_GEN["model"] = model
        return tok, model


def gen_local_cached(prompt: str, max_new_tokens: int = 180, temperature: float = 0.2):
    import torch

    tok, model = ensure_local_generator()

    inputs = tok([prompt], return_tensors="pt", padding=True, truncation=True).to(model.device)
    with torch.no_grad():
        out = model.generate(
            **inputs,
            do_sample=True,
            temperature=float(temperature),
            top_p=0.95,
            max_new_tokens=int(max_new_tokens),
            pad_token_id=tok.pad_token_id,
            eos_token_id=tok.eos_token_id,
        )

    gen_ids = out[0, inputs["input_ids"].shape[1] :]
    return tok.decode(gen_ids, skip_special_tokens=True).strip()


# --- RAGAS eval ---
# We create a small synthetic QA set grounded in context, then evaluate.

from datasets import Dataset


def make_eval_questions(n: int = 6):
    # Sample docs and generate grounded questions using the local model.
    # We generate questions from redacted bodies to keep the lab safe.
    sample = df_docs.sample(n=min(n, len(df_docs)), random_state=42)
    rows = []
    for _, r in sample.iterrows():
        ctx = str(r.get("body_redacted") or r.get("body") or "")
        ctx = ctx[:2000]
        prompt = (
            "Create one question that can be answered ONLY from the provided context.\n"
            "Return ONLY the question.\n\n"
            f"CONTEXT:\n{ctx}\n\nQUESTION:"
        )
        q = gen_local_cached(prompt, max_new_tokens=64, temperature=0.2)
        q = (q.splitlines()[0] if q else "").strip()
        if not q.endswith("?"):
            q = q + "?"
        rows.append({"question": q, "doc_id": str(r.get("doc_id"))})
    return rows


def answer_with_mode(question: str, mode: str, chunk_size: int, overlap: int, top_k: int, rerank_k: int):
    rows, index = get_index(mode, chunk_size, overlap)
    cfg = RAGConfig(mode=mode, top_k=int(top_k), rerank_top_k=int(rerank_k))
    answer, ctx_rows, timings = run_rag(question, mode=mode, chunk_rows=rows, index=index, cfg=cfg)

    contexts = [c["text"] for c in ctx_rows]
    return answer, contexts, timings


def run_ragas_eval(n_questions: int = 6, chunk_size: int = 900, overlap: int = 150, top_k: int = 10, rerank_k: int = 5):
    from ragas import evaluate
    from ragas.metrics import faithfulness, answer_relevancy

    eval_qs = make_eval_questions(n_questions)

    # Build dataset entries for both modes
    # ragas>=0.4 expects: user_input, response, retrieved_contexts
    records = []
    lat_records = []
    for q in eval_qs:
        for mode in ["local", "nim"]:
            ans, ctxs, timings = answer_with_mode(q["question"], mode, chunk_size, overlap, top_k, rerank_k)
            records.append(
                {
                    "user_input": q["question"],
                    "response": ans,
                    "retrieved_contexts": ctxs,
                    "mode": mode,
                }
            )
            lat_records.append({"mode": mode, **timings})

    ds = Dataset.from_list(records)

    # RAGAS needs an LLM judge. Use local Llama via a simple langchain wrapper.
    from transformers import pipeline
    from langchain_community.llms import HuggingFacePipeline

    tok, model = ensure_local_generator()
    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tok,
        max_new_tokens=256,
        temperature=0.0,
        do_sample=False,
        return_full_text=False,
    )
    llm = HuggingFacePipeline(pipeline=pipe)

    # Evaluate
    report = evaluate(ds, metrics=[faithfulness, answer_relevancy], llm=llm)
    df_report = report.to_pandas()

    # Aggregate by mode
    print("\n=== RAGAS report (by mode) ===")
    print(df_report.groupby("mode")[["faithfulness", "answer_relevancy"]].mean())

    # Latency summary
    df_lat = pd.DataFrame(lat_records).fillna(0.0)
    print("\n=== Latency summary (per-stage means) ===")
    cols = [c for c in df_lat.columns if c != "mode"]
    print(df_lat.groupby("mode")[cols].mean())

    return df_report, df_lat


btn_eval = widgets.Button(description="Run RAGAS Eval (Local vs NIM)")
out_eval = widgets.Output()


def on_eval_click(b):
    with out_eval:
        clear_output()
        # Keep small by default for workshops
        run_ragas_eval(n_questions=6, chunk_size=int(w_chunk.value), overlap=int(w_overlap.value), top_k=int(w_k.value), rerank_k=int(w_rerank_k.value))


btn_eval.on_click(on_eval_click)
display(btn_eval, out_eval)


Button(description='Run RAGAS Eval (Local vs NIM)', style=ButtonStyle())

Output()

In [4]:
# ---- NO-WIDGET FALLBACK ----
# If ipywidgets don't render/click in your environment, run these direct calls.

print("\n=== No-widget fallback: Phase 1 ===")

# Build indexes once
rows_local, index_local = get_index("local", chunk_size=900, overlap=150)
rows_nim, index_nim = get_index("nim", chunk_size=900, overlap=150)

cfg = RAGConfig(mode="local", top_k=10, rerank_top_k=5)

q = "GPU latency impacts APR"

ans_l, ctx_l, t_l = run_rag(q, mode="local", chunk_rows=rows_local, index=index_local, cfg=cfg)
plot_measured_waterfall(t_l, title="RAG Latency (local)")

print("\n---")

ans_n, ctx_n, t_n = run_rag(q, mode="nim", chunk_rows=rows_nim, index=index_nim, cfg=RAGConfig(mode="nim", top_k=10, rerank_top_k=5))
plot_measured_waterfall(t_n, title="RAG Latency (nim)")

print("\n=== No-widget fallback: Phase 2 (RAGAS) ===")
# Small eval by default
run_ragas_eval(n_questions=4, chunk_size=900, overlap=150, top_k=10, rerank_k=5)



=== No-widget fallback: Phase 1 ===


ConnectionError: HTTPConnectionPool(host='localhost', port=8000): Max retries exceeded with url: /v1/embeddings (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7ea53a22ba60>: Failed to establish a new connection: [Errno 111] Connection refused'))