# Module B: Vector Ops & RBAC Security Console

**Goal:** Visualize "Meaning" as "Distance" and implement Security Filters.

**Persona:** AI Architect

**Hardware:** GPU (Optional but recommended for speed demo)


In [11]:
# -- SETUP --

import json
import os
import glob

import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import ipywidgets as widgets
from IPython.display import display, clear_output

from sentence_transformers import SentenceTransformer

print("‚úÖ Cockpit Initialized.")


‚úÖ Cockpit Initialized.


## Phase 1: The Vector Radar (Core)

Visualize how the model clusters concepts. We have injected 3 distinct clusters.


In [12]:
# -- DATA LOADING (Real) --
# Loads the (only) corpus run under corpus_runs/ and initializes everything needed
# for real query projection + real semantic search.


def _parse_json_list(val):
    if isinstance(val, list):
        return val
    if not isinstance(val, str) or not val.strip():
        return []
    try:
        return json.loads(val)
    except Exception:
        return []


csv_candidates = glob.glob("corpus_runs/**/fico_corpus_embedded.csv", recursive=True)
if not csv_candidates:
    raise FileNotFoundError("No corpus found under corpus_runs/**/fico_corpus_embedded.csv")

csv_path = max(csv_candidates, key=lambda p: os.path.getmtime(p))
run_dir = os.path.dirname(csv_path)

emb_path = os.path.join(run_dir, "fico_corpus_embeddings.npy")
pca_path = os.path.join(run_dir, "fico_corpus_pca.json")

if not os.path.exists(emb_path) or not os.path.exists(pca_path):
    raise FileNotFoundError(
        "Missing embeddings/PCA artifacts. Re-run generate_fico_corpus.py with --save-embeddings.\n"
        f"Expected: {emb_path} and {pca_path}"
    )

print(f"‚úÖ Loading corpus: {csv_path}")
print(f"‚úÖ Loading embeddings: {emb_path}")
print(f"‚úÖ Loading PCA params: {pca_path}")

# Load dataframe
_df = pd.read_csv(csv_path)
for col in ["tags", "allowed_roles", "allowed_tenants", "restricted_tags"]:
    if col in _df.columns:
        _df[col] = _df[col].apply(_parse_json_list)

df = _df

# Load embeddings + PCA/scaler params
E = np.load(emb_path).astype(np.float32)  # shape: (N, D)
with open(pca_path, "r", encoding="utf-8") as f:
    pca_payload = json.load(f)

# Align embeddings row order to df by doc_id
if "doc_id" not in df.columns:
    raise ValueError("CSV is missing doc_id; rerun corpus generation.")

doc_ids_df = df["doc_id"].astype(str).tolist()
doc_ids_saved = [str(x) for x in pca_payload.get("doc_id_order", [])]

if len(doc_ids_saved) != len(doc_ids_df) or set(doc_ids_saved) != set(doc_ids_df):
    raise ValueError("doc_id mismatch between CSV and PCA payload; rerun corpus generation.")

if doc_ids_saved != doc_ids_df:
    idx = {doc_id: i for i, doc_id in enumerate(doc_ids_saved)}
    order = [idx[x] for x in doc_ids_df]
    E = E[order]

# Pre-normalize embeddings for cosine similarity
E_norm = E / (np.linalg.norm(E, axis=1, keepdims=True) + 1e-12)

# PCA projection params
scaler_mean = np.array(pca_payload["scaler"]["mean"], dtype=np.float32)
scaler_scale = np.array(pca_payload["scaler"]["scale"], dtype=np.float32)
pca_mean = np.array(pca_payload["pca"]["mean"], dtype=np.float32)
pca_components = np.array(pca_payload["pca"]["components"], dtype=np.float32)  # (3, D)

# Embedder for query embeddings
embed_model_id = pca_payload.get("embed_model", "sentence-transformers/all-MiniLM-L6-v2")
embedder = SentenceTransformer(embed_model_id, device="cuda")

print(f"‚úÖ Loaded {len(df)} documents across {df['cluster'].nunique()} clusters.")


‚úÖ Loading corpus: corpus_runs/llm_richer_n20_20251211_193028/fico_corpus_embedded.csv
‚úÖ Loading embeddings: corpus_runs/llm_richer_n20_20251211_193028/fico_corpus_embeddings.npy
‚úÖ Loading PCA params: corpus_runs/llm_richer_n20_20251211_193028/fico_corpus_pca.json


‚úÖ Loaded 69 documents across 3 clusters.


In [13]:
# -- INTERACTIVE RADAR (Real query projection) --


def project_query_to_xyz(query: str):
    q = (query or "").strip()
    if not q:
        return None

    q_emb = embedder.encode([q], normalize_embeddings=True, convert_to_numpy=True, show_progress_bar=False)[0].astype(np.float32)

    # Same transform used during corpus generation
    x_scaled = (q_emb - scaler_mean) / (scaler_scale + 1e-12)
    xyz = (x_scaled - pca_mean) @ pca_components.T
    return xyz


def plot_radar(inject_query=None):
    fig = px.scatter_3d(
        df,
        x='x', y='y', z='z',
        color='cluster',
        symbol='cluster',
        opacity=0.75,
        title="FICO Vector Space Radar",
        hover_data={
            "title": True,
            "doc_type": True,
            "tenant_id": True,
            "owner_team": True,
            "redaction_count": True,
        },
    )

    if inject_query:
        xyz = project_query_to_xyz(inject_query)
        if xyz is not None:
            fig.add_trace(
                go.Scatter3d(
                    x=[float(xyz[0])],
                    y=[float(xyz[1])],
                    z=[float(xyz[2])],
                    mode='markers+text',
                    marker=dict(size=10, color='red', symbol='x'),
                    text=[f"QUERY: {inject_query}"],
                    textposition="top center",
                    name='Injected Query',
                )
            )

    fig.update_layout(margin=dict(l=0, r=0, b=0, t=30))
    fig.show()


# Widget Setup
text_input = widgets.Text(
    value='',
    placeholder='Type a query (e.g., "GPU latency impacts APR")',
    description='Query:',
)
button = widgets.Button(description="Inject Query", button_style='danger')


def on_click(b):
    clear_output(wait=True)
    display(ui)
    plot_radar(text_input.value)


button.on_click(on_click)
ui = widgets.VBox([text_input, button])


display(ui)
plot_radar()


VBox(children=(Text(value='Server is crashing', description='Query:', placeholder='Type a query (e.g., "GPU la‚Ä¶

## Phase 2: RBAC Security Filter (Extension)

**Scenario:** A "Junior Analyst" tries to access sensitive data.

**Task:** Filter search results based on `user_level`.


In [None]:
# -- RBAC + REAL VECTOR SEARCH --
# 1) Embed the query
# 2) Retrieve top-K by cosine similarity over saved embeddings
# 3) Apply RBAC filter (tenant/role/restricted tags)

ROLE_LEVEL = {
    "public": 1,
    "analyst": 2,
    "risk_analyst": 2,
    "sre": 3,
    "security": 3,
    "admin": 3,
}

ALL_ROLES = list(ROLE_LEVEL.keys())


def rbac_allow(row, user_role: str, user_tenant: str, clearance_tags: set[str]):
    allowed_roles = row.get("allowed_roles", []) or []
    allowed_tenants = row.get("allowed_tenants", []) or []
    restricted_tags = row.get("restricted_tags", []) or []

    # Admin/security commonly have cross-tenant visibility.
    if user_role in {"admin", "security"}:
        tenant_ok = True
    else:
        tenant_ok = (
            "*" in allowed_tenants
            or (user_tenant in allowed_tenants)
            or (str(row.get("tenant_id", "global")) in ["global", user_tenant])
        )

    if "public" in allowed_roles:
        role_ok = True
    else:
        role_ok = user_role in allowed_roles

    if restricted_tags:
        if user_role in {"admin", "security"}:
            tags_ok = True
        else:
            tags_ok = set(restricted_tags).issubset(clearance_tags)
    else:
        tags_ok = True

    ok = tenant_ok and role_ok and tags_ok

    reason = []
    if not tenant_ok:
        reason.append("tenant")
    if not role_ok:
        reason.append("role")
    if not tags_ok:
        reason.append("restricted_tags")

    return ok, "+".join(reason) if reason else "ok"


def vector_search(query: str, top_k: int = 30):
    q = (query or "").strip()
    if not q:
        return [], None

    q_emb = embedder.encode([q], normalize_embeddings=True, convert_to_numpy=True, show_progress_bar=False)[0].astype(np.float32)
    q_norm = q_emb / (np.linalg.norm(q_emb) + 1e-12)

    sims = E_norm @ q_norm

    k = min(int(top_k), len(sims))
    idx = np.argpartition(-sims, k - 1)[:k]
    idx = idx[np.argsort(-sims[idx])]

    results = [(int(i), float(sims[int(i)])) for i in idx]
    return results, q_emb


def rbac_vector_search(query: str, user_role: str, user_tenant: str, clearance_tags: set[str], top_k: int = 30):
    q = (query or "").strip()
    if not q:
        print("‚ùå Enter a query.")
        return

    print(f"üîé Semantic search: '{q}' as role='{user_role}' tenant='{user_tenant}' clearance={sorted(list(clearance_tags))}")

    hits, _ = vector_search(q, top_k=top_k)
    if not hits:
        print("‚ùå No results.")
        return

    visible = []
    blocked_reasons = {}

    for i, score in hits:
        row = df.iloc[i]
        ok, reason = rbac_allow(row, user_role, user_tenant, clearance_tags)
        if ok:
            visible.append((row, score))
        else:
            blocked_reasons[reason] = blocked_reasons.get(reason, 0) + 1

    print("----- Results -----")
    if visible:
        for row, score in visible[:20]:
            title = row.get("title", "(no title)")
            doc_type = row.get("doc_type", "kb")
            tenant_id = row.get("tenant_id", "global")
            tags = row.get("tags", []) or []
            redactions = int(row.get("redaction_count", 0) or 0)
            redact_note = " [has redactable lines]" if redactions > 0 else ""
            print(f"‚úÖ {score:0.3f} | {title} | type={doc_type} tenant={tenant_id} tags={tags}{redact_note}")
    else:
        print("üö´ No visible documents in the top-K results.")

    blocked_count = len(hits) - len(visible)
    if blocked_count > 0:
        print(f"\nüîí BLOCKED {blocked_count} of top-{len(hits)}")
        for k, v in blocked_reasons.items():
            print(f"  - {k}: {v}")

        print("\nBlocked examples (first 10):")
        shown = 0
        for i, score in hits:
            row = df.iloc[i]
            ok, reason = rbac_allow(row, user_role, user_tenant, clearance_tags)
            if ok:
                continue
            title = row.get("title", "(no title)")
            doc_type = row.get("doc_type", "kb")
            tenant_id = row.get("tenant_id", "global")
            allowed_roles = row.get("allowed_roles", []) or []
            restricted_tags = row.get("restricted_tags", []) or []
            print(f"üö´ {score:0.3f} | {reason} | {title} | type={doc_type} tenant={tenant_id} roles={allowed_roles} restricted={restricted_tags}")
            shown += 1
            if shown >= 10:
                break


# Widgets (optional)
role_dropdown = widgets.Dropdown(options=ALL_ROLES, value="analyst", description="Role:")
tenant_dropdown = widgets.Dropdown(options=["acme", "globex", "initech"], value="acme", description="Tenant:")
clearance_select = widgets.SelectMultiple(
    options=["pii", "secrets", "customer-data", "prod", "credentials"],
    value=(),
    description="Clearance:",
)
topk_slider = widgets.IntSlider(value=30, min=5, max=100, step=5, description="Top-K:")
search_box = widgets.Text(placeholder='Try: "GPU latency impacts APR", "Kubernetes incident runbook"', description='Search:')
btn_rbac = widgets.Button(description="Run Secure Semantic Search")


def on_rbac_click(b):
    clear_output(wait=True)
    display(ui_rbac)
    rbac_vector_search(
        search_box.value,
        role_dropdown.value,
        tenant_dropdown.value,
        set(clearance_select.value),
        top_k=int(topk_slider.value),
    )


btn_rbac.on_click(on_rbac_click)
ui_rbac = widgets.VBox([role_dropdown, tenant_dropdown, clearance_select, topk_slider, search_box, btn_rbac])

display(ui_rbac)

VBox(children=(Dropdown(description='Role:', index=5, options=('public', 'analyst', 'risk_analyst', 'sre', 'se‚Ä¶

In [16]:
# --- RBAC Impact Metrics + Result Inspector ---
# Works with the real semantic search state loaded earlier (df, E_norm, embedder, etc.).

LAST_RUN = {}


def rbac_breakdown(row, user_role: str, user_tenant: str, clearance_tags: set[str]):
    allowed_roles = row.get("allowed_roles", []) or []
    allowed_tenants = row.get("allowed_tenants", []) or []
    restricted_tags = row.get("restricted_tags", []) or []

    if user_role in {"admin", "security"}:
        tenant_ok = True
    else:
        tenant_ok = (
            "*" in allowed_tenants
            or (user_tenant in allowed_tenants)
            or (str(row.get("tenant_id", "global")) in ["global", user_tenant])
        )

    if "public" in allowed_roles:
        role_ok = True
    else:
        role_ok = user_role in allowed_roles

    if restricted_tags:
        if user_role in {"admin", "security"}:
            tags_ok = True
        else:
            tags_ok = set(restricted_tags).issubset(clearance_tags)
    else:
        tags_ok = True

    return {
        "tenant_ok": bool(tenant_ok),
        "role_ok": bool(role_ok),
        "tags_ok": bool(tags_ok),
        "allowed_roles": allowed_roles,
        "allowed_tenants": allowed_tenants,
        "restricted_tags": restricted_tags,
        "user_role": user_role,
        "user_tenant": user_tenant,
        "user_clearance": sorted(list(clearance_tags)),
    }


def apply_rbac(hits, user_role: str, user_tenant: str, clearance_tags: set[str]):
    visible = []
    blocked = []
    reason_counts = {}

    for i, score in hits:
        row = df.iloc[int(i)]
        ok, reason = rbac_allow(row, user_role, user_tenant, clearance_tags)
        if ok:
            visible.append((int(i), float(score)))
        else:
            blocked.append((int(i), float(score), str(reason)))
            reason_counts[str(reason)] = reason_counts.get(str(reason), 0) + 1

    return visible, blocked, reason_counts


def run_and_store(query: str, user_role: str, user_tenant: str, clearance_tags: set[str], top_k: int = 30):
    hits, _ = vector_search(query, top_k=top_k)
    visible, blocked, reasons = apply_rbac(hits, user_role, user_tenant, clearance_tags)

    LAST_RUN.clear()
    LAST_RUN.update(
        {
            "query": query,
            "top_k": int(top_k),
            "user_role": user_role,
            "user_tenant": user_tenant,
            "clearance": sorted(list(clearance_tags)),
            "hits": hits,
            "visible": visible,
            "blocked": blocked,
            "blocked_reasons": reasons,
        }
    )
    return LAST_RUN


def rbac_impact_metrics(query: str, tenant: str, clearance_tags: set[str], top_k: int = 30):
    q = (query or "").strip()
    if not q:
        print("‚ùå Enter a query.")
        return

    roles = ["analyst", "security"]
    reports = {}

    print(f"\n=== RBAC impact metrics for query: {q!r} (top_k={int(top_k)}) ===")

    for role in roles:
        hits, _ = vector_search(q, top_k=top_k)
        visible, blocked, reasons = apply_rbac(hits, role, tenant, clearance_tags)

        reports[role] = {
            "hits": hits,
            "visible": visible,
            "blocked": blocked,
            "reasons": reasons,
        }

        print(f"\nRole={role} tenant={tenant} clearance={sorted(list(clearance_tags))}")
        print(f"- topK before RBAC: {len(hits)}")
        print(f"- topK after RBAC:  {len(visible)}")
        print(f"- blocked:          {len(blocked)}")
        if reasons:
            print("- blocked by reason:")
            for k, v in sorted(reasons.items(), key=lambda kv: (-kv[1], kv[0])):
                print(f"  - {k}: {v}")
        else:
            print("- blocked by reason: (none)")

    # security vs analyst comparison
    sec_ids = {i for i, _ in reports["security"]["visible"]}
    ana_ids = {i for i, _ in reports["analyst"]["visible"]}
    print("\n=== security vs analyst comparison (visible set overlap) ===")
    print(f"- security visible: {len(sec_ids)}")
    print(f"- analyst visible:  {len(ana_ids)}")
    print(f"- overlap:          {len(sec_ids & ana_ids)}")
    print(f"- security-only:    {len(sec_ids - ana_ids)}")


def inspect_idx(i: int, user_role: str, user_tenant: str, clearance_tags: set[str], score: float | None = None):
    row = df.iloc[int(i)]

    ok, reason = rbac_allow(row, user_role, user_tenant, clearance_tags)
    breakdown = rbac_breakdown(row, user_role, user_tenant, clearance_tags)

    print("\n=== Result inspector ===")
    if score is not None:
        print(f"score: {float(score):0.4f}")
    print(f"doc_id: {row.get('doc_id')}")
    print(f"pair_id: {row.get('pair_id')}")
    print(f"cluster: {row.get('cluster')}")
    print(f"tenant_id: {row.get('tenant_id')}")
    print(f"doc_type: {row.get('doc_type')}")
    print(f"owner_team: {row.get('owner_team')}")
    print(f"tags: {row.get('tags')}")
    print(f"restricted_tags: {row.get('restricted_tags')}")
    print(f"allowed_roles: {row.get('allowed_roles')}")
    print(f"allowed_tenants: {row.get('allowed_tenants')}")
    print(f"redaction_count: {row.get('redaction_count')}")

    print("\n--- RBAC decision ---")
    print(f"visible: {ok} (reason={reason})")
    print(f"tenant_ok={breakdown['tenant_ok']} role_ok={breakdown['role_ok']} tags_ok={breakdown['tags_ok']}")

    print("\n--- TITLE ---")
    print(row.get("title", ""))

    print("\n--- BODY (full) ---")
    print(row.get("body", ""))

    print("\n--- BODY (redacted) ---")
    print(row.get("body_redacted", ""))


def inspect_last(rank: int = 0, which: str = "visible"):
    if not LAST_RUN:
        print("‚ùå No LAST_RUN yet. Run a search first (rbac_vector_search or run_and_store).")
        return

    if which == "visible":
        items = LAST_RUN.get("visible", [])
        if rank >= len(items):
            print(f"‚ùå rank {rank} out of range for visible (len={len(items)})")
            return
        i, score = items[rank]
    else:
        items = LAST_RUN.get("hits", [])
        if rank >= len(items):
            print(f"‚ùå rank {rank} out of range for hits (len={len(items)})")
            return
        i, score = items[rank]

    inspect_idx(
        i,
        user_role=LAST_RUN.get("user_role", "analyst"),
        user_tenant=LAST_RUN.get("user_tenant", "acme"),
        clearance_tags=set(LAST_RUN.get("clearance", [])),
        score=score,
    )


# Optional widget UI for metrics + inspector
metrics_btn = widgets.Button(description="RBAC impact metrics (security vs analyst)")
inspect_btn = widgets.Button(description="Inspect selected")

metrics_query = widgets.Text(value="Kubernetes incident runbook", description="Query:")
metrics_tenant = widgets.Dropdown(options=["acme", "globex", "initech"], value="acme", description="Tenant:")
metrics_clearance = widgets.SelectMultiple(
    options=["pii", "secrets", "customer-data", "prod", "credentials"],
    value=(),
    description="Clearance:",
)
metrics_topk = widgets.IntSlider(value=30, min=5, max=100, step=5, description="Top-K:")

# Dropdown populated after you run a search (rbac_vector_search or run_and_store)
inspect_dropdown = widgets.Dropdown(options=[("(run a search first)", -1)], description="Pick:")


def refresh_inspect_dropdown():
    if not LAST_RUN:
        inspect_dropdown.options = [("(run a search first)", -1)]
        return

    opts = []
    # show visible results first
    for rank, (i, score) in enumerate(LAST_RUN.get("visible", [])[:30]):
        row = df.iloc[int(i)]
        title = str(row.get("title", "(no title)"))
        opts.append((f"v{rank} {score:0.3f} | {title[:80]}", int(i)))

    if not opts:
        # fall back to raw hits if nothing visible
        for rank, (i, score) in enumerate(LAST_RUN.get("hits", [])[:30]):
            row = df.iloc[int(i)]
            title = str(row.get("title", "(no title)"))
            opts.append((f"h{rank} {score:0.3f} | {title[:80]}", int(i)))

    inspect_dropdown.options = opts


def on_metrics_click(b):
    clear_output(wait=True)
    display(ui_metrics)
    rbac_impact_metrics(metrics_query.value, metrics_tenant.value, set(metrics_clearance.value), top_k=int(metrics_topk.value))


def on_inspect_click(b):
    clear_output(wait=True)
    display(ui_metrics)
    i = int(inspect_dropdown.value)
    if i < 0:
        print("‚ùå Run a search first to populate selectable results.")
        return
    # Use current widget role/tenant if available from LAST_RUN, else defaults
    role = LAST_RUN.get("user_role", "analyst")
    tenant = LAST_RUN.get("user_tenant", metrics_tenant.value)
    clearance = set(LAST_RUN.get("clearance", []))
    inspect_idx(i, role, tenant, clearance)


metrics_btn.on_click(on_metrics_click)
inspect_btn.on_click(on_inspect_click)

ui_metrics = widgets.VBox([
    widgets.HTML("<b>RBAC impact metrics + result inspector</b>"),
    metrics_query,
    widgets.HBox([metrics_tenant, metrics_topk]),
    metrics_clearance,
    widgets.HBox([metrics_btn, inspect_btn]),
    inspect_dropdown,
])

# Call this manually after running a search, or just re-run this cell.
refresh_inspect_dropdown()

display(ui_metrics)

# Direct-call examples (works even if widgets are broken):
# rbac_impact_metrics("GPU latency impacts APR", tenant="acme", clearance_tags=set(), top_k=30)
# run_and_store("Kubernetes incident runbook", "security", "acme", set(), top_k=30); inspect_last(0)


VBox(children=(HTML(value='<b>RBAC impact metrics + result inspector</b>'), Text(value='Kubernetes incident ru‚Ä¶

In [17]:
# ---- NO-WIDGET FALLBACK ----
# If ipywidgets don't render/click in your environment, run semantic search directly:

print("\n=== Non-widget demo (direct calls) ===")

# Security (cross-tenant) should see the most.
rbac_vector_search("Kubernetes incident runbook", "security", "acme", set(), top_k=30)

print("\n---")
# Analyst should see fewer (role-gated) and may hit restricted_tags blocks.
rbac_vector_search("GPU latency impacts APR", "analyst", "acme", set(), top_k=30)

print("\n---")
# Analyst with secrets clearance.
rbac_vector_search("SECRET", "analyst", "acme", {"secrets"}, top_k=30)




=== Non-widget demo (direct calls) ===
üîé Semantic search: 'Kubernetes incident runbook' as role='security' tenant='acme' clearance=[]
----- Results -----
‚úÖ 0.682 | ["...", "..."] } assistant { "title": "Kubernetes Incident Response for Crash Loops", "body": "In the event of a Kubernetes cluster crash lo | type=runbook tenant=globex tags=['docker', 'driver', 'kubernetes', 'latency', 'observability']
‚úÖ 0.565 | ... | type=runbook tenant=initech tags=['...', 'docker', 'driver', 'firewall', 'kubernetes', 'latency']
‚úÖ 0.543 | ... | type=runbook tenant=initech tags=['...', 'docker', 'firewall', 'gpu', 'kubernetes', 'observability']
‚úÖ 0.530 | ... | type=runbook tenant=acme tags=['...', 'docker', 'driver', 'gpu', 'kubernetes', 'observability']
‚úÖ 0.501 | "title": "...", "body": "...", "tags": ["...", "..."] } assistant { "title": "Driver/CUDA Compatibility and Container Deployment Issues", "b | type=runbook tenant=acme tags=['docker', 'driver', 'firewall', 'gpu', 'kubernetes']
‚úÖ 