In [3]:
#!/usr/bin/env python3
# 3 Feb 2026
#
# Interactive, resumable tool-description rewrite workflow with:
# - K-candidate generation per tool instance (configurable)
# - Deterministic statistical/lexical risk indicators printed alongside base and candidates
# - Human-in-the-loop decision (accept candidate, edit, manual, skip), with append-only audit log
#
# Additions (Jan 21 test):
# - Candidate text snippet shown in the overview, so selection can occur without extra commands.
# - Explicit preview command documented: p<idx> (e.g., p2) prints the full candidate + stats.
#
# Additions (Concise soft length target, reviewer-proof):
# - Optional soft length target for style_concise: default 30% shorter, applied only if base_len >= threshold.
# - The target is guidance only (exceptions allowed to preserve meaning); out-of-target is flagged and logged.
# - Length metrics (len_ratio, len_delta) are computed for every candidate and stored in audit for reporting.
#
# Additions (Semantic drift hardening, v2):
# - Logic/negation/quantifier/modal/scope tokens extracted and diffed (beyond purely lexical patterns).
# - Risk scoring adjusted: verbs treated as secondary; structural+logic tokens treated as primary.
# - Optional semantic signals:
#   - Embedding cosine similarity (best-effort; depends on provider support for embeddings endpoint).
#   - LLM entailment verifier returning ENTAILS / NOT_ENTAILS (disabled by default).
#
# Additions (Feb 3 patch):
# - "Perturbations" visibility: print candidate-generation perturbation context to stdout during generation.
# - ESC behavior: raw-key command input on TTY so Esc behaves like quit (q) and never "accepts" accidentally.

import json
import shutil
import os
import time
import hashlib
import difflib
import re
import math
import sys
from pathlib import Path
from typing import Dict, Any, Optional, Tuple, List

from openai import OpenAI


# ========= Config =========
GEMINI_BASE_URL = "https://generativelanguage.googleapis.com/v1beta/openai/"
LLM_MODEL_DEFAULT = "gemini-2.5-flash"

HASH_HEX_LEN = 32

DEFAULT_MAX_TOKENS = 512
RETRY_ON_LENGTH = True
RETRY_MAX_TOKENS = 1024

DEFAULT_ALLOW_RESERIALIZE_FALLBACK = False

# How much of the last generated candidate to store in audit and to feed back into prompt.
DEFAULT_MAX_PREV_REWRITE_CHARS = 800

# Candidate count shown per tool instance.
DEFAULT_NUM_CANDIDATES = 2

# Printing controls for token previews in statistics.
DEFAULT_MAX_TOKEN_PREVIEW = 8
DEFAULT_MAX_TOKEN_STRING_LEN = 48

# Candidate text snippet in overview (chars).
DEFAULT_CANDIDATE_SNIPPET_CHARS = 160

# Soft concise length target knobs (reviewer-proof defaults).
DEFAULT_CONCISE_TARGET_RATIO = 0.70
DEFAULT_CONCISE_TARGET_MIN_BASE_LEN = 160
DEFAULT_CONCISE_TARGET_MIN_CHARS = 80

# Semantic signals (disabled by default).
DEFAULT_ENABLE_EMBEDDINGS = False
DEFAULT_EMBEDDING_MODEL = ""  # Provider-dependent; empty means "unset".
DEFAULT_EMBEDDING_LOW_COSINE_THRESHOLD = 0.85

DEFAULT_ENABLE_VERIFIER = False
DEFAULT_VERIFIER_MODEL = ""  # Empty means "use llm_model".
DEFAULT_VERIFIER_MAX_TOKENS = 16

# Visibility knobs (Feb 3 patch).
DEFAULT_SHOW_PERTURBATIONS = True   # Prints perturbation context during candidate generation.
DEFAULT_RAW_KEY_INPUT = True        # Enables raw-key command input on TTY so Esc can be captured.

# Policy versioning for audit identity and reporting.
RISK_POLICY_NAME = "risk_policy_v2_structural_logic_primary"
LOGIC_TOKEN_POLICY_NAME = "logic_tokens_v1"
SEMANTIC_POLICY_NAME = "semantic_signals_v1"


# ========= Styles =========
STYLE_SPECS: Dict[str, Dict[str, Any]] = {
    "style_verbose": {
        "system": (
            "Task: rewrite tool descriptions.\n"
            "Hard constraints:\n"
            "- Meaning must be preserved exactly; no new capabilities, steps, motivations, benefits, or context.\n"
            "- No information present in the original description may be deleted.\n"
            "- No new parameter names, IDs, field names, flags, or implementation details may be introduced.\n"
            "- If parameter/field names/IDs/flags already appear in the original description, they must be kept.\n"
            "- No examples, normative language, or assumptions.\n"
            "- The subject (the tool) and scope must remain unchanged.\n"
            "- Output must be only the rewritten description text, nothing else.\n"
            "- Style: verbose but controlled; concise and complete (1–2 sentences), clear and direct.\n"
        ),
        "regen_diversity_instruction": (
            "A meaning-equivalent rewrite is required with lexical and syntactic variation from the previous rewrite; "
            "the same sentence structure should be avoided."
        ),
        "max_prev_rewrite_chars": 800,
    },
    "style_concise": {
        "system": (
            "Task: rewrite tool descriptions.\n"
            "Hard constraints:\n"
            "- Meaning must be preserved exactly; no new capabilities, steps, motivations, benefits, or context.\n"
            "- No information present in the original description may be deleted.\n"
            "- No new parameter names, IDs, field names, flags, or implementation details may be introduced.\n"
            "- If parameter/field names/IDs/flags already appear in the original description, they must be kept.\n"
            "- No examples, normative language, or assumptions.\n"
            "- The subject (the tool) and scope must remain unchanged.\n"
            "- Output must be only the rewritten description text, nothing else.\n"
            "- Style: concise and controlled; 1 sentence preferred, 2 max.\n"
            "- Length constraint: shorter than the base description is preferred; if the base description is already short, the rewrite must not exceed its length.\n"
            "- Compression rule: remove redundancy, filler, and hedging while preserving all explicitly stated constraints/details.\n"
        ),
        "regen_diversity_instruction": (
            "A different paraphrase is required than the previous rewrite. "
            "The same sentence skeleton or distinctive phrases must not be reused. "
            "Meaning must remain exactly the same; only wording and structure may vary."
        ),
        "max_prev_rewrite_chars": 600,
    },
    # Alias to tolerate misspellings.
    "style_coicnoso": {},   # filled after dict creation
    "style_coinceise": {},  # filled after dict creation
}
STYLE_SPECS["style_coicnoso"] = STYLE_SPECS["style_concise"]
STYLE_SPECS["style_coinceise"] = STYLE_SPECS["style_concise"]


def _resolve_style(mode_key: str) -> Tuple[str, Dict[str, Any]]:
    mk = (mode_key or "").strip()
    if not mk:
        mk = "style_verbose"
    if mk not in STYLE_SPECS:
        raise ValueError(f"Unknown MODE_KEY='{mk}'. Supported: {', '.join(sorted(STYLE_SPECS.keys()))}")
    return mk, STYLE_SPECS[mk]


# ========= Client =========
def make_gemini_client() -> OpenAI:
    token = os.environ.get("TOKEN_GEMINI")
    if not token:
        raise RuntimeError("TOKEN_GEMINI environment variable is not set.")
    return OpenAI(api_key=token, base_url=GEMINI_BASE_URL)


# ========= Small utils =========
def _json_safe(obj: Any) -> Any:
    if obj is None or isinstance(obj, (str, int, float, bool)):
        return obj
    if isinstance(obj, dict):
        return {str(k): _json_safe(v) for k, v in obj.items()}
    if isinstance(obj, (list, tuple)):
        return [_json_safe(x) for x in obj]
    if hasattr(obj, "model_dump") and callable(getattr(obj, "model_dump")):
        try:
            return _json_safe(obj.model_dump())
        except Exception:
            pass
    if hasattr(obj, "dict") and callable(getattr(obj, "dict")):
        try:
            return _json_safe(obj.dict())
        except Exception:
            pass
    if hasattr(obj, "__dict__"):
        try:
            return _json_safe(vars(obj))
        except Exception:
            pass
    try:
        return str(obj)
    except Exception:
        return None


def _sha256_text(s: str) -> str:
    return hashlib.sha256((s or "").encode("utf-8")).hexdigest()


def _canonical_json(obj: Any) -> str:
    return json.dumps(obj, ensure_ascii=False, sort_keys=True, separators=(",", ":"))


def _sha256_file(path: Path) -> str:
    h = hashlib.sha256()
    with path.open("rb") as f:
        for chunk in iter(lambda: f.read(1024 * 1024), b""):
            h.update(chunk)
    return h.hexdigest()


def _safe_int_env(name: str, default: int) -> int:
    v = os.environ.get(name)
    if v is None or not v.strip():
        return int(default)
    try:
        return int(v.strip())
    except Exception:
        return int(default)


def _safe_float_env(name: str, default: float) -> float:
    v = os.environ.get(name)
    if v is None or not v.strip():
        return float(default)
    try:
        return float(v.strip())
    except Exception:
        return float(default)


def _safe_bool_env(name: str, default: bool) -> bool:
    v = os.environ.get(name)
    if v is None or not v.strip():
        return bool(default)
    s = v.strip().lower()
    if s in ("1", "true", "t", "yes", "y", "on"):
        return True
    if s in ("0", "false", "f", "no", "n", "off"):
        return False
    return bool(default)


# ========= Raw-key command input (Esc-safe) =========
def _read_command_raw_tty(prompt: str, *, k: int) -> str:
    """
    Reads a short command using raw keypress input on TTY.
    - Enter returns the accumulated buffer (possibly empty).
    - Esc returns '\x1b' immediately (caller maps it to quit).
    - Backspace edits the buffer.
    - For k<=9, numeric selection is returned immediately when it becomes unambiguous.
    """
    # Defer imports so non-TTY / non-Unix runs remain usable.
    try:
        import termios
        import tty
    except Exception:
        # Fall back to line input if raw mode is unavailable.
        return input(prompt)

    fd = sys.stdin.fileno()
    old = termios.tcgetattr(fd)
    buf = ""

    immediate_digits = (int(k) <= 9)

    def echo(s: str) -> None:
        sys.stdout.write(s)
        sys.stdout.flush()

    echo(prompt)

    try:
        tty.setraw(fd)
        while True:
            ch = sys.stdin.read(1)

            # Enter
            if ch in ("\r", "\n"):
                echo("\n")
                return buf

            # Esc
            if ch == "\x1b":
                echo("\n")
                return "\x1b"

            # Backspace / delete
            if ch in ("\x7f", "\b"):
                if buf:
                    buf = buf[:-1]
                    echo("\b \b")
                continue

            # Ignore other control chars.
            if not ch.isprintable():
                continue

            # Append printable char and echo.
            buf += ch
            echo(ch)

            low = buf.strip().lower()

            # If user typed a single-letter command, return immediately.
            if low in ("r", "e", "m", "s", "q", "y"):
                echo("\n")
                return low

            # Preview shortcut: p<idx> (only immediate if k<=9 and idx is single-digit).
            if immediate_digits and low.startswith("p") and len(low) == 2 and low[1].isdigit():
                vi = int(low[1])
                if 1 <= vi <= int(k):
                    echo("\n")
                    return low

            # Numeric selection (only immediate for k<=9 to avoid ambiguity like "10").
            if immediate_digits and len(low) == 1 and low.isdigit():
                vi = int(low)
                if 1 <= vi <= int(k):
                    echo("\n")
                    return low

    finally:
        termios.tcsetattr(fd, termios.TCSADRAIN, old)


def _read_command(prompt: str, *, k: int, raw_key_input: bool) -> str:
    """
    Reads a command from the user.
    - If raw_key_input is enabled and stdin is a TTY, uses raw keypress mode to capture Esc.
    - Otherwise, falls back to line-based input().
    """
    if raw_key_input and sys.stdin.isatty():
        try:
            return _read_command_raw_tty(prompt, k=int(k))
        except Exception:
            # Safe fallback.
            return input(prompt)
    return input(prompt)


# ========= Concise soft target (policy) =========
def _make_length_policy(
    *,
    base_desc: str,
    mode_key: str,
    concise_ratio: float,
    concise_min_base_len: int,
    concise_min_chars: int,
) -> Dict[str, Any]:
    """
    Returns a policy object (always present) used for:
    - prompt guidance (soft target)
    - stats (len_ratio, within_target)
    - audit reporting

    Soft target is applied only if:
    - mode_key == style_concise
    - base_len >= concise_min_base_len
    - computed target is strictly shorter than base_len
    """
    base = (base_desc or "").strip()
    base_len = len(base)

    ratio = float(concise_ratio)
    min_base_len = int(concise_min_base_len)
    min_chars = int(concise_min_chars)

    reason = "not_concise_mode"
    applied = False
    target_chars: Optional[int] = None

    if mode_key == "style_concise":
        if base_len < min_base_len or base_len <= 0:
            reason = "base_too_short"
        else:
            raw_target = int(base_len * ratio)
            candidate_target = max(raw_target, min_chars)
            # If env is mis-set (min_chars > base), do not apply a target that would exceed base.
            if candidate_target >= base_len:
                reason = "target_not_shorter_than_base"
            else:
                applied = True
                reason = "ok"
                target_chars = candidate_target

    return {
        "policy_name": "concise_soft_target_v1",
        "mode_key": mode_key,
        "base_len_chars": base_len,
        "concise_soft_target": {
            "applied": bool(applied),
            "reason": str(reason),
            "target_ratio": float(ratio),
            "min_base_len": int(min_base_len),
            "min_chars": int(min_chars),
            "target_chars": int(target_chars) if isinstance(target_chars, int) else None,
        },
    }


# ========= Statistical / lexical indicators =========
_FLAG_RE = re.compile(r"(?<!\w)--[A-Za-z0-9][A-Za-z0-9_-]*")
_SNAKE_RE = re.compile(r"\b[A-Za-z][A-Za-z0-9]*_[A-Za-z0-9_]+\b")
_CAMEL_RE = re.compile(r"\b[a-z]+[A-Z][A-Za-z0-9]*\b")
_FIELD_COLON_RE = re.compile(r"\b[A-Za-z][A-Za-z0-9_]{2,}\b(?=\s*[:=])")
_NUMBER_RE = re.compile(r"\b\d+(?:\.\d+)?\b")
_NUMBER_UNIT_RE = re.compile(
    r"\b\d+(?:\.\d+)?\s*(?:kb|mb|gb|tb|ms|s|sec|secs|seconds|mins|minutes|hrs|hours|days)\b",
    re.IGNORECASE,
)

_HIGH_RISK_VERBS = [
    "create", "delete", "remove", "destroy",
    "upload", "download", "send", "email",
    "execute", "run", "invoke", "call",
    "write", "read", "save", "store",
    "update", "modify", "edit", "change",
    "retrieve", "fetch", "search", "browse",
    "access", "open", "close",
    "return", "returns",
]
_VERB_RE = re.compile(r"\b(" + "|".join(re.escape(v) for v in _HIGH_RISK_VERBS) + r")\b", re.IGNORECASE)

_LOGIC_WORDS = {
    "only", "must", "never", "not", "no", "unless", "except",
    "required", "optional",
    "cannot", "can't",
}

_LOGIC_PHRASES = [
    "at least",
    "at most",
    "up to",
    "no more than",
    "no less than",
    "do not",
    "does not",
    "did not",
    "must not",
    "should not",
    "may not",
    "will not",
    "cannot",
    "can't",
    "if and only if",
]

_MODAL_WORDS = {
    "may", "must", "should", "will", "can", "could", "would", "might", "shall",
}

_SCOPE_PHRASES = [
    "returns",
    "return",
    "can return",
    "may return",
    "will return",
    "must return",
    "should return",
    "cannot return",
    "can't return",
    "does not return",
    "do not return",
]


def _compile_phrase_patterns(phrases: List[str]) -> Dict[str, re.Pattern]:
    out: Dict[str, re.Pattern] = {}
    for p in phrases:
        esc = re.escape(p).replace(r"\ ", r"\s+")
        pat = re.compile(r"(?<![A-Za-z0-9_])" + esc + r"(?![A-Za-z0-9_])", re.IGNORECASE)
        out[p.lower()] = pat
    return out


_LOGIC_PHRASE_PATTERNS = _compile_phrase_patterns(_LOGIC_PHRASES)
_SCOPE_PHRASE_PATTERNS = _compile_phrase_patterns(_SCOPE_PHRASES)

_WORD_TOKEN_RE = re.compile(r"[A-Za-z0-9_]+(?:'[A-Za-z0-9_]+)?", re.IGNORECASE)


def _sentence_count(text: str) -> int:
    t = (text or "").strip()
    if not t:
        return 0
    parts = [p for p in re.split(r"[.!?]+", t) if p.strip()]
    return len(parts)


def _word_count(text: str) -> int:
    t = (text or "").strip()
    if not t:
        return 0
    return len([w for w in re.split(r"\s+", t) if w])


def _extract_phrase_tokens(lower_text: str, patterns: Dict[str, re.Pattern]) -> List[str]:
    found: List[str] = []
    for canonical, pat in patterns.items():
        if pat.search(lower_text):
            found.append(canonical)
    return sorted(set(found))


def _extract_word_tokens(lower_text: str, vocabulary: set) -> List[str]:
    toks = [m.group(0).lower() for m in _WORD_TOKEN_RE.finditer(lower_text)]
    return sorted(set(t for t in toks if t in vocabulary))


def _extract_indicator_tokens(text: str) -> Dict[str, List[str]]:
    t = text or ""
    lower = t.lower()

    logic_phr = _extract_phrase_tokens(lower, _LOGIC_PHRASE_PATTERNS)
    logic_w = _extract_word_tokens(lower, _LOGIC_WORDS)
    logic = sorted(set(logic_phr + logic_w))

    modals = _extract_word_tokens(lower, _MODAL_WORDS)
    scope = _extract_phrase_tokens(lower, _SCOPE_PHRASE_PATTERNS)

    return {
        "flags": sorted(set(_FLAG_RE.findall(t))),
        "snake": sorted(set(_SNAKE_RE.findall(t))),
        "camel": sorted(set(_CAMEL_RE.findall(t))),
        "field_like": sorted(set(_FIELD_COLON_RE.findall(t))),
        "numbers": sorted(set(_NUMBER_RE.findall(t))),
        "number_units": sorted(set(m.group(0) for m in _NUMBER_UNIT_RE.finditer(t))),
        "verbs": sorted(set(m.group(0).lower() for m in _VERB_RE.finditer(t))),
        "logic": logic,
        "modals": modals,
        "scope": scope,
    }


def _format_token_preview(tokens: List[str], *, max_items: int, max_len: int) -> str:
    if not tokens:
        return "-"
    out: List[str] = []
    for t in tokens[: max(0, int(max_items))]:
        s = str(t)
        if len(s) > int(max_len):
            s = s[: int(max_len) - 1] + "…"
        out.append(s)
    if len(tokens) > int(max_items):
        out.append(f"+{len(tokens) - int(max_items)}")
    return ", ".join(out) if out else "-"


def _diff_token_sets(base: Dict[str, List[str]], cand: Dict[str, List[str]], key: str) -> Tuple[List[str], List[str]]:
    b = set(base.get(key, []) or [])
    c = set(cand.get(key, []) or [])
    new_items = sorted(c - b)
    missing_items = sorted(b - c)
    return new_items, missing_items


def _similarity_ratio(a: str, b: str) -> float:
    aa = (a or "").strip()
    bb = (b or "").strip()
    if not aa and not bb:
        return 1.0
    if not aa or not bb:
        return 0.0
    return float(difflib.SequenceMatcher(None, aa, bb).ratio())


def _cosine_similarity(a: List[float], b: List[float]) -> Optional[float]:
    if not a or not b:
        return None
    if len(a) != len(b):
        return None
    dot = 0.0
    na = 0.0
    nb = 0.0
    for i in range(len(a)):
        ai = float(a[i])
        bi = float(b[i])
        dot += ai * bi
        na += ai * ai
        nb += bi * bi
    if na <= 0.0 or nb <= 0.0:
        return None
    return float(dot / (math.sqrt(na) * math.sqrt(nb)))


def compute_candidate_stats(
    *,
    base_text: str,
    cand_text: str,
    mode_key: str,
    length_policy: Optional[Dict[str, Any]] = None,
) -> Dict[str, Any]:
    base = (base_text or "").strip()
    cand = (cand_text or "").strip()

    base_tokens = _extract_indicator_tokens(base)
    cand_tokens = _extract_indicator_tokens(cand)

    diffs: Dict[str, Any] = {}
    for k in ("flags", "snake", "camel", "field_like", "numbers", "number_units", "verbs", "logic", "modals", "scope"):
        new_items, missing_items = _diff_token_sets(base_tokens, cand_tokens, k)
        diffs[k] = {"new": new_items, "missing": missing_items}

    base_len = len(base)
    cand_len = len(cand)
    base_words = _word_count(base)
    cand_words = _word_count(cand)
    base_sent = _sentence_count(base)
    cand_sent = _sentence_count(cand)

    sim = _similarity_ratio(base, cand)

    len_ratio = (float(cand_len) / float(base_len)) if base_len > 0 else None
    len_delta = int(cand_len) - int(base_len)
    len_delta_ratio = (float(len_delta) / float(base_len)) if base_len > 0 else None

    structural_keys = ("flags", "field_like", "numbers", "number_units", "snake", "camel")
    logic_keys = ("logic", "modals", "scope")

    new_structural = sum(len(diffs[k]["new"]) for k in structural_keys)
    missing_structural = sum(len(diffs[k]["missing"]) for k in structural_keys)
    new_logic = sum(len(diffs[k]["new"]) for k in logic_keys)
    missing_logic = sum(len(diffs[k]["missing"]) for k in logic_keys)

    new_verbs = len(diffs["verbs"]["new"])
    missing_verbs = len(diffs["verbs"]["missing"])

    risk_label = "LOW"
    risk_reasons: List[str] = []

    if new_structural > 0:
        risk_label = "HIGH"
        risk_reasons.append("new_structural_tokens_detected")
    if risk_label != "HIGH" and (new_logic > 0 or missing_logic > 0):
        risk_label = "HIGH"
        risk_reasons.append("logic_or_modal_or_scope_tokens_changed")
    if risk_label != "HIGH" and missing_structural >= 4:
        risk_label = "HIGH"
        risk_reasons.append("many_structural_tokens_missing")

    if risk_label == "LOW" and missing_structural > 0:
        risk_label = "MED"
        risk_reasons.append("some_structural_tokens_missing")

    if risk_label == "LOW" and new_verbs > 0 and sim < 0.55:
        risk_label = "MED"
        risk_reasons.append("new_risk_verbs_with_low_similarity")

    if risk_label == "LOW" and sim < 0.45:
        risk_label = "MED"
        risk_reasons.append("very_low_text_similarity")

    soft_flags: List[Dict[str, Any]] = []
    if mode_key == "style_concise":
        if base_len > 0 and cand_len > base_len:
            soft_flags.append({"type": "concise_length_exceeds_base", "base_len": base_len, "cand_len": cand_len})
        if cand_sent > 2:
            soft_flags.append({"type": "concise_sentence_count_exceeds_2", "sentence_count": cand_sent})
    if mode_key == "style_verbose":
        if cand_sent > 2:
            soft_flags.append({"type": "verbose_sentence_count_exceeds_2", "sentence_count": cand_sent})

    concise_target_chars = None
    concise_target_applied = False
    concise_target_reason = None
    within_soft_target = None

    if isinstance(length_policy, dict):
        ct = (length_policy.get("concise_soft_target") or {})
        if isinstance(ct, dict):
            concise_target_applied = bool(ct.get("applied", False))
            concise_target_reason = ct.get("reason")
            concise_target_chars = ct.get("target_chars") if isinstance(ct.get("target_chars"), int) else None

    if mode_key == "style_concise" and concise_target_applied and isinstance(concise_target_chars, int) and base_len > 0:
        within_soft_target = bool(cand_len <= concise_target_chars)
        if not within_soft_target:
            soft_flags.append(
                {
                    "type": "concise_exceeds_soft_target",
                    "target_chars": int(concise_target_chars),
                    "cand_len": int(cand_len),
                    "len_ratio": float(len_ratio) if isinstance(len_ratio, float) else None,
                }
            )

    return {
        "policy": {
            "risk_policy_name": RISK_POLICY_NAME,
            "logic_token_policy_name": LOGIC_TOKEN_POLICY_NAME,
        },
        "base_len_chars": base_len,
        "cand_len_chars": cand_len,
        "len_ratio": len_ratio,
        "len_delta_chars": len_delta,
        "len_delta_ratio": len_delta_ratio,
        "base_words": base_words,
        "cand_words": cand_words,
        "base_sentences": base_sent,
        "cand_sentences": cand_sent,
        "similarity_ratio": sim,
        "diffs": diffs,
        "risk_label": risk_label,
        "risk_reasons": risk_reasons,
        "soft_flags": soft_flags,
        "base_tokens": base_tokens,
        "cand_tokens": cand_tokens,
        "length_policy": length_policy,
        "concise_soft_target_applied": concise_target_applied,
        "concise_soft_target_reason": concise_target_reason,
        "concise_soft_target_chars": concise_target_chars,
        "within_soft_target": within_soft_target,
        "new_structural_count": int(new_structural),
        "missing_structural_count": int(missing_structural),
        "new_logic_count": int(new_logic),
        "missing_logic_count": int(missing_logic),
        "new_risk_verbs_count": int(new_verbs),
        "missing_risk_verbs_count": int(missing_verbs),
    }


# ========= Optional semantic signals (embeddings + verifier) =========
def _compute_embedding_cosine(
    *,
    client: OpenAI,
    embedding_model: str,
    base_text: str,
    cand_text: str,
) -> Tuple[Optional[float], Optional[str], Dict[str, Any]]:
    meta: Dict[str, Any] = {"embedding_model": embedding_model, "provider_base_url": GEMINI_BASE_URL}
    try:
        rb = client.embeddings.create(model=embedding_model, input=base_text)
        rc = client.embeddings.create(model=embedding_model, input=cand_text)
        vb = getattr(rb.data[0], "embedding", None) if getattr(rb, "data", None) else None
        vc = getattr(rc.data[0], "embedding", None) if getattr(rc, "data", None) else None
        if not isinstance(vb, list) or not isinstance(vc, list):
            return None, "embedding_vector_missing", meta
        cos = _cosine_similarity(vb, vc)
        if cos is None:
            return None, "embedding_cosine_failed", meta
        return float(cos), None, meta
    except Exception as e:
        return None, str(e), meta


def _compute_entailment_verdict(
    *,
    client: OpenAI,
    verifier_model: str,
    base_text: str,
    cand_text: str,
    max_tokens: int,
) -> Tuple[Optional[str], Optional[str], Dict[str, Any]]:
    meta: Dict[str, Any] = {"verifier_model": verifier_model, "provider_base_url": GEMINI_BASE_URL}
    system = (
        "Task: semantic equivalence gate.\n"
        "Decision must be based only on whether the candidate text entails the base text with identical meaning.\n"
        "Output must be exactly one of: ENTAILS, NOT_ENTAILS.\n"
        "No explanations, no punctuation, no extra tokens.\n"
    )
    user = (
        "Base text:\n"
        f"{base_text.strip()}\n\n"
        "Candidate text:\n"
        f"{cand_text.strip()}\n"
    )
    try:
        resp = client.chat.completions.create(
            model=verifier_model,
            messages=[{"role": "system", "content": system}, {"role": "user", "content": user}],
            temperature=0.0,
            max_tokens=int(max_tokens),
        )
        out = (resp.choices[0].message.content or "").strip().upper()
        tok = out.split()[0] if out else ""
        if tok not in ("ENTAILS", "NOT_ENTAILS"):
            tok = "UNKNOWN"
        meta["finish_reason"] = getattr(resp.choices[0], "finish_reason", None)
        meta["usage"] = getattr(resp, "usage", None)
        return tok, None, meta
    except Exception as e:
        return None, str(e), meta


def augment_stats_with_semantic_signals(
    *,
    stats: Dict[str, Any],
    client: Optional[OpenAI],
    semantic_cfg: Optional[Dict[str, Any]],
    base_text: str,
    cand_text: str,
) -> Dict[str, Any]:
    if not isinstance(stats, dict):
        return stats
    if not semantic_cfg or not isinstance(semantic_cfg, dict):
        return stats
    if client is None:
        return stats

    enabled_embeddings = bool(semantic_cfg.get("enable_embeddings", False))
    embedding_model = str(semantic_cfg.get("embedding_model") or "").strip()
    emb_low_thr = semantic_cfg.get("embedding_low_cosine_threshold", DEFAULT_EMBEDDING_LOW_COSINE_THRESHOLD)

    enabled_verifier = bool(semantic_cfg.get("enable_verifier", False))
    verifier_model = str(semantic_cfg.get("verifier_model") or "").strip()
    verifier_max_tokens = int(semantic_cfg.get("verifier_max_tokens", DEFAULT_VERIFIER_MAX_TOKENS))

    if not isinstance(stats.get("soft_flags"), list):
        stats["soft_flags"] = []

    semantic_block: Dict[str, Any] = {"semantic_policy_name": SEMANTIC_POLICY_NAME}

    if enabled_embeddings and embedding_model:
        cos, err, meta = _compute_embedding_cosine(
            client=client,
            embedding_model=embedding_model,
            base_text=base_text,
            cand_text=cand_text,
        )
        semantic_block["embedding"] = {"cosine": cos, "error": err, "meta": meta}
        if isinstance(cos, (int, float)) and isinstance(emb_low_thr, (int, float)):
            if float(cos) < float(emb_low_thr):
                stats["soft_flags"].append(
                    {"type": "embedding_low_cosine", "cosine": float(cos), "threshold": float(emb_low_thr)}
                )
        if err:
            stats["soft_flags"].append({"type": "embedding_error", "error": str(err)[:200]})

    if enabled_verifier:
        if not verifier_model:
            verifier_model = str(semantic_cfg.get("fallback_llm_model") or "").strip()
        if verifier_model:
            label, err, meta = _compute_entailment_verdict(
                client=client,
                verifier_model=verifier_model,
                base_text=base_text,
                cand_text=cand_text,
                max_tokens=verifier_max_tokens,
            )
            semantic_block["verifier"] = {"label": label, "error": err, "meta": meta}
            if label == "NOT_ENTAILS":
                stats["soft_flags"].append({"type": "verifier_not_entails"})
            elif label == "UNKNOWN":
                stats["soft_flags"].append({"type": "verifier_unknown"})
            if err:
                stats["soft_flags"].append({"type": "verifier_error", "error": str(err)[:200]})
        else:
            stats["soft_flags"].append({"type": "verifier_unconfigured"})

    if semantic_block:
        stats["semantic"] = semantic_block

    return stats


def compute_full_candidate_stats(
    *,
    base_text: str,
    cand_text: str,
    mode_key: str,
    length_policy: Optional[Dict[str, Any]],
    client: Optional[OpenAI],
    semantic_cfg: Optional[Dict[str, Any]],
) -> Dict[str, Any]:
    stats = compute_candidate_stats(base_text=base_text, cand_text=cand_text, mode_key=mode_key, length_policy=length_policy)
    return augment_stats_with_semantic_signals(
        stats=stats,
        client=client,
        semantic_cfg=semantic_cfg,
        base_text=base_text,
        cand_text=cand_text,
    )


def _print_base_stats(base_desc: str, *, max_preview: int, max_tok_len: int) -> None:
    base = (base_desc or "").strip()
    tokens = _extract_indicator_tokens(base)
    print("\nStatistics (base):")
    print(f"  chars={len(base)}; words={_word_count(base)}; sentences={_sentence_count(base)}")
    print(
        "  tokens:"
        f" flags={len(tokens['flags'])}, field_like={len(tokens['field_like'])}, "
        f"numbers={len(tokens['numbers'])}, number_units={len(tokens['number_units'])}, "
        f"verbs={len(tokens['verbs'])}, snake={len(tokens['snake'])}, camel={len(tokens['camel'])}, "
        f"logic={len(tokens['logic'])}, modals={len(tokens['modals'])}, scope={len(tokens['scope'])}"
    )
    print(
        "  previews:"
        f" flags=[{_format_token_preview(tokens['flags'], max_items=max_preview, max_len=max_tok_len)}];"
        f" field_like=[{_format_token_preview(tokens['field_like'], max_items=max_preview, max_len=max_tok_len)}];"
        f" numbers=[{_format_token_preview(tokens['numbers'], max_items=max_preview, max_len=max_tok_len)}];"
        f" verbs=[{_format_token_preview(tokens['verbs'], max_items=max_preview, max_len=max_tok_len)}];"
        f" logic=[{_format_token_preview(tokens['logic'], max_items=max_preview, max_len=max_tok_len)}];"
        f" modals=[{_format_token_preview(tokens['modals'], max_items=max_preview, max_len=max_tok_len)}];"
        f" scope=[{_format_token_preview(tokens['scope'], max_items=max_preview, max_len=max_tok_len)}]"
    )


def _print_candidate_summary_line(
    i: int,
    cand: Dict[str, Any],
    *,
    max_preview: int,
    max_tok_len: int,
    snippet_chars: int,
) -> None:
    txt = (cand.get("text") or "").strip()
    err = cand.get("error")
    dup = bool(cand.get("duplicate", False))
    stats = cand.get("stats") or {}

    status = "ok"
    if err:
        status = f"error:{str(err)[:60]}"
    elif not txt:
        status = "empty"
    elif dup:
        status = "duplicate"

    risk = stats.get("risk_label") or "-"
    sim = stats.get("similarity_ratio")
    sim_s = f"{float(sim):.2f}" if isinstance(sim, (int, float)) else "-"

    diffs = (stats.get("diffs") or {})
    new_flags = len(((diffs.get("flags") or {}).get("new") or []))
    new_nums = len(((diffs.get("numbers") or {}).get("new") or [])) + len(((diffs.get("number_units") or {}).get("new") or []))
    new_verbs = len(((diffs.get("verbs") or {}).get("new") or []))
    new_fields = len(((diffs.get("field_like") or {}).get("new") or []))
    new_snake = len(((diffs.get("snake") or {}).get("new") or []))
    new_camel = len(((diffs.get("camel") or {}).get("new") or []))
    new_logic = (
        len(((diffs.get("logic") or {}).get("new") or [])) +
        len(((diffs.get("modals") or {}).get("new") or [])) +
        len(((diffs.get("scope") or {}).get("new") or []))
    )
    missing_total = (
        len(((diffs.get("flags") or {}).get("missing") or [])) +
        len(((diffs.get("numbers") or {}).get("missing") or [])) +
        len(((diffs.get("number_units") or {}).get("missing") or [])) +
        len(((diffs.get("verbs") or {}).get("missing") or [])) +
        len(((diffs.get("field_like") or {}).get("missing") or [])) +
        len(((diffs.get("snake") or {}).get("missing") or [])) +
        len(((diffs.get("camel") or {}).get("missing") or [])) +
        len(((diffs.get("logic") or {}).get("missing") or [])) +
        len(((diffs.get("modals") or {}).get("missing") or [])) +
        len(((diffs.get("scope") or {}).get("missing") or []))
    )

    clen = stats.get("cand_len_chars")
    cwords = stats.get("cand_words")
    csent = stats.get("cand_sentences")

    clen_s = str(clen) if isinstance(clen, int) else "-"
    cwords_s = str(cwords) if isinstance(cwords, int) else "-"
    csent_s = str(csent) if isinstance(csent, int) else "-"

    lr = stats.get("len_ratio")
    lr_s = f"{float(lr):.2f}" if isinstance(lr, (int, float)) else "-"
    ld = stats.get("len_delta_chars")
    ld_s = f"{int(ld):+d}" if isinstance(ld, int) else "-"

    t_applied = bool(stats.get("concise_soft_target_applied", False))
    t_chars = stats.get("concise_soft_target_chars")
    within = stats.get("within_soft_target")
    target_s = "-"
    if t_applied and isinstance(t_chars, int):
        if within is True:
            target_s = f"target<={t_chars} ok"
        elif within is False:
            target_s = f"target<={t_chars} NO"
        else:
            target_s = f"target<={t_chars}"

    emb_s = "-"
    ent_s = "-"
    sem = stats.get("semantic") if isinstance(stats.get("semantic"), dict) else None
    if isinstance(sem, dict):
        emb = sem.get("embedding") if isinstance(sem.get("embedding"), dict) else None
        if isinstance(emb, dict) and isinstance(emb.get("cosine"), (int, float)):
            emb_s = f"{float(emb.get('cosine')):.2f}"
        ver = sem.get("verifier") if isinstance(sem.get("verifier"), dict) else None
        if isinstance(ver, dict) and isinstance(ver.get("label"), str):
            ent_s = ver.get("label")

    print(
        f"  [{i}] status={status}; risk={risk}; sim={sim_s}; emb={emb_s}; ent={ent_s}; "
        f"cand(chars={clen_s}, words={cwords_s}, sent={csent_s}); "
        f"len_ratio={lr_s}; Δchars={ld_s}; {target_s}; "
        f"new(flags={new_flags}, fields={new_fields}, nums={new_nums}, snake={new_snake}, camel={new_camel}, logic={new_logic}, verbs={new_verbs}); "
        f"missing_total={missing_total}"
    )

    if isinstance(diffs, dict) and txt and not err:
        nf = (diffs.get("flags") or {}).get("new") or []
        nfv = (diffs.get("field_like") or {}).get("new") or []
        nn = (diffs.get("numbers") or {}).get("new") or []
        nnu = (diffs.get("number_units") or {}).get("new") or []
        nlogic = (diffs.get("logic") or {}).get("new") or []
        nmod = (diffs.get("modals") or {}).get("new") or []
        nscope = (diffs.get("scope") or {}).get("new") or []
        if nf or nfv or nn or nnu or nlogic or nmod or nscope:
            print(
                "      new-previews:"
                f" flags=[{_format_token_preview(list(nf), max_items=max_preview, max_len=max_tok_len)}];"
                f" fields=[{_format_token_preview(list(nfv), max_items=max_preview, max_len=max_tok_len)}];"
                f" numbers=[{_format_token_preview(list(nn), max_items=max_preview, max_len=max_tok_len)}];"
                f" number_units=[{_format_token_preview(list(nnu), max_items=max_preview, max_len=max_tok_len)}];"
                f" logic=[{_format_token_preview(list(nlogic), max_items=max_preview, max_len=max_tok_len)}];"
                f" modals=[{_format_token_preview(list(nmod), max_items=max_preview, max_len=max_tok_len)}];"
                f" scope=[{_format_token_preview(list(nscope), max_items=max_preview, max_len=max_tok_len)}]"
            )

    if txt and not err and int(snippet_chars) > 0:
        sn = " ".join(txt.split())
        max_sn = int(snippet_chars)
        if len(sn) > max_sn:
            sn = sn[: max_sn - 1] + "…"
        print(f"      text: {sn}")


def _print_candidate_full(
    i: int,
    cand: Dict[str, Any],
    *,
    max_preview: int,
    max_tok_len: int,
) -> None:
    txt = (cand.get("text") or "").strip()
    err = cand.get("error")
    stats = cand.get("stats") or {}
    diffs = (stats.get("diffs") or {})

    print(f"\nCandidate [{i}]:")
    if err:
        print(f"  Generation error: {err}")
        if txt:
            print("  Partial text:")
            print(txt)
        return
    if not txt:
        print("  (empty)")
        return

    print(txt)

    risk = stats.get("risk_label") or "-"
    reasons = stats.get("risk_reasons") or []
    soft = stats.get("soft_flags") or []

    print("\n  Candidate statistics:")
    sim = stats.get("similarity_ratio")
    sim_s = f"{float(sim):.2f}" if isinstance(sim, (int, float)) else "-"
    lr = stats.get("len_ratio")
    lr_s = f"{float(lr):.2f}" if isinstance(lr, (int, float)) else "-"
    ld = stats.get("len_delta_chars")
    ld_s = f"{int(ld):+d}" if isinstance(ld, int) else "-"

    sem = stats.get("semantic") if isinstance(stats.get("semantic"), dict) else None
    sem_s = ""
    if isinstance(sem, dict):
        emb = sem.get("embedding") if isinstance(sem.get("embedding"), dict) else None
        ver = sem.get("verifier") if isinstance(sem.get("verifier"), dict) else None
        if isinstance(emb, dict):
            sem_s += f"; emb_cos={emb.get('cosine')}"
            if emb.get("error"):
                sem_s += f"; emb_err={str(emb.get('error'))[:80]}"
        if isinstance(ver, dict):
            sem_s += f"; entail={ver.get('label')}"
            if ver.get("error"):
                sem_s += f"; ent_err={str(ver.get('error'))[:80]}"

    print(f"    risk={risk}; reasons={reasons if reasons else '[]'}; similarity={sim_s}; len_ratio={lr_s}; Δchars={ld_s}{sem_s}")

    if soft:
        print(f"    soft_flags={soft}")

    for key in ("flags", "field_like", "numbers", "number_units", "snake", "camel", "logic", "modals", "scope", "verbs"):
        d = diffs.get(key) or {}
        new_items = d.get("new") or []
        missing_items = d.get("missing") or []
        if not new_items and not missing_items:
            continue
        print(
            f"    {key}:"
            f" new({len(new_items)})=[{_format_token_preview(list(new_items), max_items=max_preview, max_len=max_tok_len)}];"
            f" missing({len(missing_items)})=[{_format_token_preview(list(missing_items), max_items=max_preview, max_len=max_tok_len)}]"
        )


# ========= Raw JSON-string patcher (for tools stored as JSON strings) =========
def _extract_json_string_value(raw_json: str, key: str) -> Optional[str]:
    token = f'"{key}"'
    i = raw_json.find(token)
    if i < 0:
        return None
    i = raw_json.find(":", i + len(token))
    if i < 0:
        return None
    i += 1
    n = len(raw_json)
    while i < n and raw_json[i] in " \t\r\n":
        i += 1
    if i >= n or raw_json[i] != '"':
        return None
    start = i
    i += 1
    esc = False
    while i < n:
        c = raw_json[i]
        if esc:
            esc = False
        else:
            if c == "\\":
                esc = True
            elif c == '"':
                return raw_json[start : i + 1]
        i += 1
    return None


def _decode_raw_json_string(raw_json_string_with_quotes: str) -> str:
    try:
        obj = json.loads('{"description":' + raw_json_string_with_quotes + "}")
        return obj.get("description") or ""
    except json.JSONDecodeError:
        return ""


def _get_description_for_print(entry: Any) -> Tuple[str, str]:
    if isinstance(entry, str):
        raw = _extract_json_string_value(entry, "description")
        if raw is not None:
            return raw, "raw_json"
        try:
            obj = json.loads(entry)
            return obj.get("description") or "", "rendered"
        except json.JSONDecodeError:
            return "", "rendered"
    if isinstance(entry, dict):
        return entry.get("description") or "", "rendered"
    return "", "rendered"


def _load_tool(entry: Any) -> Tuple[Optional[Dict[str, Any]], str]:
    if isinstance(entry, str):
        try:
            return json.loads(entry), "json_str"
        except json.JSONDecodeError:
            return None, "other"
    if isinstance(entry, dict):
        return entry, "dict"
    return None, "other"


def _skip_ws(s: str, i: int) -> int:
    n = len(s)
    while i < n and s[i] in " \t\r\n":
        i += 1
    return i


def _scan_string_span(s: str, i: int) -> Optional[Tuple[int, int]]:
    n = len(s)
    if i >= n or s[i] != '"':
        return None
    j = i + 1
    esc = False
    while j < n:
        c = s[j]
        if esc:
            esc = False
        else:
            if c == "\\":
                esc = True
            elif c == '"':
                return (i, j + 1)
        j += 1
    return None


def _scan_number_span(s: str, i: int) -> Optional[Tuple[int, int]]:
    n = len(s)
    j = i
    if j < n and s[j] == "-":
        j += 1
    if j >= n:
        return None
    if s[j] == "0":
        j += 1
    elif s[j].isdigit():
        while j < n and s[j].isdigit():
            j += 1
    else:
        return None
    if j < n and s[j] == ".":
        j += 1
        if j >= n or not s[j].isdigit():
            return None
        while j < n and s[j].isdigit():
            j += 1
    if j < n and s[j] in "eE":
        j += 1
        if j < n and s[j] in "+-":
            j += 1
        if j >= n or not s[j].isdigit():
            return None
        while j < n and s[j].isdigit():
            j += 1
    return (i, j)


def _scan_literal_span(s: str, i: int) -> Optional[Tuple[int, int]]:
    for lit in ("true", "false", "null"):
        if s.startswith(lit, i):
            return (i, i + len(lit))
    return None


def _scan_container_span(s: str, i: int) -> Optional[Tuple[int, int]]:
    n = len(s)
    if i >= n:
        return None

    opener = s[i]
    if opener not in "{[":
        return None

    stack: List[str] = ["}" if opener == "{" else "]"]
    j = i + 1
    in_str = False
    esc = False

    while j < n:
        c = s[j]

        if in_str:
            if esc:
                esc = False
            else:
                if c == "\\":
                    esc = True
                elif c == '"':
                    in_str = False
            j += 1
            continue

        if c == '"':
            in_str = True
            j += 1
            continue

        if c == "{":
            stack.append("}")
            j += 1
            continue
        if c == "[":
            stack.append("]")
            j += 1
            continue

        if c in "}]":
            if not stack:
                return None
            expected = stack[-1]
            if c != expected:
                return None
            stack.pop()
            j += 1
            if not stack:
                return (i, j)
            continue

        j += 1

    return None


def _is_value_delim(c: str) -> bool:
    return c in ",}]"


def _scan_value_span(s: str, i: int) -> Optional[Tuple[int, int]]:
    n = len(s)
    i = _skip_ws(s, i)
    if i >= n:
        return None

    c = s[i]
    if c == '"':
        return _scan_string_span(s, i)
    if c in "{[":
        return _scan_container_span(s, i)

    span: Optional[Tuple[int, int]]
    if c == "-" or c.isdigit():
        span = _scan_number_span(s, i)
    else:
        span = _scan_literal_span(s, i)

    if not span:
        return None

    _, end = span
    k = _skip_ws(s, end)
    if k >= n:
        return span
    if _is_value_delim(s[k]):
        return span
    return None


def _replace_top_level_string_field_in_raw_object(raw_json_obj: str, key: str, new_value: str) -> Tuple[str, bool, str]:
    s = raw_json_obj
    n = len(s)

    i = _skip_ws(s, 0)
    if i >= n or s[i] != "{":
        return raw_json_obj, False, "not_object"

    i += 1
    found_any_key = False
    expect_key = True

    while True:
        i = _skip_ws(s, i)
        if i >= n:
            return raw_json_obj, False, "cannot_scan_value"

        if expect_key:
            if s[i] == "}":
                return raw_json_obj, False, "key_not_found"
            if s[i] != '"':
                return raw_json_obj, False, "invalid_key_string"

            key_span = _scan_string_span(s, i)
            if not key_span:
                return raw_json_obj, False, "invalid_key_string"

            found_any_key = True
            k_start, k_end = key_span
            try:
                key_decoded = json.loads(s[k_start:k_end])
            except Exception:
                return raw_json_obj, False, "invalid_key_string"

            i = _skip_ws(s, k_end)
            if i >= n or s[i] != ":":
                return raw_json_obj, False, "missing_colon"

            v_span = _scan_value_span(s, i + 1)
            if not v_span:
                return raw_json_obj, False, "cannot_scan_value"

            v_start, v_end = v_span

            if key_decoded == key:
                if v_start >= n or s[v_start] != '"':
                    return raw_json_obj, False, "value_not_string"

                replacement_literal = json.dumps(new_value, ensure_ascii=False)
                patched = s[:v_start] + replacement_literal + s[v_end:]

                try:
                    obj = json.loads(patched)
                except Exception:
                    return raw_json_obj, False, "json_load_failed_after_patch"

                if isinstance(obj, dict) and obj.get(key) == new_value:
                    return patched, True, "ok"
                return raw_json_obj, False, "validation_failed_after_patch"

            i = v_end
            expect_key = False
            continue

        i = _skip_ws(s, i)
        if i >= n:
            return raw_json_obj, False, "cannot_scan_value"

        if s[i] == ",":
            i += 1
            expect_key = True
            continue
        if s[i] == "}":
            return raw_json_obj, False, ("key_not_found" if found_any_key else "key_not_found")
        return raw_json_obj, False, "cannot_scan_value"


# ========= IDs =========
def _tool_fingerprint_excluding_description(tool_obj: Dict[str, Any]) -> str:
    filtered = {k: v for k, v in tool_obj.items() if k != "description"}
    payload = _canonical_json(filtered)
    return hashlib.sha256(payload.encode("utf-8")).hexdigest()[:HASH_HEX_LEN]


def _record_id(record_obj: Dict[str, Any], tool_field: str) -> str:
    rec = dict(record_obj)
    tools = rec.get(tool_field)
    if isinstance(tools, list):
        canon_tools: List[Any] = []
        for entry in tools:
            tool_obj, kind = _load_tool(entry)
            if tool_obj is None:
                canon_tools.append({"_unparsed": entry, "_kind": kind})
            else:
                canon_tools.append({k: v for k, v in tool_obj.items() if k != "description"})
        rec[tool_field] = canon_tools
    payload = _canonical_json(rec)
    return hashlib.sha256(payload.encode("utf-8")).hexdigest()[:HASH_HEX_LEN]


def _tool_instance_key(record_id: str, tool_index: int, tool_obj: Dict[str, Any]) -> str:
    fp = _tool_fingerprint_excluding_description(tool_obj)
    return f"rec:{record_id}:t{tool_index}:{fp}"


# ========= Audit (single file, resumable) =========
def _audit_identity(
    dataset_path: Path,
    *,
    mode_key: str,
    model: str,
    tool_field: str,
    num_candidates: int,
    semantic_cfg: Optional[Dict[str, Any]],
) -> str:
    sc = semantic_cfg or {}
    stable = (
        f"{dataset_path.resolve()}|{mode_key}|{model}|{tool_field}|K={int(num_candidates)}|"
        f"{RISK_POLICY_NAME}|{LOGIC_TOKEN_POLICY_NAME}|{SEMANTIC_POLICY_NAME}|"
        f"emb={bool(sc.get('enable_embeddings', False))}:{str(sc.get('embedding_model') or '')}|"
        f"ver={bool(sc.get('enable_verifier', False))}:{str(sc.get('verifier_model') or '')}"
    )
    return hashlib.sha256(stable.encode("utf-8")).hexdigest()[:12]


def _audit_file_path(
    dataset_path: Path,
    *,
    audit_dir: Path,
    mode_key: str,
    model: str,
    tool_field: str,
    num_candidates: int,
    semantic_cfg: Optional[Dict[str, Any]],
) -> Path:
    audit_key = _audit_identity(
        dataset_path,
        mode_key=mode_key,
        model=model,
        tool_field=tool_field,
        num_candidates=int(num_candidates),
        semantic_cfg=semantic_cfg,
    )
    safe_model = "".join(ch if ch.isalnum() or ch in ("-", "_", ".") else "_" for ch in model)
    out_dir = audit_dir / audit_key
    filename = f"{dataset_path.stem}.{audit_key}.{mode_key}.{safe_model}.K{int(num_candidates)}.audit.jsonl"
    return out_dir / filename


def _append_audit_event(audit_file: Path, event: Dict[str, Any]) -> None:
    audit_file.parent.mkdir(parents=True, exist_ok=True)
    safe_event = _json_safe(event)
    with audit_file.open("a", encoding="utf-8") as f:
        f.write(json.dumps(safe_event, ensure_ascii=False) + "\n")


def _load_resume_state(
    audit_file: Path,
) -> Tuple[
    Dict[str, Tuple[str, Optional[str], Optional[Dict[str, Any]]]],
    Dict[str, int],
    Dict[str, Optional[str]],
    Optional[Dict[str, Any]],
]:
    decisions: Dict[str, Tuple[str, Optional[str], Optional[Dict[str, Any]]]] = {}
    regen_counts: Dict[str, int] = {}
    last_rejected_text: Dict[str, Optional[str]] = {}
    prior_run_start: Optional[Dict[str, Any]] = None

    if not audit_file.exists():
        return decisions, regen_counts, last_rejected_text, None

    best_round: Dict[str, int] = {}

    with audit_file.open("r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            try:
                ev = json.loads(line)
            except Exception:
                continue
            if not isinstance(ev, dict):
                continue

            et = ev.get("event_type")
            if et == "run_start" and prior_run_start is None:
                prior_run_start = ev

            if et == "regenerate":
                ik = ev.get("instance_key")
                rr = ev.get("generation_round")
                txt = ev.get("last_generated_text")
                if isinstance(ik, str) and isinstance(rr, int) and rr >= 0:
                    prev = regen_counts.get(ik, 0)
                    if rr > prev:
                        regen_counts[ik] = rr
                    prev_best = best_round.get(ik, -1)
                    if rr >= prev_best:
                        best_round[ik] = rr
                        last_rejected_text[ik] = txt if isinstance(txt, str) else None

            if et == "decision":
                ik = ev.get("instance_key")
                status = ev.get("status")
                final_desc = ev.get("final_description")
                llm_bundle = ev.get("llm_bundle")
                if isinstance(ik, str) and isinstance(status, str):
                    decisions[ik] = (
                        status,
                        final_desc if isinstance(final_desc, str) else None,
                        llm_bundle if isinstance(llm_bundle, dict) else None,
                    )

    return decisions, regen_counts, last_rejected_text, prior_run_start


# ========= LLM helpers =========
def _sanitize_llm_output(text: str) -> str:
    t = (text or "").strip()
    if t.startswith("{") and "description" in t:
        try:
            obj = json.loads(t)
            if isinstance(obj, dict) and isinstance(obj.get("description"), str):
                t = obj["description"].strip()
        except Exception:
            pass
    if (t.startswith('"') and t.endswith('"')) or (t.startswith("'") and t.endswith("'")):
        t = t[1:-1].strip()
    return t


def _llm_chat_completion(
    *,
    client: OpenAI,
    model: str,
    messages: List[Dict[str, str]],
    temperature: float,
    max_tokens: int,
    seed: Optional[int],
) -> Tuple[str, Dict[str, Any]]:
    meta: Dict[str, Any] = {
        "seed_requested": seed,
        "seed_applied": False,
        "seed_error": None,
        "finish_reason": None,
        "usage": None,
        "max_tokens_requested": int(max_tokens),
        "max_param_used": None,
    }

    base_kwargs: Dict[str, Any] = dict(model=model, messages=messages, temperature=temperature)

    def attempt(max_param_used: str, include_seed: bool) -> Tuple[str, Dict[str, Any]]:
        req = dict(base_kwargs)
        if max_param_used == "max_completion_tokens":
            req["max_completion_tokens"] = int(max_tokens)
        else:
            req["max_tokens"] = int(max_tokens)
        if include_seed and seed is not None:
            req["seed"] = int(seed)

        resp = client.chat.completions.create(**req)
        text = (resp.choices[0].message.content or "").strip()

        meta_local = dict(meta)
        meta_local["max_param_used"] = max_param_used
        meta_local["finish_reason"] = getattr(resp.choices[0], "finish_reason", None)
        meta_local["usage"] = getattr(resp, "usage", None)
        meta_local["seed_applied"] = bool(include_seed and seed is not None)
        return text, meta_local

    def is_seed_error(e: Exception) -> bool:
        s = str(e).lower()
        return ("seed" in s) and ("unknown" in s or "unsupported" in s or "invalid" in s)

    try:
        return attempt("max_completion_tokens", include_seed=True)
    except Exception as e1:
        if seed is not None and is_seed_error(e1):
            meta["seed_error"] = str(e1)
            try:
                return attempt("max_completion_tokens", include_seed=False)
            except Exception:
                pass
        try:
            return attempt("max_tokens", include_seed=True)
        except Exception as e2:
            if seed is not None and is_seed_error(e2):
                meta["seed_error"] = str(e2)
                return attempt("max_tokens", include_seed=False)
            raise


def generate_description_via_llm(
    *,
    client: OpenAI,
    tool_name: str,
    base_description: str,
    model: str,
    seed: Optional[int],
    max_tokens: int,
    retry_on_length: bool,
    retry_max_tokens: int,
    mode_key: str,
    style_spec: Dict[str, Any],
    regen_index: int = 0,
    previous_rewrite: Optional[str] = None,
    length_policy: Optional[Dict[str, Any]] = None,
) -> Tuple[str, Dict[str, Any]]:
    system = str(style_spec["system"])
    regen_instr = str(style_spec.get("regen_diversity_instruction") or "")
    max_prev = int(style_spec.get("max_prev_rewrite_chars") or DEFAULT_MAX_PREV_REWRITE_CHARS)

    user_parts: List[str] = []
    user_parts.append(f"Tool name: {tool_name}")
    user_parts.append("Base description:")
    user_parts.append(base_description.strip() or "(empty)")
    user_parts.append("")
    user_parts.append(f"Rewrite in '{mode_key}' under the constraints.")

    if mode_key == "style_concise" and isinstance(length_policy, dict):
        ct = length_policy.get("concise_soft_target") if isinstance(length_policy.get("concise_soft_target"), dict) else {}
        applied = bool(ct.get("applied", False))
        target_chars = ct.get("target_chars") if isinstance(ct.get("target_chars"), int) else None
        ratio = ct.get("target_ratio")
        if applied and isinstance(target_chars, int):
            pct = int(float(ratio) * 100) if isinstance(ratio, (int, float)) else 70
            user_parts.append("")
            user_parts.append(f"Length guidance (soft target): <= {target_chars} characters (~{pct}% of base).")
            user_parts.append(
                "Exceeding the target is permitted only if strictly necessary to preserve meaning; "
                "no explicitly stated details may be omitted."
            )
        else:
            user_parts.append("")
            user_parts.append(
                "Length guidance: the base description is short or cannot be shortened safely; "
                "the rewrite must not exceed the base length and must remain as brief as possible."
            )

    if regen_index > 0:
        user_parts.append("")
        user_parts.append(f"Regeneration request: {regen_index}")
        if regen_instr:
            user_parts.append(regen_instr)
        if previous_rewrite and previous_rewrite.strip():
            prev = previous_rewrite.strip()
            if len(prev) > max_prev:
                prev = prev[:max_prev].rstrip()
            user_parts.append("")
            user_parts.append("Previous rewrite (wording must not be reused):")
            user_parts.append(prev)

    user = "\n".join(user_parts)

    raw1, meta1 = _llm_chat_completion(
        client=client,
        model=model,
        messages=[{"role": "system", "content": system}, {"role": "user", "content": user}],
        temperature=0.0,
        max_tokens=max_tokens,
        seed=seed,
    )
    san1 = _sanitize_llm_output(raw1)
    finish1 = (meta1.get("finish_reason") or "").lower()
    looks_truncated_1 = (finish1 == "length")

    if not looks_truncated_1:
        return san1, {
            "proposal_origin": "primary",
            "proposal_sanitized_final": san1,
            "llm_text_raw_primary": raw1,
            "llm_text_raw_retry": None,
            "primary": meta1,
            "retry": None,
            "mode_key": mode_key,
            "length_policy": length_policy,
        }

    raw2 = None
    meta2 = None
    san2 = None
    best_san = san1
    origin = "primary"

    if retry_on_length and retry_max_tokens > max_tokens:
        raw2, meta2 = _llm_chat_completion(
            client=client,
            model=model,
            messages=[{"role": "system", "content": system}, {"role": "user", "content": user}],
            temperature=0.0,
            max_tokens=int(retry_max_tokens),
            seed=seed,
        )
        san2 = _sanitize_llm_output(raw2)
        if san2 and len(san2) >= len(best_san):
            best_san = san2
            origin = "retry"

    return best_san, {
        "proposal_origin": origin,
        "proposal_sanitized_final": best_san,
        "llm_text_raw_primary": raw1,
        "llm_text_raw_retry": raw2,
        "primary": meta1,
        "retry": meta2,
        "mode_key": mode_key,
        "length_policy": length_policy,
    }


def _print_perturbation_context(
    *,
    show: bool,
    tool_name: str,
    candidate_i_1based: int,
    k: int,
    generation_round: int,
    regen_index: int,
    seed: Optional[int],
    max_tokens: int,
    mode_key: str,
    style_spec: Dict[str, Any],
    length_policy: Optional[Dict[str, Any]],
    prev_hint: Optional[str],
) -> None:
    """
    Prints (to stdout) the "perturbation context" used for this candidate generation.
    This is intentionally concise: it shows *what* changes across candidates/rounds without dumping full prompts.
    """
    if not show:
        return

    regen_instr = str(style_spec.get("regen_diversity_instruction") or "").strip()

    ct = None
    if isinstance(length_policy, dict):
        ct0 = length_policy.get("concise_soft_target")
        if isinstance(ct0, dict):
            ct = ct0

    ct_applied = bool(ct.get("applied", False)) if isinstance(ct, dict) else False
    ct_target = ct.get("target_chars") if (isinstance(ct, dict) and isinstance(ct.get("target_chars"), int)) else None
    ct_reason = ct.get("reason") if isinstance(ct, dict) else None

    print(
        f"\n  [gen] {tool_name} | cand {candidate_i_1based}/{int(k)} | round={int(generation_round)} | "
        f"regen_index={int(regen_index)} | mode={mode_key} | seed={seed} | max_tokens={int(max_tokens)}"
    )

    if mode_key == "style_concise":
        if ct_applied and isinstance(ct_target, int):
            print(f"      length_guidance: soft_target<= {int(ct_target)} chars (applied)")
        else:
            print(f"      length_guidance: soft_target not applied (reason={ct_reason})")

    if int(generation_round) > 0 or int(regen_index) > 0:
        if regen_instr:
            sn = " ".join(regen_instr.split())
            if len(sn) > 220:
                sn = sn[:219] + "…"
            print(f"      diversity_instruction: {sn}")

    if prev_hint and prev_hint.strip():
        ph = prev_hint.strip()
        ph_sn = " ".join(ph.split())
        if len(ph_sn) > 220:
            ph_sn = ph_sn[:219] + "…"
        print(
            f"      previous_rewrite_hint: len={len(ph)} sha={_sha256_text(ph)[:12]} snippet='{ph_sn}'"
        )


def generate_k_candidates_with_stats(
    *,
    client: OpenAI,
    tool_name: str,
    base_description: str,
    model: str,
    seed: Optional[int],
    max_tokens: int,
    retry_on_length: bool,
    retry_max_tokens: int,
    mode_key: str,
    style_spec: Dict[str, Any],
    generation_round: int,
    k: int,
    previous_rewrite_hint: Optional[str],
    min_sleep_sec_between_calls: float,
    length_policy: Optional[Dict[str, Any]],
    semantic_cfg: Optional[Dict[str, Any]],
    show_perturbations: bool,
) -> Tuple[List[Dict[str, Any]], Optional[str]]:
    base_desc = (base_description or "").strip()
    k_eff = max(1, int(k))

    candidates: List[Dict[str, Any]] = []
    seen: set = set()

    prev = previous_rewrite_hint.strip() if isinstance(previous_rewrite_hint, str) and previous_rewrite_hint.strip() else None
    last_generated_text: Optional[str] = None

    for i in range(0, k_eff):
        regen_index = int(generation_round) * 1000 + i
        text = ""
        bundle: Optional[Dict[str, Any]] = None
        err: Optional[str] = None

        # Feb 3 patch: print perturbation context *before* the LLM call.
        _print_perturbation_context(
            show=bool(show_perturbations),
            tool_name=tool_name,
            candidate_i_1based=i + 1,
            k=k_eff,
            generation_round=int(generation_round),
            regen_index=int(regen_index),
            seed=seed,
            max_tokens=int(max_tokens),
            mode_key=mode_key,
            style_spec=style_spec,
            length_policy=length_policy,
            prev_hint=prev,
        )

        try:
            text, bundle = generate_description_via_llm(
                client=client,
                tool_name=tool_name,
                base_description=base_desc,
                model=model,
                seed=seed,
                max_tokens=max_tokens,
                retry_on_length=retry_on_length,
                retry_max_tokens=retry_max_tokens,
                mode_key=mode_key,
                style_spec=style_spec,
                regen_index=regen_index,
                previous_rewrite=prev,
                length_policy=length_policy,
            )
            text = (text or "").strip()
            last_generated_text = text if text else last_generated_text
        except Exception as e:
            err = str(e)
            text = ""

        duplicate = False
        if text:
            if text in seen:
                duplicate = True
            else:
                seen.add(text)

        stats = compute_full_candidate_stats(
            base_text=base_desc,
            cand_text=text,
            mode_key=mode_key,
            length_policy=length_policy,
            client=client,
            semantic_cfg=semantic_cfg,
        ) if text else None

        candidates.append(
            {
                "candidate_index": i + 1,
                "text": text,
                "error": err,
                "bundle": bundle,
                "duplicate": duplicate,
                "stats": stats,
            }
        )

        prev = text if text else prev

        if min_sleep_sec_between_calls > 0:
            time.sleep(float(min_sleep_sec_between_calls))

    return candidates, last_generated_text


# ========= IO =========
def make_working_copy(input_jsonl: str, output_jsonl: str, *, overwrite: bool = False) -> str:
    src = Path(input_jsonl)
    dst = Path(output_jsonl)

    if not src.exists():
        raise FileNotFoundError(f"File not found: {src}")

    if dst.exists() and not overwrite:
        return str(dst)

    dst.parent.mkdir(parents=True, exist_ok=True)
    shutil.copy2(src, dst)
    return str(dst)


def _normalize_cmd(raw: str) -> str:
    """
    Normalizes user command strings.
    Important: maps Esc (raw '\x1b' or common textual forms) to quit.
    """
    r = raw if raw is not None else ""
    # If raw-key mode is enabled, Esc is returned as '\x1b'.
    if ("\x1b" in r) or (r.strip().lower() in ("esc", "<esc>", "^[", "escape")):
        return "q"

    c = (r or "").strip().lower()
    if c in ("", "y", "yes", "ok", "okay", "si", "sì"):
        return "y"
    if c in ("r", "retry", "again", "prova", "prova ancora", "rigenera"):
        return "r"
    if c in ("e", "edit", "modifica"):
        return "e"
    if c in ("m", "manual", "mine", "mio", "mia", "custom"):
        return "m"
    if c in ("s", "skip", "salta", "pass"):
        return "s"
    if c in ("q", "quit", "exit", "esci"):
        return "q"
    return c


def _parse_candidate_choice(cmd: str, *, k: int) -> Optional[int]:
    c = (cmd or "").strip()
    if not c:
        return None
    if c.isdigit():
        v = int(c)
        if 1 <= v <= int(k):
            return v
    return None


# ========= Main interactive =========
def interactive_llm_tools_in_jsonl(
    jsonl_path: str,
    *,
    tool_field: str,
    create_backup_of_target: bool,
    llm_model: str,
    seed: Optional[int],
    max_tokens: int,
    retry_on_length: bool,
    retry_max_tokens: int,
    allow_reserialize_fallback: bool,
    min_sleep_sec_between_calls: float,
    audit_dir: str,
    mode_key: str,
    num_candidates: int,
    max_token_preview: int,
    max_token_string_len: int,
    candidate_snippet_chars: int,
    concise_target_ratio: float,
    concise_target_min_base_len: int,
    concise_target_min_chars: int,
    semantic_cfg: Optional[Dict[str, Any]],
    show_perturbations: bool,
    raw_key_input: bool,
) -> None:
    mode_key, style_spec = _resolve_style(mode_key)

    path = Path(jsonl_path)
    if not path.exists():
        raise FileNotFoundError(f"File not found: {jsonl_path}")

    client = make_gemini_client()
    audit_file = _audit_file_path(
        path,
        audit_dir=Path(audit_dir),
        mode_key=mode_key,
        model=llm_model,
        tool_field=tool_field,
        num_candidates=int(num_candidates),
        semantic_cfg=semantic_cfg,
    )

    decisions_by_instance, regen_counts, last_rejected_text_by_instance, prior_run_start = _load_resume_state(audit_file)

    tool_order: List[Dict[str, Any]] = []
    with path.open("r", encoding="utf-8") as f:
        for raw_line in f:
            line = raw_line.rstrip("\n")
            if not line.strip():
                continue
            try:
                record = json.loads(line)
            except json.JSONDecodeError:
                continue
            if not isinstance(record, dict):
                continue

            rid = _record_id(record, tool_field=tool_field)
            tools = record.get(tool_field)
            if not isinstance(tools, list):
                continue

            for tool_index, entry in enumerate(tools):
                tool_obj, kind = _load_tool(entry)
                if not tool_obj:
                    continue
                name = (tool_obj.get("name") or "").strip()
                if not name:
                    continue

                desc_print, desc_mode = _get_description_for_print(entry)
                instance_key = _tool_instance_key(rid, tool_index, tool_obj)

                tool_order.append(
                    {
                        "record_id": rid,
                        "tool_index": tool_index,
                        "tool_name": name,
                        "desc_print": desc_print,
                        "desc_mode": desc_mode,
                        "instance_key": instance_key,
                        "entry_kind": kind,
                    }
                )

    n_total = len(tool_order)
    n_prev_reviewed = len(decisions_by_instance)

    start_pos = 0
    while start_pos < n_total and tool_order[start_pos]["instance_key"] in decisions_by_instance:
        start_pos += 1

    session_id = hashlib.sha256(f"{time.time_ns()}".encode("utf-8")).hexdigest()[:12]
    before_sha = _sha256_file(path)

    length_policy_config = {
        "concise_soft_target": {
            "ratio": float(concise_target_ratio),
            "min_base_len": int(concise_target_min_base_len),
            "min_chars": int(concise_target_min_chars),
            "policy_name": "concise_soft_target_v1",
        }
    }

    semantic_cfg_norm = dict(semantic_cfg or {})
    semantic_cfg_norm["fallback_llm_model"] = llm_model

    if prior_run_start is None:
        _append_audit_event(
            audit_file,
            {
                "event_type": "run_start",
                "ts": int(time.time()),
                "session_id": session_id,
                "mode": mode_key,
                "model": llm_model,
                "seed": seed,
                "dataset_path": str(path),
                "dataset_sha256_at_session_start": before_sha,
                "tool_field": tool_field,
                "n_total_occurrences": n_total,
                "max_tokens_requested": int(max_tokens),
                "retry_on_length": bool(retry_on_length),
                "retry_max_tokens": int(retry_max_tokens),
                "allow_reserialize_fallback": bool(allow_reserialize_fallback),
                "num_candidates": int(num_candidates),
                "min_sleep_sec_between_calls": float(min_sleep_sec_between_calls),
                "stats_max_token_preview": int(max_token_preview),
                "stats_max_token_string_len": int(max_token_string_len),
                "candidate_snippet_chars": int(candidate_snippet_chars),
                "length_policy_config": length_policy_config,
                "policy_versions": {
                    "risk_policy_name": RISK_POLICY_NAME,
                    "logic_token_policy_name": LOGIC_TOKEN_POLICY_NAME,
                    "semantic_policy_name": SEMANTIC_POLICY_NAME,
                },
                "semantic_cfg": semantic_cfg_norm,
                "show_perturbations": bool(show_perturbations),
                "raw_key_input": bool(raw_key_input),
            },
        )
    else:
        _append_audit_event(
            audit_file,
            {
                "event_type": "run_resume",
                "ts": int(time.time()),
                "session_id": session_id,
                "mode": mode_key,
                "model": llm_model,
                "seed": seed,
                "dataset_path": str(path),
                "dataset_sha256_at_session_start": before_sha,
                "tool_field": tool_field,
                "n_total_occurrences": n_total,
                "n_previously_reviewed": n_prev_reviewed,
                "resume_from_index_1based": (start_pos + 1) if start_pos < n_total else (n_total + 1),
                "num_candidates": int(num_candidates),
                "candidate_snippet_chars": int(candidate_snippet_chars),
                "length_policy_config": length_policy_config,
                "policy_versions": {
                    "risk_policy_name": RISK_POLICY_NAME,
                    "logic_token_policy_name": LOGIC_TOKEN_POLICY_NAME,
                    "semantic_policy_name": SEMANTIC_POLICY_NAME,
                },
                "semantic_cfg": semantic_cfg_norm,
                "show_perturbations": bool(show_perturbations),
                "raw_key_input": bool(raw_key_input),
            },
        )

    print(f"Target: {path}")
    print(f"Mode: {mode_key}")
    print(f"Audit file (RESUMABLE): {audit_file}")
    print(f"Tool occurrences total: {n_total}")
    if start_pos < n_total:
        print(f"Resume position: [{start_pos + 1}/{n_total}] (previously reviewed: {n_prev_reviewed})")
    else:
        print(f"Resume position: completed (previously reviewed: {n_prev_reviewed})")
    print(f"LLM: {llm_model} @ {GEMINI_BASE_URL}")
    print(f"Candidates per tool: {int(num_candidates)}")
    print(f"Candidate snippet chars: {int(candidate_snippet_chars)}")
    print(f"Policies: risk={RISK_POLICY_NAME}; logic={LOGIC_TOKEN_POLICY_NAME}; semantic={SEMANTIC_POLICY_NAME}")
    print(f"Perturbation prints: {'enabled' if show_perturbations else 'disabled'}")
    print(f"Raw key input (Esc-safe): {'enabled' if raw_key_input else 'disabled'}")

    if semantic_cfg_norm.get("enable_embeddings"):
        print(
            "Embedding signal: enabled; "
            f"model='{semantic_cfg_norm.get('embedding_model')}', "
            f"low_cos_thr={semantic_cfg_norm.get('embedding_low_cosine_threshold')}"
        )
    else:
        print("Embedding signal: disabled")
    if semantic_cfg_norm.get("enable_verifier"):
        print(
            "Verifier signal: enabled; "
            f"model='{semantic_cfg_norm.get('verifier_model') or llm_model}', "
            f"max_tokens={int(semantic_cfg_norm.get('verifier_max_tokens', DEFAULT_VERIFIER_MAX_TOKENS))}"
        )
    else:
        print("Verifier signal: disabled")

    if mode_key == "style_concise":
        print(
            "Concise soft target: "
            f"ratio={float(concise_target_ratio):.2f}, "
            f"min_base_len={int(concise_target_min_base_len)}, "
            f"min_chars={int(concise_target_min_chars)}"
        )
    print(f"Max tokens: {int(max_tokens)}; retry_on_length={bool(retry_on_length)}; retry_max_tokens={int(retry_max_tokens)}")
    print(
        "Commands: ENTER/ok=accept #1, 1..K=accept candidate, r=regenerate K, "
        "e=edit candidate, m=manual, s=skip, q/Esc=quit, p<idx>=preview (e.g., p2)\n"
    )

    quit_requested = False
    resume_next_index_1based: Optional[int] = None

    max_prev = int(style_spec.get("max_prev_rewrite_chars") or DEFAULT_MAX_PREV_REWRITE_CHARS)

    session_summary: Dict[str, Any] = {
        "accepted": 0,
        "edited": 0,
        "manual": 0,
        "skipped": 0,
        "accepted_risk_labels": {"LOW": 0, "MED": 0, "HIGH": 0, "NA": 0},
        "accepted_similarity_sum": 0.0,
        "accepted_similarity_n": 0,
        "accepted_base_chars_sum": 0,
        "accepted_cand_chars_sum": 0,
        "accepted_len_ratio_sum": 0.0,
        "accepted_len_ratio_n": 0,
        "accepted_len_delta_sum": 0,
        "accepted_soft_target_applicable_n": 0,
        "accepted_within_soft_target_n": 0,
        "accepted_embedding_cos_sum": 0.0,
        "accepted_embedding_cos_n": 0,
        "accepted_entails": 0,
        "accepted_not_entails": 0,
        "accepted_entails_unknown": 0,
    }

    for pos in range(start_pos, n_total):
        item = tool_order[pos]
        idx = pos + 1

        name = item["tool_name"]
        desc_mode = item["desc_mode"]
        old_desc_print = item["desc_print"]
        instance_key = item["instance_key"]
        rid = item["record_id"]
        tool_i = item["tool_index"]

        generation_round = int(regen_counts.get(instance_key, 0))
        previous_rewrite_hint: Optional[str] = last_rejected_text_by_instance.get(instance_key)

        print("=" * 80)
        print(f"[{idx}/{n_total}] {name}")
        print(f"instance_key: {instance_key} (record_id={rid}, tool_index={tool_i})")

        if desc_mode == "raw_json":
            print("Current description RAW (escaped):")
            print(old_desc_print if old_desc_print else "(empty)")
            base_desc = _decode_raw_json_string(old_desc_print) if old_desc_print else ""
            print("\nCurrent description DECODED:")
            print(base_desc if base_desc else "(empty)")
        else:
            base_desc = old_desc_print or ""
            print("Current description:")
            print(base_desc if base_desc else "(empty)")

        base_desc = (base_desc or "").strip()
        base_len_chars = len(base_desc)

        length_policy = _make_length_policy(
            base_desc=base_desc,
            mode_key=mode_key,
            concise_ratio=float(concise_target_ratio),
            concise_min_base_len=int(concise_target_min_base_len),
            concise_min_chars=int(concise_target_min_chars),
        )

        _print_base_stats(base_desc, max_preview=int(max_token_preview), max_tok_len=int(max_token_string_len))
        if mode_key == "style_concise":
            ct = length_policy.get("concise_soft_target", {}) if isinstance(length_policy.get("concise_soft_target"), dict) else {}
            if ct.get("applied") and isinstance(ct.get("target_chars"), int):
                print(f"Concise soft target (applied): target_chars={ct.get('target_chars')} (base_len={base_len_chars})")
            else:
                print(f"Concise soft target (not applied): reason={ct.get('reason')} (base_len={base_len_chars})")

        candidates: List[Dict[str, Any]] = []
        last_generated_text: Optional[str] = None

        while True:
            if not candidates:
                try:
                    candidates, last_generated_text = generate_k_candidates_with_stats(
                        client=client,
                        tool_name=name,
                        base_description=base_desc,
                        model=llm_model,
                        seed=seed,
                        max_tokens=max_tokens,
                        retry_on_length=retry_on_length,
                        retry_max_tokens=retry_max_tokens,
                        mode_key=mode_key,
                        style_spec=style_spec,
                        generation_round=generation_round,
                        k=int(num_candidates),
                        previous_rewrite_hint=previous_rewrite_hint,
                        min_sleep_sec_between_calls=float(min_sleep_sec_between_calls),
                        length_policy=length_policy,
                        semantic_cfg=semantic_cfg_norm,
                        show_perturbations=bool(show_perturbations),
                    )
                except Exception as e:
                    print(f"\nLLM ERROR (candidate set generation): {e}")
                    raw = _read_command("Choice [m=manual, s=skip, q/Esc=quit] > ", k=int(num_candidates), raw_key_input=bool(raw_key_input))
                    cmd = _normalize_cmd(raw)
                    now = int(time.time())

                    if cmd == "q":
                        quit_requested = True
                        resume_next_index_1based = idx
                        break

                    if cmd == "s":
                        decisions_by_instance[instance_key] = ("skipped", None, None)
                        _append_audit_event(
                            audit_file,
                            {
                                "event_type": "decision",
                                "ts": now,
                                "session_id": session_id,
                                "status": "skipped",
                                "tool_name": name,
                                "instance_key": instance_key,
                                "record_id": rid,
                                "tool_index": tool_i,
                                "model": llm_model,
                                "seed": seed,
                                "mode": mode_key,
                                "base_description": base_desc,
                                "final_description": None,
                                "source": "user",
                                "note": "skip_after_llm_error",
                                "length_policy": length_policy,
                                "semantic_cfg": semantic_cfg_norm,
                            },
                        )
                        session_summary["skipped"] += 1
                        break

                    if cmd == "m":
                        manual_final = input("Manual final description (empty cancels) > ").rstrip("\n").strip()
                        status = "manual" if manual_final else "skipped"
                        decisions_by_instance[instance_key] = (status, manual_final or None, None)
                        diff_stats = compute_full_candidate_stats(
                            base_text=base_desc,
                            cand_text=manual_final,
                            mode_key=mode_key,
                            length_policy=length_policy,
                            client=client,
                            semantic_cfg=semantic_cfg_norm,
                        ) if manual_final else None

                        _append_audit_event(
                            audit_file,
                            {
                                "event_type": "decision",
                                "ts": now,
                                "session_id": session_id,
                                "status": status,
                                "tool_name": name,
                                "instance_key": instance_key,
                                "record_id": rid,
                                "tool_index": tool_i,
                                "model": llm_model,
                                "seed": seed,
                                "mode": mode_key,
                                "base_description": base_desc,
                                "final_description": manual_final or None,
                                "source": "user",
                                "note": "manual_after_llm_error",
                                "diff_stats": diff_stats,
                                "length_policy": length_policy,
                                "semantic_cfg": semantic_cfg_norm,
                            },
                        )
                        if status == "manual":
                            session_summary["manual"] += 1
                        else:
                            session_summary["skipped"] += 1
                        break

                    candidates = []
                    continue

                _append_audit_event(
                    audit_file,
                    {
                        "event_type": "candidates_generated",
                        "ts": int(time.time()),
                        "session_id": session_id,
                        "tool_name": name,
                        "instance_key": instance_key,
                        "record_id": rid,
                        "tool_index": tool_i,
                        "mode": mode_key,
                        "model": llm_model,
                        "seed": seed,
                        "generation_round": int(generation_round),
                        "num_candidates_requested": int(num_candidates),
                        "base_len_chars": int(base_len_chars),
                        "length_policy": length_policy,
                        "semantic_cfg": semantic_cfg_norm,
                        "candidates_summary": [
                            {
                                "candidate_index": c.get("candidate_index"),
                                "text_sha256": _sha256_text((c.get("text") or "").strip()),
                                "text_len": len((c.get("text") or "").strip()),
                                "error": c.get("error"),
                                "duplicate": bool(c.get("duplicate", False)),
                                "risk_label": ((c.get("stats") or {}).get("risk_label") if isinstance(c.get("stats"), dict) else None),
                                "similarity_ratio": ((c.get("stats") or {}).get("similarity_ratio") if isinstance(c.get("stats"), dict) else None),
                                "len_ratio": ((c.get("stats") or {}).get("len_ratio") if isinstance(c.get("stats"), dict) else None),
                                "len_delta_chars": ((c.get("stats") or {}).get("len_delta_chars") if isinstance(c.get("stats"), dict) else None),
                                "new_structural_count": ((c.get("stats") or {}).get("new_structural_count") if isinstance(c.get("stats"), dict) else None),
                                "missing_structural_count": ((c.get("stats") or {}).get("missing_structural_count") if isinstance(c.get("stats"), dict) else None),
                                "new_logic_count": ((c.get("stats") or {}).get("new_logic_count") if isinstance(c.get("stats"), dict) else None),
                                "missing_logic_count": ((c.get("stats") or {}).get("missing_logic_count") if isinstance(c.get("stats"), dict) else None),
                                "concise_soft_target_applied": ((c.get("stats") or {}).get("concise_soft_target_applied") if isinstance(c.get("stats"), dict) else None),
                                "concise_soft_target_chars": ((c.get("stats") or {}).get("concise_soft_target_chars") if isinstance(c.get("stats"), dict) else None),
                                "within_soft_target": ((c.get("stats") or {}).get("within_soft_target") if isinstance(c.get("stats"), dict) else None),
                                "embedding_cosine": (
                                    (((c.get("stats") or {}).get("semantic") or {}).get("embedding") or {}).get("cosine")
                                    if isinstance(c.get("stats"), dict) else None
                                ),
                                "verifier_label": (
                                    (((c.get("stats") or {}).get("semantic") or {}).get("verifier") or {}).get("label")
                                    if isinstance(c.get("stats"), dict) else None
                                ),
                            }
                            for c in candidates
                        ],
                    },
                )

            print("\nCandidates overview:")
            for c in candidates:
                _print_candidate_summary_line(
                    int(c.get("candidate_index") or 0),
                    c,
                    max_preview=int(max_token_preview),
                    max_tok_len=int(max_token_string_len),
                    snippet_chars=int(candidate_snippet_chars),
                )

            raw = _read_command(
                f"\nChoice [ENTER=accept #1, 1..{int(num_candidates)}=accept, r=regen, e=edit, m=manual, s=skip, q/Esc=quit, p<idx>=preview] > ",
                k=int(num_candidates),
                raw_key_input=bool(raw_key_input),
            )
            cmd = _normalize_cmd(raw)
            now = int(time.time())

            if cmd == "q":
                quit_requested = True
                resume_next_index_1based = idx
                break

            if cmd == "s":
                decisions_by_instance[instance_key] = ("skipped", None, None)
                _append_audit_event(
                    audit_file,
                    {
                        "event_type": "decision",
                        "ts": now,
                        "session_id": session_id,
                        "status": "skipped",
                        "tool_name": name,
                        "instance_key": instance_key,
                        "record_id": rid,
                        "tool_index": tool_i,
                        "model": llm_model,
                        "seed": seed,
                        "mode": mode_key,
                        "base_description": base_desc,
                        "final_description": None,
                        "source": "user",
                        "note": "skip",
                        "length_policy": length_policy,
                        "semantic_cfg": semantic_cfg_norm,
                    },
                )
                session_summary["skipped"] += 1
                break

            if cmd == "r":
                generation_round += 1
                regen_counts[instance_key] = int(generation_round)

                if last_generated_text and isinstance(last_generated_text, str) and last_generated_text.strip():
                    hint = last_generated_text.strip()
                    if len(hint) > max_prev:
                        hint = hint[:max_prev].rstrip()
                    previous_rewrite_hint = hint
                    last_rejected_text_by_instance[instance_key] = hint

                _append_audit_event(
                    audit_file,
                    {
                        "event_type": "regenerate",
                        "ts": now,
                        "session_id": session_id,
                        "tool_name": name,
                        "instance_key": instance_key,
                        "record_id": rid,
                        "tool_index": tool_i,
                        "mode": mode_key,
                        "generation_round": int(generation_round),
                        "last_generated_text": previous_rewrite_hint,
                        "last_generated_text_sha256": _sha256_text(previous_rewrite_hint or ""),
                        "length_policy": length_policy,
                        "semantic_cfg": semantic_cfg_norm,
                    },
                )

                candidates = []
                last_generated_text = None
                if min_sleep_sec_between_calls > 0:
                    time.sleep(float(min_sleep_sec_between_calls))
                continue

            if cmd == "m":
                manual_final = input("Manual final description (empty cancels) > ").rstrip("\n").strip()
                status = "manual" if manual_final else "skipped"
                decisions_by_instance[instance_key] = (status, manual_final or None, None)

                diff_stats = compute_full_candidate_stats(
                    base_text=base_desc,
                    cand_text=manual_final,
                    mode_key=mode_key,
                    length_policy=length_policy,
                    client=client,
                    semantic_cfg=semantic_cfg_norm,
                ) if manual_final else None

                _append_audit_event(
                    audit_file,
                    {
                        "event_type": "decision",
                        "ts": now,
                        "session_id": session_id,
                        "status": status,
                        "tool_name": name,
                        "instance_key": instance_key,
                        "record_id": rid,
                        "tool_index": tool_i,
                        "model": llm_model,
                        "seed": seed,
                        "mode": mode_key,
                        "base_description": base_desc,
                        "final_description": manual_final or None,
                        "source": "user",
                        "note": "manual_replace",
                        "diff_stats": diff_stats,
                        "length_policy": length_policy,
                        "semantic_cfg": semantic_cfg_norm,
                    },
                )
                if status == "manual":
                    session_summary["manual"] += 1
                else:
                    session_summary["skipped"] += 1
                break

            if cmd == "e":
                raw_idx = input(f"Candidate index to edit [1..{int(num_candidates)}] (empty=1) > ").strip()
                chosen_i = 1
                if raw_idx and raw_idx.isdigit():
                    chosen_i = int(raw_idx)
                if not (1 <= chosen_i <= int(num_candidates)):
                    print("Invalid candidate index.")
                    continue

                cand = candidates[chosen_i - 1] if (chosen_i - 1) < len(candidates) else None
                base_text = (cand.get("text") or "").strip() if isinstance(cand, dict) else ""
                if base_text:
                    print("\nSelected candidate text:")
                    print(base_text)
                else:
                    print("\nSelected candidate is empty; editing starts from empty string.")
                    base_text = ""

                edited = input("Edit final description (empty cancels) > ").rstrip("\n").strip()
                status = "edited" if edited else "skipped"
                bundle = cand.get("bundle") if isinstance(cand, dict) else None
                stats = compute_full_candidate_stats(
                    base_text=base_desc,
                    cand_text=edited,
                    mode_key=mode_key,
                    length_policy=length_policy,
                    client=client,
                    semantic_cfg=semantic_cfg_norm,
                ) if edited else None

                decisions_by_instance[instance_key] = (status, edited or None, bundle if isinstance(bundle, dict) else None)
                _append_audit_event(
                    audit_file,
                    {
                        "event_type": "decision",
                        "ts": now,
                        "session_id": session_id,
                        "status": status,
                        "tool_name": name,
                        "instance_key": instance_key,
                        "record_id": rid,
                        "tool_index": tool_i,
                        "model": llm_model,
                        "seed": seed,
                        "mode": mode_key,
                        "base_description": base_desc,
                        "final_description": edited or None,
                        "source": "user",
                        "note": "edit_candidate",
                        "chosen_candidate_index": int(chosen_i),
                        "llm_bundle": bundle if isinstance(bundle, dict) else None,
                        "diff_stats": stats,
                        "generation_round": int(generation_round),
                        "length_policy": length_policy,
                        "semantic_cfg": semantic_cfg_norm,
                    },
                )
                if status == "edited":
                    session_summary["edited"] += 1
                else:
                    session_summary["skipped"] += 1
                break

            choice_i = 1 if cmd == "y" else _parse_candidate_choice(cmd, k=int(num_candidates))
            if choice_i is not None:
                if not (1 <= int(choice_i) <= int(num_candidates)):
                    print("Invalid candidate index.")
                    continue
                cand = candidates[int(choice_i) - 1] if (int(choice_i) - 1) < len(candidates) else None
                if not isinstance(cand, dict):
                    print("Candidate not available.")
                    continue
                if cand.get("error") or not (cand.get("text") or "").strip():
                    print("Selected candidate is not acceptable (empty or error).")
                    _print_candidate_full(int(choice_i), cand, max_preview=int(max_token_preview), max_tok_len=int(max_token_string_len))
                    continue

                final_desc = (cand.get("text") or "").strip()
                bundle = cand.get("bundle") if isinstance(cand.get("bundle"), dict) else None
                stats = cand.get("stats") if isinstance(cand.get("stats"), dict) else compute_full_candidate_stats(
                    base_text=base_desc,
                    cand_text=final_desc,
                    mode_key=mode_key,
                    length_policy=length_policy,
                    client=client,
                    semantic_cfg=semantic_cfg_norm,
                )

                decisions_by_instance[instance_key] = ("accepted", final_desc, bundle)
                _append_audit_event(
                    audit_file,
                    {
                        "event_type": "decision",
                        "ts": now,
                        "session_id": session_id,
                        "status": "accepted",
                        "tool_name": name,
                        "instance_key": instance_key,
                        "record_id": rid,
                        "tool_index": tool_i,
                        "model": llm_model,
                        "seed": seed,
                        "mode": mode_key,
                        "base_description": base_desc,
                        "final_description": final_desc,
                        "source": "llm",
                        "chosen_candidate_index": int(choice_i),
                        "generation_round": int(generation_round),
                        "llm_bundle": bundle,
                        "diff_stats": stats,
                        "length_policy": length_policy,
                        "semantic_cfg": semantic_cfg_norm,
                    },
                )

                session_summary["accepted"] += 1
                rl = (stats.get("risk_label") if isinstance(stats, dict) else None) or "NA"
                if rl not in session_summary["accepted_risk_labels"]:
                    rl = "NA"
                session_summary["accepted_risk_labels"][rl] += 1

                sim = stats.get("similarity_ratio") if isinstance(stats, dict) else None
                if isinstance(sim, (int, float)):
                    session_summary["accepted_similarity_sum"] += float(sim)
                    session_summary["accepted_similarity_n"] += 1

                bl = stats.get("base_len_chars") if isinstance(stats, dict) else None
                cl = stats.get("cand_len_chars") if isinstance(stats, dict) else None
                lr = stats.get("len_ratio") if isinstance(stats, dict) else None
                ld = stats.get("len_delta_chars") if isinstance(stats, dict) else None
                if isinstance(bl, int) and isinstance(cl, int):
                    session_summary["accepted_base_chars_sum"] += int(bl)
                    session_summary["accepted_cand_chars_sum"] += int(cl)
                if isinstance(lr, (int, float)):
                    session_summary["accepted_len_ratio_sum"] += float(lr)
                    session_summary["accepted_len_ratio_n"] += 1
                if isinstance(ld, int):
                    session_summary["accepted_len_delta_sum"] += int(ld)

                wst = stats.get("within_soft_target") if isinstance(stats, dict) else None
                st_applied = bool(stats.get("concise_soft_target_applied", False)) if isinstance(stats, dict) else False
                if st_applied:
                    session_summary["accepted_soft_target_applicable_n"] += 1
                    if wst is True:
                        session_summary["accepted_within_soft_target_n"] += 1

                sem = stats.get("semantic") if isinstance(stats.get("semantic"), dict) else None
                if isinstance(sem, dict):
                    emb = sem.get("embedding") if isinstance(sem.get("embedding"), dict) else None
                    if isinstance(emb, dict) and isinstance(emb.get("cosine"), (int, float)):
                        session_summary["accepted_embedding_cos_sum"] += float(emb.get("cosine"))
                        session_summary["accepted_embedding_cos_n"] += 1
                    ver = sem.get("verifier") if isinstance(sem.get("verifier"), dict) else None
                    if isinstance(ver, dict) and isinstance(ver.get("label"), str):
                        lab = ver.get("label")
                        if lab == "ENTAILS":
                            session_summary["accepted_entails"] += 1
                        elif lab == "NOT_ENTAILS":
                            session_summary["accepted_not_entails"] += 1
                        else:
                            session_summary["accepted_entails_unknown"] += 1

                break

            if cmd.startswith("p"):
                raw_idx = cmd[1:].strip()
                if raw_idx.isdigit():
                    vi = int(raw_idx)
                    if 1 <= vi <= int(num_candidates):
                        _print_candidate_full(vi, candidates[vi - 1], max_preview=int(max_token_preview), max_tok_len=int(max_token_string_len))
                        continue
                print("Preview command format: p<index>, for example: p2")
                continue

            print("Invalid command. Preview: p<index> (example: p2).")

        if quit_requested:
            break

    # ========= Apply decisions to file =========
    tmp_path = path.with_suffix(path.suffix + ".tmp")
    updated_count = 0
    patch_failures = 0

    with path.open("r", encoding="utf-8") as fin, tmp_path.open("w", encoding="utf-8") as fout:
        for raw_line in fin:
            line = raw_line.rstrip("\n")
            if not line.strip():
                fout.write(line + "\n")
                continue

            try:
                record = json.loads(line)
            except json.JSONDecodeError:
                fout.write(line + "\n")
                continue

            if not isinstance(record, dict):
                fout.write(json.dumps(record, ensure_ascii=False) + "\n")
                continue

            rid = _record_id(record, tool_field=tool_field)
            tools = record.get(tool_field)

            if isinstance(tools, list):
                new_tools: List[Any] = []
                for tool_index, entry in enumerate(tools):
                    tool_obj, kind = _load_tool(entry)
                    if not tool_obj:
                        new_tools.append(entry)
                        continue

                    instance_key = _tool_instance_key(rid, tool_index, tool_obj)
                    decision = decisions_by_instance.get(instance_key)

                    if decision is None:
                        new_tools.append(entry)
                        continue

                    status, new_desc, llm_bundle = decision
                    if status in ("accepted", "edited", "manual") and new_desc:
                        if kind == "json_str" and isinstance(entry, str):
                            already_ok = False
                            try:
                                obj0 = json.loads(entry)
                                if isinstance(obj0, dict) and obj0.get("description") == new_desc:
                                    already_ok = True
                            except Exception:
                                already_ok = False

                            if already_ok:
                                new_tools.append(entry)
                                continue

                            patched, ok, reason = _replace_top_level_string_field_in_raw_object(entry, "description", new_desc)
                            if ok:
                                new_tools.append(patched)
                                updated_count += 1
                            else:
                                fallback_ok = False
                                fallback_patched = entry
                                if allow_reserialize_fallback:
                                    try:
                                        obj = json.loads(entry)
                                        if isinstance(obj, dict):
                                            obj["description"] = new_desc
                                            fallback_patched = json.dumps(obj, ensure_ascii=False)
                                            fallback_ok = True
                                    except Exception:
                                        fallback_ok = False

                                if fallback_ok:
                                    new_tools.append(fallback_patched)
                                    updated_count += 1
                                    _append_audit_event(
                                        audit_file,
                                        {
                                            "event_type": "patch_fallback_reserialize",
                                            "ts": int(time.time()),
                                            "session_id": session_id,
                                            "instance_key": instance_key,
                                            "record_id": rid,
                                            "tool_index": tool_index,
                                            "tool_name": tool_obj.get("name"),
                                            "mode": mode_key,
                                            "entry_sha256_before": _sha256_text(entry),
                                            "entry_sha256_after": _sha256_text(fallback_patched),
                                            "patch_reason": reason,
                                        },
                                    )
                                else:
                                    new_tools.append(entry)
                                    patch_failures += 1
                        else:
                            if tool_obj.get("description") == new_desc:
                                new_tools.append(tool_obj)
                                continue
                            tool_obj["description"] = new_desc
                            new_tools.append(tool_obj)
                            updated_count += 1
                    else:
                        new_tools.append(entry)

                record[tool_field] = new_tools

            fout.write(json.dumps(record, ensure_ascii=False) + "\n")

    if create_backup_of_target:
        bak_path = path.with_suffix(path.suffix + ".bak")
        if not bak_path.exists():
            shutil.copy2(path, bak_path)

    tmp_path.replace(path)
    after_sha = _sha256_file(path)

    n_reviewed = len(decisions_by_instance)
    n_skipped = sum(1 for st, _, _ in decisions_by_instance.values() if st == "skipped")
    completed = (n_reviewed >= n_total) and (not quit_requested)

    avg_sim = None
    if int(session_summary["accepted_similarity_n"]) > 0:
        avg_sim = float(session_summary["accepted_similarity_sum"]) / float(session_summary["accepted_similarity_n"])

    avg_len_ratio = None
    if int(session_summary["accepted_len_ratio_n"]) > 0:
        avg_len_ratio = float(session_summary["accepted_len_ratio_sum"]) / float(session_summary["accepted_len_ratio_n"])

    avg_len_delta = None
    if int(session_summary["accepted"]) > 0:
        avg_len_delta = float(session_summary["accepted_len_delta_sum"]) / float(session_summary["accepted"])

    avg_base_len = None
    avg_cand_len = None
    if int(session_summary["accepted"]) > 0:
        avg_base_len = float(session_summary["accepted_base_chars_sum"]) / float(session_summary["accepted"])
        avg_cand_len = float(session_summary["accepted_cand_chars_sum"]) / float(session_summary["accepted"])

    avg_emb_cos = None
    if int(session_summary["accepted_embedding_cos_n"]) > 0:
        avg_emb_cos = float(session_summary["accepted_embedding_cos_sum"]) / float(session_summary["accepted_embedding_cos_n"])

    _append_audit_event(
        audit_file,
        {
            "event_type": "run_end",
            "ts": int(time.time()),
            "session_id": session_id,
            "mode": mode_key,
            "model": llm_model,
            "seed": seed,
            "dataset_path": str(path),
            "dataset_sha256_at_session_start": before_sha,
            "dataset_sha256_at_session_end": after_sha,
            "n_total_occurrences": n_total,
            "n_reviewed_total": n_reviewed,
            "n_updated_this_session": updated_count,
            "n_skipped_total": n_skipped,
            "completed": bool(completed),
            "quit_requested": bool(quit_requested),
            "raw_patch_failures_this_session": patch_failures,
            "resume_next_index_1based": resume_next_index_1based if quit_requested else (n_total + 1 if completed else None),
            "session_summary": {
                "accepted": int(session_summary["accepted"]),
                "edited": int(session_summary["edited"]),
                "manual": int(session_summary["manual"]),
                "skipped": int(session_summary["skipped"]),
                "accepted_risk_labels": session_summary["accepted_risk_labels"],
                "accepted_avg_similarity": avg_sim,
                "accepted_similarity_n": int(session_summary["accepted_similarity_n"]),
                "accepted_avg_len_ratio": avg_len_ratio,
                "accepted_len_ratio_n": int(session_summary["accepted_len_ratio_n"]),
                "accepted_avg_len_delta_chars": avg_len_delta,
                "accepted_avg_base_len_chars": avg_base_len,
                "accepted_avg_cand_len_chars": avg_cand_len,
                "accepted_soft_target_applicable_n": int(session_summary["accepted_soft_target_applicable_n"]),
                "accepted_within_soft_target_n": int(session_summary["accepted_within_soft_target_n"]),
                "accepted_avg_embedding_cosine": avg_emb_cos,
                "accepted_embedding_cosine_n": int(session_summary["accepted_embedding_cos_n"]),
                "accepted_entails": int(session_summary["accepted_entails"]),
                "accepted_not_entails": int(session_summary["accepted_not_entails"]),
                "accepted_entails_unknown": int(session_summary["accepted_entails_unknown"]),
            },
            "length_policy_config": length_policy_config,
            "policy_versions": {
                "risk_policy_name": RISK_POLICY_NAME,
                "logic_token_policy_name": LOGIC_TOKEN_POLICY_NAME,
                "semantic_policy_name": SEMANTIC_POLICY_NAME,
            },
            "semantic_cfg": semantic_cfg_norm,
            "show_perturbations": bool(show_perturbations),
            "raw_key_input": bool(raw_key_input),
        },
    )

    print("\nChanges applied.")
    print(f"Mode: {mode_key}")
    print(f"Candidates per tool: {int(num_candidates)}")
    print(f"Candidate snippet chars: {int(candidate_snippet_chars)}")
    print(f"Perturbation prints: {'enabled' if show_perturbations else 'disabled'}")
    print(f"Raw key input (Esc-safe): {'enabled' if raw_key_input else 'disabled'}")
    print(f"Descriptions updated (this session): {updated_count}")
    if patch_failures:
        print(f"Raw JSON-string patch failures (left unchanged): {patch_failures}")
    print(f"Reviewed total (from audit): {n_reviewed} / {n_total}")
    print(f"Completed: {completed} (quit_requested={quit_requested})")
    if quit_requested and resume_next_index_1based is not None:
        print(f"Resume next time from: [{resume_next_index_1based}/{n_total}]")
    print(f"Updated file: {path}")
    print(f"Audit file (same on resume): {audit_file}")

    print("\nSession summary (heuristic):")
    print(
        f"  accepted={int(session_summary['accepted'])}, edited={int(session_summary['edited'])}, "
        f"manual={int(session_summary['manual'])}, skipped={int(session_summary['skipped'])}"
    )
    print(f"  accepted_risk_labels={session_summary['accepted_risk_labels']}")
    if avg_sim is not None:
        print(f"  accepted_avg_similarity={avg_sim:.2f} (n={int(session_summary['accepted_similarity_n'])})")
    if avg_len_ratio is not None:
        print(f"  accepted_avg_len_ratio={avg_len_ratio:.2f} (n={int(session_summary['accepted_len_ratio_n'])})")
    if avg_len_delta is not None:
        print(f"  accepted_avg_len_delta_chars={avg_len_delta:+.1f}")
    if avg_base_len is not None and avg_cand_len is not None:
        print(f"  accepted_avg_base_len_chars={avg_base_len:.1f}; accepted_avg_cand_len_chars={avg_cand_len:.1f}")
    if avg_emb_cos is not None:
        print(f"  accepted_avg_embedding_cosine={avg_emb_cos:.2f} (n={int(session_summary['accepted_embedding_cosine_n'])})")
    if semantic_cfg_norm.get("enable_verifier"):
        print(
            "  accepted_verifier_counts: "
            f"ENTAILS={int(session_summary['accepted_entails'])}, "
            f"NOT_ENTAILS={int(session_summary['accepted_not_entails'])}, "
            f"UNKNOWN={int(session_summary['accepted_entails_unknown'])}"
        )
    if mode_key == "style_concise":
        print(
            "  accepted_soft_target: "
            f"applicable={int(session_summary['accepted_soft_target_applicable_n'])}, "
            f"within={int(session_summary['accepted_within_soft_target_n'])}"
        )


def _derive_working_copy_path(input_path: str, mode_key: str) -> str:
    p = Path(input_path)
    return str(p.with_name(f"{p.stem}.WORKING_COPY.{mode_key}{p.suffix}"))


if __name__ == "__main__":
    import argparse
    from pathlib import Path
    from config_loader import load_config

    ap = argparse.ArgumentParser()
    ap.add_argument("--config", default=os.environ.get("CONFIG_PATH", "config.toml"))
    args, _unknown = ap.parse_known_args()


    cfg = load_config(args.config)

    # 1) Base URL viene dal config (non più costante hardcoded)
    GEMINI_BASE_URL = cfg.base_url

    # 2) Token SEMPRE da env (sicurezza)
    if not os.environ.get(cfg.token_env):
        raise RuntimeError(f"{cfg.token_env} environment variable is not set.")

    # 3) Risoluzione alias style (se presenti nel config)
    mk = (cfg.mode_key or "").strip() or "style_verbose"
    mk = (cfg.style_aliases or {}).get(mk, mk)
    mode_key_resolved, _ = _resolve_style(mk)

    # 4) Input/Output
    INPUT_JSONL = cfg.input_jsonl
    OUTPUT_JSONL = cfg.output_jsonl.strip() or _derive_working_copy_path(INPUT_JSONL, mode_key_resolved)

    working = make_working_copy(INPUT_JSONL, OUTPUT_JSONL, overwrite=False)
    print(f"Working copy: {working}")

    # 5) Semantic cfg
    semantic_cfg_val: Dict[str, Any] = {
        "enable_embeddings": bool(cfg.enable_embeddings and bool(cfg.embedding_model.strip())),
        "embedding_model": str(cfg.embedding_model).strip(),
        "embedding_low_cosine_threshold": float(cfg.embedding_low_cosine_threshold),
        "enable_verifier": bool(cfg.enable_verifier),
        "verifier_model": str(cfg.verifier_model).strip(),
        "verifier_max_tokens": int(cfg.verifier_max_tokens),
    }

    interactive_llm_tools_in_jsonl(
        working,
        tool_field=cfg.tool_field,
        create_backup_of_target=cfg.create_backup_of_target,
        llm_model=cfg.model,
        seed=cfg.seed,
        max_tokens=int(cfg.max_tokens),
        retry_on_length=bool(cfg.retry_on_length),
        retry_max_tokens=int(cfg.retry_max_tokens),
        allow_reserialize_fallback=bool(cfg.allow_reserialize_fallback),
        min_sleep_sec_between_calls=float(cfg.min_sleep_sec_between_calls),
        audit_dir=cfg.audit_dir,
        mode_key=mode_key_resolved,
        num_candidates=int(cfg.num_candidates),
        max_token_preview=int(cfg.stats_max_token_preview),
        max_token_string_len=int(cfg.stats_max_token_string_len),
        candidate_snippet_chars=int(cfg.candidate_snippet_chars),
        concise_target_ratio=float(cfg.concise_target_ratio),
        concise_target_min_base_len=int(cfg.concise_target_min_base_len),
        concise_target_min_chars=int(cfg.concise_target_min_chars),
        semantic_cfg=semantic_cfg_val,
        show_perturbations=bool(cfg.show_perturbations),
        raw_key_input=bool(cfg.raw_key_input),
    )


Working copy: When2Call/data/test/when2call_test_llm_judge.WORKING_COPY.style_concise.jsonl
Target: When2Call/data/test/when2call_test_llm_judge.WORKING_COPY.style_concise.jsonl
Mode: style_concise
Audit file (RESUMABLE): audit/1e8922187545/when2call_test_llm_judge.WORKING_COPY.style_concise.1e8922187545.style_concise.gemini-2.5-flash.K2.audit.jsonl
Tool occurrences total: 978
Resume position: [1/978] (previously reviewed: 0)
LLM: gemini-2.5-flash @ https://generativelanguage.googleapis.com/v1beta/openai/
Candidates per tool: 2
Candidate snippet chars: 160
Policies: risk=risk_policy_v2_structural_logic_primary; logic=logic_tokens_v1; semantic=semantic_signals_v1
Perturbation prints: enabled
Raw key input (Esc-safe): enabled
Embedding signal: disabled
Verifier signal: disabled
Concise soft target: ratio=0.70, min_base_len=160, min_chars=80
Max tokens: 512; retry_on_length=True; retry_max_tokens=1024
Commands: ENTER/ok=accept #1, 1..K=accept candidate, r=regenerate K, e=edit candidate, m