In [1]:
#!/usr/bin/env python3
# 3 Feb 2026
#
# Interactive, resumable tool-description rewrite workflow with:
# - K-candidate generation per tool instance (configurable)
# - Deterministic statistical/lexical risk indicators printed alongside base and candidates
# - Human-in-the-loop decision (accept candidate, edit, manual, skip), with append-only audit log
#
# Additions (Jan 21 test):
# - Candidate text snippet shown in the overview, so selection can occur without extra commands.
# - Explicit preview command documented: p<idx> (e.g., p2) prints the full candidate + stats.
#
# Additions (Concise soft length target, reviewer-proof):
# - Optional soft length target for style_concise: default 30% shorter, applied only if base_len >= threshold.
# - The target is guidance only (exceptions allowed to preserve meaning); out-of-target is flagged and logged.
# - Length metrics (len_ratio, len_delta) are computed for every candidate and stored in audit for reporting.
#
# Additions (Semantic drift hardening, v2):
# - Logic/negation/quantifier/modal/scope tokens extracted and diffed (beyond purely lexical patterns).
# - Risk scoring adjusted: verbs treated as secondary; structural+logic tokens treated as primary.
# - Optional semantic signals:
#   - Embedding cosine similarity (best-effort; depends on provider support for embeddings endpoint).
#   - LLM entailment verifier returning ENTAILS / NOT_ENTAILS (disabled by default).
#
# Additions (Feb 3 patch):
# - "Perturbations" visibility: print candidate-generation perturbation context to stdout during generation.
# - ESC behavior: raw-key command input on TTY so Esc behaves like quit (q) and never "accepts" accidentally.

import json
import shutil
import os
import time
import hashlib
import difflib
import re
import math
import sys
from pathlib import Path
from typing import Dict, Any, Optional, Tuple, List

from openai import OpenAI


# ========= Config =========
GEMINI_BASE_URL = "https://generativelanguage.googleapis.com/v1beta/openai/"
LLM_MODEL_DEFAULT = "gemini-2.5-flash"

HASH_HEX_LEN = 32

DEFAULT_MAX_TOKENS = 512
RETRY_ON_LENGTH = True
RETRY_MAX_TOKENS = 1024

DEFAULT_ALLOW_RESERIALIZE_FALLBACK = False

# How much of the last generated candidate to store in audit and to feed back into prompt.
DEFAULT_MAX_PREV_REWRITE_CHARS = 800

# Candidate count shown per tool instance.
DEFAULT_NUM_CANDIDATES = 2

# Printing controls for token previews in statistics.
DEFAULT_MAX_TOKEN_PREVIEW = 8
DEFAULT_MAX_TOKEN_STRING_LEN = 48

# Candidate text snippet in overview (chars).
DEFAULT_CANDIDATE_SNIPPET_CHARS = 160

# Soft concise length target knobs (reviewer-proof defaults).
DEFAULT_CONCISE_TARGET_RATIO = 0.70
DEFAULT_CONCISE_TARGET_MIN_BASE_LEN = 160
DEFAULT_CONCISE_TARGET_MIN_CHARS = 80

# Semantic signals (disabled by default).
DEFAULT_ENABLE_EMBEDDINGS = False
DEFAULT_EMBEDDING_MODEL = ""  # Provider-dependent; empty means "unset".
DEFAULT_EMBEDDING_LOW_COSINE_THRESHOLD = 0.85

DEFAULT_ENABLE_VERIFIER = False
DEFAULT_VERIFIER_MODEL = ""  # Empty means "use llm_model".
DEFAULT_VERIFIER_MAX_TOKENS = 16

# Visibility knobs (Feb 3 patch).
DEFAULT_SHOW_PERTURBATIONS = True   # Prints perturbation context during candidate generation.
DEFAULT_RAW_KEY_INPUT = True        # Enables raw-key command input on TTY so Esc can be captured.

# Policy versioning for audit identity and reporting.
RISK_POLICY_NAME = "risk_policy_v2_structural_logic_primary"
LOGIC_TOKEN_POLICY_NAME = "logic_tokens_v1"
SEMANTIC_POLICY_NAME = "semantic_signals_v1"


# ========= Styles =========
STYLE_SPECS: Dict[str, Dict[str, Any]] = {
    "style_verbose": {
        "system": (
            "Task: rewrite tool descriptions.\n"
            "Hard constraints:\n"
            "- Meaning must be preserved exactly; no new capabilities, steps, motivations, benefits, or context.\n"
            "- No information present in the original description may be deleted.\n"
            "- No new parameter names, IDs, field names, flags, or implementation details may be introduced.\n"
            "- If parameter/field names/IDs/flags already appear in the original description, they must be kept.\n"
            "- No examples, normative language, or assumptions.\n"
            "- The subject (the tool) and scope must remain unchanged.\n"
            "- Output must be only the rewritten description text, nothing else.\n"
            "- Style: verbose but controlled; concise and complete (1–2 sentences), clear and direct.\n"
        ),
        "regen_diversity_instruction": (
            "A meaning-equivalent rewrite is required with lexical and syntactic variation from the previous rewrite; "
            "the same sentence structure should be avoided."
        ),
        "max_prev_rewrite_chars": 800,
    },
    "style_concise": {
        "system": (
            "Task: rewrite tool descriptions.\n"
            "Hard constraints:\n"
            "- Meaning must be preserved exactly; no new capabilities, steps, motivations, benefits, or context.\n"
            "- No information present in the original description may be deleted.\n"
            "- No new parameter names, IDs, field names, flags, or implementation details may be introduced.\n"
            "- If parameter/field names/IDs/flags already appear in the original description, they must be kept.\n"
            "- No examples, normative language, or assumptions.\n"
            "- The subject (the tool) and scope must remain unchanged.\n"
            "- Output must be only the rewritten description text, nothing else.\n"
            "- Style: concise and controlled; 1 sentence preferred, 2 max.\n"
            "- Length constraint: shorter than the base description is preferred; if the base description is already short, the rewrite must not exceed its length.\n"
            "- Compression rule: remove redundancy, filler, and hedging while preserving all explicitly stated constraints/details.\n"
        ),
        "regen_diversity_instruction": (
            "A different paraphrase is required than the previous rewrite. "
            "The same sentence skeleton or distinctive phrases must not be reused. "
            "Meaning must remain exactly the same; only wording and structure may vary."
        ),
        "max_prev_rewrite_chars": 600,
    },
    # Alias to tolerate misspellings.
    "style_coicnoso": {},   # filled after dict creation
    "style_coinceise": {},  # filled after dict creation
}
STYLE_SPECS["style_coicnoso"] = STYLE_SPECS["style_concise"]
STYLE_SPECS["style_coinceise"] = STYLE_SPECS["style_concise"]


def _resolve_style(mode_key: str) -> Tuple[str, Dict[str, Any]]:
    mk = (mode_key or "").strip()
    if not mk:
        mk = "style_verbose"
    if mk not in STYLE_SPECS:
        raise ValueError(f"Unknown MODE_KEY='{mk}'. Supported: {', '.join(sorted(STYLE_SPECS.keys()))}")
    return mk, STYLE_SPECS[mk]


# ========= Client =========
def make_gemini_client() -> OpenAI:
    token = os.environ.get("TOKEN_GEMINI")
    if not token:
        raise RuntimeError("TOKEN_GEMINI environment variable is not set.")
    return OpenAI(api_key=token, base_url=GEMINI_BASE_URL)


# ========= Small utils =========
def _json_safe(obj: Any) -> Any:
    if obj is None or isinstance(obj, (str, int, float, bool)):
        return obj
    if isinstance(obj, dict):
        return {str(k): _json_safe(v) for k, v in obj.items()}
    if isinstance(obj, (list, tuple)):
        return [_json_safe(x) for x in obj]
    if hasattr(obj, "model_dump") and callable(getattr(obj, "model_dump")):
        try:
            return _json_safe(obj.model_dump())
        except Exception:
            pass
    if hasattr(obj, "dict") and callable(getattr(obj, "dict")):
        try:
            return _json_safe(obj.dict())
        except Exception:
            pass
    if hasattr(obj, "__dict__"):
        try:
            return _json_safe(vars(obj))
        except Exception:
            pass
    try:
        return str(obj)
    except Exception:
        return None


def _sha256_text(s: str) -> str:
    return hashlib.sha256((s or "").encode("utf-8")).hexdigest()


def _canonical_json(obj: Any) -> str:
    return json.dumps(obj, ensure_ascii=False, sort_keys=True, separators=(",", ":"))


def _sha256_file(path: Path) -> str:
    h = hashlib.sha256()
    with path.open("rb") as f:
        for chunk in iter(lambda: f.read(1024 * 1024), b""):
            h.update(chunk)
    return h.hexdigest()


def _safe_int_env(name: str, default: int) -> int:
    v = os.environ.get(name)
    if v is None or not v.strip():
        return int(default)
    try:
        return int(v.strip())
    except Exception:
        return int(default)


def _safe_float_env(name: str, default: float) -> float:
    v = os.environ.get(name)
    if v is None or not v.strip():
        return float(default)
    try:
        return float(v.strip())
    except Exception:
        return float(default)


def _safe_bool_env(name: str, default: bool) -> bool:
    v = os.environ.get(name)
    if v is None or not v.strip():
        return bool(default)
    s = v.strip().lower()
    if s in ("1", "true", "t", "yes", "y", "on"):
        return True
    if s in ("0", "false", "f", "no", "n", "off"):
        return False
    return bool(default)


# ========= Raw-key command input (Esc-safe) =========
def _read_command_raw_tty(prompt: str, *, k: int) -> str:
    """
    Reads a short command using raw keypress input on TTY.
    - Enter returns the accumulated buffer (possibly empty).
    - Esc returns '\x1b' immediately (caller maps it to quit).
    - Backspace edits the buffer.
    - For k<=9, numeric selection is returned immediately when it becomes unambiguous.
    """
    # Defer imports so non-TTY / non-Unix runs remain usable.
    try:
        import termios
        import tty
    except Exception:
        # Fall back to line input if raw mode is unavailable.
        return input(prompt)

    fd = sys.stdin.fileno()
    old = termios.tcgetattr(fd)
    buf = ""

    immediate_digits = (int(k) <= 9)

    def echo(s: str) -> None:
        sys.stdout.write(s)
        sys.stdout.flush()

    echo(prompt)

    try:
        tty.setraw(fd)
        while True:
            ch = sys.stdin.read(1)

            # Enter
            if ch in ("\r", "\n"):
                echo("\n")
                return buf

            # Esc
            if ch == "\x1b":
                echo("\n")
                return "\x1b"

            # Backspace / delete
            if ch in ("\x7f", "\b"):
                if buf:
                    buf = buf[:-1]
                    echo("\b \b")
                continue

            # Ignore other control chars.
            if not ch.isprintable():
                continue

            # Append printable char and echo.
            buf += ch
            echo(ch)

            low = buf.strip().lower()

            # If user typed a single-letter command, return immediately.
            if low in ("r", "e", "m", "s", "q", "y"):
                echo("\n")
                return low

            # Preview shortcut: p<idx> (only immediate if k<=9 and idx is single-digit).
            if immediate_digits and low.startswith("p") and len(low) == 2 and low[1].isdigit():
                vi = int(low[1])
                if 1 <= vi <= int(k):
                    echo("\n")
                    return low

            # Numeric selection (only immediate for k<=9 to avoid ambiguity like "10").
            if immediate_digits and len(low) == 1 and low.isdigit():
                vi = int(low)
                if 1 <= vi <= int(k):
                    echo("\n")
                    return low

    finally:
        termios.tcsetattr(fd, termios.TCSADRAIN, old)


def _read_command(prompt: str, *, k: int, raw_key_input: bool) -> str:
    """
    Reads a command from the user.
    - If raw_key_input is enabled and stdin is a TTY, uses raw keypress mode to capture Esc.
    - Otherwise, falls back to line-based input().
    """
    if raw_key_input and sys.stdin.isatty():
        try:
            return _read_command_raw_tty(prompt, k=int(k))
        except Exception:
            # Safe fallback.
            return input(prompt)
    return input(prompt)


# ========= Concise soft target (policy) =========
def _make_length_policy(
    *,
    base_desc: str,
    mode_key: str,
    concise_ratio: float,
    concise_min_base_len: int,
    concise_min_chars: int,
) -> Dict[str, Any]:
    """
    Returns a policy object (always present) used for:
    - prompt guidance (soft target)
    - stats (len_ratio, within_target)
    - audit reporting

    Soft target is applied only if:
    - mode_key == style_concise
    - base_len >= concise_min_base_len
    - computed target is strictly shorter than base_len
    """
    base = (base_desc or "").strip()
    base_len = len(base)

    ratio = float(concise_ratio)
    min_base_len = int(concise_min_base_len)
    min_chars = int(concise_min_chars)

    reason = "not_concise_mode"
    applied = False
    target_chars: Optional[int] = None

    if mode_key == "style_concise":
        if base_len < min_base_len or base_len <= 0:
            reason = "base_too_short"
        else:
            raw_target = int(base_len * ratio)
            candidate_target = max(raw_target, min_chars)
            # If env is mis-set (min_chars > base), do not apply a target that would exceed base.
            if candidate_target >= base_len:
                reason = "target_not_shorter_than_base"
            else:
                applied = True
                reason = "ok"
                target_chars = candidate_target

    return {
        "policy_name": "concise_soft_target_v1",
        "mode_key": mode_key,
        "base_len_chars": base_len,
        "concise_soft_target": {
            "applied": bool(applied),
            "reason": str(reason),
            "target_ratio": float(ratio),
            "min_base_len": int(min_base_len),
            "min_chars": int(min_chars),
            "target_chars": int(target_chars) if isinstance(target_chars, int) else None,
        },
    }


# ========= Statistical / lexical indicators =========
_FLAG_RE = re.compile(r"(?<!\w)--[A-Za-z0-9][A-Za-z0-9_-]*")
_SNAKE_RE = re.compile(r"\b[A-Za-z][A-Za-z0-9]*_[A-Za-z0-9_]+\b")
_CAMEL_RE = re.compile(r"\b[a-z]+[A-Z][A-Za-z0-9]*\b")
_FIELD_COLON_RE = re.compile(r"\b[A-Za-z][A-Za-z0-9_]{2,}\b(?=\s*[:=])")
_NUMBER_RE = re.compile(r"\b\d+(?:\.\d+)?\b")
_NUMBER_UNIT_RE = re.compile(
    r"\b\d+(?:\.\d+)?\s*(?:kb|mb|gb|tb|ms|s|sec|secs|seconds|mins|minutes|hrs|hours|days)\b",
    re.IGNORECASE,
)

_HIGH_RISK_VERBS = [
    "create", "delete", "remove", "destroy",
    "upload", "download", "send", "email",
    "execute", "run", "invoke", "call",
    "write", "read", "save", "store",
    "update", "modify", "edit", "change",
    "retrieve", "fetch", "search", "browse",
    "access", "open", "close",
    "return", "returns",
]
_VERB_RE = re.compile(r"\b(" + "|".join(re.escape(v) for v in _HIGH_RISK_VERBS) + r")\b", re.IGNORECASE)

_LOGIC_WORDS = {
    "only", "must", "never", "not", "no", "unless", "except",
    "required", "optional",
    "cannot", "can't",
}

_LOGIC_PHRASES = [
    "at least",
    "at most",
    "up to",
    "no more than",
    "no less than",
    "do not",
    "does not",
    "did not",
    "must not",
    "should not",
    "may not",
    "will not",
    "cannot",
    "can't",
    "if and only if",
]

_MODAL_WORDS = {
    "may", "must", "should", "will", "can", "could", "would", "might", "shall",
}

_SCOPE_PHRASES = [
    "returns",
    "return",
    "can return",
    "may return",
    "will return",
    "must return",
    "should return",
    "cannot return",
    "can't return",
    "does not return",
    "do not return",
]


def _compile_phrase_patterns(phrases: List[str]) -> Dict[str, re.Pattern]:
    out: Dict[str, re.Pattern] = {}
    for p in phrases:
        esc = re.escape(p).replace(r"\ ", r"\s+")
        pat = re.compile(r"(?<![A-Za-z0-9_])" + esc + r"(?![A-Za-z0-9_])", re.IGNORECASE)
        out[p.lower()] = pat
    return out


_LOGIC_PHRASE_PATTERNS = _compile_phrase_patterns(_LOGIC_PHRASES)
_SCOPE_PHRASE_PATTERNS = _compile_phrase_patterns(_SCOPE_PHRASES)

_WORD_TOKEN_RE = re.compile(r"[A-Za-z0-9_]+(?:'[A-Za-z0-9_]+)?", re.IGNORECASE)


def _sentence_count(text: str) -> int:
    t = (text or "").strip()
    if not t:
        return 0
    parts = [p for p in re.split(r"[.!?]+", t) if p.strip()]
    return len(parts)


def _word_count(text: str) -> int:
    t = (text or "").strip()
    if not t:
        return 0
    return len([w for w in re.split(r"\s+", t) if w])


def _extract_phrase_tokens(lower_text: str, patterns: Dict[str, re.Pattern]) -> List[str]:
    found: List[str] = []
    for canonical, pat in patterns.items():
        if pat.search(lower_text):
            found.append(canonical)
    return sorted(set(found))


def _extract_word_tokens(lower_text: str, vocabulary: set) -> List[str]:
    toks = [m.group(0).lower() for m in _WORD_TOKEN_RE.finditer(lower_text)]
    return sorted(set(t for t in toks if t in vocabulary))


def _extract_indicator_tokens(text: str) -> Dict[str, List[str]]:
    t = text or ""
    lower = t.lower()

    logic_phr = _extract_phrase_tokens(lower, _LOGIC_PHRASE_PATTERNS)
    logic_w = _extract_word_tokens(lower, _LOGIC_WORDS)
    logic = sorted(set(logic_phr + logic_w))

    modals = _extract_word_tokens(lower, _MODAL_WORDS)
    scope = _extract_phrase_tokens(lower, _SCOPE_PHRASE_PATTERNS)

    return {
        "flags": sorted(set(_FLAG_RE.findall(t))),
        "snake": sorted(set(_SNAKE_RE.findall(t))),
        "camel": sorted(set(_CAMEL_RE.findall(t))),
        "field_like": sorted(set(_FIELD_COLON_RE.findall(t))),
        "numbers": sorted(set(_NUMBER_RE.findall(t))),
        "number_units": sorted(set(m.group(0) for m in _NUMBER_UNIT_RE.finditer(t))),
        "verbs": sorted(set(m.group(0).lower() for m in _VERB_RE.finditer(t))),
        "logic": logic,
        "modals": modals,
        "scope": scope,
    }


def _format_token_preview(tokens: List[str], *, max_items: int, max_len: int) -> str:
    if not tokens:
        return "-"
    out: List[str] = []
    for t in tokens[: max(0, int(max_items))]:
        s = str(t)
        if len(s) > int(max_len):
            s = s[: int(max_len) - 1] + "…"
        out.append(s)
    if len(tokens) > int(max_items):
        out.append(f"+{len(tokens) - int(max_items)}")
    return ", ".join(out) if out else "-"


def _diff_token_sets(base: Dict[str, List[str]], cand: Dict[str, List[str]], key: str) -> Tuple[List[str], List[str]]:
    b = set(base.get(key, []) or [])
    c = set(cand.get(key, []) or [])
    new_items = sorted(c - b)
    missing_items = sorted(b - c)
    return new_items, missing_items


def _similarity_ratio(a: str, b: str) -> float:
    aa = (a or "").strip()
    bb = (b or "").strip()
    if not aa and not bb:
        return 1.0
    if not aa or not bb:
        return 0.0
    return float(difflib.SequenceMatcher(None, aa, bb).ratio())


def _cosine_similarity(a: List[float], b: List[float]) -> Optional[float]:
    if not a or not b:
        return None
    if len(a) != len(b):
        return None
    dot = 0.0
    na = 0.0
    nb = 0.0
    for i in range(len(a)):
        ai = float(a[i])
        bi = float(b[i])
        dot += ai * bi
        na += ai * ai
        nb += bi * bi
    if na <= 0.0 or nb <= 0.0:
        return None
    return float(dot / (math.sqrt(na) * math.sqrt(nb)))


def compute_candidate_stats(
    *,
    base_text: str,
    cand_text: str,
    mode_key: str,
    length_policy: Optional[Dict[str, Any]] = None,
) -> Dict[str, Any]:
    base = (base_text or "").strip()
    cand = (cand_text or "").strip()

    base_tokens = _extract_indicator_tokens(base)
    cand_tokens = _extract_indicator_tokens(cand)

    diffs: Dict[str, Any] = {}
    for k in ("flags", "snake", "camel", "field_like", "numbers", "number_units", "verbs", "logic", "modals", "scope"):
        new_items, missing_items = _diff_token_sets(base_tokens, cand_tokens, k)
        diffs[k] = {"new": new_items, "missing": missing_items}

    base_len = len(base)
    cand_len = len(cand)
    base_words = _word_count(base)
    cand_words = _word_count(cand)
    base_sent = _sentence_count(base)
    cand_sent = _sentence_count(cand)

    sim = _similarity_ratio(base, cand)

    len_ratio = (float(cand_len) / float(base_len)) if base_len > 0 else None
    len_delta = int(cand_len) - int(base_len)
    len_delta_ratio = (float(len_delta) / float(base_len)) if base_len > 0 else None

    structural_keys = ("flags", "field_like", "numbers", "number_units", "snake", "camel")
    logic_keys = ("logic", "modals", "scope")

    new_structural = sum(len(diffs[k]["new"]) for k in structural_keys)
    missing_structural = sum(len(diffs[k]["missing"]) for k in structural_keys)
    new_logic = sum(len(diffs[k]["new"]) for k in logic_keys)
    missing_logic = sum(len(diffs[k]["missing"]) for k in logic_keys)

    new_verbs = len(diffs["verbs"]["new"])
    missing_verbs = len(diffs["verbs"]["missing"])

    risk_label = "LOW"
    risk_reasons: List[str] = []

    if new_structural > 0:
        risk_label = "HIGH"
        risk_reasons.append("new_structural_tokens_detected")
    if risk_label != "HIGH" and (new_logic > 0 or missing_logic > 0):
        risk_label = "HIGH"
        risk_reasons.append("logic_or_modal_or_scope_tokens_changed")
    if risk_label != "HIGH" and missing_structural >= 4:
        risk_label = "HIGH"
        risk_reasons.append("many_structural_tokens_missing")

    if risk_label == "LOW" and missing_structural > 0:
        risk_label = "MED"
        risk_reasons.append("some_structural_tokens_missing")

    if risk_label == "LOW" and new_verbs > 0 and sim < 0.55:
        risk_label = "MED"
        risk_reasons.append("new_risk_verbs_with_low_similarity")

    if risk_label == "LOW" and sim < 0.45:
        risk_label = "MED"
        risk_reasons.append("very_low_text_similarity")

    soft_flags: List[Dict[str, Any]] = []
    if mode_key == "style_concise":
        if base_len > 0 and cand_len > base_len:
            soft_flags.append({"type": "concise_length_exceeds_base", "base_len": base_len, "cand_len": cand_len})
        if cand_sent > 2:
            soft_flags.append({"type": "concise_sentence_count_exceeds_2", "sentence_count": cand_sent})
    if mode_key == "style_verbose":
        if cand_sent > 2:
            soft_flags.append({"type": "verbose_sentence_count_exceeds_2", "sentence_count": cand_sent})

    concise_target_chars = None
    concise_target_applied = False
    concise_target_reason = None
    within_soft_target = None

    if isinstance(length_policy, dict):
        ct = (length_policy.get("concise_soft_target") or {})
        if isinstance(ct, dict):
            concise_target_applied = bool(ct.get("applied", False))
            concise_target_reason = ct.get("reason")
            concise_target_chars = ct.get("target_chars") if isinstance(ct.get("target_chars"), int) else None

    if mode_key == "style_concise" and concise_target_applied and isinstance(concise_target_chars, int) and base_len > 0:
        within_soft_target = bool(cand_len <= concise_target_chars)
        if not within_soft_target:
            soft_flags.append(
                {
                    "type": "concise_exceeds_soft_target",
                    "target_chars": int(concise_target_chars),
                    "cand_len": int(cand_len),
                    "len_ratio": float(len_ratio) if isinstance(len_ratio, float) else None,
                }
            )

    return {
        "policy": {
            "risk_policy_name": RISK_POLICY_NAME,
            "logic_token_policy_name": LOGIC_TOKEN_POLICY_NAME,
        },
        "base_len_chars": base_len,
        "cand_len_chars": cand_len,
        "len_ratio": len_ratio,
        "len_delta_chars": len_delta,
        "len_delta_ratio": len_delta_ratio,
        "base_words": base_words,
        "cand_words": cand_words,
        "base_sentences": base_sent,
        "cand_sentences": cand_sent,
        "similarity_ratio": sim,
        "diffs": diffs,
        "risk_label": risk_label,
        "risk_reasons": risk_reasons,
        "soft_flags": soft_flags,
        "base_tokens": base_tokens,
        "cand_tokens": cand_tokens,
        "length_policy": length_policy,
        "concise_soft_target_applied": concise_target_applied,
        "concise_soft_target_reason": concise_target_reason,
        "concise_soft_target_chars": concise_target_chars,
        "within_soft_target": within_soft_target,
        "new_structural_count": int(new_structural),
        "missing_structural_count": int(missing_structural),
        "new_logic_count": int(new_logic),
        "missing_logic_count": int(missing_logic),
        "new_risk_verbs_count": int(new_verbs),
        "missing_risk_verbs_count": int(missing_verbs),
    }


# ========= Optional semantic signals (embeddings + verifier) =========
def _compute_embedding_cosine(
    *,
    client: OpenAI,
    embedding_model: str,
    base_text: str,
    cand_text: str,
) -> Tuple[Optional[float], Optional[str], Dict[str, Any]]:
    meta: Dict[str, Any] = {"embedding_model": embedding_model, "provider_base_url": GEMINI_BASE_URL}
    try:
        rb = client.embeddings.create(model=embedding_model, input=base_text)
        rc = client.embeddings.create(model=embedding_model, input=cand_text)
        vb = getattr(rb.data[0], "embedding", None) if getattr(rb, "data", None) else None
        vc = getattr(rc.data[0], "embedding", None) if getattr(rc, "data", None) else None
        if not isinstance(vb, list) or not isinstance(vc, list):
            return None, "embedding_vector_missing", meta
        cos = _cosine_similarity(vb, vc)
        if cos is None:
            return None, "embedding_cosine_failed", meta
        return float(cos), None, meta
    except Exception as e:
        return None, str(e), meta


def _compute_entailment_verdict(
    *,
    client: OpenAI,
    verifier_model: str,
    base_text: str,
    cand_text: str,
    max_tokens: int,
) -> Tuple[Optional[str], Optional[str], Dict[str, Any]]:
    meta: Dict[str, Any] = {"verifier_model": verifier_model, "provider_base_url": GEMINI_BASE_URL}
    system = (
        "Task: semantic equivalence gate.\n"
        "Decision must be based only on whether the candidate text entails the base text with identical meaning.\n"
        "Output must be exactly one of: ENTAILS, NOT_ENTAILS.\n"
        "No explanations, no punctuation, no extra tokens.\n"
    )
    user = (
        "Base text:\n"
        f"{base_text.strip()}\n\n"
        "Candidate text:\n"
        f"{cand_text.strip()}\n"
    )
    try:
        resp = client.chat.completions.create(
            model=verifier_model,
            messages=[{"role": "system", "content": system}, {"role": "user", "content": user}],
            temperature=0.0,
            max_tokens=int(max_tokens),
        )
        out = (resp.choices[0].message.content or "").strip().upper()
        tok = out.split()[0] if out else ""
        if tok not in ("ENTAILS", "NOT_ENTAILS"):
            tok = "UNKNOWN"
        meta["finish_reason"] = getattr(resp.choices[0], "finish_reason", None)
        meta["usage"] = getattr(resp, "usage", None)
        return tok, None, meta
    except Exception as e:
        return None, str(e), meta


def augment_stats_with_semantic_signals(
    *,
    stats: Dict[str, Any],
    client: Optional[OpenAI],
    semantic_cfg: Optional[Dict[str, Any]],
    base_text: str,
    cand_text: str,
) -> Dict[str, Any]:
    if not isinstance(stats, dict):
        return stats
    if not semantic_cfg or not isinstance(semantic_cfg, dict):
        return stats
    if client is None:
        return stats

    enabled_embeddings = bool(semantic_cfg.get("enable_embeddings", False))
    embedding_model = str(semantic_cfg.get("embedding_model") or "").strip()
    emb_low_thr = semantic_cfg.get("embedding_low_cosine_threshold", DEFAULT_EMBEDDING_LOW_COSINE_THRESHOLD)

    enabled_verifier = bool(semantic_cfg.get("enable_verifier", False))
    verifier_model = str(semantic_cfg.get("verifier_model") or "").strip()
    verifier_max_tokens = int(semantic_cfg.get("verifier_max_tokens", DEFAULT_VERIFIER_MAX_TOKENS))

    if not isinstance(stats.get("soft_flags"), list):
        stats["soft_flags"] = []

    semantic_block: Dict[str, Any] = {"semantic_policy_name": SEMANTIC_POLICY_NAME}

    if enabled_embeddings and embedding_model:
        cos, err, meta = _compute_embedding_cosine(
            client=client,
            embedding_model=embedding_model,
            base_text=base_text,
            cand_text=cand_text,
        )
        semantic_block["embedding"] = {"cosine": cos, "error": err, "meta": meta}
        if isinstance(cos, (int, float)) and isinstance(emb_low_thr, (int, float)):
            if float(cos) < float(emb_low_thr):
                stats["soft_flags"].append(
                    {"type": "embedding_low_cosine", "cosine": float(cos), "threshold": float(emb_low_thr)}
                )
        if err:
            stats["soft_flags"].append({"type": "embedding_error", "error": str(err)[:200]})

    if enabled_verifier:
        if not verifier_model:
            verifier_model = str(semantic_cfg.get("fallback_llm_model") or "").strip()
        if verifier_model:
            label, err, meta = _compute_entailment_verdict(
                client=client,
                verifier_model=verifier_model,
                base_text=base_text,
                cand_text=cand_text,
                max_tokens=verifier_max_tokens,
            )
            semantic_block["verifier"] = {"label": label, "error": err, "meta": meta}
            if label == "NOT_ENTAILS":
                stats["soft_flags"].append({"type": "verifier_not_entails"})
            elif label == "UNKNOWN":
                stats["soft_flags"].append({"type": "verifier_unknown"})
            if err:
                stats["soft_flags"].append({"type": "verifier_error", "error": str(err)[:200]})
        else:
            stats["soft_flags"].append({"type": "verifier_unconfigured"})

    if semantic_block:
        stats["semantic"] = semantic_block

    return stats


def compute_full_candidate_stats(
    *,
    base_text: str,
    cand_text: str,
    mode_key: str,
    length_policy: Optional[Dict[str, Any]],
    client: Optional[OpenAI],
    semantic_cfg: Optional[Dict[str, Any]],
) -> Dict[str, Any]:
    stats = compute_candidate_stats(base_text=base_text, cand_text=cand_text, mode_key=mode_key, length_policy=length_policy)
    return augment_stats_with_semantic_signals(
        stats=stats,
        client=client,
        semantic_cfg=semantic_cfg,
        base_text=base_text,
        cand_text=cand_text,
    )


def _print_base_stats(base_desc: str, *, max_preview: int, max_tok_len: int) -> None:
    base = (base_desc or "").strip()
    tokens = _extract_indicator_tokens(base)
    print("\nStatistics (base):")
    print(f"  chars={len(base)}; words={_word_count(base)}; sentences={_sentence_count(base)}")
    print(
        "  tokens:"
        f" flags={len(tokens['flags'])}, field_like={len(tokens['field_like'])}, "
        f"numbers={len(tokens['numbers'])}, number_units={len(tokens['number_units'])}, "
        f"verbs={len(tokens['verbs'])}, snake={len(tokens['snake'])}, camel={len(tokens['camel'])}, "
        f"logic={len(tokens['logic'])}, modals={len(tokens['modals'])}, scope={len(tokens['scope'])}"
    )
    print(
        "  previews:"
        f" flags=[{_format_token_preview(tokens['flags'], max_items=max_preview, max_len=max_tok_len)}];"
        f" field_like=[{_format_token_preview(tokens['field_like'], max_items=max_preview, max_len=max_tok_len)}];"
        f" numbers=[{_format_token_preview(tokens['numbers'], max_items=max_preview, max_len=max_tok_len)}];"
        f" verbs=[{_format_token_preview(tokens['verbs'], max_items=max_preview, max_len=max_tok_len)}];"
        f" logic=[{_format_token_preview(tokens['logic'], max_items=max_preview, max_len=max_tok_len)}];"
        f" modals=[{_format_token_preview(tokens['modals'], max_items=max_preview, max_len=max_tok_len)}];"
        f" scope=[{_format_token_preview(tokens['scope'], max_items=max_preview, max_len=max_tok_len)}]"
    )


def _print_candidate_summary_line(
    i: int,
    cand: Dict[str, Any],
    *,
    max_preview: int,
    max_tok_len: int,
    snippet_chars: int,
) -> None:
    txt = (cand.get("text") or "").strip()
    err = cand.get("error")
    dup = bool(cand.get("duplicate", False))
    stats = cand.get("stats") or {}

    status = "ok"
    if err:
        status = f"error:{str(err)[:60]}"
    elif not txt:
        status = "empty"
    elif dup:
        status = "duplicate"

    risk = stats.get("risk_label") or "-"
    sim = stats.get("similarity_ratio")
    sim_s = f"{float(sim):.2f}" if isinstance(sim, (int, float)) else "-"

    diffs = (stats.get("diffs") or {})
    new_flags = len(((diffs.get("flags") or {}).get("new") or []))
    new_nums = len(((diffs.get("numbers") or {}).get("new") or [])) + len(((diffs.get("number_units") or {}).get("new") or []))
    new_verbs = len(((diffs.get("verbs") or {}).get("new") or []))
    new_fields = len(((diffs.get("field_like") or {}).get("new") or []))
    new_snake = len(((diffs.get("snake") or {}).get("new") or []))
    new_camel = len(((diffs.get("camel") or {}).get("new") or []))
    new_logic = (
        len(((diffs.get("logic") or {}).get("new") or [])) +
        len(((diffs.get("modals") or {}).get("new") or [])) +
        len(((diffs.get("scope") or {}).get("new") or []))
    )
    missing_total = (
        len(((diffs.get("flags") or {}).get("missing") or [])) +
        len(((diffs.get("numbers") or {}).get("missing") or [])) +
        len(((diffs.get("number_units") or {}).get("missing") or [])) +
        len(((diffs.get("verbs") or {}).get("missing") or [])) +
        len(((diffs.get("field_like") or {}).get("missing") or [])) +
        len(((diffs.get("snake") or {}).get("missing") or [])) +
        len(((diffs.get("camel") or {}).get("missing") or [])) +
        len(((diffs.get("logic") or {}).get("missing") or [])) +
        len(((diffs.get("modals") or {}).get("missing") or [])) +
        len(((diffs.get("scope") or {}).get("missing") or []))
    )

    clen = stats.get("cand_len_chars")
    cwords = stats.get("cand_words")
    csent = stats.get("cand_sentences")

    clen_s = str(clen) if isinstance(clen, int) else "-"
    cwords_s = str(cwords) if isinstance(cwords, int) else "-"
    csent_s = str(csent) if isinstance(csent, int) else "-"

    lr = stats.get("len_ratio")
    lr_s = f"{float(lr):.2f}" if isinstance(lr, (int, float)) else "-"
    ld = stats.get("len_delta_chars")
    ld_s = f"{int(ld):+d}" if isinstance(ld, int) else "-"

    t_applied = bool(stats.get("concise_soft_target_applied", False))
    t_chars = stats.get("concise_soft_target_chars")
    within = stats.get("within_soft_target")
    target_s = "-"
    if t_applied and isinstance(t_chars, int):
        if within is True:
            target_s = f"target<={t_chars} ok"
        elif within is False:
            target_s = f"target<={t_chars} NO"
        else:
            target_s = f"target<={t_chars}"

    emb_s = "-"
    ent_s = "-"
    sem = stats.get("semantic") if isinstance(stats.get("semantic"), dict) else None
    if isinstance(sem, dict):
        emb = sem.get("embedding") if isinstance(sem.get("embedding"), dict) else None
        if isinstance(emb, dict) and isinstance(emb.get("cosine"), (int, float)):
            emb_s = f"{float(emb.get('cosine')):.2f}"
        ver = sem.get("verifier") if isinstance(sem.get("verifier"), dict) else None
        if isinstance(ver, dict) and isinstance(ver.get("label"), str):
            ent_s = ver.get("label")

    print(
        f"  [{i}] status={status}; risk={risk}; sim={sim_s}; emb={emb_s}; ent={ent_s}; "
        f"cand(chars={clen_s}, words={cwords_s}, sent={csent_s}); "
        f"len_ratio={lr_s}; Δchars={ld_s}; {target_s}; "
        f"new(flags={new_flags}, fields={new_fields}, nums={new_nums}, snake={new_snake}, camel={new_camel}, logic={new_logic}, verbs={new_verbs}); "
        f"missing_total={missing_total}"
    )

    if isinstance(diffs, dict) and txt and not err:
        nf = (diffs.get("flags") or {}).get("new") or []
        nfv = (diffs.get("field_like") or {}).get("new") or []
        nn = (diffs.get("numbers") or {}).get("new") or []
        nnu = (diffs.get("number_units") or {}).get("new") or []
        nlogic = (diffs.get("logic") or {}).get("new") or []
        nmod = (diffs.get("modals") or {}).get("new") or []
        nscope = (diffs.get("scope") or {}).get("new") or []
        if nf or nfv or nn or nnu or nlogic or nmod or nscope:
            print(
                "      new-previews:"
                f" flags=[{_format_token_preview(list(nf), max_items=max_preview, max_len=max_tok_len)}];"
                f" fields=[{_format_token_preview(list(nfv), max_items=max_preview, max_len=max_tok_len)}];"
                f" numbers=[{_format_token_preview(list(nn), max_items=max_preview, max_len=max_tok_len)}];"
                f" number_units=[{_format_token_preview(list(nnu), max_items=max_preview, max_len=max_tok_len)}];"
                f" logic=[{_format_token_preview(list(nlogic), max_items=max_preview, max_len=max_tok_len)}];"
                f" modals=[{_format_token_preview(list(nmod), max_items=max_preview, max_len=max_tok_len)}];"
                f" scope=[{_format_token_preview(list(nscope), max_items=max_preview, max_len=max_tok_len)}]"
            )

    if txt and not err and int(snippet_chars) > 0:
        sn = " ".join(txt.split())
        max_sn = int(snippet_chars)
        if len(sn) > max_sn:
            sn = sn[: max_sn - 1] + "…"
        print(f"      text: {sn}")


def _print_candidate_full(
    i: int,
    cand: Dict[str, Any],
    *,
    max_preview: int,
    max_tok_len: int,
) -> None:
    txt = (cand.get("text") or "").strip()
    err = cand.get("error")
    stats = cand.get("stats") or {}
    diffs = (stats.get("diffs") or {})

    print(f"\nCandidate [{i}]:")
    if err:
        print(f"  Generation error: {err}")
        if txt:
            print("  Partial text:")
            print(txt)
        return
    if not txt:
        print("  (empty)")
        return

    print(txt)

    risk = stats.get("risk_label") or "-"
    reasons = stats.get("risk_reasons") or []
    soft = stats.get("soft_flags") or []

    print("\n  Candidate statistics:")
    sim = stats.get("similarity_ratio")
    sim_s = f"{float(sim):.2f}" if isinstance(sim, (int, float)) else "-"
    lr = stats.get("len_ratio")
    lr_s = f"{float(lr):.2f}" if isinstance(lr, (int, float)) else "-"
    ld = stats.get("len_delta_chars")
    ld_s = f"{int(ld):+d}" if isinstance(ld, int) else "-"

    sem = stats.get("semantic") if isinstance(stats.get("semantic"), dict) else None
    sem_s = ""
    if isinstance(sem, dict):
        emb = sem.get("embedding") if isinstance(sem.get("embedding"), dict) else None
        ver = sem.get("verifier") if isinstance(sem.get("verifier"), dict) else None
        if isinstance(emb, dict):
            sem_s += f"; emb_cos={emb.get('cosine')}"
            if emb.get("error"):
                sem_s += f"; emb_err={str(emb.get('error'))[:80]}"
        if isinstance(ver, dict):
            sem_s += f"; entail={ver.get('label')}"
            if ver.get("error"):
                sem_s += f"; ent_err={str(ver.get('error'))[:80]}"

    print(f"    risk={risk}; reasons={reasons if reasons else '[]'}; similarity={sim_s}; len_ratio={lr_s}; Δchars={ld_s}{sem_s}")

    if soft:
        print(f"    soft_flags={soft}")

    for key in ("flags", "field_like", "numbers", "number_units", "snake", "camel", "logic", "modals", "scope", "verbs"):
        d = diffs.get(key) or {}
        new_items = d.get("new") or []
        missing_items = d.get("missing") or []
        if not new_items and not missing_items:
            continue
        print(
            f"    {key}:"
            f" new({len(new_items)})=[{_format_token_preview(list(new_items), max_items=max_preview, max_len=max_tok_len)}];"
            f" missing({len(missing_items)})=[{_format_token_preview(list(missing_items), max_items=max_preview, max_len=max_tok_len)}]"
        )


# ========= Raw JSON-string patcher (for tools stored as JSON strings) =========
def _extract_json_string_value(raw_json: str, key: str) -> Optional[str]:
    token = f'"{key}"'
    i = raw_json.find(token)
    if i < 0:
        return None
    i = raw_json.find(":", i + len(token))
    if i < 0:
        return None
    i += 1
    n = len(raw_json)
    while i < n and raw_json[i] in " \t\r\n":
        i += 1
    if i >= n or raw_json[i] != '"':
        return None
    start = i
    i += 1
    esc = False
    while i < n:
        c = raw_json[i]
        if esc:
            esc = False
        else:
            if c == "\\":
                esc = True
            elif c == '"':
                return raw_json[start : i + 1]
        i += 1
    return None


def _decode_raw_json_string(raw_json_string_with_quotes: str) -> str:
    try:
        obj = json.loads('{"description":' + raw_json_string_with_quotes + "}")
        return obj.get("description") or ""
    except json.JSONDecodeError:
        return ""


def _get_description_for_print(entry: Any) -> Tuple[str, str]:
    if isinstance(entry, str):
        raw = _extract_json_string_value(entry, "description")
        if raw is not None:
            return raw, "raw_json"
        try:
            obj = json.loads(entry)
            return obj.get("description") or "", "rendered"
        except json.JSONDecodeError:
            return "", "rendered"
    if isinstance(entry, dict):
        return entry.get("description") or "", "rendered"
    return "", "rendered"


def _load_tool(entry: Any) -> Tuple[Optional[Dict[str, Any]], str]:
    if isinstance(entry, str):
        try:
            return json.loads(entry), "json_str"
        except json.JSONDecodeError:
            return None, "other"
    if isinstance(entry, dict):
        return entry, "dict"
    return None, "other"


def _skip_ws(s: str, i: int) -> int:
    n = len(s)
    while i < n and s[i] in " \t\r\n":
        i += 1
    return i


def _scan_string_span(s: str, i: int) -> Optional[Tuple[int, int]]:
    n = len(s)
    if i >= n or s[i] != '"':
        return None
    j = i + 1
    esc = False
    while j < n:
        c = s[j]
        if esc:
            esc = False
        else:
            if c == "\\":
                esc = True
            elif c == '"':
                return (i, j + 1)
        j += 1
    return None


def _scan_number_span(s: str, i: int) -> Optional[Tuple[int, int]]:
    n = len(s)
    j = i
    if j < n and s[j] == "-":
        j += 1
    if j >= n:
        return None
    if s[j] == "0":
        j += 1
    elif s[j].isdigit():
        while j < n and s[j].isdigit():
            j += 1
    else:
        return None
    if j < n and s[j] == ".":
        j += 1
        if j >= n or not s[j].isdigit():
            return None
        while j < n and s[j].isdigit():
            j += 1
    if j < n and s[j] in "eE":
        j += 1
        if j < n and s[j] in "+-":
            j += 1
        if j >= n or not s[j].isdigit():
            return None
        while j < n and s[j].isdigit():
            j += 1
    return (i, j)


def _scan_literal_span(s: str, i: int) -> Optional[Tuple[int, int]]:
    for lit in ("true", "false", "null"):
        if s.startswith(lit, i):
            return (i, i + len(lit))
    return None


def _scan_container_span(s: str, i: int) -> Optional[Tuple[int, int]]:
    n = len(s)
    if i >= n:
        return None

    opener = s[i]
    if opener not in "{[":
        return None

    stack: List[str] = ["}" if opener == "{" else "]"]
    j = i + 1
    in_str = False
    esc = False

    while j < n:
        c = s[j]

        if in_str:
            if esc:
                esc = False
            else:
                if c == "\\":
                    esc = True
                elif c == '"':
                    in_str = False
            j += 1
            continue

        if c == '"':
            in_str = True
            j += 1
            continue

        if c == "{":
            stack.append("}")
            j += 1
            continue
        if c == "[":
            stack.append("]")
            j += 1
            continue

        if c in "}]":
            if not stack:
                return None
            expected = stack[-1]
            if c != expected:
                return None
            stack.pop()
            j += 1
            if not stack:
                return (i, j)
            continue

        j += 1

    return None


def _is_value_delim(c: str) -> bool:
    return c in ",}]"


def _scan_value_span(s: str, i: int) -> Optional[Tuple[int, int]]:
    n = len(s)
    i = _skip_ws(s, i)
    if i >= n:
        return None

    c = s[i]
    if c == '"':
        return _scan_string_span(s, i)
    if c in "{[":
        return _scan_container_span(s, i)

    span: Optional[Tuple[int, int]]
    if c == "-" or c.isdigit():
        span = _scan_number_span(s, i)
    else:
        span = _scan_literal_span(s, i)

    if not span:
        return None

    _, end = span
    k = _skip_ws(s, end)
    if k >= n:
        return span
    if _is_value_delim(s[k]):
        return span
    return None


def _replace_top_level_string_field_in_raw_object(raw_json_obj: str, key: str, new_value: str) -> Tuple[str, bool, str]:
    s = raw_json_obj
    n = len(s)

    i = _skip_ws(s, 0)
    if i >= n or s[i] != "{":
        return raw_json_obj, False, "not_object"

    i += 1
    found_any_key = False
    expect_key = True

    while True:
        i = _skip_ws(s, i)
        if i >= n:
            return raw_json_obj, False, "cannot_scan_value"

        if expect_key:
            if s[i] == "}":
                return raw_json_obj, False, "key_not_found"
            if s[i] != '"':
                return raw_json_obj, False, "invalid_key_string"

            key_span = _scan_string_span(s, i)
            if not key_span:
                return raw_json_obj, False, "invalid_key_string"

            found_any_key = True
            k_start, k_end = key_span
            try:
                key_decoded = json.loads(s[k_start:k_end])
            except Exception:
                return raw_json_obj, False, "invalid_key_string"

            i = _skip_ws(s, k_end)
            if i >= n or s[i] != ":":
                return raw_json_obj, False, "missing_colon"

            v_span = _scan_value_span(s, i + 1)
            if not v_span:
                return raw_json_obj, False, "cannot_scan_value"

            v_start, v_end = v_span

            if key_decoded == key:
                if v_start >= n or s[v_start] != '"':
                    return raw_json_obj, False, "value_not_string"

                replacement_literal = json.dumps(new_value, ensure_ascii=False)
                patched = s[:v_start] + replacement_literal + s[v_end:]

                try:
                    obj = json.loads(patched)
                except Exception:
                    return raw_json_obj, False, "json_load_failed_after_patch"

                if isinstance(obj, dict) and obj.get(key) == new_value:
                    return patched, True, "ok"
                return raw_json_obj, False, "validation_failed_after_patch"

            i = v_end
            expect_key = False
            continue

        i = _skip_ws(s, i)
        if i >= n:
            return raw_json_obj, False, "cannot_scan_value"

        if s[i] == ",":
            i += 1
            expect_key = True
            continue
        if s[i] == "}":
            return raw_json_obj, False, ("key_not_found" if found_any_key else "key_not_found")
        return raw_json_obj, False, "cannot_scan_value"


# ========= IDs =========
def _tool_fingerprint_excluding_description(tool_obj: Dict[str, Any]) -> str:
    filtered = {k: v for k, v in tool_obj.items() if k != "description"}
    payload = _canonical_json(filtered)
    return hashlib.sha256(payload.encode("utf-8")).hexdigest()[:HASH_HEX_LEN]


def _record_id(record_obj: Dict[str, Any], tool_field: str) -> str:
    rec = dict(record_obj)
    tools = rec.get(tool_field)
    if isinstance(tools, list):
        canon_tools: List[Any] = []
        for entry in tools:
            tool_obj, kind = _load_tool(entry)
            if tool_obj is None:
                canon_tools.append({"_unparsed": entry, "_kind": kind})
            else:
                canon_tools.append({k: v for k, v in tool_obj.items() if k != "description"})
        rec[tool_field] = canon_tools
    payload = _canonical_json(rec)
    return hashlib.sha256(payload.encode("utf-8")).hexdigest()[:HASH_HEX_LEN]


def _tool_instance_key(record_id: str, tool_index: int, tool_obj: Dict[str, Any]) -> str:
    fp = _tool_fingerprint_excluding_description(tool_obj)
    return f"rec:{record_id}:t{tool_index}:{fp}"


# ========= Audit (single file, resumable) =========
def _audit_identity(
    dataset_path: Path,
    *,
    mode_key: str,
    model: str,
    tool_field: str,
    num_candidates: int,
    semantic_cfg: Optional[Dict[str, Any]],
) -> str:
    sc = semantic_cfg or {}
    stable = (
        f"{dataset_path.resolve()}|{mode_key}|{model}|{tool_field}|K={int(num_candidates)}|"
        f"{RISK_POLICY_NAME}|{LOGIC_TOKEN_POLICY_NAME}|{SEMANTIC_POLICY_NAME}|"
        f"emb={bool(sc.get('enable_embeddings', False))}:{str(sc.get('embedding_model') or '')}|"
        f"ver={bool(sc.get('enable_verifier', False))}:{str(sc.get('verifier_model') or '')}"
    )
    return hashlib.sha256(stable.encode("utf-8")).hexdigest()[:12]


def _audit_file_path(
    dataset_path: Path,
    *,
    audit_dir: Path,
    mode_key: str,
    model: str,
    tool_field: str,
    num_candidates: int,
    semantic_cfg: Optional[Dict[str, Any]],
) -> Path:
    audit_key = _audit_identity(
        dataset_path,
        mode_key=mode_key,
        model=model,
        tool_field=tool_field,
        num_candidates=int(num_candidates),
        semantic_cfg=semantic_cfg,
    )
    safe_model = "".join(ch if ch.isalnum() or ch in ("-", "_", ".") else "_" for ch in model)
    out_dir = audit_dir / audit_key
    filename = f"{dataset_path.stem}.{audit_key}.{mode_key}.{safe_model}.K{int(num_candidates)}.audit.jsonl"
    return out_dir / filename


def _append_audit_event(audit_file: Path, event: Dict[str, Any]) -> None:
    audit_file.parent.mkdir(parents=True, exist_ok=True)
    safe_event = _json_safe(event)
    with audit_file.open("a", encoding="utf-8") as f:
        f.write(json.dumps(safe_event, ensure_ascii=False) + "\n")


def _load_resume_state(
    audit_file: Path,
) -> Tuple[
    Dict[str, Tuple[str, Optional[str], Optional[Dict[str, Any]]]],
    Dict[str, int],
    Dict[str, Optional[str]],
    Optional[Dict[str, Any]],
]:
    decisions: Dict[str, Tuple[str, Optional[str], Optional[Dict[str, Any]]]] = {}
    regen_counts: Dict[str, int] = {}
    last_rejected_text: Dict[str, Optional[str]] = {}
    prior_run_start: Optional[Dict[str, Any]] = None

    if not audit_file.exists():
        return decisions, regen_counts, last_rejected_text, None

    best_round: Dict[str, int] = {}

    with audit_file.open("r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            try:
                ev = json.loads(line)
            except Exception:
                continue
            if not isinstance(ev, dict):
                continue

            et = ev.get("event_type")
            if et == "run_start" and prior_run_start is None:
                prior_run_start = ev

            if et == "regenerate":
                ik = ev.get("instance_key")
                rr = ev.get("generation_round")
                txt = ev.get("last_generated_text")
                if isinstance(ik, str) and isinstance(rr, int) and rr >= 0:
                    prev = regen_counts.get(ik, 0)
                    if rr > prev:
                        regen_counts[ik] = rr
                    prev_best = best_round.get(ik, -1)
                    if rr >= prev_best:
                        best_round[ik] = rr
                        last_rejected_text[ik] = txt if isinstance(txt, str) else None

            if et == "decision":
                ik = ev.get("instance_key")
                status = ev.get("status")
                final_desc = ev.get("final_description")
                llm_bundle = ev.get("llm_bundle")
                if isinstance(ik, str) and isinstance(status, str):
                    decisions[ik] = (
                        status,
                        final_desc if isinstance(final_desc, str) else None,
                        llm_bundle if isinstance(llm_bundle, dict) else None,
                    )

    return decisions, regen_counts, last_rejected_text, prior_run_start


# ========= LLM helpers =========
def _sanitize_llm_output(text: str) -> str:
    t = (text or "").strip()
    if t.startswith("{") and "description" in t:
        try:
            obj = json.loads(t)
            if isinstance(obj, dict) and isinstance(obj.get("description"), str):
                t = obj["description"].strip()
        except Exception:
            pass
    if (t.startswith('"') and t.endswith('"')) or (t.startswith("'") and t.endswith("'")):
        t = t[1:-1].strip()
    return t


def _llm_chat_completion(
    *,
    client: OpenAI,
    model: str,
    messages: List[Dict[str, str]],
    temperature: float,
    max_tokens: int,
    seed: Optional[int],
) -> Tuple[str, Dict[str, Any]]:
    meta: Dict[str, Any] = {
        "seed_requested": seed,
        "seed_applied": False,
        "seed_error": None,
        "finish_reason": None,
        "usage": None,
        "max_tokens_requested": int(max_tokens),
        "max_param_used": None,
    }

    base_kwargs: Dict[str, Any] = dict(model=model, messages=messages, temperature=temperature)

    def attempt(max_param_used: str, include_seed: bool) -> Tuple[str, Dict[str, Any]]:
        req = dict(base_kwargs)
        if max_param_used == "max_completion_tokens":
            req["max_completion_tokens"] = int(max_tokens)
        else:
            req["max_tokens"] = int(max_tokens)
        if include_seed and seed is not None:
            req["seed"] = int(seed)

        resp = client.chat.completions.create(**req)
        text = (resp.choices[0].message.content or "").strip()

        meta_local = dict(meta)
        meta_local["max_param_used"] = max_param_used
        meta_local["finish_reason"] = getattr(resp.choices[0], "finish_reason", None)
        meta_local["usage"] = getattr(resp, "usage", None)
        meta_local["seed_applied"] = bool(include_seed and seed is not None)
        return text, meta_local

    def is_seed_error(e: Exception) -> bool:
        s = str(e).lower()
        return ("seed" in s) and ("unknown" in s or "unsupported" in s or "invalid" in s)

    try:
        return attempt("max_completion_tokens", include_seed=True)
    except Exception as e1:
        if seed is not None and is_seed_error(e1):
            meta["seed_error"] = str(e1)
            try:
                return attempt("max_completion_tokens", include_seed=False)
            except Exception:
                pass
        try:
            return attempt("max_tokens", include_seed=True)
        except Exception as e2:
            if seed is not None and is_seed_error(e2):
                meta["seed_error"] = str(e2)
                return attempt("max_tokens", include_seed=False)
            raise


def generate_description_via_llm(
    *,
    client: OpenAI,
    tool_name: str,
    base_description: str,
    model: str,
    seed: Optional[int],
    max_tokens: int,
    retry_on_length: bool,
    retry_max_tokens: int,
    mode_key: str,
    style_spec: Dict[str, Any],
    regen_index: int = 0,
    previous_rewrite: Optional[str] = None,
    length_policy: Optional[Dict[str, Any]] = None,
) -> Tuple[str, Dict[str, Any]]:
    system = str(style_spec["system"])
    regen_instr = str(style_spec.get("regen_diversity_instruction") or "")
    max_prev = int(style_spec.get("max_prev_rewrite_chars") or DEFAULT_MAX_PREV_REWRITE_CHARS)

    user_parts: List[str] = []
    user_parts.append(f"Tool name: {tool_name}")
    user_parts.append("Base description:")
    user_parts.append(base_description.strip() or "(empty)")
    user_parts.append("")
    user_parts.append(f"Rewrite in '{mode_key}' under the constraints.")

    if mode_key == "style_concise" and isinstance(length_policy, dict):
        ct = length_policy.get("concise_soft_target") if isinstance(length_policy.get("concise_soft_target"), dict) else {}
        applied = bool(ct.get("applied", False))
        target_chars = ct.get("target_chars") if isinstance(ct.get("target_chars"), int) else None
        ratio = ct.get("target_ratio")
        if applied and isinstance(target_chars, int):
            pct = int(float(ratio) * 100) if isinstance(ratio, (int, float)) else 70
            user_parts.append("")
            user_parts.append(f"Length guidance (soft target): <= {target_chars} characters (~{pct}% of base).")
            user_parts.append(
                "Exceeding the target is permitted only if strictly necessary to preserve meaning; "
                "no explicitly stated details may be omitted."
            )
        else:
            user_parts.append("")
            user_parts.append(
                "Length guidance: the base description is short or cannot be shortened safely; "
                "the rewrite must not exceed the base length and must remain as brief as possible."
            )

    if regen_index > 0:
        user_parts.append("")
        user_parts.append(f"Regeneration request: {regen_index}")
        if regen_instr:
            user_parts.append(regen_instr)
        if previous_rewrite and previous_rewrite.strip():
            prev = previous_rewrite.strip()
            if len(prev) > max_prev:
                prev = prev[:max_prev].rstrip()
            user_parts.append("")
            user_parts.append("Previous rewrite (wording must not be reused):")
            user_parts.append(prev)

    user = "\n".join(user_parts)

    raw1, meta1 = _llm_chat_completion(
        client=client,
        model=model,
        messages=[{"role": "system", "content": system}, {"role": "user", "content": user}],
        temperature=0.0,
        max_tokens=max_tokens,
        seed=seed,
    )
    san1 = _sanitize_llm_output(raw1)
    finish1 = (meta1.get("finish_reason") or "").lower()
    looks_truncated_1 = (finish1 == "length")

    if not looks_truncated_1:
        return san1, {
            "proposal_origin": "primary",
            "proposal_sanitized_final": san1,
            "llm_text_raw_primary": raw1,
            "llm_text_raw_retry": None,
            "primary": meta1,
            "retry": None,
            "mode_key": mode_key,
            "length_policy": length_policy,
        }

    raw2 = None
    meta2 = None
    san2 = None
    best_san = san1
    origin = "primary"

    if retry_on_length and retry_max_tokens > max_tokens:
        raw2, meta2 = _llm_chat_completion(
            client=client,
            model=model,
            messages=[{"role": "system", "content": system}, {"role": "user", "content": user}],
            temperature=0.0,
            max_tokens=int(retry_max_tokens),
            seed=seed,
        )
        san2 = _sanitize_llm_output(raw2)
        if san2 and len(san2) >= len(best_san):
            best_san = san2
            origin = "retry"

    return best_san, {
        "proposal_origin": origin,
        "proposal_sanitized_final": best_san,
        "llm_text_raw_primary": raw1,
        "llm_text_raw_retry": raw2,
        "primary": meta1,
        "retry": meta2,
        "mode_key": mode_key,
        "length_policy": length_policy,
    }


def _print_perturbation_context(
    *,
    show: bool,
    tool_name: str,
    candidate_i_1based: int,
    k: int,
    generation_round: int,
    regen_index: int,
    seed: Optional[int],
    max_tokens: int,
    mode_key: str,
    style_spec: Dict[str, Any],
    length_policy: Optional[Dict[str, Any]],
    prev_hint: Optional[str],
) -> None:
    """
    Prints (to stdout) the "perturbation context" used for this candidate generation.
    This is intentionally concise: it shows *what* changes across candidates/rounds without dumping full prompts.
    """
    if not show:
        return

    regen_instr = str(style_spec.get("regen_diversity_instruction") or "").strip()

    ct = None
    if isinstance(length_policy, dict):
        ct0 = length_policy.get("concise_soft_target")
        if isinstance(ct0, dict):
            ct = ct0

    ct_applied = bool(ct.get("applied", False)) if isinstance(ct, dict) else False
    ct_target = ct.get("target_chars") if (isinstance(ct, dict) and isinstance(ct.get("target_chars"), int)) else None
    ct_reason = ct.get("reason") if isinstance(ct, dict) else None

    print(
        f"\n  [gen] {tool_name} | cand {candidate_i_1based}/{int(k)} | round={int(generation_round)} | "
        f"regen_index={int(regen_index)} | mode={mode_key} | seed={seed} | max_tokens={int(max_tokens)}"
    )

    if mode_key == "style_concise":
        if ct_applied and isinstance(ct_target, int):
            print(f"      length_guidance: soft_target<= {int(ct_target)} chars (applied)")
        else:
            print(f"      length_guidance: soft_target not applied (reason={ct_reason})")

    if int(generation_round) > 0 or int(regen_index) > 0:
        if regen_instr:
            sn = " ".join(regen_instr.split())
            if len(sn) > 220:
                sn = sn[:219] + "…"
            print(f"      diversity_instruction: {sn}")

    if prev_hint and prev_hint.strip():
        ph = prev_hint.strip()
        ph_sn = " ".join(ph.split())
        if len(ph_sn) > 220:
            ph_sn = ph_sn[:219] + "…"
        print(
            f"      previous_rewrite_hint: len={len(ph)} sha={_sha256_text(ph)[:12]} snippet='{ph_sn}'"
        )


def generate_k_candidates_with_stats(
    *,
    client: OpenAI,
    tool_name: str,
    base_description: str,
    model: str,
    seed: Optional[int],
    max_tokens: int,
    retry_on_length: bool,
    retry_max_tokens: int,
    mode_key: str,
    style_spec: Dict[str, Any],
    generation_round: int,
    k: int,
    previous_rewrite_hint: Optional[str],
    min_sleep_sec_between_calls: float,
    length_policy: Optional[Dict[str, Any]],
    semantic_cfg: Optional[Dict[str, Any]],
    show_perturbations: bool,
) -> Tuple[List[Dict[str, Any]], Optional[str]]:
    base_desc = (base_description or "").strip()
    k_eff = max(1, int(k))

    candidates: List[Dict[str, Any]] = []
    seen: set = set()

    prev = previous_rewrite_hint.strip() if isinstance(previous_rewrite_hint, str) and previous_rewrite_hint.strip() else None
    last_generated_text: Optional[str] = None

    for i in range(0, k_eff):
        regen_index = int(generation_round) * 1000 + i
        text = ""
        bundle: Optional[Dict[str, Any]] = None
        err: Optional[str] = None

        # Feb 3 patch: print perturbation context *before* the LLM call.
        _print_perturbation_context(
            show=bool(show_perturbations),
            tool_name=tool_name,
            candidate_i_1based=i + 1,
            k=k_eff,
            generation_round=int(generation_round),
            regen_index=int(regen_index),
            seed=seed,
            max_tokens=int(max_tokens),
            mode_key=mode_key,
            style_spec=style_spec,
            length_policy=length_policy,
            prev_hint=prev,
        )

        try:
            text, bundle = generate_description_via_llm(
                client=client,
                tool_name=tool_name,
                base_description=base_desc,
                model=model,
                seed=seed,
                max_tokens=max_tokens,
                retry_on_length=retry_on_length,
                retry_max_tokens=retry_max_tokens,
                mode_key=mode_key,
                style_spec=style_spec,
                regen_index=regen_index,
                previous_rewrite=prev,
                length_policy=length_policy,
            )
            text = (text or "").strip()
            last_generated_text = text if text else last_generated_text
        except Exception as e:
            err = str(e)
            text = ""

        duplicate = False
        if text:
            if text in seen:
                duplicate = True
            else:
                seen.add(text)

        stats = compute_full_candidate_stats(
            base_text=base_desc,
            cand_text=text,
            mode_key=mode_key,
            length_policy=length_policy,
            client=client,
            semantic_cfg=semantic_cfg,
        ) if text else None

        candidates.append(
            {
                "candidate_index": i + 1,
                "text": text,
                "error": err,
                "bundle": bundle,
                "duplicate": duplicate,
                "stats": stats,
            }
        )

        prev = text if text else prev

        if min_sleep_sec_between_calls > 0:
            time.sleep(float(min_sleep_sec_between_calls))

    return candidates, last_generated_text


# ========= IO =========
def make_working_copy(input_jsonl: str, output_jsonl: str, *, overwrite: bool = False) -> str:
    src = Path(input_jsonl)
    dst = Path(output_jsonl)

    if not src.exists():
        raise FileNotFoundError(f"File not found: {src}")

    if dst.exists() and not overwrite:
        return str(dst)

    dst.parent.mkdir(parents=True, exist_ok=True)
    shutil.copy2(src, dst)
    return str(dst)


def _normalize_cmd(raw: str) -> str:
    """
    Normalizes user command strings.
    Important: maps Esc (raw '\x1b' or common textual forms) to quit.
    """
    r = raw if raw is not None else ""
    # If raw-key mode is enabled, Esc is returned as '\x1b'.
    if ("\x1b" in r) or (r.strip().lower() in ("esc", "<esc>", "^[", "escape")):
        return "q"

    c = (r or "").strip().lower()
    if c in ("", "y", "yes", "ok", "okay", "si", "sì"):
        return "y"
    if c in ("r", "retry", "again", "prova", "prova ancora", "rigenera"):
        return "r"
    if c in ("e", "edit", "modifica"):
        return "e"
    if c in ("m", "manual", "mine", "mio", "mia", "custom"):
        return "m"
    if c in ("s", "skip", "salta", "pass"):
        return "s"
    if c in ("q", "quit", "exit", "esci"):
        return "q"
    return c


def _parse_candidate_choice(cmd: str, *, k: int) -> Optional[int]:
    c = (cmd or "").strip()
    if not c:
        return None
    if c.isdigit():
        v = int(c)
        if 1 <= v <= int(k):
            return v
    return None


# ========= Main interactive =========
def interactive_llm_tools_in_jsonl(
    jsonl_path: str,
    *,
    tool_field: str,
    create_backup_of_target: bool,
    llm_model: str,
    seed: Optional[int],
    max_tokens: int,
    retry_on_length: bool,
    retry_max_tokens: int,
    allow_reserialize_fallback: bool,
    min_sleep_sec_between_calls: float,
    audit_dir: str,
    mode_key: str,
    num_candidates: int,
    max_token_preview: int,
    max_token_string_len: int,
    candidate_snippet_chars: int,
    concise_target_ratio: float,
    concise_target_min_base_len: int,
    concise_target_min_chars: int,
    semantic_cfg: Optional[Dict[str, Any]],
    show_perturbations: bool,
    raw_key_input: bool,
) -> None:
    mode_key, style_spec = _resolve_style(mode_key)

    path = Path(jsonl_path)
    if not path.exists():
        raise FileNotFoundError(f"File not found: {jsonl_path}")

    client = make_gemini_client()
    audit_file = _audit_file_path(
        path,
        audit_dir=Path(audit_dir),
        mode_key=mode_key,
        model=llm_model,
        tool_field=tool_field,
        num_candidates=int(num_candidates),
        semantic_cfg=semantic_cfg,
    )

    decisions_by_instance, regen_counts, last_rejected_text_by_instance, prior_run_start = _load_resume_state(audit_file)

    tool_order: List[Dict[str, Any]] = []
    with path.open("r", encoding="utf-8") as f:
        for raw_line in f:
            line = raw_line.rstrip("\n")
            if not line.strip():
                continue
            try:
                record = json.loads(line)
            except json.JSONDecodeError:
                continue
            if not isinstance(record, dict):
                continue

            rid = _record_id(record, tool_field=tool_field)
            tools = record.get(tool_field)
            if not isinstance(tools, list):
                continue

            for tool_index, entry in enumerate(tools):
                tool_obj, kind = _load_tool(entry)
                if not tool_obj:
                    continue
                name = (tool_obj.get("name") or "").strip()
                if not name:
                    continue

                desc_print, desc_mode = _get_description_for_print(entry)
                instance_key = _tool_instance_key(rid, tool_index, tool_obj)

                tool_order.append(
                    {
                        "record_id": rid,
                        "tool_index": tool_index,
                        "tool_name": name,
                        "desc_print": desc_print,
                        "desc_mode": desc_mode,
                        "instance_key": instance_key,
                        "entry_kind": kind,
                    }
                )

    n_total = len(tool_order)
    n_prev_reviewed = len(decisions_by_instance)

    start_pos = 0
    while start_pos < n_total and tool_order[start_pos]["instance_key"] in decisions_by_instance:
        start_pos += 1

    session_id = hashlib.sha256(f"{time.time_ns()}".encode("utf-8")).hexdigest()[:12]
    before_sha = _sha256_file(path)

    length_policy_config = {
        "concise_soft_target": {
            "ratio": float(concise_target_ratio),
            "min_base_len": int(concise_target_min_base_len),
            "min_chars": int(concise_target_min_chars),
            "policy_name": "concise_soft_target_v1",
        }
    }

    semantic_cfg_norm = dict(semantic_cfg or {})
    semantic_cfg_norm["fallback_llm_model"] = llm_model

    if prior_run_start is None:
        _append_audit_event(
            audit_file,
            {
                "event_type": "run_start",
                "ts": int(time.time()),
                "session_id": session_id,
                "mode": mode_key,
                "model": llm_model,
                "seed": seed,
                "dataset_path": str(path),
                "dataset_sha256_at_session_start": before_sha,
                "tool_field": tool_field,
                "n_total_occurrences": n_total,
                "max_tokens_requested": int(max_tokens),
                "retry_on_length": bool(retry_on_length),
                "retry_max_tokens": int(retry_max_tokens),
                "allow_reserialize_fallback": bool(allow_reserialize_fallback),
                "num_candidates": int(num_candidates),
                "min_sleep_sec_between_calls": float(min_sleep_sec_between_calls),
                "stats_max_token_preview": int(max_token_preview),
                "stats_max_token_string_len": int(max_token_string_len),
                "candidate_snippet_chars": int(candidate_snippet_chars),
                "length_policy_config": length_policy_config,
                "policy_versions": {
                    "risk_policy_name": RISK_POLICY_NAME,
                    "logic_token_policy_name": LOGIC_TOKEN_POLICY_NAME,
                    "semantic_policy_name": SEMANTIC_POLICY_NAME,
                },
                "semantic_cfg": semantic_cfg_norm,
                "show_perturbations": bool(show_perturbations),
                "raw_key_input": bool(raw_key_input),
            },
        )
    else:
        _append_audit_event(
            audit_file,
            {
                "event_type": "run_resume",
                "ts": int(time.time()),
                "session_id": session_id,
                "mode": mode_key,
                "model": llm_model,
                "seed": seed,
                "dataset_path": str(path),
                "dataset_sha256_at_session_start": before_sha,
                "tool_field": tool_field,
                "n_total_occurrences": n_total,
                "n_previously_reviewed": n_prev_reviewed,
                "resume_from_index_1based": (start_pos + 1) if start_pos < n_total else (n_total + 1),
                "num_candidates": int(num_candidates),
                "candidate_snippet_chars": int(candidate_snippet_chars),
                "length_policy_config": length_policy_config,
                "policy_versions": {
                    "risk_policy_name": RISK_POLICY_NAME,
                    "logic_token_policy_name": LOGIC_TOKEN_POLICY_NAME,
                    "semantic_policy_name": SEMANTIC_POLICY_NAME,
                },
                "semantic_cfg": semantic_cfg_norm,
                "show_perturbations": bool(show_perturbations),
                "raw_key_input": bool(raw_key_input),
            },
        )

    print(f"Target: {path}")
    print(f"Mode: {mode_key}")
    print(f"Audit file (RESUMABLE): {audit_file}")
    print(f"Tool occurrences total: {n_total}")
    if start_pos < n_total:
        print(f"Resume position: [{start_pos + 1}/{n_total}] (previously reviewed: {n_prev_reviewed})")
    else:
        print(f"Resume position: completed (previously reviewed: {n_prev_reviewed})")
    print(f"LLM: {llm_model} @ {GEMINI_BASE_URL}")
    print(f"Candidates per tool: {int(num_candidates)}")
    print(f"Candidate snippet chars: {int(candidate_snippet_chars)}")
    print(f"Policies: risk={RISK_POLICY_NAME}; logic={LOGIC_TOKEN_POLICY_NAME}; semantic={SEMANTIC_POLICY_NAME}")
    print(f"Perturbation prints: {'enabled' if show_perturbations else 'disabled'}")
    print(f"Raw key input (Esc-safe): {'enabled' if raw_key_input else 'disabled'}")

    if semantic_cfg_norm.get("enable_embeddings"):
        print(
            "Embedding signal: enabled; "
            f"model='{semantic_cfg_norm.get('embedding_model')}', "
            f"low_cos_thr={semantic_cfg_norm.get('embedding_low_cosine_threshold')}"
        )
    else:
        print("Embedding signal: disabled")
    if semantic_cfg_norm.get("enable_verifier"):
        print(
            "Verifier signal: enabled; "
            f"model='{semantic_cfg_norm.get('verifier_model') or llm_model}', "
            f"max_tokens={int(semantic_cfg_norm.get('verifier_max_tokens', DEFAULT_VERIFIER_MAX_TOKENS))}"
        )
    else:
        print("Verifier signal: disabled")

    if mode_key == "style_concise":
        print(
            "Concise soft target: "
            f"ratio={float(concise_target_ratio):.2f}, "
            f"min_base_len={int(concise_target_min_base_len)}, "
            f"min_chars={int(concise_target_min_chars)}"
        )
    print(f"Max tokens: {int(max_tokens)}; retry_on_length={bool(retry_on_length)}; retry_max_tokens={int(retry_max_tokens)}")
    print(
        "Commands: ENTER/ok=accept #1, 1..K=accept candidate, r=regenerate K, "
        "e=edit candidate, m=manual, s=skip, q/Esc=quit, p<idx>=preview (e.g., p2)\n"
    )

    quit_requested = False
    resume_next_index_1based: Optional[int] = None

    max_prev = int(style_spec.get("max_prev_rewrite_chars") or DEFAULT_MAX_PREV_REWRITE_CHARS)

    session_summary: Dict[str, Any] = {
        "accepted": 0,
        "edited": 0,
        "manual": 0,
        "skipped": 0,
        "accepted_risk_labels": {"LOW": 0, "MED": 0, "HIGH": 0, "NA": 0},
        "accepted_similarity_sum": 0.0,
        "accepted_similarity_n": 0,
        "accepted_base_chars_sum": 0,
        "accepted_cand_chars_sum": 0,
        "accepted_len_ratio_sum": 0.0,
        "accepted_len_ratio_n": 0,
        "accepted_len_delta_sum": 0,
        "accepted_soft_target_applicable_n": 0,
        "accepted_within_soft_target_n": 0,
        "accepted_embedding_cos_sum": 0.0,
        "accepted_embedding_cos_n": 0,
        "accepted_entails": 0,
        "accepted_not_entails": 0,
        "accepted_entails_unknown": 0,
    }

    for pos in range(start_pos, n_total):
        item = tool_order[pos]
        idx = pos + 1

        name = item["tool_name"]
        desc_mode = item["desc_mode"]
        old_desc_print = item["desc_print"]
        instance_key = item["instance_key"]
        rid = item["record_id"]
        tool_i = item["tool_index"]

        generation_round = int(regen_counts.get(instance_key, 0))
        previous_rewrite_hint: Optional[str] = last_rejected_text_by_instance.get(instance_key)

        print("=" * 80)
        print(f"[{idx}/{n_total}] {name}")
        print(f"instance_key: {instance_key} (record_id={rid}, tool_index={tool_i})")

        if desc_mode == "raw_json":
            print("Current description RAW (escaped):")
            print(old_desc_print if old_desc_print else "(empty)")
            base_desc = _decode_raw_json_string(old_desc_print) if old_desc_print else ""
            print("\nCurrent description DECODED:")
            print(base_desc if base_desc else "(empty)")
        else:
            base_desc = old_desc_print or ""
            print("Current description:")
            print(base_desc if base_desc else "(empty)")

        base_desc = (base_desc or "").strip()
        base_len_chars = len(base_desc)

        length_policy = _make_length_policy(
            base_desc=base_desc,
            mode_key=mode_key,
            concise_ratio=float(concise_target_ratio),
            concise_min_base_len=int(concise_target_min_base_len),
            concise_min_chars=int(concise_target_min_chars),
        )

        _print_base_stats(base_desc, max_preview=int(max_token_preview), max_tok_len=int(max_token_string_len))
        if mode_key == "style_concise":
            ct = length_policy.get("concise_soft_target", {}) if isinstance(length_policy.get("concise_soft_target"), dict) else {}
            if ct.get("applied") and isinstance(ct.get("target_chars"), int):
                print(f"Concise soft target (applied): target_chars={ct.get('target_chars')} (base_len={base_len_chars})")
            else:
                print(f"Concise soft target (not applied): reason={ct.get('reason')} (base_len={base_len_chars})")

        candidates: List[Dict[str, Any]] = []
        last_generated_text: Optional[str] = None

        while True:
            if not candidates:
                try:
                    candidates, last_generated_text = generate_k_candidates_with_stats(
                        client=client,
                        tool_name=name,
                        base_description=base_desc,
                        model=llm_model,
                        seed=seed,
                        max_tokens=max_tokens,
                        retry_on_length=retry_on_length,
                        retry_max_tokens=retry_max_tokens,
                        mode_key=mode_key,
                        style_spec=style_spec,
                        generation_round=generation_round,
                        k=int(num_candidates),
                        previous_rewrite_hint=previous_rewrite_hint,
                        min_sleep_sec_between_calls=float(min_sleep_sec_between_calls),
                        length_policy=length_policy,
                        semantic_cfg=semantic_cfg_norm,
                        show_perturbations=bool(show_perturbations),
                    )
                except Exception as e:
                    print(f"\nLLM ERROR (candidate set generation): {e}")
                    raw = _read_command("Choice [m=manual, s=skip, q/Esc=quit] > ", k=int(num_candidates), raw_key_input=bool(raw_key_input))
                    cmd = _normalize_cmd(raw)
                    now = int(time.time())

                    if cmd == "q":
                        quit_requested = True
                        resume_next_index_1based = idx
                        break

                    if cmd == "s":
                        decisions_by_instance[instance_key] = ("skipped", None, None)
                        _append_audit_event(
                            audit_file,
                            {
                                "event_type": "decision",
                                "ts": now,
                                "session_id": session_id,
                                "status": "skipped",
                                "tool_name": name,
                                "instance_key": instance_key,
                                "record_id": rid,
                                "tool_index": tool_i,
                                "model": llm_model,
                                "seed": seed,
                                "mode": mode_key,
                                "base_description": base_desc,
                                "final_description": None,
                                "source": "user",
                                "note": "skip_after_llm_error",
                                "length_policy": length_policy,
                                "semantic_cfg": semantic_cfg_norm,
                            },
                        )
                        session_summary["skipped"] += 1
                        break

                    if cmd == "m":
                        manual_final = input("Manual final description (empty cancels) > ").rstrip("\n").strip()
                        status = "manual" if manual_final else "skipped"
                        decisions_by_instance[instance_key] = (status, manual_final or None, None)
                        diff_stats = compute_full_candidate_stats(
                            base_text=base_desc,
                            cand_text=manual_final,
                            mode_key=mode_key,
                            length_policy=length_policy,
                            client=client,
                            semantic_cfg=semantic_cfg_norm,
                        ) if manual_final else None

                        _append_audit_event(
                            audit_file,
                            {
                                "event_type": "decision",
                                "ts": now,
                                "session_id": session_id,
                                "status": status,
                                "tool_name": name,
                                "instance_key": instance_key,
                                "record_id": rid,
                                "tool_index": tool_i,
                                "model": llm_model,
                                "seed": seed,
                                "mode": mode_key,
                                "base_description": base_desc,
                                "final_description": manual_final or None,
                                "source": "user",
                                "note": "manual_after_llm_error",
                                "diff_stats": diff_stats,
                                "length_policy": length_policy,
                                "semantic_cfg": semantic_cfg_norm,
                            },
                        )
                        if status == "manual":
                            session_summary["manual"] += 1
                        else:
                            session_summary["skipped"] += 1
                        break

                    candidates = []
                    continue

                _append_audit_event(
                    audit_file,
                    {
                        "event_type": "candidates_generated",
                        "ts": int(time.time()),
                        "session_id": session_id,
                        "tool_name": name,
                        "instance_key": instance_key,
                        "record_id": rid,
                        "tool_index": tool_i,
                        "mode": mode_key,
                        "model": llm_model,
                        "seed": seed,
                        "generation_round": int(generation_round),
                        "num_candidates_requested": int(num_candidates),
                        "base_len_chars": int(base_len_chars),
                        "length_policy": length_policy,
                        "semantic_cfg": semantic_cfg_norm,
                        "candidates_summary": [
                            {
                                "candidate_index": c.get("candidate_index"),
                                "text_sha256": _sha256_text((c.get("text") or "").strip()),
                                "text_len": len((c.get("text") or "").strip()),
                                "error": c.get("error"),
                                "duplicate": bool(c.get("duplicate", False)),
                                "risk_label": ((c.get("stats") or {}).get("risk_label") if isinstance(c.get("stats"), dict) else None),
                                "similarity_ratio": ((c.get("stats") or {}).get("similarity_ratio") if isinstance(c.get("stats"), dict) else None),
                                "len_ratio": ((c.get("stats") or {}).get("len_ratio") if isinstance(c.get("stats"), dict) else None),
                                "len_delta_chars": ((c.get("stats") or {}).get("len_delta_chars") if isinstance(c.get("stats"), dict) else None),
                                "new_structural_count": ((c.get("stats") or {}).get("new_structural_count") if isinstance(c.get("stats"), dict) else None),
                                "missing_structural_count": ((c.get("stats") or {}).get("missing_structural_count") if isinstance(c.get("stats"), dict) else None),
                                "new_logic_count": ((c.get("stats") or {}).get("new_logic_count") if isinstance(c.get("stats"), dict) else None),
                                "missing_logic_count": ((c.get("stats") or {}).get("missing_logic_count") if isinstance(c.get("stats"), dict) else None),
                                "concise_soft_target_applied": ((c.get("stats") or {}).get("concise_soft_target_applied") if isinstance(c.get("stats"), dict) else None),
                                "concise_soft_target_chars": ((c.get("stats") or {}).get("concise_soft_target_chars") if isinstance(c.get("stats"), dict) else None),
                                "within_soft_target": ((c.get("stats") or {}).get("within_soft_target") if isinstance(c.get("stats"), dict) else None),
                                "embedding_cosine": (
                                    (((c.get("stats") or {}).get("semantic") or {}).get("embedding") or {}).get("cosine")
                                    if isinstance(c.get("stats"), dict) else None
                                ),
                                "verifier_label": (
                                    (((c.get("stats") or {}).get("semantic") or {}).get("verifier") or {}).get("label")
                                    if isinstance(c.get("stats"), dict) else None
                                ),
                            }
                            for c in candidates
                        ],
                    },
                )

            print("\nCandidates overview:")
            for c in candidates:
                _print_candidate_summary_line(
                    int(c.get("candidate_index") or 0),
                    c,
                    max_preview=int(max_token_preview),
                    max_tok_len=int(max_token_string_len),
                    snippet_chars=int(candidate_snippet_chars),
                )

            raw = _read_command(
                f"\nChoice [ENTER=accept #1, 1..{int(num_candidates)}=accept, r=regen, e=edit, m=manual, s=skip, q/Esc=quit, p<idx>=preview] > ",
                k=int(num_candidates),
                raw_key_input=bool(raw_key_input),
            )
            cmd = _normalize_cmd(raw)
            now = int(time.time())

            if cmd == "q":
                quit_requested = True
                resume_next_index_1based = idx
                break

            if cmd == "s":
                decisions_by_instance[instance_key] = ("skipped", None, None)
                _append_audit_event(
                    audit_file,
                    {
                        "event_type": "decision",
                        "ts": now,
                        "session_id": session_id,
                        "status": "skipped",
                        "tool_name": name,
                        "instance_key": instance_key,
                        "record_id": rid,
                        "tool_index": tool_i,
                        "model": llm_model,
                        "seed": seed,
                        "mode": mode_key,
                        "base_description": base_desc,
                        "final_description": None,
                        "source": "user",
                        "note": "skip",
                        "length_policy": length_policy,
                        "semantic_cfg": semantic_cfg_norm,
                    },
                )
                session_summary["skipped"] += 1
                break

            if cmd == "r":
                generation_round += 1
                regen_counts[instance_key] = int(generation_round)

                if last_generated_text and isinstance(last_generated_text, str) and last_generated_text.strip():
                    hint = last_generated_text.strip()
                    if len(hint) > max_prev:
                        hint = hint[:max_prev].rstrip()
                    previous_rewrite_hint = hint
                    last_rejected_text_by_instance[instance_key] = hint

                _append_audit_event(
                    audit_file,
                    {
                        "event_type": "regenerate",
                        "ts": now,
                        "session_id": session_id,
                        "tool_name": name,
                        "instance_key": instance_key,
                        "record_id": rid,
                        "tool_index": tool_i,
                        "mode": mode_key,
                        "generation_round": int(generation_round),
                        "last_generated_text": previous_rewrite_hint,
                        "last_generated_text_sha256": _sha256_text(previous_rewrite_hint or ""),
                        "length_policy": length_policy,
                        "semantic_cfg": semantic_cfg_norm,
                    },
                )

                candidates = []
                last_generated_text = None
                if min_sleep_sec_between_calls > 0:
                    time.sleep(float(min_sleep_sec_between_calls))
                continue

            if cmd == "m":
                manual_final = input("Manual final description (empty cancels) > ").rstrip("\n").strip()
                status = "manual" if manual_final else "skipped"
                decisions_by_instance[instance_key] = (status, manual_final or None, None)

                diff_stats = compute_full_candidate_stats(
                    base_text=base_desc,
                    cand_text=manual_final,
                    mode_key=mode_key,
                    length_policy=length_policy,
                    client=client,
                    semantic_cfg=semantic_cfg_norm,
                ) if manual_final else None

                _append_audit_event(
                    audit_file,
                    {
                        "event_type": "decision",
                        "ts": now,
                        "session_id": session_id,
                        "status": status,
                        "tool_name": name,
                        "instance_key": instance_key,
                        "record_id": rid,
                        "tool_index": tool_i,
                        "model": llm_model,
                        "seed": seed,
                        "mode": mode_key,
                        "base_description": base_desc,
                        "final_description": manual_final or None,
                        "source": "user",
                        "note": "manual_replace",
                        "diff_stats": diff_stats,
                        "length_policy": length_policy,
                        "semantic_cfg": semantic_cfg_norm,
                    },
                )
                if status == "manual":
                    session_summary["manual"] += 1
                else:
                    session_summary["skipped"] += 1
                break

            if cmd == "e":
                raw_idx = input(f"Candidate index to edit [1..{int(num_candidates)}] (empty=1) > ").strip()
                chosen_i = 1
                if raw_idx and raw_idx.isdigit():
                    chosen_i = int(raw_idx)
                if not (1 <= chosen_i <= int(num_candidates)):
                    print("Invalid candidate index.")
                    continue

                cand = candidates[chosen_i - 1] if (chosen_i - 1) < len(candidates) else None
                base_text = (cand.get("text") or "").strip() if isinstance(cand, dict) else ""
                if base_text:
                    print("\nSelected candidate text:")
                    print(base_text)
                else:
                    print("\nSelected candidate is empty; editing starts from empty string.")
                    base_text = ""

                edited = input("Edit final description (empty cancels) > ").rstrip("\n").strip()
                status = "edited" if edited else "skipped"
                bundle = cand.get("bundle") if isinstance(cand, dict) else None
                stats = compute_full_candidate_stats(
                    base_text=base_desc,
                    cand_text=edited,
                    mode_key=mode_key,
                    length_policy=length_policy,
                    client=client,
                    semantic_cfg=semantic_cfg_norm,
                ) if edited else None

                decisions_by_instance[instance_key] = (status, edited or None, bundle if isinstance(bundle, dict) else None)
                _append_audit_event(
                    audit_file,
                    {
                        "event_type": "decision",
                        "ts": now,
                        "session_id": session_id,
                        "status": status,
                        "tool_name": name,
                        "instance_key": instance_key,
                        "record_id": rid,
                        "tool_index": tool_i,
                        "model": llm_model,
                        "seed": seed,
                        "mode": mode_key,
                        "base_description": base_desc,
                        "final_description": edited or None,
                        "source": "user",
                        "note": "edit_candidate",
                        "chosen_candidate_index": int(chosen_i),
                        "llm_bundle": bundle if isinstance(bundle, dict) else None,
                        "diff_stats": stats,
                        "generation_round": int(generation_round),
                        "length_policy": length_policy,
                        "semantic_cfg": semantic_cfg_norm,
                    },
                )
                if status == "edited":
                    session_summary["edited"] += 1
                else:
                    session_summary["skipped"] += 1
                break

            choice_i = 1 if cmd == "y" else _parse_candidate_choice(cmd, k=int(num_candidates))
            if choice_i is not None:
                if not (1 <= int(choice_i) <= int(num_candidates)):
                    print("Invalid candidate index.")
                    continue
                cand = candidates[int(choice_i) - 1] if (int(choice_i) - 1) < len(candidates) else None
                if not isinstance(cand, dict):
                    print("Candidate not available.")
                    continue
                if cand.get("error") or not (cand.get("text") or "").strip():
                    print("Selected candidate is not acceptable (empty or error).")
                    _print_candidate_full(int(choice_i), cand, max_preview=int(max_token_preview), max_tok_len=int(max_token_string_len))
                    continue

                final_desc = (cand.get("text") or "").strip()
                bundle = cand.get("bundle") if isinstance(cand.get("bundle"), dict) else None
                stats = cand.get("stats") if isinstance(cand.get("stats"), dict) else compute_full_candidate_stats(
                    base_text=base_desc,
                    cand_text=final_desc,
                    mode_key=mode_key,
                    length_policy=length_policy,
                    client=client,
                    semantic_cfg=semantic_cfg_norm,
                )

                decisions_by_instance[instance_key] = ("accepted", final_desc, bundle)
                _append_audit_event(
                    audit_file,
                    {
                        "event_type": "decision",
                        "ts": now,
                        "session_id": session_id,
                        "status": "accepted",
                        "tool_name": name,
                        "instance_key": instance_key,
                        "record_id": rid,
                        "tool_index": tool_i,
                        "model": llm_model,
                        "seed": seed,
                        "mode": mode_key,
                        "base_description": base_desc,
                        "final_description": final_desc,
                        "source": "llm",
                        "chosen_candidate_index": int(choice_i),
                        "generation_round": int(generation_round),
                        "llm_bundle": bundle,
                        "diff_stats": stats,
                        "length_policy": length_policy,
                        "semantic_cfg": semantic_cfg_norm,
                    },
                )

                session_summary["accepted"] += 1
                rl = (stats.get("risk_label") if isinstance(stats, dict) else None) or "NA"
                if rl not in session_summary["accepted_risk_labels"]:
                    rl = "NA"
                session_summary["accepted_risk_labels"][rl] += 1

                sim = stats.get("similarity_ratio") if isinstance(stats, dict) else None
                if isinstance(sim, (int, float)):
                    session_summary["accepted_similarity_sum"] += float(sim)
                    session_summary["accepted_similarity_n"] += 1

                bl = stats.get("base_len_chars") if isinstance(stats, dict) else None
                cl = stats.get("cand_len_chars") if isinstance(stats, dict) else None
                lr = stats.get("len_ratio") if isinstance(stats, dict) else None
                ld = stats.get("len_delta_chars") if isinstance(stats, dict) else None
                if isinstance(bl, int) and isinstance(cl, int):
                    session_summary["accepted_base_chars_sum"] += int(bl)
                    session_summary["accepted_cand_chars_sum"] += int(cl)
                if isinstance(lr, (int, float)):
                    session_summary["accepted_len_ratio_sum"] += float(lr)
                    session_summary["accepted_len_ratio_n"] += 1
                if isinstance(ld, int):
                    session_summary["accepted_len_delta_sum"] += int(ld)

                wst = stats.get("within_soft_target") if isinstance(stats, dict) else None
                st_applied = bool(stats.get("concise_soft_target_applied", False)) if isinstance(stats, dict) else False
                if st_applied:
                    session_summary["accepted_soft_target_applicable_n"] += 1
                    if wst is True:
                        session_summary["accepted_within_soft_target_n"] += 1

                sem = stats.get("semantic") if isinstance(stats.get("semantic"), dict) else None
                if isinstance(sem, dict):
                    emb = sem.get("embedding") if isinstance(sem.get("embedding"), dict) else None
                    if isinstance(emb, dict) and isinstance(emb.get("cosine"), (int, float)):
                        session_summary["accepted_embedding_cos_sum"] += float(emb.get("cosine"))
                        session_summary["accepted_embedding_cos_n"] += 1
                    ver = sem.get("verifier") if isinstance(sem.get("verifier"), dict) else None
                    if isinstance(ver, dict) and isinstance(ver.get("label"), str):
                        lab = ver.get("label")
                        if lab == "ENTAILS":
                            session_summary["accepted_entails"] += 1
                        elif lab == "NOT_ENTAILS":
                            session_summary["accepted_not_entails"] += 1
                        else:
                            session_summary["accepted_entails_unknown"] += 1

                break

            if cmd.startswith("p"):
                raw_idx = cmd[1:].strip()
                if raw_idx.isdigit():
                    vi = int(raw_idx)
                    if 1 <= vi <= int(num_candidates):
                        _print_candidate_full(vi, candidates[vi - 1], max_preview=int(max_token_preview), max_tok_len=int(max_token_string_len))
                        continue
                print("Preview command format: p<index>, for example: p2")
                continue

            print("Invalid command. Preview: p<index> (example: p2).")

        if quit_requested:
            break

    # ========= Apply decisions to file =========
    tmp_path = path.with_suffix(path.suffix + ".tmp")
    updated_count = 0
    patch_failures = 0

    with path.open("r", encoding="utf-8") as fin, tmp_path.open("w", encoding="utf-8") as fout:
        for raw_line in fin:
            line = raw_line.rstrip("\n")
            if not line.strip():
                fout.write(line + "\n")
                continue

            try:
                record = json.loads(line)
            except json.JSONDecodeError:
                fout.write(line + "\n")
                continue

            if not isinstance(record, dict):
                fout.write(json.dumps(record, ensure_ascii=False) + "\n")
                continue

            rid = _record_id(record, tool_field=tool_field)
            tools = record.get(tool_field)

            if isinstance(tools, list):
                new_tools: List[Any] = []
                for tool_index, entry in enumerate(tools):
                    tool_obj, kind = _load_tool(entry)
                    if not tool_obj:
                        new_tools.append(entry)
                        continue

                    instance_key = _tool_instance_key(rid, tool_index, tool_obj)
                    decision = decisions_by_instance.get(instance_key)

                    if decision is None:
                        new_tools.append(entry)
                        continue

                    status, new_desc, llm_bundle = decision
                    if status in ("accepted", "edited", "manual") and new_desc:
                        if kind == "json_str" and isinstance(entry, str):
                            already_ok = False
                            try:
                                obj0 = json.loads(entry)
                                if isinstance(obj0, dict) and obj0.get("description") == new_desc:
                                    already_ok = True
                            except Exception:
                                already_ok = False

                            if already_ok:
                                new_tools.append(entry)
                                continue

                            patched, ok, reason = _replace_top_level_string_field_in_raw_object(entry, "description", new_desc)
                            if ok:
                                new_tools.append(patched)
                                updated_count += 1
                            else:
                                fallback_ok = False
                                fallback_patched = entry
                                if allow_reserialize_fallback:
                                    try:
                                        obj = json.loads(entry)
                                        if isinstance(obj, dict):
                                            obj["description"] = new_desc
                                            fallback_patched = json.dumps(obj, ensure_ascii=False)
                                            fallback_ok = True
                                    except Exception:
                                        fallback_ok = False

                                if fallback_ok:
                                    new_tools.append(fallback_patched)
                                    updated_count += 1
                                    _append_audit_event(
                                        audit_file,
                                        {
                                            "event_type": "patch_fallback_reserialize",
                                            "ts": int(time.time()),
                                            "session_id": session_id,
                                            "instance_key": instance_key,
                                            "record_id": rid,
                                            "tool_index": tool_index,
                                            "tool_name": tool_obj.get("name"),
                                            "mode": mode_key,
                                            "entry_sha256_before": _sha256_text(entry),
                                            "entry_sha256_after": _sha256_text(fallback_patched),
                                            "patch_reason": reason,
                                        },
                                    )
                                else:
                                    new_tools.append(entry)
                                    patch_failures += 1
                        else:
                            if tool_obj.get("description") == new_desc:
                                new_tools.append(tool_obj)
                                continue
                            tool_obj["description"] = new_desc
                            new_tools.append(tool_obj)
                            updated_count += 1
                    else:
                        new_tools.append(entry)

                record[tool_field] = new_tools

            fout.write(json.dumps(record, ensure_ascii=False) + "\n")

    if create_backup_of_target:
        bak_path = path.with_suffix(path.suffix + ".bak")
        if not bak_path.exists():
            shutil.copy2(path, bak_path)

    tmp_path.replace(path)
    after_sha = _sha256_file(path)

    n_reviewed = len(decisions_by_instance)
    n_skipped = sum(1 for st, _, _ in decisions_by_instance.values() if st == "skipped")
    completed = (n_reviewed >= n_total) and (not quit_requested)

    avg_sim = None
    if int(session_summary["accepted_similarity_n"]) > 0:
        avg_sim = float(session_summary["accepted_similarity_sum"]) / float(session_summary["accepted_similarity_n"])

    avg_len_ratio = None
    if int(session_summary["accepted_len_ratio_n"]) > 0:
        avg_len_ratio = float(session_summary["accepted_len_ratio_sum"]) / float(session_summary["accepted_len_ratio_n"])

    avg_len_delta = None
    if int(session_summary["accepted"]) > 0:
        avg_len_delta = float(session_summary["accepted_len_delta_sum"]) / float(session_summary["accepted"])

    avg_base_len = None
    avg_cand_len = None
    if int(session_summary["accepted"]) > 0:
        avg_base_len = float(session_summary["accepted_base_chars_sum"]) / float(session_summary["accepted"])
        avg_cand_len = float(session_summary["accepted_cand_chars_sum"]) / float(session_summary["accepted"])

    avg_emb_cos = None
    if int(session_summary["accepted_embedding_cos_n"]) > 0:
        avg_emb_cos = float(session_summary["accepted_embedding_cos_sum"]) / float(session_summary["accepted_embedding_cos_n"])

    _append_audit_event(
        audit_file,
        {
            "event_type": "run_end",
            "ts": int(time.time()),
            "session_id": session_id,
            "mode": mode_key,
            "model": llm_model,
            "seed": seed,
            "dataset_path": str(path),
            "dataset_sha256_at_session_start": before_sha,
            "dataset_sha256_at_session_end": after_sha,
            "n_total_occurrences": n_total,
            "n_reviewed_total": n_reviewed,
            "n_updated_this_session": updated_count,
            "n_skipped_total": n_skipped,
            "completed": bool(completed),
            "quit_requested": bool(quit_requested),
            "raw_patch_failures_this_session": patch_failures,
            "resume_next_index_1based": resume_next_index_1based if quit_requested else (n_total + 1 if completed else None),
            "session_summary": {
                "accepted": int(session_summary["accepted"]),
                "edited": int(session_summary["edited"]),
                "manual": int(session_summary["manual"]),
                "skipped": int(session_summary["skipped"]),
                "accepted_risk_labels": session_summary["accepted_risk_labels"],
                "accepted_avg_similarity": avg_sim,
                "accepted_similarity_n": int(session_summary["accepted_similarity_n"]),
                "accepted_avg_len_ratio": avg_len_ratio,
                "accepted_len_ratio_n": int(session_summary["accepted_len_ratio_n"]),
                "accepted_avg_len_delta_chars": avg_len_delta,
                "accepted_avg_base_len_chars": avg_base_len,
                "accepted_avg_cand_len_chars": avg_cand_len,
                "accepted_soft_target_applicable_n": int(session_summary["accepted_soft_target_applicable_n"]),
                "accepted_within_soft_target_n": int(session_summary["accepted_within_soft_target_n"]),
                "accepted_avg_embedding_cosine": avg_emb_cos,
                "accepted_embedding_cosine_n": int(session_summary["accepted_embedding_cos_n"]),
                "accepted_entails": int(session_summary["accepted_entails"]),
                "accepted_not_entails": int(session_summary["accepted_not_entails"]),
                "accepted_entails_unknown": int(session_summary["accepted_entails_unknown"]),
            },
            "length_policy_config": length_policy_config,
            "policy_versions": {
                "risk_policy_name": RISK_POLICY_NAME,
                "logic_token_policy_name": LOGIC_TOKEN_POLICY_NAME,
                "semantic_policy_name": SEMANTIC_POLICY_NAME,
            },
            "semantic_cfg": semantic_cfg_norm,
            "show_perturbations": bool(show_perturbations),
            "raw_key_input": bool(raw_key_input),
        },
    )

    print("\nChanges applied.")
    print(f"Mode: {mode_key}")
    print(f"Candidates per tool: {int(num_candidates)}")
    print(f"Candidate snippet chars: {int(candidate_snippet_chars)}")
    print(f"Perturbation prints: {'enabled' if show_perturbations else 'disabled'}")
    print(f"Raw key input (Esc-safe): {'enabled' if raw_key_input else 'disabled'}")
    print(f"Descriptions updated (this session): {updated_count}")
    if patch_failures:
        print(f"Raw JSON-string patch failures (left unchanged): {patch_failures}")
    print(f"Reviewed total (from audit): {n_reviewed} / {n_total}")
    print(f"Completed: {completed} (quit_requested={quit_requested})")
    if quit_requested and resume_next_index_1based is not None:
        print(f"Resume next time from: [{resume_next_index_1based}/{n_total}]")
    print(f"Updated file: {path}")
    print(f"Audit file (same on resume): {audit_file}")

    print("\nSession summary (heuristic):")
    print(
        f"  accepted={int(session_summary['accepted'])}, edited={int(session_summary['edited'])}, "
        f"manual={int(session_summary['manual'])}, skipped={int(session_summary['skipped'])}"
    )
    print(f"  accepted_risk_labels={session_summary['accepted_risk_labels']}")
    if avg_sim is not None:
        print(f"  accepted_avg_similarity={avg_sim:.2f} (n={int(session_summary['accepted_similarity_n'])})")
    if avg_len_ratio is not None:
        print(f"  accepted_avg_len_ratio={avg_len_ratio:.2f} (n={int(session_summary['accepted_len_ratio_n'])})")
    if avg_len_delta is not None:
        print(f"  accepted_avg_len_delta_chars={avg_len_delta:+.1f}")
    if avg_base_len is not None and avg_cand_len is not None:
        print(f"  accepted_avg_base_len_chars={avg_base_len:.1f}; accepted_avg_cand_len_chars={avg_cand_len:.1f}")
    if avg_emb_cos is not None:
        print(f"  accepted_avg_embedding_cosine={avg_emb_cos:.2f} (n={int(session_summary['accepted_embedding_cosine_n'])})")
    if semantic_cfg_norm.get("enable_verifier"):
        print(
            "  accepted_verifier_counts: "
            f"ENTAILS={int(session_summary['accepted_entails'])}, "
            f"NOT_ENTAILS={int(session_summary['accepted_not_entails'])}, "
            f"UNKNOWN={int(session_summary['accepted_entails_unknown'])}"
        )
    if mode_key == "style_concise":
        print(
            "  accepted_soft_target: "
            f"applicable={int(session_summary['accepted_soft_target_applicable_n'])}, "
            f"within={int(session_summary['accepted_within_soft_target_n'])}"
        )


def _derive_working_copy_path(input_path: str, mode_key: str) -> str:
    p = Path(input_path)
    return str(p.with_name(f"{p.stem}.WORKING_COPY.{mode_key}{p.suffix}"))


if __name__ == "__main__":
    # ----- Inputs -----
    INPUT_JSONL = os.environ.get("INPUT_JSONL") or "When2Call/data/test/when2call_test_llm_judge.jsonl"
    MODE_KEY = os.environ.get("MODE_KEY") or "style_concise"
    LLM_MODEL = os.environ.get("LLM_MODEL") or LLM_MODEL_DEFAULT

    mode_key_resolved, _ = _resolve_style(MODE_KEY)

    OUTPUT_JSONL = os.environ.get("OUTPUT_JSONL") or _derive_working_copy_path(INPUT_JSONL, mode_key_resolved)

    working = make_working_copy(INPUT_JSONL, OUTPUT_JSONL, overwrite=False)
    print(f"Working copy: {working}")

    # ----- Runtime knobs -----
    seed_env = os.environ.get("GEMINI_SEED")
    seed_val: Optional[int] = int(seed_env.strip()) if (seed_env and seed_env.strip()) else None

    max_tokens_env = os.environ.get("GEMINI_MAX_TOKENS")
    max_tokens_val = int(max_tokens_env.strip()) if (max_tokens_env and max_tokens_env.strip()) else DEFAULT_MAX_TOKENS

    retry_max_tokens_env = os.environ.get("GEMINI_RETRY_MAX_TOKENS")
    retry_max_tokens_val = int(retry_max_tokens_env.strip()) if (retry_max_tokens_env and retry_max_tokens_env.strip()) else RETRY_MAX_TOKENS

    allow_reserialize_env = os.environ.get("ALLOW_RESERIALIZE_FALLBACK")
    allow_reserialize_val = (
        bool(int(allow_reserialize_env.strip()))
        if (allow_reserialize_env and allow_reserialize_env.strip())
        else DEFAULT_ALLOW_RESERIALIZE_FALLBACK
    )

    num_candidates_val = _safe_int_env("NUM_CANDIDATES", DEFAULT_NUM_CANDIDATES)
    min_sleep_val = _safe_float_env("MIN_SLEEP_SEC_BETWEEN_CALLS", 0.0)

    max_preview_val = _safe_int_env("STATS_MAX_TOKEN_PREVIEW", DEFAULT_MAX_TOKEN_PREVIEW)
    max_tok_len_val = _safe_int_env("STATS_MAX_TOKEN_STRING_LEN", DEFAULT_MAX_TOKEN_STRING_LEN)

    cand_snippet_val = _safe_int_env("CANDIDATE_SNIPPET_CHARS", DEFAULT_CANDIDATE_SNIPPET_CHARS)

    concise_ratio_val = _safe_float_env("CONCISE_TARGET_RATIO", DEFAULT_CONCISE_TARGET_RATIO)
    concise_min_base_len_val = _safe_int_env("CONCISE_TARGET_MIN_BASE_LEN", DEFAULT_CONCISE_TARGET_MIN_BASE_LEN)
    concise_min_chars_val = _safe_int_env("CONCISE_TARGET_MIN_CHARS", DEFAULT_CONCISE_TARGET_MIN_CHARS)

    enable_embeddings_val = _safe_bool_env("ENABLE_EMBEDDINGS", DEFAULT_ENABLE_EMBEDDINGS)
    embedding_model_val = os.environ.get("EMBEDDING_MODEL") or DEFAULT_EMBEDDING_MODEL
    emb_low_thr_val = _safe_float_env("EMBEDDING_LOW_COSINE_THRESHOLD", DEFAULT_EMBEDDING_LOW_COSINE_THRESHOLD)

    enable_verifier_val = _safe_bool_env("ENABLE_VERIFIER", DEFAULT_ENABLE_VERIFIER)
    verifier_model_val = os.environ.get("VERIFIER_MODEL") or DEFAULT_VERIFIER_MODEL
    verifier_max_tokens_val = _safe_int_env("VERIFIER_MAX_TOKENS", DEFAULT_VERIFIER_MAX_TOKENS)

    semantic_cfg_val: Dict[str, Any] = {
        "enable_embeddings": bool(enable_embeddings_val and bool(str(embedding_model_val).strip())),
        "embedding_model": str(embedding_model_val).strip(),
        "embedding_low_cosine_threshold": float(emb_low_thr_val),
        "enable_verifier": bool(enable_verifier_val),
        "verifier_model": str(verifier_model_val).strip(),
        "verifier_max_tokens": int(verifier_max_tokens_val),
    }

    # Feb 3 knobs
    show_perturbations_val = _safe_bool_env("SHOW_PERTURBATIONS", DEFAULT_SHOW_PERTURBATIONS)
    raw_key_input_val = _safe_bool_env("RAW_KEY_INPUT", DEFAULT_RAW_KEY_INPUT)

    interactive_llm_tools_in_jsonl(
        working,
        tool_field="tools",
        create_backup_of_target=False,
        llm_model=LLM_MODEL,
        seed=seed_val,
        max_tokens=max_tokens_val,
        retry_on_length=RETRY_ON_LENGTH,
        retry_max_tokens=retry_max_tokens_val,
        allow_reserialize_fallback=allow_reserialize_val,
        min_sleep_sec_between_calls=float(min_sleep_val),
        audit_dir=os.environ.get("AUDIT_DIR") or "audit",
        mode_key=mode_key_resolved,
        num_candidates=int(num_candidates_val),
        max_token_preview=int(max_preview_val),
        max_token_string_len=int(max_tok_len_val),
        candidate_snippet_chars=int(cand_snippet_val),
        concise_target_ratio=float(concise_ratio_val),
        concise_target_min_base_len=int(concise_min_base_len_val),
        concise_target_min_chars=int(concise_min_chars_val),
        semantic_cfg=semantic_cfg_val,
        show_perturbations=bool(show_perturbations_val),
        raw_key_input=bool(raw_key_input_val),
    )


Working copy: When2Call/data/test/when2call_test_llm_judge.WORKING_COPY.style_concise.jsonl
Target: When2Call/data/test/when2call_test_llm_judge.WORKING_COPY.style_concise.jsonl
Mode: style_concise
Audit file (RESUMABLE): audit/06f0b2ed2c69/when2call_test_llm_judge.WORKING_COPY.style_concise.06f0b2ed2c69.style_concise.gemini-2.5-flash.K2.audit.jsonl
Tool occurrences total: 978
Resume position: [2/978] (previously reviewed: 1)
LLM: gemini-2.5-flash @ https://generativelanguage.googleapis.com/v1beta/openai/
Candidates per tool: 2
Candidate snippet chars: 160
Policies: risk=risk_policy_v2_structural_logic_primary; logic=logic_tokens_v1; semantic=semantic_signals_v1
Perturbation prints: enabled
Raw key input (Esc-safe): enabled
Embedding signal: disabled
Verifier signal: disabled
Concise soft target: ratio=0.70, min_base_len=160, min_chars=80
Max tokens: 512; retry_on_length=True; retry_max_tokens=1024
Commands: ENTER/ok=accept #1, 1..K=accept candidate, r=regenerate K, e=edit candidate, m

In [3]:
#!/usr/bin/env python3
# December 28
#
# Interactive, resumable tool-description rewrite workflow with:
# - K-candidate generation per tool instance (configurable)
# - Deterministic statistical/lexical risk indicators printed alongside base and candidates
# - Human-in-the-loop decision (accept candidate, edit, manual, skip), with append-only audit log
#
# Additions (Jan 21 test):
# - Candidate text snippet shown in the overview, so the reviewer can choose without extra commands.
# - Explicit preview command documented: p<idx> (e.g., p2) prints the full candidate + stats.
#
# Additions (Concise soft length target, reviewer-proof):
# - Optional soft length target for style_concise: default 30% shorter, applied only if base_len >= threshold.
# - The target is guidance only (exceptions allowed to preserve meaning); out-of-target is flagged and logged.
# - Length metrics (len_ratio, len_delta) are computed for every candidate and stored in audit for reporting.

import json
import shutil
import os
import time
import hashlib
import difflib
import re
from pathlib import Path
from typing import Dict, Any, Optional, Tuple, List

from openai import OpenAI


# ========= Config =========
GEMINI_BASE_URL = "https://generativelanguage.googleapis.com/v1beta/openai/"
LLM_MODEL_DEFAULT = "gemini-2.5-flash"

HASH_HEX_LEN = 32

DEFAULT_MAX_TOKENS = 512
RETRY_ON_LENGTH = True
RETRY_MAX_TOKENS = 1024

DEFAULT_ALLOW_RESERIALIZE_FALLBACK = False

# How much of the last generated candidate to store in audit and to feed back into prompt.
DEFAULT_MAX_PREV_REWRITE_CHARS = 800

# Candidate count shown per tool instance.
DEFAULT_NUM_CANDIDATES = 2

# Printing controls for token previews in statistics.
DEFAULT_MAX_TOKEN_PREVIEW = 8
DEFAULT_MAX_TOKEN_STRING_LEN = 48

# Candidate text snippet in overview (chars).
DEFAULT_CANDIDATE_SNIPPET_CHARS = 160

# Soft concise length target knobs (reviewer-proof defaults).
DEFAULT_CONCISE_TARGET_RATIO = 0.70
DEFAULT_CONCISE_TARGET_MIN_BASE_LEN = 160
DEFAULT_CONCISE_TARGET_MIN_CHARS = 80


# ========= Styles =========
STYLE_SPECS: Dict[str, Dict[str, Any]] = {
    "style_verbose": {
        "system": (
            "Rewrite tool descriptions.\n"
            "Hard constraints:\n"
            "- Preserve meaning exactly; do not add new capabilities, steps, motivations, benefits, or context.\n"
            "- Do not delete information present in the original description.\n"
            "- Do not introduce new parameter names, IDs, field names, flags, or implementation details.\n"
            "- If parameter/field names/IDs/flags already appear in the original description, keep them (do not remove them).\n"
            "- Do not add examples, normative language, or assumptions.\n"
            "- Keep the same subject (the tool) and the same scope.\n"
            "- Output only the rewritten description text, nothing else.\n"
            "- Style: verbose but controlled; keep it concise and complete (1–2 sentences), clear and direct.\n"
        ),
        "regen_diversity_instruction": (
            "Return a meaning-equivalent rewrite that is lexically different from your previous rewrite; "
            "avoid repeating the same sentence structure."
        ),
        "max_prev_rewrite_chars": 800,
    },
    "style_concise": {
        "system": (
            "Rewrite tool descriptions.\n"
            "Hard constraints:\n"
            "- Preserve meaning exactly; do not add new capabilities, steps, motivations, benefits, or context.\n"
            "- Do not delete information present in the original description.\n"
            "- Do not introduce new parameter names, IDs, field names, flags, or implementation details.\n"
            "- If parameter/field names/IDs/flags already appear in the original description, keep them (do not remove them).\n"
            "- Do not add examples, normative language, or assumptions.\n"
            "- Keep the same subject (the tool) and the same scope.\n"
            "- Output only the rewritten description text, nothing else.\n"
            " - Style: concise and controlled; 1 sentence preferred, 2 max.\n"
            " - Length constraint: aim to be shorter than the base description; if the base description is already short, do not exceed its length.\n"
            " - Compression rule: remove redundancy, filler, and hedging; keep all explicitly stated constraints/details.\n"
        ),
        "regen_diversity_instruction": (
            "You must produce a different paraphrase than the previous rewrite. "
            "Do not reuse the same sentence skeleton or distinctive phrases. "
            "Keep meaning exactly the same; only vary wording and structure."
        ),
        "max_prev_rewrite_chars": 600,
    },
    # Alias to tolerate misspellings.
    "style_coicnoso": {},   # filled after dict creation
    "style_coinceise": {},  # filled after dict creation
}
STYLE_SPECS["style_coicnoso"] = STYLE_SPECS["style_concise"]
STYLE_SPECS["style_coinceise"] = STYLE_SPECS["style_concise"]


def _resolve_style(mode_key: str) -> Tuple[str, Dict[str, Any]]:
    mk = (mode_key or "").strip()
    if not mk:
        mk = "style_verbose"
    if mk not in STYLE_SPECS:
        raise ValueError(f"Unknown MODE_KEY='{mk}'. Supported: {', '.join(sorted(STYLE_SPECS.keys()))}")
    return mk, STYLE_SPECS[mk]


# ========= Client =========
def make_gemini_client() -> OpenAI:
    token = os.environ.get("TOKEN_GEMINI")
    if not token:
        raise RuntimeError("TOKEN_GEMINI environment variable is not set.")
    return OpenAI(api_key=token, base_url=GEMINI_BASE_URL)


# ========= Small utils =========
def _json_safe(obj: Any) -> Any:
    if obj is None or isinstance(obj, (str, int, float, bool)):
        return obj
    if isinstance(obj, dict):
        return {str(k): _json_safe(v) for k, v in obj.items()}
    if isinstance(obj, (list, tuple)):
        return [_json_safe(x) for x in obj]
    if hasattr(obj, "model_dump") and callable(getattr(obj, "model_dump")):
        try:
            return _json_safe(obj.model_dump())
        except Exception:
            pass
    if hasattr(obj, "dict") and callable(getattr(obj, "dict")):
        try:
            return _json_safe(obj.dict())
        except Exception:
            pass
    if hasattr(obj, "__dict__"):
        try:
            return _json_safe(vars(obj))
        except Exception:
            pass
    try:
        return str(obj)
    except Exception:
        return None


def _sha256_text(s: str) -> str:
    return hashlib.sha256((s or "").encode("utf-8")).hexdigest()


def _canonical_json(obj: Any) -> str:
    return json.dumps(obj, ensure_ascii=False, sort_keys=True, separators=(",", ":"))


def _sha256_file(path: Path) -> str:
    h = hashlib.sha256()
    with path.open("rb") as f:
        for chunk in iter(lambda: f.read(1024 * 1024), b""):
            h.update(chunk)
    return h.hexdigest()


def _safe_int_env(name: str, default: int) -> int:
    v = os.environ.get(name)
    if v is None or not v.strip():
        return int(default)
    try:
        return int(v.strip())
    except Exception:
        return int(default)


def _safe_float_env(name: str, default: float) -> float:
    v = os.environ.get(name)
    if v is None or not v.strip():
        return float(default)
    try:
        return float(v.strip())
    except Exception:
        return float(default)


# ========= Concise soft target (policy) =========
def _make_length_policy(
    *,
    base_desc: str,
    mode_key: str,
    concise_ratio: float,
    concise_min_base_len: int,
    concise_min_chars: int,
) -> Dict[str, Any]:
    """
    Returns a policy object (always present) used for:
    - prompt guidance (soft target)
    - stats (len_ratio, within_target)
    - audit reporting

    Soft target is applied only if:
    - mode_key == style_concise
    - base_len >= concise_min_base_len
    - computed target is strictly shorter than base_len
    """
    base = (base_desc or "").strip()
    base_len = len(base)

    ratio = float(concise_ratio)
    min_base_len = int(concise_min_base_len)
    min_chars = int(concise_min_chars)

    reason = "not_concise_mode"
    applied = False
    target_chars: Optional[int] = None

    if mode_key == "style_concise":
        if base_len < min_base_len or base_len <= 0:
            reason = "base_too_short"
        else:
            raw_target = int(base_len * ratio)
            candidate_target = max(raw_target, min_chars)
            # If env is mis-set (min_chars > base), do not apply a target that would exceed base.
            if candidate_target >= base_len:
                reason = "target_not_shorter_than_base"
            else:
                applied = True
                reason = "ok"
                target_chars = candidate_target

    return {
        "policy_name": "concise_soft_target_v1",
        "mode_key": mode_key,
        "base_len_chars": base_len,
        "concise_soft_target": {
            "applied": bool(applied),
            "reason": str(reason),
            "target_ratio": float(ratio),
            "min_base_len": int(min_base_len),
            "min_chars": int(min_chars),
            "target_chars": int(target_chars) if isinstance(target_chars, int) else None,
        },
    }


# ========= Statistical / lexical indicators =========
_FLAG_RE = re.compile(r"(?<!\w)--[A-Za-z0-9][A-Za-z0-9_-]*")
_SNAKE_RE = re.compile(r"\b[A-Za-z][A-Za-z0-9]*_[A-Za-z0-9_]+\b")
_CAMEL_RE = re.compile(r"\b[a-z]+[A-Z][A-Za-z0-9]*\b")
_FIELD_COLON_RE = re.compile(r"\b[A-Za-z][A-Za-z0-9_]{2,}\b(?=\s*[:=])")
_NUMBER_RE = re.compile(r"\b\d+(?:\.\d+)?\b")
_NUMBER_UNIT_RE = re.compile(r"\b\d+(?:\.\d+)?\s*(?:kb|mb|gb|tb|ms|s|sec|secs|seconds|mins|minutes|hrs|hours|days)\b", re.IGNORECASE)

# Conservative action verbs indicating possible semantic drift if newly introduced.
_HIGH_RISK_VERBS = [
    "create", "delete", "remove", "destroy",
    "upload", "download", "send", "email",
    "execute", "run", "invoke", "call",
    "write", "read", "save", "store",
    "update", "modify", "edit", "change",
    "retrieve", "fetch", "search", "browse",
    "access", "open", "close",
]
_VERB_RE = re.compile(r"\b(" + "|".join(re.escape(v) for v in _HIGH_RISK_VERBS) + r")\b", re.IGNORECASE)


def _sentence_count(text: str) -> int:
    t = (text or "").strip()
    if not t:
        return 0
    parts = [p for p in re.split(r"[.!?]+", t) if p.strip()]
    return len(parts)


def _word_count(text: str) -> int:
    t = (text or "").strip()
    if not t:
        return 0
    return len([w for w in re.split(r"\s+", t) if w])


def _extract_indicator_tokens(text: str) -> Dict[str, List[str]]:
    t = text or ""
    return {
        "flags": sorted(set(_FLAG_RE.findall(t))),
        "snake": sorted(set(_SNAKE_RE.findall(t))),
        "camel": sorted(set(_CAMEL_RE.findall(t))),
        "field_like": sorted(set(_FIELD_COLON_RE.findall(t))),
        "numbers": sorted(set(_NUMBER_RE.findall(t))),
        "number_units": sorted(set(m.group(0) for m in _NUMBER_UNIT_RE.finditer(t))),
        "verbs": sorted(set(m.group(0).lower() for m in _VERB_RE.finditer(t))),
    }


def _format_token_preview(tokens: List[str], *, max_items: int, max_len: int) -> str:
    if not tokens:
        return "-"
    out: List[str] = []
    for t in tokens[: max(0, int(max_items))]:
        s = str(t)
        if len(s) > int(max_len):
            s = s[: int(max_len) - 1] + "…"
        out.append(s)
    if len(tokens) > int(max_items):
        out.append(f"+{len(tokens) - int(max_items)}")
    return ", ".join(out) if out else "-"


def _diff_token_sets(base: Dict[str, List[str]], cand: Dict[str, List[str]], key: str) -> Tuple[List[str], List[str]]:
    b = set(base.get(key, []) or [])
    c = set(cand.get(key, []) or [])
    new_items = sorted(c - b)
    missing_items = sorted(b - c)
    return new_items, missing_items


def _similarity_ratio(a: str, b: str) -> float:
    aa = (a or "").strip()
    bb = (b or "").strip()
    if not aa and not bb:
        return 1.0
    if not aa or not bb:
        return 0.0
    return float(difflib.SequenceMatcher(None, aa, bb).ratio())


def compute_candidate_stats(
    *,
    base_text: str,
    cand_text: str,
    mode_key: str,
    length_policy: Optional[Dict[str, Any]] = None,
) -> Dict[str, Any]:
    base = (base_text or "").strip()
    cand = (cand_text or "").strip()

    base_tokens = _extract_indicator_tokens(base)
    cand_tokens = _extract_indicator_tokens(cand)

    diffs: Dict[str, Any] = {}
    for k in ("flags", "snake", "camel", "field_like", "numbers", "number_units", "verbs"):
        new_items, missing_items = _diff_token_sets(base_tokens, cand_tokens, k)
        diffs[k] = {"new": new_items, "missing": missing_items}

    base_len = len(base)
    cand_len = len(cand)
    base_words = _word_count(base)
    cand_words = _word_count(cand)
    base_sent = _sentence_count(base)
    cand_sent = _sentence_count(cand)

    sim = _similarity_ratio(base, cand)

    len_ratio = (float(cand_len) / float(base_len)) if base_len > 0 else None
    len_delta = int(cand_len) - int(base_len)
    len_delta_ratio = (float(len_delta) / float(base_len)) if base_len > 0 else None

    new_critical = (
        len(diffs["flags"]["new"]) +
        len(diffs["field_like"]["new"]) +
        len(diffs["numbers"]["new"]) +
        len(diffs["number_units"]["new"]) +
        len(diffs["verbs"]["new"]) +
        len(diffs["snake"]["new"]) +
        len(diffs["camel"]["new"])
    )
    missing_signal = (
        len(diffs["flags"]["missing"]) +
        len(diffs["field_like"]["missing"]) +
        len(diffs["numbers"]["missing"]) +
        len(diffs["number_units"]["missing"]) +
        len(diffs["verbs"]["missing"]) +
        len(diffs["snake"]["missing"]) +
        len(diffs["camel"]["missing"])
    )

    risk_label = "LOW"
    risk_reasons: List[str] = []

    if new_critical > 0:
        risk_label = "HIGH"
        risk_reasons.append("new_indicator_tokens_detected")
    if risk_label != "HIGH" and missing_signal >= 4:
        risk_label = "HIGH"
        risk_reasons.append("many_indicator_tokens_missing")
    if risk_label == "LOW" and missing_signal > 0:
        risk_label = "MED"
        risk_reasons.append("some_indicator_tokens_missing")
    if risk_label == "LOW" and sim < 0.55:
        risk_label = "MED"
        risk_reasons.append("low_text_similarity")

    soft_flags: List[Dict[str, Any]] = []
    if mode_key == "style_concise":
        if base_len > 0 and cand_len > base_len:
            soft_flags.append({"type": "concise_length_exceeds_base", "base_len": base_len, "cand_len": cand_len})
        if cand_sent > 2:
            soft_flags.append({"type": "concise_sentence_count_exceeds_2", "sentence_count": cand_sent})
    if mode_key == "style_verbose":
        if cand_sent > 2:
            soft_flags.append({"type": "verbose_sentence_count_exceeds_2", "sentence_count": cand_sent})

    # Concise soft target evaluation (flag only; never a hard reject).
    concise_target_chars = None
    concise_target_applied = False
    concise_target_reason = None
    within_soft_target = None

    if isinstance(length_policy, dict):
        ct = (length_policy.get("concise_soft_target") or {})
        if isinstance(ct, dict):
            concise_target_applied = bool(ct.get("applied", False))
            concise_target_reason = ct.get("reason")
            concise_target_chars = ct.get("target_chars") if isinstance(ct.get("target_chars"), int) else None

    if mode_key == "style_concise" and concise_target_applied and isinstance(concise_target_chars, int) and base_len > 0:
        within_soft_target = bool(cand_len <= concise_target_chars)
        if not within_soft_target:
            soft_flags.append(
                {
                    "type": "concise_exceeds_soft_target",
                    "target_chars": int(concise_target_chars),
                    "cand_len": int(cand_len),
                    "len_ratio": float(len_ratio) if isinstance(len_ratio, float) else None,
                }
            )

    return {
        "base_len_chars": base_len,
        "cand_len_chars": cand_len,
        "len_ratio": len_ratio,
        "len_delta_chars": len_delta,
        "len_delta_ratio": len_delta_ratio,
        "base_words": base_words,
        "cand_words": cand_words,
        "base_sentences": base_sent,
        "cand_sentences": cand_sent,
        "similarity_ratio": sim,
        "diffs": diffs,
        "risk_label": risk_label,
        "risk_reasons": risk_reasons,
        "soft_flags": soft_flags,
        "base_tokens": base_tokens,
        "cand_tokens": cand_tokens,
        "length_policy": length_policy,
        "concise_soft_target_applied": concise_target_applied,
        "concise_soft_target_reason": concise_target_reason,
        "concise_soft_target_chars": concise_target_chars,
        "within_soft_target": within_soft_target,
    }


def _print_base_stats(base_desc: str, *, max_preview: int, max_tok_len: int) -> None:
    base = (base_desc or "").strip()
    tokens = _extract_indicator_tokens(base)
    print("\nStatistics (base):")
    print(f"  chars={len(base)}; words={_word_count(base)}; sentences={_sentence_count(base)}")
    print(
        "  tokens:"
        f" flags={len(tokens['flags'])}, field_like={len(tokens['field_like'])}, "
        f"numbers={len(tokens['numbers'])}, number_units={len(tokens['number_units'])}, "
        f"verbs={len(tokens['verbs'])}, snake={len(tokens['snake'])}, camel={len(tokens['camel'])}"
    )
    print(
        "  previews:"
        f" flags=[{_format_token_preview(tokens['flags'], max_items=max_preview, max_len=max_tok_len)}];"
        f" field_like=[{_format_token_preview(tokens['field_like'], max_items=max_preview, max_len=max_tok_len)}];"
        f" numbers=[{_format_token_preview(tokens['numbers'], max_items=max_preview, max_len=max_tok_len)}];"
        f" verbs=[{_format_token_preview(tokens['verbs'], max_items=max_preview, max_len=max_tok_len)}]"
    )


def _print_candidate_summary_line(
    i: int,
    cand: Dict[str, Any],
    *,
    max_preview: int,
    max_tok_len: int,
    snippet_chars: int,
) -> None:
    txt = (cand.get("text") or "").strip()
    err = cand.get("error")
    dup = bool(cand.get("duplicate", False))
    stats = cand.get("stats") or {}

    status = "ok"
    if err:
        status = f"error:{str(err)[:60]}"
    elif not txt:
        status = "empty"
    elif dup:
        status = "duplicate"

    risk = stats.get("risk_label") or "-"
    sim = stats.get("similarity_ratio")
    sim_s = f"{float(sim):.2f}" if isinstance(sim, (int, float)) else "-"

    diffs = (stats.get("diffs") or {})
    new_flags = len(((diffs.get("flags") or {}).get("new") or []))
    new_nums = len(((diffs.get("numbers") or {}).get("new") or [])) + len(((diffs.get("number_units") or {}).get("new") or []))
    new_verbs = len(((diffs.get("verbs") or {}).get("new") or []))
    new_fields = len(((diffs.get("field_like") or {}).get("new") or []))
    missing_total = (
        len(((diffs.get("flags") or {}).get("missing") or [])) +
        len(((diffs.get("numbers") or {}).get("missing") or [])) +
        len(((diffs.get("number_units") or {}).get("missing") or [])) +
        len(((diffs.get("verbs") or {}).get("missing") or [])) +
        len(((diffs.get("field_like") or {}).get("missing") or [])) +
        len(((diffs.get("snake") or {}).get("missing") or [])) +
        len(((diffs.get("camel") or {}).get("missing") or []))
    )

    clen = stats.get("cand_len_chars")
    cwords = stats.get("cand_words")
    csent = stats.get("cand_sentences")

    clen_s = str(clen) if isinstance(clen, int) else "-"
    cwords_s = str(cwords) if isinstance(cwords, int) else "-"
    csent_s = str(csent) if isinstance(csent, int) else "-"

    # Length diagnostics
    lr = stats.get("len_ratio")
    lr_s = f"{float(lr):.2f}" if isinstance(lr, (int, float)) else "-"
    ld = stats.get("len_delta_chars")
    ld_s = f"{int(ld):+d}" if isinstance(ld, int) else "-"

    # Concise target diagnostics (if applicable)
    t_applied = bool(stats.get("concise_soft_target_applied", False))
    t_chars = stats.get("concise_soft_target_chars")
    within = stats.get("within_soft_target")
    target_s = "-"
    if t_applied and isinstance(t_chars, int):
        if within is True:
            target_s = f"target<={t_chars} ok"
        elif within is False:
            target_s = f"target<={t_chars} NO"
        else:
            target_s = f"target<={t_chars}"

    print(
        f"  [{i}] status={status}; risk={risk}; sim={sim_s}; "
        f"cand(chars={clen_s}, words={cwords_s}, sent={csent_s}); "
        f"len_ratio={lr_s}; Δchars={ld_s}; {target_s}; "
        f"new(flags={new_flags}, fields={new_fields}, nums={new_nums}, verbs={new_verbs}); missing_total={missing_total}"
    )

    # Compact preview of the most actionable deltas.
    if isinstance(diffs, dict) and txt and not err:
        nf = (diffs.get("flags") or {}).get("new") or []
        nfv = (diffs.get("field_like") or {}).get("new") or []
        nn = (diffs.get("numbers") or {}).get("new") or []
        nv = (diffs.get("verbs") or {}).get("new") or []
        if nf or nfv or nn or nv:
            print(
                "      new-previews:"
                f" flags=[{_format_token_preview(list(nf), max_items=max_preview, max_len=max_tok_len)}];"
                f" fields=[{_format_token_preview(list(nfv), max_items=max_preview, max_len=max_tok_len)}];"
                f" numbers=[{_format_token_preview(list(nn), max_items=max_preview, max_len=max_tok_len)}];"
                f" verbs=[{_format_token_preview(list(nv), max_items=max_preview, max_len=max_tok_len)}]"
            )

    # Show a compact snippet of the candidate text to enable selection without extra commands.
    if txt and not err and int(snippet_chars) > 0:
        sn = " ".join(txt.split())
        max_sn = int(snippet_chars)
        if len(sn) > max_sn:
            sn = sn[: max_sn - 1] + "…"
        print(f"      text: {sn}")


def _print_candidate_full(
    i: int,
    cand: Dict[str, Any],
    *,
    max_preview: int,
    max_tok_len: int,
) -> None:
    txt = (cand.get("text") or "").strip()
    err = cand.get("error")
    stats = cand.get("stats") or {}
    diffs = (stats.get("diffs") or {})

    print(f"\nCandidate [{i}]:")
    if err:
        print(f"  Generation error: {err}")
        if txt:
            print("  Partial text:")
            print(txt)
        return
    if not txt:
        print("  (empty)")
        return

    print(txt)

    risk = stats.get("risk_label") or "-"
    reasons = stats.get("risk_reasons") or []
    soft = stats.get("soft_flags") or []

    print("\n  Candidate statistics:")
    sim = stats.get("similarity_ratio")
    sim_s = f"{float(sim):.2f}" if isinstance(sim, (int, float)) else "-"
    lr = stats.get("len_ratio")
    lr_s = f"{float(lr):.2f}" if isinstance(lr, (int, float)) else "-"
    ld = stats.get("len_delta_chars")
    ld_s = f"{int(ld):+d}" if isinstance(ld, int) else "-"
    print(f"    risk={risk}; reasons={reasons if reasons else '[]'}; similarity={sim_s}; len_ratio={lr_s}; Δchars={ld_s}")

    if soft:
        print(f"    soft_flags={soft}")

    for key in ("flags", "field_like", "numbers", "number_units", "verbs", "snake", "camel"):
        d = diffs.get(key) or {}
        new_items = d.get("new") or []
        missing_items = d.get("missing") or []
        if not new_items and not missing_items:
            continue
        print(
            f"    {key}:"
            f" new({len(new_items)})=[{_format_token_preview(list(new_items), max_items=max_preview, max_len=max_tok_len)}];"
            f" missing({len(missing_items)})=[{_format_token_preview(list(missing_items), max_items=max_preview, max_len=max_tok_len)}]"
        )


# ========= Raw JSON-string patcher (for tools stored as JSON strings) =========
def _extract_json_string_value(raw_json: str, key: str) -> Optional[str]:
    token = f'"{key}"'
    i = raw_json.find(token)
    if i < 0:
        return None
    i = raw_json.find(":", i + len(token))
    if i < 0:
        return None
    i += 1
    n = len(raw_json)
    while i < n and raw_json[i] in " \t\r\n":
        i += 1
    if i >= n or raw_json[i] != '"':
        return None
    start = i
    i += 1
    esc = False
    while i < n:
        c = raw_json[i]
        if esc:
            esc = False
        else:
            if c == "\\":
                esc = True
            elif c == '"':
                return raw_json[start : i + 1]
        i += 1
    return None


def _decode_raw_json_string(raw_json_string_with_quotes: str) -> str:
    try:
        obj = json.loads('{"description":' + raw_json_string_with_quotes + "}")
        return obj.get("description") or ""
    except json.JSONDecodeError:
        return ""


def _get_description_for_print(entry: Any) -> Tuple[str, str]:
    if isinstance(entry, str):
        raw = _extract_json_string_value(entry, "description")
        if raw is not None:
            return raw, "raw_json"
        try:
            obj = json.loads(entry)
            return obj.get("description") or "", "rendered"
        except json.JSONDecodeError:
            return "", "rendered"
    if isinstance(entry, dict):
        return entry.get("description") or "", "rendered"
    return "", "rendered"


def _load_tool(entry: Any) -> Tuple[Optional[Dict[str, Any]], str]:
    if isinstance(entry, str):
        try:
            return json.loads(entry), "json_str"
        except json.JSONDecodeError:
            return None, "other"
    if isinstance(entry, dict):
        return entry, "dict"
    return None, "other"


def _skip_ws(s: str, i: int) -> int:
    n = len(s)
    while i < n and s[i] in " \t\r\n":
        i += 1
    return i


def _scan_string_span(s: str, i: int) -> Optional[Tuple[int, int]]:
    n = len(s)
    if i >= n or s[i] != '"':
        return None
    j = i + 1
    esc = False
    while j < n:
        c = s[j]
        if esc:
            esc = False
        else:
            if c == "\\":
                esc = True
            elif c == '"':
                return (i, j + 1)
        j += 1
    return None


def _scan_number_span(s: str, i: int) -> Optional[Tuple[int, int]]:
    n = len(s)
    j = i
    if j < n and s[j] == "-":
        j += 1
    if j >= n:
        return None
    if s[j] == "0":
        j += 1
    elif s[j].isdigit():
        while j < n and s[j].isdigit():
            j += 1
    else:
        return None
    if j < n and s[j] == ".":
        j += 1
        if j >= n or not s[j].isdigit():
            return None
        while j < n and s[j].isdigit():
            j += 1
    if j < n and s[j] in "eE":
        j += 1
        if j < n and s[j] in "+-":
            j += 1
        if j >= n or not s[j].isdigit():
            return None
        while j < n and s[j].isdigit():
            j += 1
    return (i, j)


def _scan_literal_span(s: str, i: int) -> Optional[Tuple[int, int]]:
    for lit in ("true", "false", "null"):
        if s.startswith(lit, i):
            return (i, i + len(lit))
    return None


def _scan_container_span(s: str, i: int) -> Optional[Tuple[int, int]]:
    n = len(s)
    if i >= n:
        return None

    opener = s[i]
    if opener not in "{[":
        return None

    stack: List[str] = ["}" if opener == "{" else "]"]
    j = i + 1
    in_str = False
    esc = False

    while j < n:
        c = s[j]

        if in_str:
            if esc:
                esc = False
            else:
                if c == "\\":
                    esc = True
                elif c == '"':
                    in_str = False
            j += 1
            continue

        if c == '"':
            in_str = True
            j += 1
            continue

        if c == "{":
            stack.append("}")
            j += 1
            continue
        if c == "[":
            stack.append("]")
            j += 1
            continue

        if c in "}]":
            if not stack:
                return None
            expected = stack[-1]
            if c != expected:
                return None
            stack.pop()
            j += 1
            if not stack:
                return (i, j)
            continue

        j += 1

    return None


def _is_value_delim(c: str) -> bool:
    return c in ",}]"


def _scan_value_span(s: str, i: int) -> Optional[Tuple[int, int]]:
    n = len(s)
    i = _skip_ws(s, i)
    if i >= n:
        return None

    c = s[i]
    if c == '"':
        return _scan_string_span(s, i)
    if c in "{[":
        return _scan_container_span(s, i)

    span: Optional[Tuple[int, int]]
    if c == "-" or c.isdigit():
        span = _scan_number_span(s, i)
    else:
        span = _scan_literal_span(s, i)

    if not span:
        return None

    _, end = span
    k = _skip_ws(s, end)
    if k >= n:
        return span
    if _is_value_delim(s[k]):
        return span
    return None


def _replace_top_level_string_field_in_raw_object(raw_json_obj: str, key: str, new_value: str) -> Tuple[str, bool, str]:
    s = raw_json_obj
    n = len(s)

    i = _skip_ws(s, 0)
    if i >= n or s[i] != "{":
        return raw_json_obj, False, "not_object"

    i += 1
    found_any_key = False
    expect_key = True

    while True:
        i = _skip_ws(s, i)
        if i >= n:
            return raw_json_obj, False, "cannot_scan_value"

        if expect_key:
            if s[i] == "}":
                return raw_json_obj, False, "key_not_found"
            if s[i] != '"':
                return raw_json_obj, False, "invalid_key_string"

            key_span = _scan_string_span(s, i)
            if not key_span:
                return raw_json_obj, False, "invalid_key_string"

            found_any_key = True
            k_start, k_end = key_span
            try:
                key_decoded = json.loads(s[k_start:k_end])
            except Exception:
                return raw_json_obj, False, "invalid_key_string"

            i = _skip_ws(s, k_end)
            if i >= n or s[i] != ":":
                return raw_json_obj, False, "missing_colon"

            v_span = _scan_value_span(s, i + 1)
            if not v_span:
                return raw_json_obj, False, "cannot_scan_value"

            v_start, v_end = v_span

            if key_decoded == key:
                if v_start >= n or s[v_start] != '"':
                    return raw_json_obj, False, "value_not_string"

                replacement_literal = json.dumps(new_value, ensure_ascii=False)
                patched = s[:v_start] + replacement_literal + s[v_end:]

                try:
                    obj = json.loads(patched)
                except Exception:
                    return raw_json_obj, False, "json_load_failed_after_patch"

                if isinstance(obj, dict) and obj.get(key) == new_value:
                    return patched, True, "ok"
                return raw_json_obj, False, "validation_failed_after_patch"

            i = v_end
            expect_key = False
            continue

        i = _skip_ws(s, i)
        if i >= n:
            return raw_json_obj, False, "cannot_scan_value"

        if s[i] == ",":
            i += 1
            expect_key = True
            continue
        if s[i] == "}":
            return raw_json_obj, False, ("key_not_found" if found_any_key else "key_not_found")
        return raw_json_obj, False, "cannot_scan_value"


# ========= IDs =========
def _tool_fingerprint_excluding_description(tool_obj: Dict[str, Any]) -> str:
    filtered = {k: v for k, v in tool_obj.items() if k != "description"}
    payload = _canonical_json(filtered)
    return hashlib.sha256(payload.encode("utf-8")).hexdigest()[:HASH_HEX_LEN]


def _record_id(record_obj: Dict[str, Any], tool_field: str) -> str:
    rec = dict(record_obj)
    tools = rec.get(tool_field)
    if isinstance(tools, list):
        canon_tools: List[Any] = []
        for entry in tools:
            tool_obj, kind = _load_tool(entry)
            if tool_obj is None:
                canon_tools.append({"_unparsed": entry, "_kind": kind})
            else:
                canon_tools.append({k: v for k, v in tool_obj.items() if k != "description"})
        rec[tool_field] = canon_tools
    payload = _canonical_json(rec)
    return hashlib.sha256(payload.encode("utf-8")).hexdigest()[:HASH_HEX_LEN]


def _tool_instance_key(record_id: str, tool_index: int, tool_obj: Dict[str, Any]) -> str:
    fp = _tool_fingerprint_excluding_description(tool_obj)
    return f"rec:{record_id}:t{tool_index}:{fp}"


# ========= Audit (single file, resumable) =========
def _audit_identity(dataset_path: Path, *, mode_key: str, model: str, tool_field: str, num_candidates: int) -> str:
    stable = f"{dataset_path.resolve()}|{mode_key}|{model}|{tool_field}|K={int(num_candidates)}"
    return hashlib.sha256(stable.encode("utf-8")).hexdigest()[:12]


def _audit_file_path(
    dataset_path: Path,
    *,
    audit_dir: Path,
    mode_key: str,
    model: str,
    tool_field: str,
    num_candidates: int,
) -> Path:
    audit_key = _audit_identity(dataset_path, mode_key=mode_key, model=model, tool_field=tool_field, num_candidates=num_candidates)
    safe_model = "".join(ch if ch.isalnum() or ch in ("-", "_", ".") else "_" for ch in model)
    out_dir = audit_dir / audit_key
    filename = f"{dataset_path.stem}.{audit_key}.{mode_key}.{safe_model}.K{int(num_candidates)}.audit.jsonl"
    return out_dir / filename


def _append_audit_event(audit_file: Path, event: Dict[str, Any]) -> None:
    audit_file.parent.mkdir(parents=True, exist_ok=True)
    safe_event = _json_safe(event)
    with audit_file.open("a", encoding="utf-8") as f:
        f.write(json.dumps(safe_event, ensure_ascii=False) + "\n")


def _load_resume_state(
    audit_file: Path,
) -> Tuple[
    Dict[str, Tuple[str, Optional[str], Optional[Dict[str, Any]]]],
    Dict[str, int],
    Dict[str, Optional[str]],
    Optional[Dict[str, Any]],
]:
    decisions: Dict[str, Tuple[str, Optional[str], Optional[Dict[str, Any]]]] = {}
    regen_counts: Dict[str, int] = {}
    last_rejected_text: Dict[str, Optional[str]] = {}
    prior_run_start: Optional[Dict[str, Any]] = None

    if not audit_file.exists():
        return decisions, regen_counts, last_rejected_text, None

    best_round: Dict[str, int] = {}

    with audit_file.open("r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            try:
                ev = json.loads(line)
            except Exception:
                continue
            if not isinstance(ev, dict):
                continue

            et = ev.get("event_type")
            if et == "run_start" and prior_run_start is None:
                prior_run_start = ev

            if et == "regenerate":
                ik = ev.get("instance_key")
                rr = ev.get("generation_round")
                txt = ev.get("last_generated_text")
                if isinstance(ik, str) and isinstance(rr, int) and rr >= 0:
                    prev = regen_counts.get(ik, 0)
                    if rr > prev:
                        regen_counts[ik] = rr
                    prev_best = best_round.get(ik, -1)
                    if rr >= prev_best:
                        best_round[ik] = rr
                        last_rejected_text[ik] = txt if isinstance(txt, str) else None

            if et == "decision":
                ik = ev.get("instance_key")
                status = ev.get("status")
                final_desc = ev.get("final_description")
                llm_bundle = ev.get("llm_bundle")
                if isinstance(ik, str) and isinstance(status, str):
                    decisions[ik] = (
                        status,
                        final_desc if isinstance(final_desc, str) else None,
                        llm_bundle if isinstance(llm_bundle, dict) else None,
                    )

    return decisions, regen_counts, last_rejected_text, prior_run_start


# ========= LLM helpers =========
def _sanitize_llm_output(text: str) -> str:
    t = (text or "").strip()
    if t.startswith("{") and "description" in t:
        try:
            obj = json.loads(t)
            if isinstance(obj, dict) and isinstance(obj.get("description"), str):
                t = obj["description"].strip()
        except Exception:
            pass
    if (t.startswith('"') and t.endswith('"')) or (t.startswith("'") and t.endswith("'")):
        t = t[1:-1].strip()
    return t


def _llm_chat_completion(
    *,
    client: OpenAI,
    model: str,
    messages: List[Dict[str, str]],
    temperature: float,
    max_tokens: int,
    seed: Optional[int],
) -> Tuple[str, Dict[str, Any]]:
    meta: Dict[str, Any] = {
        "seed_requested": seed,
        "seed_applied": False,
        "seed_error": None,
        "finish_reason": None,
        "usage": None,
        "max_tokens_requested": int(max_tokens),
        "max_param_used": None,
    }

    base_kwargs: Dict[str, Any] = dict(model=model, messages=messages, temperature=temperature)

    def attempt(max_param_used: str, include_seed: bool) -> Tuple[str, Dict[str, Any]]:
        req = dict(base_kwargs)
        if max_param_used == "max_completion_tokens":
            req["max_completion_tokens"] = int(max_tokens)
        else:
            req["max_tokens"] = int(max_tokens)
        if include_seed and seed is not None:
            req["seed"] = int(seed)

        resp = client.chat.completions.create(**req)
        text = (resp.choices[0].message.content or "").strip()

        meta_local = dict(meta)
        meta_local["max_param_used"] = max_param_used
        meta_local["finish_reason"] = getattr(resp.choices[0], "finish_reason", None)
        meta_local["usage"] = getattr(resp, "usage", None)
        meta_local["seed_applied"] = bool(include_seed and seed is not None)
        return text, meta_local

    def is_seed_error(e: Exception) -> bool:
        s = str(e).lower()
        return ("seed" in s) and ("unknown" in s or "unsupported" in s or "invalid" in s)

    try:
        return attempt("max_completion_tokens", include_seed=True)
    except Exception as e1:
        if seed is not None and is_seed_error(e1):
            meta["seed_error"] = str(e1)
            try:
                return attempt("max_completion_tokens", include_seed=False)
            except Exception:
                pass
        try:
            return attempt("max_tokens", include_seed=True)
        except Exception as e2:
            if seed is not None and is_seed_error(e2):
                meta["seed_error"] = str(e2)
                return attempt("max_tokens", include_seed=False)
            raise


def generate_description_via_llm(
    *,
    client: OpenAI,
    tool_name: str,
    base_description: str,
    model: str,
    seed: Optional[int],
    max_tokens: int,
    retry_on_length: bool,
    retry_max_tokens: int,
    mode_key: str,
    style_spec: Dict[str, Any],
    regen_index: int = 0,
    previous_rewrite: Optional[str] = None,
    length_policy: Optional[Dict[str, Any]] = None,
) -> Tuple[str, Dict[str, Any]]:
    system = str(style_spec["system"])
    regen_instr = str(style_spec.get("regen_diversity_instruction") or "")
    max_prev = int(style_spec.get("max_prev_rewrite_chars") or DEFAULT_MAX_PREV_REWRITE_CHARS)

    user_parts: List[str] = []
    user_parts.append(f"Tool name: {tool_name}")
    user_parts.append("Base description:")
    user_parts.append(base_description.strip() or "(empty)")
    user_parts.append("")
    user_parts.append(f"Rewrite in '{mode_key}' under the constraints.")

    # Soft length guidance (concise only): target is optional and non-binding.
    if mode_key == "style_concise" and isinstance(length_policy, dict):
        ct = length_policy.get("concise_soft_target") if isinstance(length_policy.get("concise_soft_target"), dict) else {}
        applied = bool(ct.get("applied", False))
        target_chars = ct.get("target_chars") if isinstance(ct.get("target_chars"), int) else None
        ratio = ct.get("target_ratio")
        if applied and isinstance(target_chars, int):
            pct = int(float(ratio) * 100) if isinstance(ratio, (int, float)) else 70
            user_parts.append("")
            user_parts.append(f"Length guidance (soft target): aim for <= {target_chars} characters (~{pct}% of base).")
            user_parts.append(
                "You may exceed the target if strictly necessary to preserve meaning; "
                "do not omit any explicitly stated details."
            )
        else:
            user_parts.append("")
            user_parts.append(
                "Length guidance: the base description is short or cannot be shortened safely; "
                "do not exceed the base length; keep it as brief as possible."
            )

    if regen_index > 0:
        user_parts.append("")
        user_parts.append(f"Regeneration request: {regen_index}")
        if regen_instr:
            user_parts.append(regen_instr)
        if previous_rewrite and previous_rewrite.strip():
            prev = previous_rewrite.strip()
            if len(prev) > max_prev:
                prev = prev[:max_prev].rstrip()
            user_parts.append("")
            user_parts.append("Previous rewrite (do not reuse wording):")
            user_parts.append(prev)

    user = "\n".join(user_parts)

    raw1, meta1 = _llm_chat_completion(
        client=client,
        model=model,
        messages=[{"role": "system", "content": system}, {"role": "user", "content": user}],
        temperature=0.0,
        max_tokens=max_tokens,
        seed=seed,
    )
    san1 = _sanitize_llm_output(raw1)
    finish1 = (meta1.get("finish_reason") or "").lower()
    looks_truncated_1 = (finish1 == "length")

    if not looks_truncated_1:
        return san1, {
            "proposal_origin": "primary",
            "proposal_sanitized_final": san1,
            "llm_text_raw_primary": raw1,
            "llm_text_raw_retry": None,
            "primary": meta1,
            "retry": None,
            "mode_key": mode_key,
            "length_policy": length_policy,
        }

    raw2 = None
    meta2 = None
    san2 = None
    best_san = san1
    origin = "primary"

    if retry_on_length and retry_max_tokens > max_tokens:
        raw2, meta2 = _llm_chat_completion(
            client=client,
            model=model,
            messages=[{"role": "system", "content": system}, {"role": "user", "content": user}],
            temperature=0.0,
            max_tokens=int(retry_max_tokens),
            seed=seed,
        )
        san2 = _sanitize_llm_output(raw2)
        if san2 and len(san2) >= len(best_san):
            best_san = san2
            origin = "retry"

    return best_san, {
        "proposal_origin": origin,
        "proposal_sanitized_final": best_san,
        "llm_text_raw_primary": raw1,
        "llm_text_raw_retry": raw2,
        "primary": meta1,
        "retry": meta2,
        "mode_key": mode_key,
        "length_policy": length_policy,
    }


def generate_k_candidates_with_stats(
    *,
    client: OpenAI,
    tool_name: str,
    base_description: str,
    model: str,
    seed: Optional[int],
    max_tokens: int,
    retry_on_length: bool,
    retry_max_tokens: int,
    mode_key: str,
    style_spec: Dict[str, Any],
    generation_round: int,
    k: int,
    previous_rewrite_hint: Optional[str],
    min_sleep_sec_between_calls: float,
    length_policy: Optional[Dict[str, Any]],
) -> Tuple[List[Dict[str, Any]], Optional[str]]:
    base_desc = (base_description or "").strip()
    k_eff = max(1, int(k))

    candidates: List[Dict[str, Any]] = []
    seen: set = set()

    prev = previous_rewrite_hint.strip() if isinstance(previous_rewrite_hint, str) and previous_rewrite_hint.strip() else None
    last_generated_text: Optional[str] = None

    for i in range(0, k_eff):
        regen_index = int(generation_round) * 1000 + i  # stable, monotonically increasing per round
        text = ""
        bundle: Optional[Dict[str, Any]] = None
        err: Optional[str] = None

        try:
            text, bundle = generate_description_via_llm(
                client=client,
                tool_name=tool_name,
                base_description=base_desc,
                model=model,
                seed=seed,
                max_tokens=max_tokens,
                retry_on_length=retry_on_length,
                retry_max_tokens=retry_max_tokens,
                mode_key=mode_key,
                style_spec=style_spec,
                regen_index=regen_index,
                previous_rewrite=prev,
                length_policy=length_policy,
            )
            text = (text or "").strip()
            last_generated_text = text if text else last_generated_text
        except Exception as e:
            err = str(e)
            text = ""

        duplicate = False
        if text:
            if text in seen:
                duplicate = True
            else:
                seen.add(text)

        stats = compute_candidate_stats(
            base_text=base_desc,
            cand_text=text,
            mode_key=mode_key,
            length_policy=length_policy,
        ) if text else None

        candidates.append(
            {
                "candidate_index": i + 1,  # 1-based for interactive selection
                "text": text,
                "error": err,
                "bundle": bundle,
                "duplicate": duplicate,
                "stats": stats,
            }
        )

        prev = text if text else prev

        if min_sleep_sec_between_calls > 0:
            time.sleep(float(min_sleep_sec_between_calls))

    return candidates, last_generated_text


# ========= IO =========
def make_working_copy(input_jsonl: str, output_jsonl: str, *, overwrite: bool = False) -> str:
    src = Path(input_jsonl)
    dst = Path(output_jsonl)

    if not src.exists():
        raise FileNotFoundError(f"File not found: {src}")

    if dst.exists() and not overwrite:
        return str(dst)

    dst.parent.mkdir(parents=True, exist_ok=True)
    shutil.copy2(src, dst)
    return str(dst)


def _normalize_cmd(raw: str) -> str:
    c = (raw or "").strip().lower()
    if c in ("", "y", "yes", "ok", "okay", "si", "sì"):
        return "y"
    if c in ("r", "retry", "again", "prova", "prova ancora", "rigenera"):
        return "r"
    if c in ("e", "edit", "modifica"):
        return "e"
    if c in ("m", "manual", "mine", "mio", "mia", "custom"):
        return "m"
    if c in ("s", "skip", "salta", "pass"):
        return "s"
    if c in ("q", "quit", "exit", "esci"):
        return "q"
    return c


def _parse_candidate_choice(cmd: str, *, k: int) -> Optional[int]:
    c = (cmd or "").strip()
    if not c:
        return None
    if c.isdigit():
        v = int(c)
        if 1 <= v <= int(k):
            return v
    return None


# ========= Main interactive =========
def interactive_llm_tools_in_jsonl(
    jsonl_path: str,
    *,
    tool_field: str,
    create_backup_of_target: bool,
    llm_model: str,
    seed: Optional[int],
    max_tokens: int,
    retry_on_length: bool,
    retry_max_tokens: int,
    allow_reserialize_fallback: bool,
    min_sleep_sec_between_calls: float,
    audit_dir: str,
    mode_key: str,
    num_candidates: int,
    max_token_preview: int,
    max_token_string_len: int,
    candidate_snippet_chars: int,
    concise_target_ratio: float,
    concise_target_min_base_len: int,
    concise_target_min_chars: int,
) -> None:
    mode_key, style_spec = _resolve_style(mode_key)

    path = Path(jsonl_path)
    if not path.exists():
        raise FileNotFoundError(f"File not found: {jsonl_path}")

    client = make_gemini_client()
    audit_file = _audit_file_path(
        path,
        audit_dir=Path(audit_dir),
        mode_key=mode_key,
        model=llm_model,
        tool_field=tool_field,
        num_candidates=int(num_candidates),
    )

    decisions_by_instance, regen_counts, last_rejected_text_by_instance, prior_run_start = _load_resume_state(audit_file)

    tool_order: List[Dict[str, Any]] = []
    with path.open("r", encoding="utf-8") as f:
        for raw_line in f:
            line = raw_line.rstrip("\n")
            if not line.strip():
                continue
            try:
                record = json.loads(line)
            except json.JSONDecodeError:
                continue
            if not isinstance(record, dict):
                continue

            rid = _record_id(record, tool_field=tool_field)
            tools = record.get(tool_field)
            if not isinstance(tools, list):
                continue

            for tool_index, entry in enumerate(tools):
                tool_obj, kind = _load_tool(entry)
                if not tool_obj:
                    continue
                name = (tool_obj.get("name") or "").strip()
                if not name:
                    continue

                desc_print, desc_mode = _get_description_for_print(entry)
                instance_key = _tool_instance_key(rid, tool_index, tool_obj)

                tool_order.append(
                    {
                        "record_id": rid,
                        "tool_index": tool_index,
                        "tool_name": name,
                        "desc_print": desc_print,
                        "desc_mode": desc_mode,
                        "instance_key": instance_key,
                        "entry_kind": kind,
                    }
                )

    n_total = len(tool_order)
    n_prev_reviewed = len(decisions_by_instance)

    start_pos = 0
    while start_pos < n_total and tool_order[start_pos]["instance_key"] in decisions_by_instance:
        start_pos += 1

    session_id = hashlib.sha256(f"{time.time_ns()}".encode("utf-8")).hexdigest()[:12]
    before_sha = _sha256_file(path)

    length_policy_config = {
        "concise_soft_target": {
            "ratio": float(concise_target_ratio),
            "min_base_len": int(concise_target_min_base_len),
            "min_chars": int(concise_target_min_chars),
            "policy_name": "concise_soft_target_v1",
        }
    }

    if prior_run_start is None:
        _append_audit_event(
            audit_file,
            {
                "event_type": "run_start",
                "ts": int(time.time()),
                "session_id": session_id,
                "mode": mode_key,
                "model": llm_model,
                "seed": seed,
                "dataset_path": str(path),
                "dataset_sha256_at_session_start": before_sha,
                "tool_field": tool_field,
                "n_total_occurrences": n_total,
                "max_tokens_requested": int(max_tokens),
                "retry_on_length": bool(retry_on_length),
                "retry_max_tokens": int(retry_max_tokens),
                "allow_reserialize_fallback": bool(allow_reserialize_fallback),
                "num_candidates": int(num_candidates),
                "min_sleep_sec_between_calls": float(min_sleep_sec_between_calls),
                "stats_max_token_preview": int(max_token_preview),
                "stats_max_token_string_len": int(max_token_string_len),
                "candidate_snippet_chars": int(candidate_snippet_chars),
                "length_policy_config": length_policy_config,
            },
        )
    else:
        _append_audit_event(
            audit_file,
            {
                "event_type": "run_resume",
                "ts": int(time.time()),
                "session_id": session_id,
                "mode": mode_key,
                "model": llm_model,
                "seed": seed,
                "dataset_path": str(path),
                "dataset_sha256_at_session_start": before_sha,
                "tool_field": tool_field,
                "n_total_occurrences": n_total,
                "n_previously_reviewed": n_prev_reviewed,
                "resume_from_index_1based": (start_pos + 1) if start_pos < n_total else (n_total + 1),
                "num_candidates": int(num_candidates),
                "candidate_snippet_chars": int(candidate_snippet_chars),
                "length_policy_config": length_policy_config,
            },
        )

    print(f"Target: {path}")
    print(f"Mode: {mode_key}")
    print(f"Audit file (RESUMABLE): {audit_file}")
    print(f"Tool occurrences total: {n_total}")
    if start_pos < n_total:
        print(f"Resume position: [{start_pos + 1}/{n_total}] (previously reviewed: {n_prev_reviewed})")
    else:
        print(f"Resume position: completed (previously reviewed: {n_prev_reviewed})")
    print(f"LLM: {llm_model} @ {GEMINI_BASE_URL}")
    print(f"Candidates per tool: {int(num_candidates)}")
    print(f"Candidate snippet chars: {int(candidate_snippet_chars)}")
    if mode_key == "style_concise":
        print(
            "Concise soft target: "
            f"ratio={float(concise_target_ratio):.2f}, "
            f"min_base_len={int(concise_target_min_base_len)}, "
            f"min_chars={int(concise_target_min_chars)}"
        )
    print(f"Max tokens: {int(max_tokens)}; retry_on_length={bool(retry_on_length)}; retry_max_tokens={int(retry_max_tokens)}")
    print(
        "Commands: ENTER/ok=accept #1, 1..K=accept candidate, r=regenerate K, "
        "e=edit candidate, m=manual, s=skip, q=quit, p<idx>=preview (e.g., p2)\n"
    )

    quit_requested = False
    resume_next_index_1based: Optional[int] = None

    max_prev = int(style_spec.get("max_prev_rewrite_chars") or DEFAULT_MAX_PREV_REWRITE_CHARS)

    # Session-level statistics (best-effort; heuristic).
    session_summary: Dict[str, Any] = {
        "accepted": 0,
        "edited": 0,
        "manual": 0,
        "skipped": 0,
        "accepted_risk_labels": {"LOW": 0, "MED": 0, "HIGH": 0, "NA": 0},
        "accepted_similarity_sum": 0.0,
        "accepted_similarity_n": 0,
        # Length reporting
        "accepted_base_chars_sum": 0,
        "accepted_cand_chars_sum": 0,
        "accepted_len_ratio_sum": 0.0,
        "accepted_len_ratio_n": 0,
        "accepted_len_delta_sum": 0,
        "accepted_soft_target_applicable_n": 0,
        "accepted_within_soft_target_n": 0,
    }

    for pos in range(start_pos, n_total):
        item = tool_order[pos]
        idx = pos + 1

        name = item["tool_name"]
        desc_mode = item["desc_mode"]
        old_desc_print = item["desc_print"]
        instance_key = item["instance_key"]
        rid = item["record_id"]
        tool_i = item["tool_index"]

        generation_round = int(regen_counts.get(instance_key, 0))
        previous_rewrite_hint: Optional[str] = last_rejected_text_by_instance.get(instance_key)

        print("=" * 80)
        print(f"[{idx}/{n_total}] {name}")
        print(f"instance_key: {instance_key} (record_id={rid}, tool_index={tool_i})")

        if desc_mode == "raw_json":
            print("Current description RAW (escaped):")
            print(old_desc_print if old_desc_print else "(empty)")
            base_desc = _decode_raw_json_string(old_desc_print) if old_desc_print else ""
            print("\nCurrent description DECODED:")
            print(base_desc if base_desc else "(empty)")
        else:
            base_desc = old_desc_print or ""
            print("Current description:")
            print(base_desc if base_desc else "(empty)")

        base_desc = (base_desc or "").strip()
        base_len_chars = len(base_desc)

        # Build policy once per instance (stable for candidate set).
        length_policy = _make_length_policy(
            base_desc=base_desc,
            mode_key=mode_key,
            concise_ratio=float(concise_target_ratio),
            concise_min_base_len=int(concise_target_min_base_len),
            concise_min_chars=int(concise_target_min_chars),
        )

        _print_base_stats(base_desc, max_preview=int(max_token_preview), max_tok_len=int(max_token_string_len))
        if mode_key == "style_concise":
            ct = length_policy.get("concise_soft_target", {}) if isinstance(length_policy.get("concise_soft_target"), dict) else {}
            if ct.get("applied") and isinstance(ct.get("target_chars"), int):
                print(f"Concise soft target (applied): target_chars={ct.get('target_chars')} (base_len={base_len_chars})")
            else:
                print(f"Concise soft target (not applied): reason={ct.get('reason')} (base_len={base_len_chars})")

        candidates: List[Dict[str, Any]] = []
        last_generated_text: Optional[str] = None

        while True:
            if not candidates:
                try:
                    candidates, last_generated_text = generate_k_candidates_with_stats(
                        client=client,
                        tool_name=name,
                        base_description=base_desc,
                        model=llm_model,
                        seed=seed,
                        max_tokens=max_tokens,
                        retry_on_length=retry_on_length,
                        retry_max_tokens=retry_max_tokens,
                        mode_key=mode_key,
                        style_spec=style_spec,
                        generation_round=generation_round,
                        k=int(num_candidates),
                        previous_rewrite_hint=previous_rewrite_hint,
                        min_sleep_sec_between_calls=float(min_sleep_sec_between_calls),
                        length_policy=length_policy,
                    )
                except Exception as e:
                    print(f"\nLLM ERROR (candidate set generation): {e}")
                    cmd = _normalize_cmd(input("Choice [m=manual, s=skip, q=quit] > "))
                    now = int(time.time())

                    if cmd == "q":
                        quit_requested = True
                        resume_next_index_1based = idx
                        break

                    if cmd == "s":
                        decisions_by_instance[instance_key] = ("skipped", None, None)
                        _append_audit_event(
                            audit_file,
                            {
                                "event_type": "decision",
                                "ts": now,
                                "session_id": session_id,
                                "status": "skipped",
                                "tool_name": name,
                                "instance_key": instance_key,
                                "record_id": rid,
                                "tool_index": tool_i,
                                "model": llm_model,
                                "seed": seed,
                                "mode": mode_key,
                                "base_description": base_desc,
                                "final_description": None,
                                "source": "user",
                                "note": "skip_after_llm_error",
                                "length_policy": length_policy,
                            },
                        )
                        session_summary["skipped"] += 1
                        break

                    if cmd == "m":
                        manual_final = input("Manual final description (empty cancels) > ").rstrip("\n").strip()
                        status = "manual" if manual_final else "skipped"
                        decisions_by_instance[instance_key] = (status, manual_final or None, None)
                        diff_stats = compute_candidate_stats(
                            base_text=base_desc,
                            cand_text=manual_final,
                            mode_key=mode_key,
                            length_policy=length_policy,
                        ) if manual_final else None

                        _append_audit_event(
                            audit_file,
                            {
                                "event_type": "decision",
                                "ts": now,
                                "session_id": session_id,
                                "status": status,
                                "tool_name": name,
                                "instance_key": instance_key,
                                "record_id": rid,
                                "tool_index": tool_i,
                                "model": llm_model,
                                "seed": seed,
                                "mode": mode_key,
                                "base_description": base_desc,
                                "final_description": manual_final or None,
                                "source": "user",
                                "note": "manual_after_llm_error",
                                "diff_stats": diff_stats,
                                "length_policy": length_policy,
                            },
                        )
                        if status == "manual":
                            session_summary["manual"] += 1
                        else:
                            session_summary["skipped"] += 1
                        break

                    candidates = []
                    continue

                # Candidate-set generation event is logged for auditability (include length metrics).
                _append_audit_event(
                    audit_file,
                    {
                        "event_type": "candidates_generated",
                        "ts": int(time.time()),
                        "session_id": session_id,
                        "tool_name": name,
                        "instance_key": instance_key,
                        "record_id": rid,
                        "tool_index": tool_i,
                        "mode": mode_key,
                        "model": llm_model,
                        "seed": seed,
                        "generation_round": int(generation_round),
                        "num_candidates_requested": int(num_candidates),
                        "base_len_chars": int(base_len_chars),
                        "length_policy": length_policy,
                        "candidates_summary": [
                            {
                                "candidate_index": c.get("candidate_index"),
                                "text_sha256": _sha256_text((c.get("text") or "").strip()),
                                "text_len": len((c.get("text") or "").strip()),
                                "error": c.get("error"),
                                "duplicate": bool(c.get("duplicate", False)),
                                "risk_label": ((c.get("stats") or {}).get("risk_label") if isinstance(c.get("stats"), dict) else None),
                                "similarity_ratio": ((c.get("stats") or {}).get("similarity_ratio") if isinstance(c.get("stats"), dict) else None),
                                "len_ratio": ((c.get("stats") or {}).get("len_ratio") if isinstance(c.get("stats"), dict) else None),
                                "len_delta_chars": ((c.get("stats") or {}).get("len_delta_chars") if isinstance(c.get("stats"), dict) else None),
                                "concise_soft_target_applied": ((c.get("stats") or {}).get("concise_soft_target_applied") if isinstance(c.get("stats"), dict) else None),
                                "concise_soft_target_chars": ((c.get("stats") or {}).get("concise_soft_target_chars") if isinstance(c.get("stats"), dict) else None),
                                "within_soft_target": ((c.get("stats") or {}).get("within_soft_target") if isinstance(c.get("stats"), dict) else None),
                            }
                            for c in candidates
                        ],
                    },
                )

            print("\nCandidates overview:")
            for c in candidates:
                _print_candidate_summary_line(
                    int(c.get("candidate_index") or 0),
                    c,
                    max_preview=int(max_token_preview),
                    max_tok_len=int(max_token_string_len),
                    snippet_chars=int(candidate_snippet_chars),
                )

            cmd = _normalize_cmd(
                input(
                    f"\nChoice [ENTER=accept #1, 1..{int(num_candidates)}=accept, r=regen, e=edit, m=manual, s=skip, q=quit, p<idx>=preview] > "
                )
            )
            now = int(time.time())

            if cmd == "q":
                quit_requested = True
                resume_next_index_1based = idx
                break

            if cmd == "s":
                decisions_by_instance[instance_key] = ("skipped", None, None)
                _append_audit_event(
                    audit_file,
                    {
                        "event_type": "decision",
                        "ts": now,
                        "session_id": session_id,
                        "status": "skipped",
                        "tool_name": name,
                        "instance_key": instance_key,
                        "record_id": rid,
                        "tool_index": tool_i,
                        "model": llm_model,
                        "seed": seed,
                        "mode": mode_key,
                        "base_description": base_desc,
                        "final_description": None,
                        "source": "user",
                        "note": "skip",
                        "length_policy": length_policy,
                    },
                )
                session_summary["skipped"] += 1
                break

            if cmd == "r":
                generation_round += 1
                regen_counts[instance_key] = int(generation_round)

                if last_generated_text and isinstance(last_generated_text, str) and last_generated_text.strip():
                    hint = last_generated_text.strip()
                    if len(hint) > max_prev:
                        hint = hint[:max_prev].rstrip()
                    previous_rewrite_hint = hint
                    last_rejected_text_by_instance[instance_key] = hint

                _append_audit_event(
                    audit_file,
                    {
                        "event_type": "regenerate",
                        "ts": now,
                        "session_id": session_id,
                        "tool_name": name,
                        "instance_key": instance_key,
                        "record_id": rid,
                        "tool_index": tool_i,
                        "mode": mode_key,
                        "generation_round": int(generation_round),
                        "last_generated_text": previous_rewrite_hint,
                        "last_generated_text_sha256": _sha256_text(previous_rewrite_hint or ""),
                        "length_policy": length_policy,
                    },
                )

                candidates = []
                last_generated_text = None
                if min_sleep_sec_between_calls > 0:
                    time.sleep(float(min_sleep_sec_between_calls))
                continue

            if cmd == "m":
                manual_final = input("Manual final description (empty cancels) > ").rstrip("\n").strip()
                status = "manual" if manual_final else "skipped"
                decisions_by_instance[instance_key] = (status, manual_final or None, None)

                diff_stats = compute_candidate_stats(
                    base_text=base_desc,
                    cand_text=manual_final,
                    mode_key=mode_key,
                    length_policy=length_policy,
                ) if manual_final else None

                _append_audit_event(
                    audit_file,
                    {
                        "event_type": "decision",
                        "ts": now,
                        "session_id": session_id,
                        "status": status,
                        "tool_name": name,
                        "instance_key": instance_key,
                        "record_id": rid,
                        "tool_index": tool_i,
                        "model": llm_model,
                        "seed": seed,
                        "mode": mode_key,
                        "base_description": base_desc,
                        "final_description": manual_final or None,
                        "source": "user",
                        "note": "manual_replace",
                        "diff_stats": diff_stats,
                        "length_policy": length_policy,
                    },
                )
                if status == "manual":
                    session_summary["manual"] += 1
                else:
                    session_summary["skipped"] += 1
                break

            if cmd == "e":
                raw_idx = input(f"Candidate index to edit [1..{int(num_candidates)}] (empty=1) > ").strip()
                chosen_i = 1
                if raw_idx and raw_idx.isdigit():
                    chosen_i = int(raw_idx)
                if not (1 <= chosen_i <= int(num_candidates)):
                    print("Invalid candidate index.")
                    continue

                cand = candidates[chosen_i - 1] if (chosen_i - 1) < len(candidates) else None
                base_text = (cand.get("text") or "").strip() if isinstance(cand, dict) else ""
                if base_text:
                    print("\nSelected candidate text:")
                    print(base_text)
                else:
                    print("\nSelected candidate is empty; editing starts from empty string.")
                    base_text = ""

                edited = input("Edit final description (empty cancels) > ").rstrip("\n").strip()
                status = "edited" if edited else "skipped"
                bundle = cand.get("bundle") if isinstance(cand, dict) else None
                stats = compute_candidate_stats(
                    base_text=base_desc,
                    cand_text=edited,
                    mode_key=mode_key,
                    length_policy=length_policy,
                ) if edited else None

                decisions_by_instance[instance_key] = (status, edited or None, bundle if isinstance(bundle, dict) else None)
                _append_audit_event(
                    audit_file,
                    {
                        "event_type": "decision",
                        "ts": now,
                        "session_id": session_id,
                        "status": status,
                        "tool_name": name,
                        "instance_key": instance_key,
                        "record_id": rid,
                        "tool_index": tool_i,
                        "model": llm_model,
                        "seed": seed,
                        "mode": mode_key,
                        "base_description": base_desc,
                        "final_description": edited or None,
                        "source": "user",
                        "note": "edit_candidate",
                        "chosen_candidate_index": int(chosen_i),
                        "llm_bundle": bundle if isinstance(bundle, dict) else None,
                        "diff_stats": stats,
                        "generation_round": int(generation_round),
                        "length_policy": length_policy,
                    },
                )
                if status == "edited":
                    session_summary["edited"] += 1
                else:
                    session_summary["skipped"] += 1
                break

            choice_i = 1 if cmd == "y" else _parse_candidate_choice(cmd, k=int(num_candidates))
            if choice_i is not None:
                if not (1 <= int(choice_i) <= int(num_candidates)):
                    print("Invalid candidate index.")
                    continue
                cand = candidates[int(choice_i) - 1] if (int(choice_i) - 1) < len(candidates) else None
                if not isinstance(cand, dict):
                    print("Candidate not available.")
                    continue
                if cand.get("error") or not (cand.get("text") or "").strip():
                    print("Selected candidate is not acceptable (empty or error).")
                    _print_candidate_full(int(choice_i), cand, max_preview=int(max_token_preview), max_tok_len=int(max_token_string_len))
                    continue

                final_desc = (cand.get("text") or "").strip()
                bundle = cand.get("bundle") if isinstance(cand.get("bundle"), dict) else None
                stats = cand.get("stats") if isinstance(cand.get("stats"), dict) else compute_candidate_stats(
                    base_text=base_desc,
                    cand_text=final_desc,
                    mode_key=mode_key,
                    length_policy=length_policy,
                )

                decisions_by_instance[instance_key] = ("accepted", final_desc, bundle)
                _append_audit_event(
                    audit_file,
                    {
                        "event_type": "decision",
                        "ts": now,
                        "session_id": session_id,
                        "status": "accepted",
                        "tool_name": name,
                        "instance_key": instance_key,
                        "record_id": rid,
                        "tool_index": tool_i,
                        "model": llm_model,
                        "seed": seed,
                        "mode": mode_key,
                        "base_description": base_desc,
                        "final_description": final_desc,
                        "source": "llm",
                        "chosen_candidate_index": int(choice_i),
                        "generation_round": int(generation_round),
                        "llm_bundle": bundle,
                        "diff_stats": stats,
                        "length_policy": length_policy,
                    },
                )

                # Session summary update.
                session_summary["accepted"] += 1
                rl = (stats.get("risk_label") if isinstance(stats, dict) else None) or "NA"
                if rl not in session_summary["accepted_risk_labels"]:
                    rl = "NA"
                session_summary["accepted_risk_labels"][rl] += 1

                sim = stats.get("similarity_ratio") if isinstance(stats, dict) else None
                if isinstance(sim, (int, float)):
                    session_summary["accepted_similarity_sum"] += float(sim)
                    session_summary["accepted_similarity_n"] += 1

                # Length summary
                bl = stats.get("base_len_chars") if isinstance(stats, dict) else None
                cl = stats.get("cand_len_chars") if isinstance(stats, dict) else None
                lr = stats.get("len_ratio") if isinstance(stats, dict) else None
                ld = stats.get("len_delta_chars") if isinstance(stats, dict) else None
                if isinstance(bl, int) and isinstance(cl, int):
                    session_summary["accepted_base_chars_sum"] += int(bl)
                    session_summary["accepted_cand_chars_sum"] += int(cl)
                if isinstance(lr, (int, float)):
                    session_summary["accepted_len_ratio_sum"] += float(lr)
                    session_summary["accepted_len_ratio_n"] += 1
                if isinstance(ld, int):
                    session_summary["accepted_len_delta_sum"] += int(ld)

                wst = stats.get("within_soft_target") if isinstance(stats, dict) else None
                st_applied = bool(stats.get("concise_soft_target_applied", False)) if isinstance(stats, dict) else False
                if st_applied:
                    session_summary["accepted_soft_target_applicable_n"] += 1
                    if wst is True:
                        session_summary["accepted_within_soft_target_n"] += 1

                break

            if cmd.startswith("p"):
                raw_idx = cmd[1:].strip()
                if raw_idx.isdigit():
                    vi = int(raw_idx)
                    if 1 <= vi <= int(num_candidates):
                        _print_candidate_full(vi, candidates[vi - 1], max_preview=int(max_token_preview), max_tok_len=int(max_token_string_len))
                        continue
                print("Preview command format: p<index>, for example: p2")
                continue

            print("Invalid command. Preview: p<index> (example: p2).")

        if quit_requested:
            break

    # ========= Apply decisions to file =========
    tmp_path = path.with_suffix(path.suffix + ".tmp")
    updated_count = 0
    patch_failures = 0

    with path.open("r", encoding="utf-8") as fin, tmp_path.open("w", encoding="utf-8") as fout:
        for raw_line in fin:
            line = raw_line.rstrip("\n")
            if not line.strip():
                fout.write(line + "\n")
                continue

            try:
                record = json.loads(line)
            except json.JSONDecodeError:
                fout.write(line + "\n")
                continue

            if not isinstance(record, dict):
                fout.write(json.dumps(record, ensure_ascii=False) + "\n")
                continue

            rid = _record_id(record, tool_field=tool_field)
            tools = record.get(tool_field)

            if isinstance(tools, list):
                new_tools: List[Any] = []
                for tool_index, entry in enumerate(tools):
                    tool_obj, kind = _load_tool(entry)
                    if not tool_obj:
                        new_tools.append(entry)
                        continue

                    instance_key = _tool_instance_key(rid, tool_index, tool_obj)
                    decision = decisions_by_instance.get(instance_key)

                    if decision is None:
                        new_tools.append(entry)
                        continue

                    status, new_desc, llm_bundle = decision
                    if status in ("accepted", "edited", "manual") and new_desc:
                        if kind == "json_str" and isinstance(entry, str):
                            already_ok = False
                            try:
                                obj0 = json.loads(entry)
                                if isinstance(obj0, dict) and obj0.get("description") == new_desc:
                                    already_ok = True
                            except Exception:
                                already_ok = False

                            if already_ok:
                                new_tools.append(entry)
                                continue

                            patched, ok, reason = _replace_top_level_string_field_in_raw_object(entry, "description", new_desc)
                            if ok:
                                new_tools.append(patched)
                                updated_count += 1
                            else:
                                fallback_ok = False
                                fallback_patched = entry
                                if allow_reserialize_fallback:
                                    try:
                                        obj = json.loads(entry)
                                        if isinstance(obj, dict):
                                            obj["description"] = new_desc
                                            fallback_patched = json.dumps(obj, ensure_ascii=False)
                                            fallback_ok = True
                                    except Exception:
                                        fallback_ok = False

                                if fallback_ok:
                                    new_tools.append(fallback_patched)
                                    updated_count += 1
                                    _append_audit_event(
                                        audit_file,
                                        {
                                            "event_type": "patch_fallback_reserialize",
                                            "ts": int(time.time()),
                                            "session_id": session_id,
                                            "instance_key": instance_key,
                                            "record_id": rid,
                                            "tool_index": tool_index,
                                            "tool_name": tool_obj.get("name"),
                                            "mode": mode_key,
                                            "entry_sha256_before": _sha256_text(entry),
                                            "entry_sha256_after": _sha256_text(fallback_patched),
                                            "patch_reason": reason,
                                        },
                                    )
                                else:
                                    new_tools.append(entry)
                                    patch_failures += 1
                        else:
                            if tool_obj.get("description") == new_desc:
                                new_tools.append(tool_obj)
                                continue
                            tool_obj["description"] = new_desc
                            new_tools.append(tool_obj)
                            updated_count += 1
                    else:
                        new_tools.append(entry)

                record[tool_field] = new_tools

            fout.write(json.dumps(record, ensure_ascii=False) + "\n")

    if create_backup_of_target:
        bak_path = path.with_suffix(path.suffix + ".bak")
        if not bak_path.exists():
            shutil.copy2(path, bak_path)

    tmp_path.replace(path)
    after_sha = _sha256_file(path)

    n_reviewed = len(decisions_by_instance)
    n_skipped = sum(1 for st, _, _ in decisions_by_instance.values() if st == "skipped")
    completed = (n_reviewed >= n_total) and (not quit_requested)

    avg_sim = None
    if int(session_summary["accepted_similarity_n"]) > 0:
        avg_sim = float(session_summary["accepted_similarity_sum"]) / float(session_summary["accepted_similarity_n"])

    avg_len_ratio = None
    if int(session_summary["accepted_len_ratio_n"]) > 0:
        avg_len_ratio = float(session_summary["accepted_len_ratio_sum"]) / float(session_summary["accepted_len_ratio_n"])

    avg_len_delta = None
    if int(session_summary["accepted"]) > 0:
        avg_len_delta = float(session_summary["accepted_len_delta_sum"]) / float(session_summary["accepted"])

    avg_base_len = None
    avg_cand_len = None
    if int(session_summary["accepted"]) > 0:
        avg_base_len = float(session_summary["accepted_base_chars_sum"]) / float(session_summary["accepted"])
        avg_cand_len = float(session_summary["accepted_cand_chars_sum"]) / float(session_summary["accepted"])

    _append_audit_event(
        audit_file,
        {
            "event_type": "run_end",
            "ts": int(time.time()),
            "session_id": session_id,
            "mode": mode_key,
            "model": llm_model,
            "seed": seed,
            "dataset_path": str(path),
            "dataset_sha256_at_session_start": before_sha,
            "dataset_sha256_at_session_end": after_sha,
            "n_total_occurrences": n_total,
            "n_reviewed_total": n_reviewed,
            "n_updated_this_session": updated_count,
            "n_skipped_total": n_skipped,
            "completed": bool(completed),
            "quit_requested": bool(quit_requested),
            "raw_patch_failures_this_session": patch_failures,
            "resume_next_index_1based": resume_next_index_1based if quit_requested else (n_total + 1 if completed else None),
            "session_summary": {
                "accepted": int(session_summary["accepted"]),
                "edited": int(session_summary["edited"]),
                "manual": int(session_summary["manual"]),
                "skipped": int(session_summary["skipped"]),
                "accepted_risk_labels": session_summary["accepted_risk_labels"],
                "accepted_avg_similarity": avg_sim,
                "accepted_similarity_n": int(session_summary["accepted_similarity_n"]),
                "accepted_avg_len_ratio": avg_len_ratio,
                "accepted_len_ratio_n": int(session_summary["accepted_len_ratio_n"]),
                "accepted_avg_len_delta_chars": avg_len_delta,
                "accepted_avg_base_len_chars": avg_base_len,
                "accepted_avg_cand_len_chars": avg_cand_len,
                "accepted_soft_target_applicable_n": int(session_summary["accepted_soft_target_applicable_n"]),
                "accepted_within_soft_target_n": int(session_summary["accepted_within_soft_target_n"]),
            },
            "length_policy_config": length_policy_config,
        },
    )

    print("\nChanges applied.")
    print(f"Mode: {mode_key}")
    print(f"Candidates per tool: {int(num_candidates)}")
    print(f"Candidate snippet chars: {int(candidate_snippet_chars)}")
    print(f"Descriptions updated (this session): {updated_count}")
    if patch_failures:
        print(f"Raw JSON-string patch failures (left unchanged): {patch_failures}")
    print(f"Reviewed total (from audit): {n_reviewed} / {n_total}")
    print(f"Completed: {completed} (quit_requested={quit_requested})")
    if quit_requested and resume_next_index_1based is not None:
        print(f"Resume next time from: [{resume_next_index_1based}/{n_total}]")
    print(f"Updated file: {path}")
    print(f"Audit file (same on resume): {audit_file}")

    print("\nSession summary (heuristic):")
    print(
        f"  accepted={int(session_summary['accepted'])}, edited={int(session_summary['edited'])}, "
        f"manual={int(session_summary['manual'])}, skipped={int(session_summary['skipped'])}"
    )
    print(f"  accepted_risk_labels={session_summary['accepted_risk_labels']}")
    if avg_sim is not None:
        print(f"  accepted_avg_similarity={avg_sim:.2f} (n={int(session_summary['accepted_similarity_n'])})")
    if avg_len_ratio is not None:
        print(f"  accepted_avg_len_ratio={avg_len_ratio:.2f} (n={int(session_summary['accepted_len_ratio_n'])})")
    if avg_len_delta is not None:
        print(f"  accepted_avg_len_delta_chars={avg_len_delta:+.1f}")
    if avg_base_len is not None and avg_cand_len is not None:
        print(f"  accepted_avg_base_len_chars={avg_base_len:.1f}; accepted_avg_cand_len_chars={avg_cand_len:.1f}")
    if mode_key == "style_concise":
        print(
            "  accepted_soft_target: "
            f"applicable={int(session_summary['accepted_soft_target_applicable_n'])}, "
            f"within={int(session_summary['accepted_within_soft_target_n'])}"
        )


def _derive_working_copy_path(input_path: str, mode_key: str) -> str:
    p = Path(input_path)
    return str(p.with_name(f"{p.stem}.WORKING_COPY.{mode_key}{p.suffix}"))


if __name__ == "__main__":
    # ----- Inputs -----
    INPUT_JSONL = os.environ.get("INPUT_JSONL") or "When2Call/data/test/when2call_test_llm_judge.jsonl"
    MODE_KEY = os.environ.get("MODE_KEY") or "style_concise"   # style_verbose | style_concise (aliases: style_coicnoso, style_coinceise)
    LLM_MODEL = os.environ.get("LLM_MODEL") or LLM_MODEL_DEFAULT

    mode_key_resolved, _ = _resolve_style(MODE_KEY)

    # Per-style working copy (separate dataset per style, by default).
    OUTPUT_JSONL = os.environ.get("OUTPUT_JSONL") or _derive_working_copy_path(INPUT_JSONL, mode_key_resolved)

    working = make_working_copy(INPUT_JSONL, OUTPUT_JSONL, overwrite=False)
    print(f"Working copy: {working}")

    # ----- Runtime knobs -----
    seed_env = os.environ.get("GEMINI_SEED")
    seed_val: Optional[int] = int(seed_env.strip()) if (seed_env and seed_env.strip()) else None

    max_tokens_env = os.environ.get("GEMINI_MAX_TOKENS")
    max_tokens_val = int(max_tokens_env.strip()) if (max_tokens_env and max_tokens_env.strip()) else DEFAULT_MAX_TOKENS

    retry_max_tokens_env = os.environ.get("GEMINI_RETRY_MAX_TOKENS")
    retry_max_tokens_val = int(retry_max_tokens_env.strip()) if (retry_max_tokens_env and retry_max_tokens_env.strip()) else RETRY_MAX_TOKENS

    allow_reserialize_env = os.environ.get("ALLOW_RESERIALIZE_FALLBACK")
    allow_reserialize_val = (
        bool(int(allow_reserialize_env.strip()))
        if (allow_reserialize_env and allow_reserialize_env.strip())
        else DEFAULT_ALLOW_RESERIALIZE_FALLBACK
    )

    num_candidates_val = _safe_int_env("NUM_CANDIDATES", DEFAULT_NUM_CANDIDATES)
    min_sleep_val = _safe_float_env("MIN_SLEEP_SEC_BETWEEN_CALLS", 0.0)

    max_preview_val = _safe_int_env("STATS_MAX_TOKEN_PREVIEW", DEFAULT_MAX_TOKEN_PREVIEW)
    max_tok_len_val = _safe_int_env("STATS_MAX_TOKEN_STRING_LEN", DEFAULT_MAX_TOKEN_STRING_LEN)

    cand_snippet_val = _safe_int_env("CANDIDATE_SNIPPET_CHARS", DEFAULT_CANDIDATE_SNIPPET_CHARS)

    # Concise soft target (reviewer-proof): configurable via env.
    concise_ratio_val = _safe_float_env("CONCISE_TARGET_RATIO", DEFAULT_CONCISE_TARGET_RATIO)
    concise_min_base_len_val = _safe_int_env("CONCISE_TARGET_MIN_BASE_LEN", DEFAULT_CONCISE_TARGET_MIN_BASE_LEN)
    concise_min_chars_val = _safe_int_env("CONCISE_TARGET_MIN_CHARS", DEFAULT_CONCISE_TARGET_MIN_CHARS)

    interactive_llm_tools_in_jsonl(
        working,
        tool_field="tools",
        create_backup_of_target=False,
        llm_model=LLM_MODEL,
        seed=seed_val,
        max_tokens=max_tokens_val,
        retry_on_length=RETRY_ON_LENGTH,
        retry_max_tokens=retry_max_tokens_val,
        allow_reserialize_fallback=allow_reserialize_val,
        min_sleep_sec_between_calls=float(min_sleep_val),
        audit_dir=os.environ.get("AUDIT_DIR") or "audit",
        mode_key=mode_key_resolved,
        num_candidates=int(num_candidates_val),
        max_token_preview=int(max_preview_val),
        max_token_string_len=int(max_tok_len_val),
        candidate_snippet_chars=int(cand_snippet_val),
        concise_target_ratio=float(concise_ratio_val),
        concise_target_min_base_len=int(concise_min_base_len_val),
        concise_target_min_chars=int(concise_min_chars_val),
    )


Working copy: When2Call/data/test/when2call_test_llm_judge.WORKING_COPY.style_concise.jsonl
Target: When2Call/data/test/when2call_test_llm_judge.WORKING_COPY.style_concise.jsonl
Mode: style_concise
Audit file (RESUMABLE): audit/55b59d7780f3/when2call_test_llm_judge.WORKING_COPY.style_concise.55b59d7780f3.style_concise.gemini-2.5-flash.K2.audit.jsonl
Tool occurrences total: 978
Resume position: [4/978] (previously reviewed: 3)
LLM: gemini-2.5-flash @ https://generativelanguage.googleapis.com/v1beta/openai/
Candidates per tool: 2
Candidate snippet chars: 160
Concise soft target: ratio=0.70, min_base_len=160, min_chars=80
Max tokens: 512; retry_on_length=True; retry_max_tokens=1024
Commands: ENTER/ok=accept #1, 1..K=accept candidate, r=regenerate K, e=edit candidate, m=manual, s=skip, q=quit, p<idx>=preview (e.g., p2)

[4/978] Buses_3_BuyBusTicket
instance_key: rec:64cbc7e8819e45258b49e186164c9fad:t1:9a77a4750b12a8be645d3b59a745bc6f (record_id=64cbc7e8819e45258b49e186164c9fad, tool_index=

In [1]:
#!/usr/bin/env python3 december 28 
# two styles
import json
import shutil
import os
import time
import hashlib
from pathlib import Path
from typing import Dict, Any, Optional, Tuple, List

from openai import OpenAI


# ========= Config =========
GEMINI_BASE_URL = "https://generativelanguage.googleapis.com/v1beta/openai/"
LLM_MODEL_DEFAULT = "gemini-2.5-flash"

HASH_HEX_LEN = 32

DEFAULT_MAX_TOKENS = 512
RETRY_ON_LENGTH = True
RETRY_MAX_TOKENS = 1024

DEFAULT_ALLOW_RESERIALIZE_FALLBACK = False

# How much of the rejected previous rewrite to store in audit (for resume) and to feed back into prompt.
DEFAULT_MAX_PREV_REWRITE_CHARS = 800


# ========= Styles =========
STYLE_SPECS: Dict[str, Dict[str, Any]] = {
    "style_verbose": {
        "system": (
            "Rewrite tool descriptions.\n"
            "Hard constraints:\n"
            "- Preserve meaning exactly; do not add new capabilities, steps, motivations, benefits, or context.\n"
            "- Do not delete information present in the original description.\n"
            "- Do not introduce new parameter names, IDs, field names, flags, or implementation details.\n"
            "- If parameter/field names/IDs/flags already appear in the original description, keep them (do not remove them).\n"
            "- Do not add examples, normative language, or assumptions.\n"
            "- Keep the same subject (the tool) and the same scope.\n"
            "- Output only the rewritten description text, nothing else.\n"
            "- Style: verbose but controlled; keep it concise and complete (1–2 sentences), clear and direct.\n"
        ),
        "regen_diversity_instruction": (
            "Return a meaning-equivalent rewrite that is lexically different from your previous rewrite; "
            "avoid repeating the same sentence structure."
        ),
        "max_prev_rewrite_chars": 800,
    },
    "style_concise": {
        "system": (
            "Rewrite tool descriptions.\n"
            "Hard constraints:\n"
            "- Preserve meaning exactly; do not add new capabilities, steps, motivations, benefits, or context.\n"
            "- Do not delete information present in the original description.\n"
            "- Do not introduce new parameter names, IDs, field names, flags, or implementation details.\n"
            "- If parameter/field names/IDs/flags already appear in the original description, keep them (do not remove them).\n"
            "- Do not add examples, normative language, or assumptions.\n"
            "- Keep the same subject (the tool) and the same scope.\n"
            "- Output only the rewritten description text, nothing else.\n"
            " - Style: concise and controlled; 1 sentence preferred, 2 max.\n"
            " - Length constraint: aim to be shorter than the base description; if the base description is already short, do not exceed its length.\n"
            " - Compression rule: remove redundancy, filler, and hedging; keep all explicitly stated constraints/details.\n"

        ),
        "regen_diversity_instruction": (
            "You must produce a different paraphrase than the previous rewrite. "
            "Do not reuse the same sentence skeleton or distinctive phrases. "
            "Keep meaning exactly the same; only vary wording and structure."
        ),
        "max_prev_rewrite_chars": 600,
    },
    # Alias to tolerate the user's misspelling "coicnoso/coinceise"
    "style_coicnoso": {},   # filled after dict creation
    "style_coinceise": {},  # filled after dict creation
}
STYLE_SPECS["style_coicnoso"] = STYLE_SPECS["style_concise"]
STYLE_SPECS["style_coinceise"] = STYLE_SPECS["style_concise"]


def _resolve_style(mode_key: str) -> Tuple[str, Dict[str, Any]]:
    mk = (mode_key or "").strip()
    if not mk:
        mk = "style_verbose"
    if mk not in STYLE_SPECS:
        raise ValueError(f"Unknown MODE_KEY='{mk}'. Supported: {', '.join(sorted(STYLE_SPECS.keys()))}")
    return mk, STYLE_SPECS[mk]


# ========= Client =========
def make_gemini_client() -> OpenAI:
    token = os.environ.get("TOKEN_GEMINI")
    if not token:
        raise RuntimeError("TOKEN_GEMINI environment variable is not set.")
    return OpenAI(api_key=token, base_url=GEMINI_BASE_URL)


# ========= Small utils =========
def _json_safe(obj: Any) -> Any:
    if obj is None or isinstance(obj, (str, int, float, bool)):
        return obj
    if isinstance(obj, dict):
        return {str(k): _json_safe(v) for k, v in obj.items()}
    if isinstance(obj, (list, tuple)):
        return [_json_safe(x) for x in obj]
    if hasattr(obj, "model_dump") and callable(getattr(obj, "model_dump")):
        try:
            return _json_safe(obj.model_dump())
        except Exception:
            pass
    if hasattr(obj, "dict") and callable(getattr(obj, "dict")):
        try:
            return _json_safe(obj.dict())
        except Exception:
            pass
    if hasattr(obj, "__dict__"):
        try:
            return _json_safe(vars(obj))
        except Exception:
            pass
    try:
        return str(obj)
    except Exception:
        return None


def _sha256_text(s: str) -> str:
    return hashlib.sha256((s or "").encode("utf-8")).hexdigest()


def _canonical_json(obj: Any) -> str:
    return json.dumps(obj, ensure_ascii=False, sort_keys=True, separators=(",", ":"))


def _sha256_file(path: Path) -> str:
    h = hashlib.sha256()
    with path.open("rb") as f:
        for chunk in iter(lambda: f.read(1024 * 1024), b""):
            h.update(chunk)
    return h.hexdigest()


# ========= Raw JSON-string patcher (for tools stored as JSON strings) =========
def _extract_json_string_value(raw_json: str, key: str) -> Optional[str]:
    token = f'"{key}"'
    i = raw_json.find(token)
    if i < 0:
        return None
    i = raw_json.find(":", i + len(token))
    if i < 0:
        return None
    i += 1
    n = len(raw_json)
    while i < n and raw_json[i] in " \t\r\n":
        i += 1
    if i >= n or raw_json[i] != '"':
        return None
    start = i
    i += 1
    esc = False
    while i < n:
        c = raw_json[i]
        if esc:
            esc = False
        else:
            if c == "\\":
                esc = True
            elif c == '"':
                return raw_json[start : i + 1]
        i += 1
    return None


def _decode_raw_json_string(raw_json_string_with_quotes: str) -> str:
    try:
        obj = json.loads('{"description":' + raw_json_string_with_quotes + "}")
        return obj.get("description") or ""
    except json.JSONDecodeError:
        return ""


def _get_description_for_print(entry: Any) -> Tuple[str, str]:
    if isinstance(entry, str):
        raw = _extract_json_string_value(entry, "description")
        if raw is not None:
            return raw, "raw_json"
        try:
            obj = json.loads(entry)
            return obj.get("description") or "", "rendered"
        except json.JSONDecodeError:
            return "", "rendered"
    if isinstance(entry, dict):
        return entry.get("description") or "", "rendered"
    return "", "rendered"


def _load_tool(entry: Any) -> Tuple[Optional[Dict[str, Any]], str]:
    if isinstance(entry, str):
        try:
            return json.loads(entry), "json_str"
        except json.JSONDecodeError:
            return None, "other"
    if isinstance(entry, dict):
        return entry, "dict"
    return None, "other"


def _skip_ws(s: str, i: int) -> int:
    n = len(s)
    while i < n and s[i] in " \t\r\n":
        i += 1
    return i


def _scan_string_span(s: str, i: int) -> Optional[Tuple[int, int]]:
    n = len(s)
    if i >= n or s[i] != '"':
        return None
    j = i + 1
    esc = False
    while j < n:
        c = s[j]
        if esc:
            esc = False
        else:
            if c == "\\":
                esc = True
            elif c == '"':
                return (i, j + 1)
        j += 1
    return None


def _scan_number_span(s: str, i: int) -> Optional[Tuple[int, int]]:
    n = len(s)
    j = i
    if j < n and s[j] == "-":
        j += 1
    if j >= n:
        return None
    if s[j] == "0":
        j += 1
    elif s[j].isdigit():
        while j < n and s[j].isdigit():
            j += 1
    else:
        return None
    if j < n and s[j] == ".":
        j += 1
        if j >= n or not s[j].isdigit():
            return None
        while j < n and s[j].isdigit():
            j += 1
    if j < n and s[j] in "eE":
        j += 1
        if j < n and s[j] in "+-":
            j += 1
        if j >= n or not s[j].isdigit():
            return None
        while j < n and s[j].isdigit():
            j += 1
    return (i, j)


def _scan_literal_span(s: str, i: int) -> Optional[Tuple[int, int]]:
    for lit in ("true", "false", "null"):
        if s.startswith(lit, i):
            return (i, i + len(lit))
    return None


def _scan_container_span(s: str, i: int) -> Optional[Tuple[int, int]]:
    n = len(s)
    if i >= n:
        return None

    opener = s[i]
    if opener not in "{[":
        return None

    stack: List[str] = ["}" if opener == "{" else "]"]
    j = i + 1
    in_str = False
    esc = False

    while j < n:
        c = s[j]

        if in_str:
            if esc:
                esc = False
            else:
                if c == "\\":
                    esc = True
                elif c == '"':
                    in_str = False
            j += 1
            continue

        if c == '"':
            in_str = True
            j += 1
            continue

        if c == "{":
            stack.append("}")
            j += 1
            continue
        if c == "[":
            stack.append("]")
            j += 1
            continue

        if c in "}]":
            if not stack:
                return None
            expected = stack[-1]
            if c != expected:
                return None
            stack.pop()
            j += 1
            if not stack:
                return (i, j)
            continue

        j += 1

    return None


def _is_value_delim(c: str) -> bool:
    return c in ",}]"


def _scan_value_span(s: str, i: int) -> Optional[Tuple[int, int]]:
    n = len(s)
    i = _skip_ws(s, i)
    if i >= n:
        return None

    c = s[i]
    if c == '"':
        return _scan_string_span(s, i)
    if c in "{[":
        return _scan_container_span(s, i)

    span: Optional[Tuple[int, int]]
    if c == "-" or c.isdigit():
        span = _scan_number_span(s, i)
    else:
        span = _scan_literal_span(s, i)

    if not span:
        return None

    _, end = span
    k = _skip_ws(s, end)
    if k >= n:
        return span
    if _is_value_delim(s[k]):
        return span
    return None


def _replace_top_level_string_field_in_raw_object(raw_json_obj: str, key: str, new_value: str) -> Tuple[str, bool, str]:
    s = raw_json_obj
    n = len(s)

    i = _skip_ws(s, 0)
    if i >= n or s[i] != "{":
        return raw_json_obj, False, "not_object"

    i += 1
    found_any_key = False
    expect_key = True

    while True:
        i = _skip_ws(s, i)
        if i >= n:
            return raw_json_obj, False, "cannot_scan_value"

        if expect_key:
            if s[i] == "}":
                return raw_json_obj, False, "key_not_found"
            if s[i] != '"':
                return raw_json_obj, False, "invalid_key_string"

            key_span = _scan_string_span(s, i)
            if not key_span:
                return raw_json_obj, False, "invalid_key_string"

            found_any_key = True
            k_start, k_end = key_span
            try:
                key_decoded = json.loads(s[k_start:k_end])
            except Exception:
                return raw_json_obj, False, "invalid_key_string"

            i = _skip_ws(s, k_end)
            if i >= n or s[i] != ":":
                return raw_json_obj, False, "missing_colon"

            v_span = _scan_value_span(s, i + 1)
            if not v_span:
                return raw_json_obj, False, "cannot_scan_value"

            v_start, v_end = v_span

            if key_decoded == key:
                if v_start >= n or s[v_start] != '"':
                    return raw_json_obj, False, "value_not_string"

                replacement_literal = json.dumps(new_value, ensure_ascii=False)
                patched = s[:v_start] + replacement_literal + s[v_end:]

                try:
                    obj = json.loads(patched)
                except Exception:
                    return raw_json_obj, False, "json_load_failed_after_patch"

                if isinstance(obj, dict) and obj.get(key) == new_value:
                    return patched, True, "ok"
                return raw_json_obj, False, "validation_failed_after_patch"

            i = v_end
            expect_key = False
            continue

        i = _skip_ws(s, i)
        if i >= n:
            return raw_json_obj, False, "cannot_scan_value"

        if s[i] == ",":
            i += 1
            expect_key = True
            continue
        if s[i] == "}":
            return raw_json_obj, False, ("key_not_found" if found_any_key else "key_not_found")
        return raw_json_obj, False, "cannot_scan_value"


# ========= IDs =========
def _tool_fingerprint_excluding_description(tool_obj: Dict[str, Any]) -> str:
    filtered = {k: v for k, v in tool_obj.items() if k != "description"}
    payload = _canonical_json(filtered)
    return hashlib.sha256(payload.encode("utf-8")).hexdigest()[:HASH_HEX_LEN]


def _record_id(record_obj: Dict[str, Any], tool_field: str) -> str:
    rec = dict(record_obj)
    tools = rec.get(tool_field)
    if isinstance(tools, list):
        canon_tools: List[Any] = []
        for entry in tools:
            tool_obj, kind = _load_tool(entry)
            if tool_obj is None:
                canon_tools.append({"_unparsed": entry, "_kind": kind})
            else:
                canon_tools.append({k: v for k, v in tool_obj.items() if k != "description"})
        rec[tool_field] = canon_tools
    payload = _canonical_json(rec)
    return hashlib.sha256(payload.encode("utf-8")).hexdigest()[:HASH_HEX_LEN]


def _tool_instance_key(record_id: str, tool_index: int, tool_obj: Dict[str, Any]) -> str:
    fp = _tool_fingerprint_excluding_description(tool_obj)
    return f"rec:{record_id}:t{tool_index}:{fp}"


# ========= Audit (single file, resumable) =========
def _audit_identity(dataset_path: Path, *, mode_key: str, model: str, tool_field: str) -> str:
    stable = f"{dataset_path.resolve()}|{mode_key}|{model}|{tool_field}"
    return hashlib.sha256(stable.encode("utf-8")).hexdigest()[:12]


def _audit_file_path(
    dataset_path: Path,
    *,
    audit_dir: Path,
    mode_key: str,
    model: str,
    tool_field: str,
) -> Path:
    audit_key = _audit_identity(dataset_path, mode_key=mode_key, model=model, tool_field=tool_field)
    safe_model = "".join(ch if ch.isalnum() or ch in ("-", "_", ".") else "_" for ch in model)
    out_dir = audit_dir / audit_key
    filename = f"{dataset_path.stem}.{audit_key}.{mode_key}.{safe_model}.audit.jsonl"
    return out_dir / filename


def _append_audit_event(audit_file: Path, event: Dict[str, Any]) -> None:
    audit_file.parent.mkdir(parents=True, exist_ok=True)
    safe_event = _json_safe(event)
    with audit_file.open("a", encoding="utf-8") as f:
        f.write(json.dumps(safe_event, ensure_ascii=False) + "\n")


def _load_resume_state(
    audit_file: Path,
) -> Tuple[
    Dict[str, Tuple[str, Optional[str], Optional[Dict[str, Any]]]],
    Dict[str, int],
    Dict[str, Optional[str]],
    Optional[Dict[str, Any]],
]:
    """
    Returns:
      - decisions_by_instance: instance_key -> (status, final_description, llm_bundle)
      - regen_counts: instance_key -> max regen_index observed
      - last_rejected_text: instance_key -> last rejected proposal text (from regenerate events)
      - prior_run_start: first run_start event (if any)
    """
    decisions: Dict[str, Tuple[str, Optional[str], Optional[Dict[str, Any]]]] = {}
    regen_counts: Dict[str, int] = {}
    last_rejected_text: Dict[str, Optional[str]] = {}
    prior_run_start: Optional[Dict[str, Any]] = None

    if not audit_file.exists():
        return decisions, regen_counts, last_rejected_text, None

    best_ri: Dict[str, int] = {}

    with audit_file.open("r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            try:
                ev = json.loads(line)
            except Exception:
                continue
            if not isinstance(ev, dict):
                continue

            et = ev.get("event_type")
            if et == "run_start" and prior_run_start is None:
                prior_run_start = ev

            if et == "regenerate":
                ik = ev.get("instance_key")
                ri = ev.get("regen_index")
                txt = ev.get("last_proposal_text")
                if isinstance(ik, str) and isinstance(ri, int) and ri >= 0:
                    prev = regen_counts.get(ik, 0)
                    if ri > prev:
                        regen_counts[ik] = ri
                    prev_best = best_ri.get(ik, -1)
                    if ri >= prev_best:
                        best_ri[ik] = ri
                        last_rejected_text[ik] = txt if isinstance(txt, str) else None

            if et == "decision":
                ik = ev.get("instance_key")
                status = ev.get("status")
                final_desc = ev.get("final_description")
                llm_bundle = ev.get("llm_bundle")
                if isinstance(ik, str) and isinstance(status, str):
                    decisions[ik] = (
                        status,
                        final_desc if isinstance(final_desc, str) else None,
                        llm_bundle if isinstance(llm_bundle, dict) else None,
                    )

    return decisions, regen_counts, last_rejected_text, prior_run_start


# ========= LLM helpers =========
def _sanitize_llm_output(text: str) -> str:
    t = (text or "").strip()
    if t.startswith("{") and "description" in t:
        try:
            obj = json.loads(t)
            if isinstance(obj, dict) and isinstance(obj.get("description"), str):
                t = obj["description"].strip()
        except Exception:
            pass
    if (t.startswith('"') and t.endswith('"')) or (t.startswith("'") and t.endswith("'")):
        t = t[1:-1].strip()
    return t


def _llm_chat_completion(
    *,
    client: OpenAI,
    model: str,
    messages: List[Dict[str, str]],
    temperature: float,
    max_tokens: int,
    seed: Optional[int],
) -> Tuple[str, Dict[str, Any]]:
    meta: Dict[str, Any] = {
        "seed_requested": seed,
        "seed_applied": False,
        "seed_error": None,
        "finish_reason": None,
        "usage": None,
        "max_tokens_requested": int(max_tokens),
        "max_param_used": None,
    }

    base_kwargs: Dict[str, Any] = dict(model=model, messages=messages, temperature=temperature)

    def attempt(max_param_used: str, include_seed: bool) -> Tuple[str, Dict[str, Any]]:
        req = dict(base_kwargs)
        if max_param_used == "max_completion_tokens":
            req["max_completion_tokens"] = int(max_tokens)
        else:
            req["max_tokens"] = int(max_tokens)
        if include_seed and seed is not None:
            req["seed"] = int(seed)

        resp = client.chat.completions.create(**req)
        text = (resp.choices[0].message.content or "").strip()

        meta_local = dict(meta)
        meta_local["max_param_used"] = max_param_used
        meta_local["finish_reason"] = getattr(resp.choices[0], "finish_reason", None)
        meta_local["usage"] = getattr(resp, "usage", None)
        meta_local["seed_applied"] = bool(include_seed and seed is not None)
        return text, meta_local

    def is_seed_error(e: Exception) -> bool:
        s = str(e).lower()
        return ("seed" in s) and ("unknown" in s or "unsupported" in s or "invalid" in s)

    try:
        return attempt("max_completion_tokens", include_seed=True)
    except Exception as e1:
        if seed is not None and is_seed_error(e1):
            meta["seed_error"] = str(e1)
            try:
                return attempt("max_completion_tokens", include_seed=False)
            except Exception:
                pass
        try:
            return attempt("max_tokens", include_seed=True)
        except Exception as e2:
            if seed is not None and is_seed_error(e2):
                meta["seed_error"] = str(e2)
                return attempt("max_tokens", include_seed=False)
            raise


def generate_description_via_llm(
    *,
    client: OpenAI,
    tool_name: str,
    base_description: str,
    model: str,
    seed: Optional[int],
    max_tokens: int,
    retry_on_length: bool,
    retry_max_tokens: int,
    mode_key: str,
    style_spec: Dict[str, Any],
    regen_index: int = 0,
    previous_rewrite: Optional[str] = None,
) -> Tuple[str, Dict[str, Any]]:
    system = str(style_spec["system"])
    regen_instr = str(style_spec.get("regen_diversity_instruction") or "")
    max_prev = int(style_spec.get("max_prev_rewrite_chars") or DEFAULT_MAX_PREV_REWRITE_CHARS)

    user_parts: List[str] = []
    user_parts.append(f"Tool name: {tool_name}")
    user_parts.append("Base description:")
    user_parts.append(base_description.strip() or "(empty)")
    user_parts.append("")
    user_parts.append(f"Rewrite in '{mode_key}' under the constraints.")

    if regen_index > 0:
        user_parts.append("")
        user_parts.append(f"Regeneration request: {regen_index}")
        if regen_instr:
            user_parts.append(regen_instr)
        if previous_rewrite and previous_rewrite.strip():
            prev = previous_rewrite.strip()
            if len(prev) > max_prev:
                prev = prev[:max_prev].rstrip()
            user_parts.append("")
            user_parts.append("Previous rewrite (do not reuse wording):")
            user_parts.append(prev)

    user = "\n".join(user_parts)

    raw1, meta1 = _llm_chat_completion(
        client=client,
        model=model,
        messages=[{"role": "system", "content": system}, {"role": "user", "content": user}],
        temperature=0.0,
        max_tokens=max_tokens,
        seed=seed,
    )
    san1 = _sanitize_llm_output(raw1)
    finish1 = (meta1.get("finish_reason") or "").lower()
    looks_truncated_1 = (finish1 == "length")

    if not looks_truncated_1:
        return san1, {
            "proposal_origin": "primary",
            "proposal_sanitized_final": san1,
            "llm_text_raw_primary": raw1,
            "llm_text_raw_retry": None,
            "primary": meta1,
            "retry": None,
            "mode_key": mode_key,
        }

    raw2 = None
    meta2 = None
    san2 = None
    best_san = san1
    origin = "primary"

    if retry_on_length and retry_max_tokens > max_tokens:
        raw2, meta2 = _llm_chat_completion(
            client=client,
            model=model,
            messages=[{"role": "system", "content": system}, {"role": "user", "content": user}],
            temperature=0.0,
            max_tokens=int(retry_max_tokens),
            seed=seed,
        )
        san2 = _sanitize_llm_output(raw2)
        if san2 and len(san2) >= len(best_san):
            best_san = san2
            origin = "retry"

    return best_san, {
        "proposal_origin": origin,
        "proposal_sanitized_final": best_san,
        "llm_text_raw_primary": raw1,
        "llm_text_raw_retry": raw2,
        "primary": meta1,
        "retry": meta2,
        "mode_key": mode_key,
    }


# ========= IO =========
def make_working_copy(input_jsonl: str, output_jsonl: str, *, overwrite: bool = False) -> str:
    src = Path(input_jsonl)
    dst = Path(output_jsonl)

    if not src.exists():
        raise FileNotFoundError(f"File not found: {src}")

    if dst.exists() and not overwrite:
        return str(dst)

    dst.parent.mkdir(parents=True, exist_ok=True)
    shutil.copy2(src, dst)
    return str(dst)


def _normalize_cmd(raw: str) -> str:
    c = (raw or "").strip().lower()
    if c in ("", "y", "yes", "ok", "okay", "si", "sì"):
        return "y"
    if c in ("r", "retry", "again", "prova", "prova ancora", "rigenera"):
        return "r"
    if c in ("e", "edit", "modifica"):
        return "e"
    if c in ("m", "manual", "mine", "mio", "mia", "custom"):
        return "m"
    if c in ("s", "skip", "salta", "pass"):
        return "s"
    if c in ("q", "quit", "exit", "esci"):
        return "q"
    return c


# ========= Main interactive =========
def interactive_llm_tools_in_jsonl(
    jsonl_path: str,
    *,
    tool_field: str,
    create_backup_of_target: bool,
    llm_model: str,
    seed: Optional[int],
    max_tokens: int,
    retry_on_length: bool,
    retry_max_tokens: int,
    allow_reserialize_fallback: bool,
    min_sleep_sec_between_calls: float,
    audit_dir: str,
    mode_key: str,
) -> None:
    mode_key, style_spec = _resolve_style(mode_key)

    path = Path(jsonl_path)
    if not path.exists():
        raise FileNotFoundError(f"File not found: {jsonl_path}")

    client = make_gemini_client()
    audit_file = _audit_file_path(
        path,
        audit_dir=Path(audit_dir),
        mode_key=mode_key,
        model=llm_model,
        tool_field=tool_field,
    )

    decisions_by_instance, regen_counts, last_rejected_text_by_instance, prior_run_start = _load_resume_state(audit_file)

    tool_order: List[Dict[str, Any]] = []
    with path.open("r", encoding="utf-8") as f:
        for raw_line in f:
            line = raw_line.rstrip("\n")
            if not line.strip():
                continue
            try:
                record = json.loads(line)
            except json.JSONDecodeError:
                continue
            if not isinstance(record, dict):
                continue

            rid = _record_id(record, tool_field=tool_field)
            tools = record.get(tool_field)
            if not isinstance(tools, list):
                continue

            for tool_index, entry in enumerate(tools):
                tool_obj, kind = _load_tool(entry)
                if not tool_obj:
                    continue
                name = (tool_obj.get("name") or "").strip()
                if not name:
                    continue

                desc_print, desc_mode = _get_description_for_print(entry)
                instance_key = _tool_instance_key(rid, tool_index, tool_obj)

                tool_order.append(
                    {
                        "record_id": rid,
                        "tool_index": tool_index,
                        "tool_name": name,
                        "desc_print": desc_print,
                        "desc_mode": desc_mode,
                        "instance_key": instance_key,
                        "entry_kind": kind,
                    }
                )

    n_total = len(tool_order)
    n_prev_reviewed = len(decisions_by_instance)

    start_pos = 0
    while start_pos < n_total and tool_order[start_pos]["instance_key"] in decisions_by_instance:
        start_pos += 1

    session_id = hashlib.sha256(f"{time.time_ns()}".encode("utf-8")).hexdigest()[:12]
    before_sha = _sha256_file(path)

    if prior_run_start is None:
        _append_audit_event(
            audit_file,
            {
                "event_type": "run_start",
                "ts": int(time.time()),
                "session_id": session_id,
                "mode": mode_key,
                "model": llm_model,
                "seed": seed,
                "dataset_path": str(path),
                "dataset_sha256_at_session_start": before_sha,
                "tool_field": tool_field,
                "n_total_occurrences": n_total,
                "max_tokens_requested": int(max_tokens),
                "retry_on_length": bool(retry_on_length),
                "retry_max_tokens": int(retry_max_tokens),
                "allow_reserialize_fallback": bool(allow_reserialize_fallback),
            },
        )
    else:
        _append_audit_event(
            audit_file,
            {
                "event_type": "run_resume",
                "ts": int(time.time()),
                "session_id": session_id,
                "mode": mode_key,
                "model": llm_model,
                "seed": seed,
                "dataset_path": str(path),
                "dataset_sha256_at_session_start": before_sha,
                "tool_field": tool_field,
                "n_total_occurrences": n_total,
                "n_previously_reviewed": n_prev_reviewed,
                "resume_from_index_1based": (start_pos + 1) if start_pos < n_total else (n_total + 1),
            },
        )

    print(f"Target: {path}")
    print(f"Mode: {mode_key}")
    print(f"Audit file (RESUMABLE): {audit_file}")
    print(f"Tool occurrences total: {n_total}")
    if start_pos < n_total:
        print(f"Resume position: [{start_pos + 1}/{n_total}] (previously reviewed: {n_prev_reviewed})")
    else:
        print(f"Resume position: completed (previously reviewed: {n_prev_reviewed})")
    print(f"LLM: {llm_model} @ {GEMINI_BASE_URL}")
    print(f"Max tokens: {int(max_tokens)}; retry_on_length={bool(retry_on_length)}; retry_max_tokens={int(retry_max_tokens)}")
    print("Commands: ENTER/ok=accept, r=regenerate, e=edit, m=manual, s=skip, q=quit\n")

    quit_requested = False
    resume_next_index_1based: Optional[int] = None

    max_prev = int(style_spec.get("max_prev_rewrite_chars") or DEFAULT_MAX_PREV_REWRITE_CHARS)

    for pos in range(start_pos, n_total):
        item = tool_order[pos]
        idx = pos + 1

        name = item["tool_name"]
        desc_mode = item["desc_mode"]
        old_desc_print = item["desc_print"]
        instance_key = item["instance_key"]
        rid = item["record_id"]
        tool_i = item["tool_index"]

        regen_index_local = int(regen_counts.get(instance_key, 0))
        previous_rewrite_local: Optional[str] = last_rejected_text_by_instance.get(instance_key)

        print("=" * 80)
        print(f"[{idx}/{n_total}] {name}")
        print(f"instance_key: {instance_key} (record_id={rid}, tool_index={tool_i})")

        if desc_mode == "raw_json":
            print("Current description RAW (escaped):")
            print(old_desc_print if old_desc_print else "(empty)")
            base_desc = _decode_raw_json_string(old_desc_print) if old_desc_print else ""
            print("\nCurrent description DECODED:")
            print(base_desc if base_desc else "(empty)")
        else:
            base_desc = old_desc_print or ""
            print("Current description:")
            print(base_desc if base_desc else "(empty)")

        proposal = ""
        llm_bundle: Optional[Dict[str, Any]] = None

        while True:
            if not proposal:
                try:
                    proposal, llm_bundle = generate_description_via_llm(
                        client=client,
                        tool_name=name,
                        base_description=base_desc,
                        model=llm_model,
                        seed=seed,
                        max_tokens=max_tokens,
                        retry_on_length=retry_on_length,
                        retry_max_tokens=retry_max_tokens,
                        mode_key=mode_key,
                        style_spec=style_spec,
                        regen_index=regen_index_local,
                        previous_rewrite=previous_rewrite_local,
                    )
                except Exception as e:
                    print(f"\nLLM ERROR: {e}")
                    cmd = _normalize_cmd(input("Choice [m=manual, e=edit, s=skip, q=quit] > "))
                    now = int(time.time())

                    if cmd == "q":
                        quit_requested = True
                        resume_next_index_1based = idx
                        break

                    if cmd == "s":
                        decisions_by_instance[instance_key] = ("skipped", None, None)
                        _append_audit_event(
                            audit_file,
                            {
                                "event_type": "decision",
                                "ts": now,
                                "session_id": session_id,
                                "status": "skipped",
                                "tool_name": name,
                                "instance_key": instance_key,
                                "record_id": rid,
                                "tool_index": tool_i,
                                "model": llm_model,
                                "seed": seed,
                                "mode": mode_key,
                                "base_description": base_desc,
                                "final_description": None,
                                "source": "user",
                                "note": "skip_after_llm_error",
                            },
                        )
                        break

                    if cmd in ("m", "e"):
                        manual_final = input("Manual final description (empty cancels) > ").rstrip("\n").strip()
                        status = "manual" if (cmd == "m" and manual_final) else ("edited" if (cmd == "e" and manual_final) else "skipped")
                        decisions_by_instance[instance_key] = (status, manual_final or None, None)
                        _append_audit_event(
                            audit_file,
                            {
                                "event_type": "decision",
                                "ts": now,
                                "session_id": session_id,
                                "status": status,
                                "tool_name": name,
                                "instance_key": instance_key,
                                "record_id": rid,
                                "tool_index": tool_i,
                                "model": llm_model,
                                "seed": seed,
                                "mode": mode_key,
                                "base_description": base_desc,
                                "final_description": manual_final or None,
                                "source": "user",
                                "note": "manual_or_edit_after_llm_error",
                            },
                        )
                        break

                    proposal = ""
                    continue

                proposal = (proposal or "").strip()

            print("\nLLM proposal:")
            print(proposal if proposal else "(empty)")

            cmd = _normalize_cmd(input("\nChoice [ENTER=accept, r=regen, e=edit, m=manual, s=skip, q=quit] > "))
            now = int(time.time())

            if cmd == "y":
                if proposal.strip():
                    decisions_by_instance[instance_key] = ("accepted", proposal.strip(), llm_bundle)
                    _append_audit_event(
                        audit_file,
                        {
                            "event_type": "decision",
                            "ts": now,
                            "session_id": session_id,
                            "status": "accepted",
                            "tool_name": name,
                            "instance_key": instance_key,
                            "record_id": rid,
                            "tool_index": tool_i,
                            "model": llm_model,
                            "seed": seed,
                            "mode": mode_key,
                            "base_description": base_desc,
                            "final_description": proposal.strip(),
                            "source": "llm",
                            "llm_bundle": llm_bundle,
                        },
                    )
                else:
                    decisions_by_instance[instance_key] = ("skipped", None, llm_bundle)
                    _append_audit_event(
                        audit_file,
                        {
                            "event_type": "decision",
                            "ts": now,
                            "session_id": session_id,
                            "status": "skipped",
                            "tool_name": name,
                            "instance_key": instance_key,
                            "record_id": rid,
                            "tool_index": tool_i,
                            "model": llm_model,
                            "seed": seed,
                            "mode": mode_key,
                            "base_description": base_desc,
                            "final_description": None,
                            "source": "llm",
                            "note": "empty_proposal",
                            "llm_bundle": llm_bundle,
                        },
                    )
                break

            if cmd == "r":
                previous_rewrite_local = proposal.strip() if proposal else None
                if previous_rewrite_local and len(previous_rewrite_local) > max_prev:
                    previous_rewrite_local = previous_rewrite_local[:max_prev].rstrip()

                regen_counts[instance_key] = regen_counts.get(instance_key, 0) + 1
                regen_index_local = int(regen_counts[instance_key])
                last_rejected_text_by_instance[instance_key] = previous_rewrite_local

                _append_audit_event(
                    audit_file,
                    {
                        "event_type": "regenerate",
                        "ts": now,
                        "session_id": session_id,
                        "tool_name": name,
                        "instance_key": instance_key,
                        "record_id": rid,
                        "tool_index": tool_i,
                        "mode": mode_key,
                        "regen_index": regen_index_local,
                        "last_proposal_sha256": _sha256_text(proposal),
                        "last_proposal_text": previous_rewrite_local,
                        "last_proposal_origin": (llm_bundle or {}).get("proposal_origin") if llm_bundle else None,
                    },
                )

                proposal = ""
                llm_bundle = None
                if min_sleep_sec_between_calls > 0:
                    time.sleep(min_sleep_sec_between_calls)
                continue

            if cmd == "e":
                edited = input("Edit proposal (empty cancels) > ").rstrip("\n").strip()
                status = "edited" if edited else "skipped"
                decisions_by_instance[instance_key] = (status, edited or None, llm_bundle)
                _append_audit_event(
                    audit_file,
                    {
                        "event_type": "decision",
                        "ts": now,
                        "session_id": session_id,
                        "status": status,
                        "tool_name": name,
                        "instance_key": instance_key,
                        "record_id": rid,
                        "tool_index": tool_i,
                        "model": llm_model,
                        "seed": seed,
                        "mode": mode_key,
                        "base_description": base_desc,
                        "final_description": edited or None,
                        "source": "user",
                        "note": "edit_proposal",
                        "llm_bundle": llm_bundle,
                    },
                )
                break

            if cmd == "m":
                manual_final = input("Manual final description (empty cancels) > ").rstrip("\n").strip()
                status = "manual" if manual_final else "skipped"
                decisions_by_instance[instance_key] = (status, manual_final or None, llm_bundle)
                _append_audit_event(
                    audit_file,
                    {
                        "event_type": "decision",
                        "ts": now,
                        "session_id": session_id,
                        "status": status,
                        "tool_name": name,
                        "instance_key": instance_key,
                        "record_id": rid,
                        "tool_index": tool_i,
                        "model": llm_model,
                        "seed": seed,
                        "mode": mode_key,
                        "base_description": base_desc,
                        "final_description": manual_final or None,
                        "source": "user",
                        "note": "manual_replace",
                        "llm_bundle": llm_bundle,
                    },
                )
                break

            if cmd == "s":
                decisions_by_instance[instance_key] = ("skipped", None, llm_bundle)
                _append_audit_event(
                    audit_file,
                    {
                        "event_type": "decision",
                        "ts": now,
                        "session_id": session_id,
                        "status": "skipped",
                        "tool_name": name,
                        "instance_key": instance_key,
                        "record_id": rid,
                        "tool_index": tool_i,
                        "model": llm_model,
                        "seed": seed,
                        "mode": mode_key,
                        "base_description": base_desc,
                        "final_description": None,
                        "source": "user",
                        "note": "skip",
                        "llm_bundle": llm_bundle,
                    },
                )
                break

            if cmd == "q":
                quit_requested = True
                resume_next_index_1based = idx
                break

            print("Invalid command.")

        if quit_requested:
            break

    # ========= Apply decisions to file =========
    tmp_path = path.with_suffix(path.suffix + ".tmp")
    updated_count = 0
    patch_failures = 0

    with path.open("r", encoding="utf-8") as fin, tmp_path.open("w", encoding="utf-8") as fout:
        for raw_line in fin:
            line = raw_line.rstrip("\n")
            if not line.strip():
                fout.write(line + "\n")
                continue

            try:
                record = json.loads(line)
            except json.JSONDecodeError:
                fout.write(line + "\n")
                continue

            if not isinstance(record, dict):
                fout.write(json.dumps(record, ensure_ascii=False) + "\n")
                continue

            rid = _record_id(record, tool_field=tool_field)
            tools = record.get(tool_field)

            if isinstance(tools, list):
                new_tools: List[Any] = []
                for tool_index, entry in enumerate(tools):
                    tool_obj, kind = _load_tool(entry)
                    if not tool_obj:
                        new_tools.append(entry)
                        continue

                    instance_key = _tool_instance_key(rid, tool_index, tool_obj)
                    decision = decisions_by_instance.get(instance_key)

                    if decision is None:
                        new_tools.append(entry)
                        continue

                    status, new_desc, llm_bundle = decision
                    if status in ("accepted", "edited", "manual") and new_desc:
                        if kind == "json_str" and isinstance(entry, str):
                            already_ok = False
                            try:
                                obj0 = json.loads(entry)
                                if isinstance(obj0, dict) and obj0.get("description") == new_desc:
                                    already_ok = True
                            except Exception:
                                already_ok = False

                            if already_ok:
                                new_tools.append(entry)
                                continue

                            patched, ok, reason = _replace_top_level_string_field_in_raw_object(entry, "description", new_desc)
                            if ok:
                                new_tools.append(patched)
                                updated_count += 1
                            else:
                                fallback_ok = False
                                fallback_patched = entry
                                if allow_reserialize_fallback:
                                    try:
                                        obj = json.loads(entry)
                                        if isinstance(obj, dict):
                                            obj["description"] = new_desc
                                            fallback_patched = json.dumps(obj, ensure_ascii=False)
                                            fallback_ok = True
                                    except Exception:
                                        fallback_ok = False

                                if fallback_ok:
                                    new_tools.append(fallback_patched)
                                    updated_count += 1
                                    _append_audit_event(
                                        audit_file,
                                        {
                                            "event_type": "patch_fallback_reserialize",
                                            "ts": int(time.time()),
                                            "session_id": session_id,
                                            "instance_key": instance_key,
                                            "record_id": rid,
                                            "tool_index": tool_index,
                                            "tool_name": tool_obj.get("name"),
                                            "mode": mode_key,
                                            "entry_sha256_before": _sha256_text(entry),
                                            "entry_sha256_after": _sha256_text(fallback_patched),
                                            "patch_reason": reason,
                                        },
                                    )
                                else:
                                    new_tools.append(entry)
                                    patch_failures += 1
                        else:
                            if tool_obj.get("description") == new_desc:
                                new_tools.append(tool_obj)
                                continue
                            tool_obj["description"] = new_desc
                            new_tools.append(tool_obj)
                            updated_count += 1
                    else:
                        new_tools.append(entry)

                record[tool_field] = new_tools

            fout.write(json.dumps(record, ensure_ascii=False) + "\n")

    if create_backup_of_target:
        bak_path = path.with_suffix(path.suffix + ".bak")
        if not bak_path.exists():
            shutil.copy2(path, bak_path)

    tmp_path.replace(path)
    after_sha = _sha256_file(path)

    n_reviewed = len(decisions_by_instance)
    n_skipped = sum(1 for st, _, _ in decisions_by_instance.values() if st == "skipped")
    completed = (n_reviewed >= n_total) and (not quit_requested)

    _append_audit_event(
        audit_file,
        {
            "event_type": "run_end",
            "ts": int(time.time()),
            "session_id": session_id,
            "mode": mode_key,
            "model": llm_model,
            "seed": seed,
            "dataset_path": str(path),
            "dataset_sha256_at_session_start": before_sha,
            "dataset_sha256_at_session_end": after_sha,
            "n_total_occurrences": n_total,
            "n_reviewed_total": n_reviewed,
            "n_updated_this_session": updated_count,
            "n_skipped_total": n_skipped,
            "completed": bool(completed),
            "quit_requested": bool(quit_requested),
            "raw_patch_failures_this_session": patch_failures,
            "resume_next_index_1based": resume_next_index_1based if quit_requested else (n_total + 1 if completed else None),
        },
    )

    print("\nChanges applied.")
    print(f"Mode: {mode_key}")
    print(f"Descriptions updated (this session): {updated_count}")
    if patch_failures:
        print(f"Raw JSON-string patch failures (left unchanged): {patch_failures}")
    print(f"Reviewed total (from audit): {n_reviewed} / {n_total}")
    print(f"Completed: {completed} (quit_requested={quit_requested})")
    if quit_requested and resume_next_index_1based is not None:
        print(f"Resume next time from: [{resume_next_index_1based}/{n_total}]")
    print(f"Updated file: {path}")
    print(f"Audit file (same on resume): {audit_file}")


def _derive_working_copy_path(input_path: str, mode_key: str) -> str:
    """
    Ensures per-style dataset separation (so you can interrupt/resume per style without collisions).
    Example:
      input:  foo.jsonl
      mode:   style_verbose
      output: foo.WORKING_COPY.style_verbose.jsonl
    """
    p = Path(input_path)
    return str(p.with_name(f"{p.stem}.WORKING_COPY.{mode_key}{p.suffix}"))


if __name__ == "__main__":
    # ----- Inputs -----
    INPUT_JSONL = os.environ.get("INPUT_JSONL") or "When2Call/data/test/when2call_test_llm_judge.jsonl"
    MODE_KEY = os.environ.get("MODE_KEY") or "style_concise"   # style_verbose | style_concise (aliases: style_coicnoso, style_coinceise)
    LLM_MODEL = os.environ.get("LLM_MODEL") or LLM_MODEL_DEFAULT

    mode_key_resolved, _ = _resolve_style(MODE_KEY)

    # Per-style working copy (separate dataset per style, by default)
    OUTPUT_JSONL = os.environ.get("OUTPUT_JSONL") or _derive_working_copy_path(INPUT_JSONL, mode_key_resolved)

    working = make_working_copy(INPUT_JSONL, OUTPUT_JSONL, overwrite=False)
    print(f"Working copy: {working}")

    # ----- Runtime knobs -----
    seed_env = os.environ.get("GEMINI_SEED")
    seed_val: Optional[int] = int(seed_env.strip()) if (seed_env and seed_env.strip()) else None

    max_tokens_env = os.environ.get("GEMINI_MAX_TOKENS")
    max_tokens_val = int(max_tokens_env.strip()) if (max_tokens_env and max_tokens_env.strip()) else DEFAULT_MAX_TOKENS

    retry_max_tokens_env = os.environ.get("GEMINI_RETRY_MAX_TOKENS")
    retry_max_tokens_val = int(retry_max_tokens_env.strip()) if (retry_max_tokens_env and retry_max_tokens_env.strip()) else RETRY_MAX_TOKENS

    allow_reserialize_env = os.environ.get("ALLOW_RESERIALIZE_FALLBACK")
    allow_reserialize_val = (
        bool(int(allow_reserialize_env.strip()))
        if (allow_reserialize_env and allow_reserialize_env.strip())
        else DEFAULT_ALLOW_RESERIALIZE_FALLBACK
    )

    interactive_llm_tools_in_jsonl(
        working,
        tool_field="tools",
        create_backup_of_target=False,
        llm_model=LLM_MODEL,
        seed=seed_val,
        max_tokens=max_tokens_val,
        retry_on_length=RETRY_ON_LENGTH,
        retry_max_tokens=retry_max_tokens_val,
        allow_reserialize_fallback=allow_reserialize_val,
        min_sleep_sec_between_calls=0.0,
        audit_dir="audit",
        mode_key=mode_key_resolved,
    )


Working copy: When2Call/data/test/when2call_test_llm_judge.WORKING_COPY.style_concise.jsonl
Target: When2Call/data/test/when2call_test_llm_judge.WORKING_COPY.style_concise.jsonl
Mode: style_concise
Audit file (RESUMABLE): audit/7805585800e6/when2call_test_llm_judge.WORKING_COPY.style_concise.7805585800e6.style_concise.gemini-2.5-flash.audit.jsonl
Tool occurrences total: 978
Resume position: [3/978] (previously reviewed: 2)
LLM: gemini-2.5-flash @ https://generativelanguage.googleapis.com/v1beta/openai/
Max tokens: 512; retry_on_length=True; retry_max_tokens=1024
Commands: ENTER/ok=accept, r=regenerate, e=edit, m=manual, s=skip, q=quit

[3/978] Buses_3_FindBus
instance_key: rec:64cbc7e8819e45258b49e186164c9fad:t0:9eb0bebb83fe4602fcb0390818dc2b64 (record_id=64cbc7e8819e45258b49e186164c9fad, tool_index=0)
Current description RAW (escaped):
"Search for a bus itinerary between two cities on a specific date."

Current description DECODED:
Search for a bus itinerary between two cities on a sp

In [None]:
#!/usr/bin/env python3  28 dicembre
import json
import shutil
import os
import time
import hashlib
from pathlib import Path
from typing import Dict, Any, Optional, Tuple, List

from openai import OpenAI


# ========= Config =========
GEMINI_BASE_URL = "https://generativelanguage.googleapis.com/v1beta/openai/"
LLM_MODEL = "gemini-2.5-flash"
MODE_KEY = "style_verbose"

HASH_HEX_LEN = 32

DEFAULT_MAX_TOKENS = 512
RETRY_ON_LENGTH = True
RETRY_MAX_TOKENS = 1024

DEFAULT_ALLOW_RESERIALIZE_FALLBACK = False

REGEN_DIVERSITY_INSTRUCTION = (
    "Return a meaning-equivalent rewrite that is lexically different from your previous rewrite; "
    "avoid repeating the same sentence structure."
)

# How much of the rejected previous rewrite to store in audit (for resume) and to feed back into prompt.
MAX_PREV_REWRITE_CHARS = 800


# ========= Client =========
def make_gemini_client() -> OpenAI:
    token = os.environ.get("TOKEN_GEMINI")
    if not token:
        raise RuntimeError("TOKEN_GEMINI environment variable is not set.")
    return OpenAI(api_key=token, base_url=GEMINI_BASE_URL)


# ========= Small utils =========
def _json_safe(obj: Any) -> Any:
    if obj is None or isinstance(obj, (str, int, float, bool)):
        return obj
    if isinstance(obj, dict):
        return {str(k): _json_safe(v) for k, v in obj.items()}
    if isinstance(obj, (list, tuple)):
        return [_json_safe(x) for x in obj]
    if hasattr(obj, "model_dump") and callable(getattr(obj, "model_dump")):
        try:
            return _json_safe(obj.model_dump())
        except Exception:
            pass
    if hasattr(obj, "dict") and callable(getattr(obj, "dict")):
        try:
            return _json_safe(obj.dict())
        except Exception:
            pass
    if hasattr(obj, "__dict__"):
        try:
            return _json_safe(vars(obj))
        except Exception:
            pass
    try:
        return str(obj)
    except Exception:
        return None


def _sha256_text(s: str) -> str:
    return hashlib.sha256((s or "").encode("utf-8")).hexdigest()


def _canonical_json(obj: Any) -> str:
    return json.dumps(obj, ensure_ascii=False, sort_keys=True, separators=(",", ":"))


def _sha256_file(path: Path) -> str:
    h = hashlib.sha256()
    with path.open("rb") as f:
        for chunk in iter(lambda: f.read(1024 * 1024), b""):
            h.update(chunk)
    return h.hexdigest()


# ========= Raw JSON-string patcher (for tools stored as JSON strings) =========
def _extract_json_string_value(raw_json: str, key: str) -> Optional[str]:
    token = f'"{key}"'
    i = raw_json.find(token)
    if i < 0:
        return None
    i = raw_json.find(":", i + len(token))
    if i < 0:
        return None
    i += 1
    n = len(raw_json)
    while i < n and raw_json[i] in " \t\r\n":
        i += 1
    if i >= n or raw_json[i] != '"':
        return None
    start = i
    i += 1
    esc = False
    while i < n:
        c = raw_json[i]
        if esc:
            esc = False
        else:
            if c == "\\":
                esc = True
            elif c == '"':
                return raw_json[start : i + 1]
        i += 1
    return None


def _decode_raw_json_string(raw_json_string_with_quotes: str) -> str:
    try:
        obj = json.loads('{"description":' + raw_json_string_with_quotes + "}")
        return obj.get("description") or ""
    except json.JSONDecodeError:
        return ""


def _get_description_for_print(entry: Any) -> Tuple[str, str]:
    if isinstance(entry, str):
        raw = _extract_json_string_value(entry, "description")
        if raw is not None:
            return raw, "raw_json"
        try:
            obj = json.loads(entry)
            return obj.get("description") or "", "rendered"
        except json.JSONDecodeError:
            return "", "rendered"
    if isinstance(entry, dict):
        return entry.get("description") or "", "rendered"
    return "", "rendered"


def _load_tool(entry: Any) -> Tuple[Optional[Dict[str, Any]], str]:
    if isinstance(entry, str):
        try:
            return json.loads(entry), "json_str"
        except json.JSONDecodeError:
            return None, "other"
    if isinstance(entry, dict):
        return entry, "dict"
    return None, "other"


def _skip_ws(s: str, i: int) -> int:
    n = len(s)
    while i < n and s[i] in " \t\r\n":
        i += 1
    return i


def _scan_string_span(s: str, i: int) -> Optional[Tuple[int, int]]:
    n = len(s)
    if i >= n or s[i] != '"':
        return None
    j = i + 1
    esc = False
    while j < n:
        c = s[j]
        if esc:
            esc = False
        else:
            if c == "\\":
                esc = True
            elif c == '"':
                return (i, j + 1)
        j += 1
    return None


def _scan_number_span(s: str, i: int) -> Optional[Tuple[int, int]]:
    n = len(s)
    j = i
    if j < n and s[j] == "-":
        j += 1
    if j >= n:
        return None
    if s[j] == "0":
        j += 1
    elif s[j].isdigit():
        while j < n and s[j].isdigit():
            j += 1
    else:
        return None
    if j < n and s[j] == ".":
        j += 1
        if j >= n or not s[j].isdigit():
            return None
        while j < n and s[j].isdigit():
            j += 1
    if j < n and s[j] in "eE":
        j += 1
        if j < n and s[j] in "+-":
            j += 1
        if j >= n or not s[j].isdigit():
            return None
        while j < n and s[j].isdigit():
            j += 1
    return (i, j)


def _scan_literal_span(s: str, i: int) -> Optional[Tuple[int, int]]:
    for lit in ("true", "false", "null"):
        if s.startswith(lit, i):
            return (i, i + len(lit))
    return None


def _scan_container_span(s: str, i: int) -> Optional[Tuple[int, int]]:
    n = len(s)
    if i >= n:
        return None

    opener = s[i]
    if opener not in "{[":
        return None

    stack: List[str] = ["}" if opener == "{" else "]"]
    j = i + 1
    in_str = False
    esc = False

    while j < n:
        c = s[j]

        if in_str:
            if esc:
                esc = False
            else:
                if c == "\\":
                    esc = True
                elif c == '"':
                    in_str = False
            j += 1
            continue

        if c == '"':
            in_str = True
            j += 1
            continue

        if c == "{":
            stack.append("}")
            j += 1
            continue
        if c == "[":
            stack.append("]")
            j += 1
            continue

        if c in "}]":
            if not stack:
                return None
            expected = stack[-1]
            if c != expected:
                return None
            stack.pop()
            j += 1
            if not stack:
                return (i, j)
            continue

        j += 1

    return None


def _is_value_delim(c: str) -> bool:
    return c in ",}]"


def _scan_value_span(s: str, i: int) -> Optional[Tuple[int, int]]:
    n = len(s)
    i = _skip_ws(s, i)
    if i >= n:
        return None

    c = s[i]
    if c == '"':
        return _scan_string_span(s, i)
    if c in "{[":
        return _scan_container_span(s, i)

    span: Optional[Tuple[int, int]]
    if c == "-" or c.isdigit():
        span = _scan_number_span(s, i)
    else:
        span = _scan_literal_span(s, i)

    if not span:
        return None

    _, end = span
    k = _skip_ws(s, end)
    if k >= n:
        return span
    if _is_value_delim(s[k]):
        return span
    return None


def _replace_top_level_string_field_in_raw_object(raw_json_obj: str, key: str, new_value: str) -> Tuple[str, bool, str]:
    s = raw_json_obj
    n = len(s)

    i = _skip_ws(s, 0)
    if i >= n or s[i] != "{":
        return raw_json_obj, False, "not_object"

    i += 1
    found_any_key = False
    expect_key = True

    while True:
        i = _skip_ws(s, i)
        if i >= n:
            return raw_json_obj, False, "cannot_scan_value"

        if expect_key:
            if s[i] == "}":
                return raw_json_obj, False, "key_not_found"
            if s[i] != '"':
                return raw_json_obj, False, "invalid_key_string"

            key_span = _scan_string_span(s, i)
            if not key_span:
                return raw_json_obj, False, "invalid_key_string"

            found_any_key = True
            k_start, k_end = key_span
            try:
                key_decoded = json.loads(s[k_start:k_end])
            except Exception:
                return raw_json_obj, False, "invalid_key_string"

            i = _skip_ws(s, k_end)
            if i >= n or s[i] != ":":
                return raw_json_obj, False, "missing_colon"

            v_span = _scan_value_span(s, i + 1)
            if not v_span:
                return raw_json_obj, False, "cannot_scan_value"

            v_start, v_end = v_span

            if key_decoded == key:
                if v_start >= n or s[v_start] != '"':
                    return raw_json_obj, False, "value_not_string"

                replacement_literal = json.dumps(new_value, ensure_ascii=False)
                patched = s[:v_start] + replacement_literal + s[v_end:]

                try:
                    obj = json.loads(patched)
                except Exception:
                    return raw_json_obj, False, "json_load_failed_after_patch"

                if isinstance(obj, dict) and obj.get(key) == new_value:
                    return patched, True, "ok"
                return raw_json_obj, False, "validation_failed_after_patch"

            i = v_end
            expect_key = False
            continue

        i = _skip_ws(s, i)
        if i >= n:
            return raw_json_obj, False, "cannot_scan_value"

        if s[i] == ",":
            i += 1
            expect_key = True
            continue
        if s[i] == "}":
            return raw_json_obj, False, ("key_not_found" if found_any_key else "key_not_found")
        return raw_json_obj, False, "cannot_scan_value"


# ========= IDs =========
def _tool_fingerprint_excluding_description(tool_obj: Dict[str, Any]) -> str:
    filtered = {k: v for k, v in tool_obj.items() if k != "description"}
    payload = _canonical_json(filtered)
    return hashlib.sha256(payload.encode("utf-8")).hexdigest()[:HASH_HEX_LEN]


def _record_id(record_obj: Dict[str, Any], tool_field: str) -> str:
    rec = dict(record_obj)
    tools = rec.get(tool_field)
    if isinstance(tools, list):
        canon_tools: List[Any] = []
        for entry in tools:
            tool_obj, kind = _load_tool(entry)
            if tool_obj is None:
                canon_tools.append({"_unparsed": entry, "_kind": kind})
            else:
                canon_tools.append({k: v for k, v in tool_obj.items() if k != "description"})
        rec[tool_field] = canon_tools
    payload = _canonical_json(rec)
    return hashlib.sha256(payload.encode("utf-8")).hexdigest()[:HASH_HEX_LEN]


def _tool_instance_key(record_id: str, tool_index: int, tool_obj: Dict[str, Any]) -> str:
    fp = _tool_fingerprint_excluding_description(tool_obj)
    return f"rec:{record_id}:t{tool_index}:{fp}"


# ========= Audit (single file, resumable) =========
def _audit_identity(dataset_path: Path, *, mode_key: str, model: str, tool_field: str) -> str:
    stable = f"{dataset_path.resolve()}|{mode_key}|{model}|{tool_field}"
    return hashlib.sha256(stable.encode("utf-8")).hexdigest()[:12]


def _audit_file_path(
    dataset_path: Path,
    *,
    audit_dir: Path,
    mode_key: str,
    model: str,
    tool_field: str,
) -> Path:
    audit_key = _audit_identity(dataset_path, mode_key=mode_key, model=model, tool_field=tool_field)
    safe_model = "".join(ch if ch.isalnum() or ch in ("-", "_", ".") else "_" for ch in model)
    out_dir = audit_dir / audit_key
    filename = f"{dataset_path.stem}.{audit_key}.{mode_key}.{safe_model}.audit.jsonl"
    return out_dir / filename


def _append_audit_event(audit_file: Path, event: Dict[str, Any]) -> None:
    audit_file.parent.mkdir(parents=True, exist_ok=True)
    safe_event = _json_safe(event)
    with audit_file.open("a", encoding="utf-8") as f:
        f.write(json.dumps(safe_event, ensure_ascii=False) + "\n")


def _load_resume_state(
    audit_file: Path,
) -> Tuple[
    Dict[str, Tuple[str, Optional[str], Optional[Dict[str, Any]]]],
    Dict[str, int],
    Dict[str, Optional[str]],
    Optional[Dict[str, Any]],
]:
    """
    Returns:
      - decisions_by_instance: instance_key -> (status, final_description, llm_bundle)
      - regen_counts: instance_key -> max regen_index observed
      - last_rejected_text: instance_key -> last rejected proposal text (from regenerate events)
      - prior_run_start: first run_start event (if any)
    """
    decisions: Dict[str, Tuple[str, Optional[str], Optional[Dict[str, Any]]]] = {}
    regen_counts: Dict[str, int] = {}
    last_rejected_text: Dict[str, Optional[str]] = {}
    prior_run_start: Optional[Dict[str, Any]] = None

    if not audit_file.exists():
        return decisions, regen_counts, last_rejected_text, None

    # Track per-instance best regen index so we keep the latest text
    best_ri: Dict[str, int] = {}

    with audit_file.open("r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            try:
                ev = json.loads(line)
            except Exception:
                continue
            if not isinstance(ev, dict):
                continue

            et = ev.get("event_type")
            if et == "run_start" and prior_run_start is None:
                prior_run_start = ev

            if et == "regenerate":
                ik = ev.get("instance_key")
                ri = ev.get("regen_index")
                txt = ev.get("last_proposal_text")
                if isinstance(ik, str) and isinstance(ri, int) and ri >= 0:
                    prev = regen_counts.get(ik, 0)
                    if ri > prev:
                        regen_counts[ik] = ri
                    prev_best = best_ri.get(ik, -1)
                    if ri >= prev_best:
                        best_ri[ik] = ri
                        last_rejected_text[ik] = txt if isinstance(txt, str) else None

            if et == "decision":
                ik = ev.get("instance_key")
                status = ev.get("status")
                final_desc = ev.get("final_description")
                llm_bundle = ev.get("llm_bundle")
                if isinstance(ik, str) and isinstance(status, str):
                    decisions[ik] = (
                        status,
                        final_desc if isinstance(final_desc, str) else None,
                        llm_bundle if isinstance(llm_bundle, dict) else None,
                    )

    return decisions, regen_counts, last_rejected_text, prior_run_start


# ========= LLM helpers =========
def _ends_like_complete_sentence(text: str) -> bool:
    t = (text or "").strip()
    return bool(t) and t.endswith((".", "!", "?", "”", '"', "’", "'"))


def _sanitize_llm_output(text: str) -> str:
    t = (text or "").strip()
    if t.startswith("{") and "description" in t:
        try:
            obj = json.loads(t)
            if isinstance(obj, dict) and isinstance(obj.get("description"), str):
                t = obj["description"].strip()
        except Exception:
            pass
    if (t.startswith('"') and t.endswith('"')) or (t.startswith("'") and t.endswith("'")):
        t = t[1:-1].strip()
    return t


def _llm_chat_completion(
    *,
    client: OpenAI,
    model: str,
    messages: List[Dict[str, str]],
    temperature: float,
    max_tokens: int,
    seed: Optional[int],
) -> Tuple[str, Dict[str, Any]]:
    meta: Dict[str, Any] = {
        "seed_requested": seed,
        "seed_applied": False,
        "seed_error": None,
        "finish_reason": None,
        "usage": None,
        "max_tokens_requested": int(max_tokens),
        "max_param_used": None,
    }

    base_kwargs: Dict[str, Any] = dict(model=model, messages=messages, temperature=temperature)

    def attempt(max_param_used: str, include_seed: bool) -> Tuple[str, Dict[str, Any]]:
        req = dict(base_kwargs)
        if max_param_used == "max_completion_tokens":
            req["max_completion_tokens"] = int(max_tokens)
        else:
            req["max_tokens"] = int(max_tokens)
        if include_seed and seed is not None:
            req["seed"] = int(seed)

        resp = client.chat.completions.create(**req)
        text = (resp.choices[0].message.content or "").strip()

        meta_local = dict(meta)
        meta_local["max_param_used"] = max_param_used
        meta_local["finish_reason"] = getattr(resp.choices[0], "finish_reason", None)
        meta_local["usage"] = getattr(resp, "usage", None)
        meta_local["seed_applied"] = bool(include_seed and seed is not None)
        return text, meta_local

    def is_seed_error(e: Exception) -> bool:
        s = str(e).lower()
        return ("seed" in s) and ("unknown" in s or "unsupported" in s or "invalid" in s)

    try:
        return attempt("max_completion_tokens", include_seed=True)
    except Exception as e1:
        if seed is not None and is_seed_error(e1):
            meta["seed_error"] = str(e1)
            try:
                return attempt("max_completion_tokens", include_seed=False)
            except Exception:
                pass
        try:
            return attempt("max_tokens", include_seed=True)
        except Exception as e2:
            if seed is not None and is_seed_error(e2):
                meta["seed_error"] = str(e2)
                return attempt("max_tokens", include_seed=False)
            raise


def generate_verbose_description_via_llm(
    *,
    client: OpenAI,
    tool_name: str,
    base_description: str,
    model: str,
    seed: Optional[int],
    max_tokens: int,
    retry_on_length: bool,
    retry_max_tokens: int,
    regen_index: int = 0,
    previous_rewrite: Optional[str] = None,
) -> Tuple[str, Dict[str, Any]]:
    system = (
        "Rewrite tool descriptions.\n"
        "Hard constraints:\n"
        "- Preserve meaning exactly; do not add new capabilities, steps, motivations, benefits, or context.\n"
        "- Do not delete information present in the original description.\n"
        "- Do not introduce new parameter names, IDs, field names, flags, or implementation details.\n"
        "- If parameter/field names/IDs/flags already appear in the original description, keep them (do not remove them).\n"
        "- Do not add examples, normative language, or assumptions.\n"
        "- Keep the same subject (the tool) and the same scope.\n"
        "- Output only the rewritten description text, nothing else.\n"
        "- Style: verbose but controlled; keep it concise and complete (1–2 sentences), clear and direct.\n"
    )

    user_parts: List[str] = []
    user_parts.append(f"Tool name: {tool_name}")
    user_parts.append("Base description:")
    user_parts.append(base_description.strip() or "(empty)")
    user_parts.append("")
    user_parts.append("Rewrite in 'style_verbose' under the constraints.")

    if regen_index > 0:
        user_parts.append("")
        user_parts.append(f"Regeneration request: {regen_index}")
        user_parts.append(REGEN_DIVERSITY_INSTRUCTION)
        if previous_rewrite and previous_rewrite.strip():
            prev = previous_rewrite.strip()
            if len(prev) > MAX_PREV_REWRITE_CHARS:
                prev = prev[:MAX_PREV_REWRITE_CHARS].rstrip()
            user_parts.append("")
            user_parts.append("Previous rewrite (do not reuse wording):")
            user_parts.append(prev)

    user = "\n".join(user_parts)

    raw1, meta1 = _llm_chat_completion(
        client=client,
        model=model,
        messages=[{"role": "system", "content": system}, {"role": "user", "content": user}],
        temperature=0.0,
        max_tokens=max_tokens,
        seed=seed,
    )
    san1 = _sanitize_llm_output(raw1)
    finish1 = (meta1.get("finish_reason") or "").lower()
    looks_truncated_1 = (finish1 == "length") or (san1 and not _ends_like_complete_sentence(san1))

    if not looks_truncated_1:
        return san1, {
            "proposal_origin": "primary",
            "proposal_sanitized_final": san1,
            "llm_text_raw_primary": raw1,
            "llm_text_raw_retry": None,
            "primary": meta1,
            "retry": None,
        }

    raw2 = None
    meta2 = None
    san2 = None
    best_san = san1
    origin = "primary"

    if retry_on_length and retry_max_tokens > max_tokens:
        raw2, meta2 = _llm_chat_completion(
            client=client,
            model=model,
            messages=[{"role": "system", "content": system}, {"role": "user", "content": user}],
            temperature=0.0,
            max_tokens=int(retry_max_tokens),
            seed=seed,
        )
        san2 = _sanitize_llm_output(raw2)
        finish2 = (meta2.get("finish_reason") or "").lower()
        looks_truncated_2 = (finish2 == "length") or (san2 and not _ends_like_complete_sentence(san2))

        if san2 and len(san2) >= len(best_san):
            best_san = san2
            origin = "retry"

        if not looks_truncated_2 and san2:
            return san2, {
                "proposal_origin": "retry",
                "proposal_sanitized_final": san2,
                "llm_text_raw_primary": raw1,
                "llm_text_raw_retry": raw2,
                "primary": meta1,
                "retry": meta2,
            }

    return best_san, {
        "proposal_origin": origin,
        "proposal_sanitized_final": best_san,
        "llm_text_raw_primary": raw1,
        "llm_text_raw_retry": raw2,
        "primary": meta1,
        "retry": meta2,
    }


# ========= IO =========
def make_working_copy(input_jsonl: str, output_jsonl: str, *, overwrite: bool = False) -> str:
    src = Path(input_jsonl)
    dst = Path(output_jsonl)

    if not src.exists():
        raise FileNotFoundError(f"File not found: {src}")

    if dst.exists() and not overwrite:
        return str(dst)

    dst.parent.mkdir(parents=True, exist_ok=True)
    shutil.copy2(src, dst)
    return str(dst)


def _normalize_cmd(raw: str) -> str:
    c = (raw or "").strip().lower()
    if c in ("", "y", "yes", "ok", "okay", "si", "sì"):
        return "y"
    if c in ("r", "retry", "again", "prova", "prova ancora", "rigenera"):
        return "r"
    if c in ("e", "edit", "modifica"):
        return "e"
    if c in ("m", "manual", "mine", "mio", "mia", "custom"):
        return "m"
    if c in ("s", "skip", "salta", "pass"):
        return "s"
    if c in ("q", "quit", "exit", "esci"):
        return "q"
    return c


# ========= Main interactive =========
def interactive_llm_verbose_tools_in_jsonl(
    jsonl_path: str,
    *,
    tool_field: str,
    create_backup_of_target: bool,
    llm_model: str,
    seed: Optional[int],
    max_tokens: int,
    retry_on_length: bool,
    retry_max_tokens: int,
    allow_reserialize_fallback: bool,
    min_sleep_sec_between_calls: float,
    audit_dir: str,
) -> None:
    path = Path(jsonl_path)
    if not path.exists():
        raise FileNotFoundError(f"File not found: {jsonl_path}")

    client = make_gemini_client()
    audit_file = _audit_file_path(
        path,
        audit_dir=Path(audit_dir),
        mode_key=MODE_KEY,
        model=llm_model,
        tool_field=tool_field,
    )

    decisions_by_instance, regen_counts, last_rejected_text_by_instance, prior_run_start = _load_resume_state(audit_file)

    tool_order: List[Dict[str, Any]] = []
    with path.open("r", encoding="utf-8") as f:
        for raw_line in f:
            line = raw_line.rstrip("\n")
            if not line.strip():
                continue
            try:
                record = json.loads(line)
            except json.JSONDecodeError:
                continue
            if not isinstance(record, dict):
                continue

            rid = _record_id(record, tool_field=tool_field)
            tools = record.get(tool_field)
            if not isinstance(tools, list):
                continue

            for tool_index, entry in enumerate(tools):
                tool_obj, kind = _load_tool(entry)
                if not tool_obj:
                    continue
                name = (tool_obj.get("name") or "").strip()
                if not name:
                    continue

                desc_print, desc_mode = _get_description_for_print(entry)
                instance_key = _tool_instance_key(rid, tool_index, tool_obj)

                tool_order.append(
                    {
                        "record_id": rid,
                        "tool_index": tool_index,
                        "tool_name": name,
                        "desc_print": desc_print,
                        "desc_mode": desc_mode,
                        "instance_key": instance_key,
                        "entry_kind": kind,
                    }
                )

    n_total = len(tool_order)
    n_prev_reviewed = len(decisions_by_instance)

    start_pos = 0
    while start_pos < n_total and tool_order[start_pos]["instance_key"] in decisions_by_instance:
        start_pos += 1

    session_id = hashlib.sha256(f"{time.time_ns()}".encode("utf-8")).hexdigest()[:12]
    before_sha = _sha256_file(path)

    if prior_run_start is None:
        _append_audit_event(
            audit_file,
            {
                "event_type": "run_start",
                "ts": int(time.time()),
                "session_id": session_id,
                "mode": MODE_KEY,
                "model": llm_model,
                "seed": seed,
                "dataset_path": str(path),
                "dataset_sha256_at_session_start": before_sha,
                "tool_field": tool_field,
                "n_total_occurrences": n_total,
                "max_tokens_requested": int(max_tokens),
                "retry_on_length": bool(retry_on_length),
                "retry_max_tokens": int(retry_max_tokens),
                "allow_reserialize_fallback": bool(allow_reserialize_fallback),
            },
        )
    else:
        _append_audit_event(
            audit_file,
            {
                "event_type": "run_resume",
                "ts": int(time.time()),
                "session_id": session_id,
                "mode": MODE_KEY,
                "model": llm_model,
                "seed": seed,
                "dataset_path": str(path),
                "dataset_sha256_at_session_start": before_sha,
                "tool_field": tool_field,
                "n_total_occurrences": n_total,
                "n_previously_reviewed": n_prev_reviewed,
                "resume_from_index_1based": (start_pos + 1) if start_pos < n_total else (n_total + 1),
            },
        )

    print(f"Target: {path}")
    print(f"Audit file (RESUMABLE): {audit_file}")
    print(f"Tool occurrences total: {n_total}")
    if start_pos < n_total:
        print(f"Resume position: [{start_pos + 1}/{n_total}] (previously reviewed: {n_prev_reviewed})")
    else:
        print(f"Resume position: completed (previously reviewed: {n_prev_reviewed})")
    print(f"LLM: {llm_model} @ {GEMINI_BASE_URL}")
    print(f"Max tokens: {int(max_tokens)}; retry_on_length={bool(retry_on_length)}; retry_max_tokens={int(retry_max_tokens)}")
    print("Commands: ENTER/ok=accept, r=regenerate, e=edit, m=manual, s=skip, q=quit\n")

    quit_requested = False
    resume_next_index_1based: Optional[int] = None

    for pos in range(start_pos, n_total):
        item = tool_order[pos]
        idx = pos + 1

        name = item["tool_name"]
        desc_mode = item["desc_mode"]
        old_desc_print = item["desc_print"]
        instance_key = item["instance_key"]
        rid = item["record_id"]
        tool_i = item["tool_index"]

        # Per-instance regen state (resumable for regen_index; previous rejected text is best-effort).
        regen_index_local = int(regen_counts.get(instance_key, 0))
        previous_rewrite_local: Optional[str] = last_rejected_text_by_instance.get(instance_key)

        print("=" * 80)
        print(f"[{idx}/{n_total}] {name}")
        print(f"instance_key: {instance_key} (record_id={rid}, tool_index={tool_i})")

        if desc_mode == "raw_json":
            print("Current description RAW (escaped):")
            print(old_desc_print if old_desc_print else "(empty)")
            base_desc = _decode_raw_json_string(old_desc_print) if old_desc_print else ""
            print("\nCurrent description DECODED:")
            print(base_desc if base_desc else "(empty)")
        else:
            base_desc = old_desc_print or ""
            print("Current description:")
            print(base_desc if base_desc else "(empty)")

        proposal = ""
        llm_bundle: Optional[Dict[str, Any]] = None

        while True:
            if not proposal:
                try:
                    proposal, llm_bundle = generate_verbose_description_via_llm(
                        client=client,
                        tool_name=name,
                        base_description=base_desc,
                        model=llm_model,
                        seed=seed,
                        max_tokens=max_tokens,
                        retry_on_length=retry_on_length,
                        retry_max_tokens=retry_max_tokens,
                        regen_index=regen_index_local,
                        previous_rewrite=previous_rewrite_local,
                    )
                except Exception as e:
                    print(f"\nLLM ERROR: {e}")
                    cmd = _normalize_cmd(input("Choice [m=manual, e=edit, s=skip, q=quit] > "))
                    now = int(time.time())

                    if cmd == "q":
                        quit_requested = True
                        resume_next_index_1based = idx
                        break

                    if cmd == "s":
                        decisions_by_instance[instance_key] = ("skipped", None, None)
                        _append_audit_event(
                            audit_file,
                            {
                                "event_type": "decision",
                                "ts": now,
                                "session_id": session_id,
                                "status": "skipped",
                                "tool_name": name,
                                "instance_key": instance_key,
                                "record_id": rid,
                                "tool_index": tool_i,
                                "model": llm_model,
                                "seed": seed,
                                "base_description": base_desc,
                                "final_description": None,
                                "source": "user",
                                "note": "skip_after_llm_error",
                            },
                        )
                        break

                    if cmd in ("m", "e"):
                        manual_final = input("Manual final description (empty cancels) > ").rstrip("\n").strip()
                        status = "manual" if (cmd == "m" and manual_final) else ("edited" if (cmd == "e" and manual_final) else "skipped")
                        decisions_by_instance[instance_key] = (status, manual_final or None, None)
                        _append_audit_event(
                            audit_file,
                            {
                                "event_type": "decision",
                                "ts": now,
                                "session_id": session_id,
                                "status": status,
                                "tool_name": name,
                                "instance_key": instance_key,
                                "record_id": rid,
                                "tool_index": tool_i,
                                "model": llm_model,
                                "seed": seed,
                                "base_description": base_desc,
                                "final_description": manual_final or None,
                                "source": "user",
                                "note": "manual_or_edit_after_llm_error",
                            },
                        )
                        break

                    proposal = ""
                    continue

                proposal = (proposal or "").strip()

            print("\nLLM proposal:")
            print(proposal if proposal else "(empty)")

            if llm_bundle:
                try:
                    origin = llm_bundle.get("proposal_origin")
                    p = llm_bundle.get("primary")
                    r = llm_bundle.get("retry")
                    print(f"\nproposal_origin={origin}")
                    if p:
                        print(f"meta: finish_reason={p.get('finish_reason')}, max_param_used={p.get('max_param_used')}, usage={p.get('usage')}")
                    if r:
                        print(f"meta(retry): finish_reason={r.get('finish_reason')}, max_param_used={r.get('max_param_used')}, usage={r.get('usage')}")
                except Exception:
                    pass

            cmd = _normalize_cmd(input("\nChoice [ENTER=accept, r=regen, e=edit, m=manual, s=skip, q=quit] > "))
            now = int(time.time())

            if cmd == "y":
                if proposal.strip():
                    decisions_by_instance[instance_key] = ("accepted", proposal.strip(), llm_bundle)
                    _append_audit_event(
                        audit_file,
                        {
                            "event_type": "decision",
                            "ts": now,
                            "session_id": session_id,
                            "status": "accepted",
                            "tool_name": name,
                            "instance_key": instance_key,
                            "record_id": rid,
                            "tool_index": tool_i,
                            "model": llm_model,
                            "seed": seed,
                            "base_description": base_desc,
                            "final_description": proposal.strip(),
                            "source": "llm",
                            "llm_bundle": llm_bundle,
                        },
                    )
                else:
                    decisions_by_instance[instance_key] = ("skipped", None, llm_bundle)
                    _append_audit_event(
                        audit_file,
                        {
                            "event_type": "decision",
                            "ts": now,
                            "session_id": session_id,
                            "status": "skipped",
                            "tool_name": name,
                            "instance_key": instance_key,
                            "record_id": rid,
                            "tool_index": tool_i,
                            "model": llm_model,
                            "seed": seed,
                            "base_description": base_desc,
                            "final_description": None,
                            "source": "llm",
                            "note": "empty_proposal",
                            "llm_bundle": llm_bundle,
                        },
                    )
                break

            if cmd == "r":
                # Store the rejected output and feed it back as "previous rewrite" for the next generation.
                previous_rewrite_local = proposal.strip() if proposal else None
                if previous_rewrite_local and len(previous_rewrite_local) > MAX_PREV_REWRITE_CHARS:
                    previous_rewrite_local = previous_rewrite_local[:MAX_PREV_REWRITE_CHARS].rstrip()

                regen_counts[instance_key] = regen_counts.get(instance_key, 0) + 1
                regen_index_local = int(regen_counts[instance_key])

                # Persist the rejected text for resume.
                last_rejected_text_by_instance[instance_key] = previous_rewrite_local

                _append_audit_event(
                    audit_file,
                    {
                        "event_type": "regenerate",
                        "ts": now,
                        "session_id": session_id,
                        "tool_name": name,
                        "instance_key": instance_key,
                        "record_id": rid,
                        "tool_index": tool_i,
                        "regen_index": regen_index_local,
                        "last_proposal_sha256": _sha256_text(proposal),
                        "last_proposal_text": previous_rewrite_local,
                        "last_proposal_origin": (llm_bundle or {}).get("proposal_origin") if llm_bundle else None,
                    },
                )

                proposal = ""
                llm_bundle = None
                if min_sleep_sec_between_calls > 0:
                    time.sleep(min_sleep_sec_between_calls)
                continue

            if cmd == "e":
                edited = input("Edit proposal (empty cancels) > ").rstrip("\n").strip()
                status = "edited" if edited else "skipped"
                decisions_by_instance[instance_key] = (status, edited or None, llm_bundle)
                _append_audit_event(
                    audit_file,
                    {
                        "event_type": "decision",
                        "ts": now,
                        "session_id": session_id,
                        "status": status,
                        "tool_name": name,
                        "instance_key": instance_key,
                        "record_id": rid,
                        "tool_index": tool_i,
                        "model": llm_model,
                        "seed": seed,
                        "base_description": base_desc,
                        "final_description": edited or None,
                        "source": "user",
                        "note": "edit_proposal",
                        "llm_bundle": llm_bundle,
                    },
                )
                break

            if cmd == "m":
                manual_final = input("Manual final description (empty cancels) > ").rstrip("\n").strip()
                status = "manual" if manual_final else "skipped"
                decisions_by_instance[instance_key] = (status, manual_final or None, llm_bundle)
                _append_audit_event(
                    audit_file,
                    {
                        "event_type": "decision",
                        "ts": now,
                        "session_id": session_id,
                        "status": status,
                        "tool_name": name,
                        "instance_key": instance_key,
                        "record_id": rid,
                        "tool_index": tool_i,
                        "model": llm_model,
                        "seed": seed,
                        "base_description": base_desc,
                        "final_description": manual_final or None,
                        "source": "user",
                        "note": "manual_replace",
                        "llm_bundle": llm_bundle,
                    },
                )
                break

            if cmd == "s":
                decisions_by_instance[instance_key] = ("skipped", None, llm_bundle)
                _append_audit_event(
                    audit_file,
                    {
                        "event_type": "decision",
                        "ts": now,
                        "session_id": session_id,
                        "status": "skipped",
                        "tool_name": name,
                        "instance_key": instance_key,
                        "record_id": rid,
                        "tool_index": tool_i,
                        "model": llm_model,
                        "seed": seed,
                        "base_description": base_desc,
                        "final_description": None,
                        "source": "user",
                        "note": "skip",
                        "llm_bundle": llm_bundle,
                    },
                )
                break

            if cmd == "q":
                quit_requested = True
                resume_next_index_1based = idx
                break

            print("Invalid command.")

        if quit_requested:
            break

    # ========= Apply decisions to file =========
    tmp_path = path.with_suffix(path.suffix + ".tmp")
    updated_count = 0
    patch_failures = 0

    with path.open("r", encoding="utf-8") as fin, tmp_path.open("w", encoding="utf-8") as fout:
        for raw_line in fin:
            line = raw_line.rstrip("\n")
            if not line.strip():
                fout.write(line + "\n")
                continue

            try:
                record = json.loads(line)
            except json.JSONDecodeError:
                fout.write(line + "\n")
                continue

            if not isinstance(record, dict):
                fout.write(json.dumps(record, ensure_ascii=False) + "\n")
                continue

            rid = _record_id(record, tool_field=tool_field)
            tools = record.get(tool_field)

            if isinstance(tools, list):
                new_tools: List[Any] = []
                for tool_index, entry in enumerate(tools):
                    tool_obj, kind = _load_tool(entry)
                    if not tool_obj:
                        new_tools.append(entry)
                        continue

                    instance_key = _tool_instance_key(rid, tool_index, tool_obj)
                    decision = decisions_by_instance.get(instance_key)

                    if decision is None:
                        new_tools.append(entry)
                        continue

                    status, new_desc, llm_bundle = decision
                    if status in ("accepted", "edited", "manual") and new_desc:
                        if kind == "json_str" and isinstance(entry, str):
                            # Skip patch if already correct
                            already_ok = False
                            try:
                                obj0 = json.loads(entry)
                                if isinstance(obj0, dict) and obj0.get("description") == new_desc:
                                    already_ok = True
                            except Exception:
                                already_ok = False

                            if already_ok:
                                new_tools.append(entry)
                                continue

                            patched, ok, reason = _replace_top_level_string_field_in_raw_object(entry, "description", new_desc)
                            if ok:
                                new_tools.append(patched)
                                updated_count += 1
                            else:
                                fallback_ok = False
                                fallback_patched = entry
                                if allow_reserialize_fallback:
                                    try:
                                        obj = json.loads(entry)
                                        if isinstance(obj, dict):
                                            obj["description"] = new_desc
                                            fallback_patched = json.dumps(obj, ensure_ascii=False)
                                            fallback_ok = True
                                    except Exception:
                                        fallback_ok = False

                                if fallback_ok:
                                    new_tools.append(fallback_patched)
                                    updated_count += 1
                                    _append_audit_event(
                                        audit_file,
                                        {
                                            "event_type": "patch_fallback_reserialize",
                                            "ts": int(time.time()),
                                            "session_id": session_id,
                                            "instance_key": instance_key,
                                            "record_id": rid,
                                            "tool_index": tool_index,
                                            "tool_name": tool_obj.get("name"),
                                            "entry_sha256_before": _sha256_text(entry),
                                            "entry_sha256_after": _sha256_text(fallback_patched),
                                            "patch_reason": reason,
                                        },
                                    )
                                else:
                                    new_tools.append(entry)
                                    patch_failures += 1
                                    _append_audit_event(
                                        audit_file,
                                        {
                                            "event_type": "patch_failure",
                                            "ts": int(time.time()),
                                            "session_id": session_id,
                                            "instance_key": instance_key,
                                            "record_id": rid,
                                            "tool_index": tool_index,
                                            "tool_name": tool_obj.get("name"),
                                            "status": status,
                                            "patch_reason": reason,
                                            "final_description_sha256": _sha256_text(new_desc),
                                            "entry_sha256": _sha256_text(entry),
                                            "entry_excerpt": entry[:240],
                                            "llm_text_raw_primary_sha256": _sha256_text((llm_bundle or {}).get("llm_text_raw_primary") or ""),
                                            "llm_text_raw_retry_sha256": _sha256_text((llm_bundle or {}).get("llm_text_raw_retry") or ""),
                                            "proposal_origin": (llm_bundle or {}).get("proposal_origin"),
                                        },
                                    )
                        else:
                            if tool_obj.get("description") == new_desc:
                                new_tools.append(tool_obj)
                                continue
                            tool_obj["description"] = new_desc
                            new_tools.append(tool_obj)
                            updated_count += 1
                    else:
                        new_tools.append(entry)

                record[tool_field] = new_tools

            fout.write(json.dumps(record, ensure_ascii=False) + "\n")

    if create_backup_of_target:
        bak_path = path.with_suffix(path.suffix + ".bak")
        if not bak_path.exists():
            shutil.copy2(path, bak_path)

    tmp_path.replace(path)
    after_sha = _sha256_file(path)

    n_reviewed = len(decisions_by_instance)
    n_skipped = sum(1 for st, _, _ in decisions_by_instance.values() if st == "skipped")
    completed = (n_reviewed >= n_total) and (not quit_requested)

    _append_audit_event(
        audit_file,
        {
            "event_type": "run_end",
            "ts": int(time.time()),
            "session_id": session_id,
            "mode": MODE_KEY,
            "model": llm_model,
            "seed": seed,
            "dataset_path": str(path),
            "dataset_sha256_at_session_start": before_sha,
            "dataset_sha256_at_session_end": after_sha,
            "n_total_occurrences": n_total,
            "n_reviewed_total": n_reviewed,
            "n_updated_this_session": updated_count,
            "n_skipped_total": n_skipped,
            "completed": bool(completed),
            "quit_requested": bool(quit_requested),
            "raw_patch_failures_this_session": patch_failures,
            "resume_next_index_1based": resume_next_index_1based if quit_requested else (n_total + 1 if completed else None),
        },
    )

    print("\nChanges applied.")
    print(f"Descriptions updated (this session): {updated_count}")
    if patch_failures:
        print(f"Raw JSON-string patch failures (left unchanged): {patch_failures}")
    print(f"Reviewed total (from audit): {n_reviewed} / {n_total}")
    print(f"Completed: {completed} (quit_requested={quit_requested})")
    if quit_requested and resume_next_index_1based is not None:
        print(f"Resume next time from: [{resume_next_index_1based}/{n_total}]")
    print(f"Updated file: {path}")
    print(f"Audit file (same on resume): {audit_file}")


if __name__ == "__main__":
    INPUT_JSONL = "When2Call/data/test/when2call_test_llm_judge.jsonl"
    OUTPUT_JSONL = "When2Call/data/test/when2call_test_llm_judge.WORKING_COPY.jsonl"

    working = make_working_copy(INPUT_JSONL, OUTPUT_JSONL, overwrite=False)
    print(f"Working copy: {working}")

    seed_env = os.environ.get("GEMINI_SEED")
    seed_val: Optional[int] = int(seed_env.strip()) if (seed_env and seed_env.strip()) else None

    max_tokens_env = os.environ.get("GEMINI_MAX_TOKENS")
    max_tokens_val = int(max_tokens_env.strip()) if (max_tokens_env and max_tokens_env.strip()) else DEFAULT_MAX_TOKENS

    retry_max_tokens_env = os.environ.get("GEMINI_RETRY_MAX_TOKENS")
    retry_max_tokens_val = int(retry_max_tokens_env.strip()) if (retry_max_tokens_env and retry_max_tokens_env.strip()) else RETRY_MAX_TOKENS

    allow_reserialize_env = os.environ.get("ALLOW_RESERIALIZE_FALLBACK")
    allow_reserialize_val = (
        bool(int(allow_reserialize_env.strip()))
        if (allow_reserialize_env and allow_reserialize_env.strip())
        else DEFAULT_ALLOW_RESERIALIZE_FALLBACK
    )

    interactive_llm_verbose_tools_in_jsonl(
        working,
        tool_field="tools",
        create_backup_of_target=False,
        llm_model=LLM_MODEL,
        seed=seed_val,
        max_tokens=max_tokens_val,
        retry_on_length=RETRY_ON_LENGTH,
        retry_max_tokens=retry_max_tokens_val,
        allow_reserialize_fallback=allow_reserialize_val,
        min_sleep_sec_between_calls=0.0,
        audit_dir="audit",
    )


# Da qua in giu non usare. altre versioni automatiche di cerazione delle varianti ma non perfette

In [None]:
import os
import json
import time
from typing import Any, Dict, List

from openai import OpenAI

# ================== BASE CONFIG ==================

DEFAULT_INPUT_JSONL = "When2Call/data/test/when2call_test_llm_judge.jsonl"
DEFAULT_OUTPUT_DIR = "when2call_local_variants"

# Modalità di riscrittura delle descrizioni (semantica diversa)
MODES: List[str] = [
    "no_desc",            # 1) descrizione vuota
    "short_label",        # 2) etichetta di 2–3 parole
    "verbose_examples",   # 3) descrizione verbosa con esempi d'uso
    "verbose_normative",  # 4) descrizione verbosa + linee guida comportamentali
    "misleading",         # 5) descrizione ambigua/sottospecificata
]

# Timeout minimo tra una chiamata reale al modello e la successiva (in secondi)
RATE_LIMIT_SLEEP = 3

# Cache in-memory per non richiamare il modello sulle stesse specifiche
_DESCRIPTION_CACHE: Dict[str, str] = {}

# Timestamp dell'ultima chiamata reale al modello (per rate limiting globale)
_LAST_CALL_TIME: float | None = None


# ================== HELPER: MODEL CLIENT ==================

def make_jrc_client() -> OpenAI:
    """
    Create an OpenAI client for the JRC gateway.

    Requires the environment variable TOKEN_JRC to be set.
    """
    token = os.environ.get("TOKEN_JRC")
    if not token:
        raise RuntimeError("TOKEN_JRC environment variable is not set.")
    client = OpenAI(
        api_key=token,
        base_url="https://api-gpt.jrc.ec.europa.eu/v1",
    )
    return client


# ================== HELPER: UTILS ==================

def sanitize_model_name_for_path(model_name: str) -> str:
    """
    Sanitize a model name so it can safely be used as part of a filename.

    - Keeps letters, digits, '-', '_', and '.' as-is.
    - Replaces any other character with '-'.
    """
    allowed = set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_.")
    return "".join(ch if ch in allowed else "-" for ch in model_name)


def extract_parameters_for_prompt(spec: Dict[str, Any]) -> List[Dict[str, Any]]:
    """
    Extract parameter info from a JSON-Schema-style 'parameters' field
    without modifying the original spec.

    Returns a list of dicts like:
        {"name": ..., "type": ..., "description": ..., "required": bool}
    """
    params = spec.get("parameters", {}) or {}
    if not isinstance(params, dict):
        return []

    props = params.get("properties", {}) or {}
    if not isinstance(props, dict):
        props = {}

    required_list = params.get("required", []) or []
    if not isinstance(required_list, list):
        required_list = []

    param_info_list: List[Dict[str, Any]] = []
    for name, meta in props.items():
        if not isinstance(meta, dict):
            meta = {}
        p_type = meta.get("type", "string")
        p_desc = (meta.get("description") or "").strip()
        param_info_list.append(
            {
                "name": name,
                "type": p_type,
                "description": p_desc,
                "required": name in required_list,
            }
        )
    return param_info_list


def build_param_block_for_prompt(param_info: List[Dict[str, Any]]) -> str:
    """
    Build a textual summary of parameters to provide as context to the LLM
    (this text is NOT inserted into the JSON schema, only used in the prompt).
    """
    if not param_info:
        return "This tool has no explicit structured parameters in the schema."

    lines = ["Here are the schema parameters:"]
    for p in param_info:
        bits = [f"name={p['name']}"]
        if p.get("type"):
            bits.append(f"type={p['type']}")
        if p.get("required"):
            bits.append("required=true")
        if p.get("description"):
            bits.append(f"description={p['description']}")
        lines.append("  - " + "; ".join(bits))
    return "\n".join(lines)


def build_generation_prompt(
    mode: str,
    tool_name: str,
    orig_desc: str,
    param_info: List[Dict[str, Any]],
) -> str:
    """
    Build the instruction prompt for the LLM that rewrites the tool description
    according to the given 'mode'.
    """
    if mode not in MODES:
        raise ValueError(f"Unknown mode for generation prompt: {mode}")

    param_block = build_param_block_for_prompt(param_info)
    base_context = f"""You are helping to rewrite tool descriptions for a function-calling benchmark.

We have a tool with:
- Name: {tool_name}
- Original description (may be empty): {orig_desc if orig_desc else "[NONE]"}

{param_block}

You must write a NEW description string for this tool.
The description will be embedded directly into the tool JSON as the "description" field.

GENERAL RULES (always apply):
- Write in clear, concise English.
- The description must be a single continuous text block (you can use short sentences).
- Do NOT mention that you are an AI model.
- Do NOT include markdown, bullet markers like '-', or backticks.
- Do NOT refer to "users" as "you" if it becomes confusing; instead, describe what the tool does.
- Do NOT talk about "this schema" or "this prompt"; just describe the tool.
- Output ONLY the new description text, with no extra commentary.
"""

    if mode == "short_label":
        mode_instructions = """
MODE: short_label

- Write a VERY SHORT label (2–3 words) that summarizes what the tool does.
- Use only plain English words, no punctuation, no numbers unless essential.
- Do NOT include verbs like "use", "get", "call"; prefer noun phrases (e.g. "Weather lookup", "Stock prices").
- Do NOT mention parameters explicitly.
- The output MUST be at most 3 words.
"""
    elif mode == "verbose_examples":
        mode_instructions = """
MODE: verbose_examples

- Describe clearly what the tool does and what inputs it expects.
- Incorporate the parameter information into the description in natural language.
- Explain briefly what the tool returns.
- Include 1–2 short conceptual usage examples embedded in the text (e.g. "For example, ...").
- The examples must be consistent with the parameters and the tool's purpose.
- Do NOT give explicit behavioral policies such as "use this tool when...".
"""
    elif mode == "verbose_normative":
        mode_instructions = """
MODE: verbose_normative

- Describe the tool, its inputs, and what it returns.
- Incorporate the parameter information into the description in natural language.
- Include 1–2 short conceptual usage examples embedded in the text.
- Additionally, include explicit behavioral guidelines, such as:
  - When the tool SHOULD be used.
  - When a follow-up question is needed because required information is missing.
  - When the tool SHOULD NOT be used (e.g., out-of-scope queries).
- Phrase these guidelines in natural language, as part of the description.
"""
    elif mode == "misleading":
        mode_instructions = """
MODE: misleading

- Write a plausible but partially inaccurate description of the tool.
- Keep the general topic related to the tool name and parameters, but:
  - Over-emphasize a secondary or edge-case use, OR
  - Omit an important limitation, OR
  - Slightly misstate the typical purpose (for example, focus on historical data when the tool is mainly for real-time queries).
- Do NOT claim that the tool can handle any possible request or "everything".
- Do NOT change or contradict the parameter types, but you may downplay or fail to mention some of them.
- The result should sound like realistic but sloppy documentation that could mislead an LLM.
"""
    elif mode == "no_desc":
        # This mode does not use the LLM; handled separately.
        mode_instructions = ""
    else:
        # Should not happen because of the check at the top
        raise ValueError(f"Unexpected mode in build_generation_prompt: {mode}")

    return base_context + "\n" + mode_instructions.strip()


def llm_generate_description(
    client: OpenAI,
    model: str,
    mode: str,
    tool_name: str,
    orig_desc: str,
    param_info: List[Dict[str, Any]],
) -> str:
    """
    Call the LLM to generate a new description for a tool in a given mode.

    Uses an in-memory cache to avoid duplicate calls and enforces a minimum
    delay of RATE_LIMIT_SLEEP seconds between *real* LLM calls to respect
    rate limits.
    """
    global _LAST_CALL_TIME

    # Build a cache key based on mode, model, name, original description, and parameters.
    params_signature = json.dumps(param_info, sort_keys=True, ensure_ascii=False)
    cache_key = f"{model}||{mode}||{tool_name}||{orig_desc.strip()}||{params_signature}"

    # Cache hit: niente chiamata al modello, niente delay
    if cache_key in _DESCRIPTION_CACHE:
        return _DESCRIPTION_CACHE[cache_key]

    # Applica rate limiting PRIMA della chiamata reale al modello
    if _LAST_CALL_TIME is not None and RATE_LIMIT_SLEEP > 0:
        elapsed = time.monotonic() - _LAST_CALL_TIME
        if elapsed < RATE_LIMIT_SLEEP:
            sleep_for = RATE_LIMIT_SLEEP - elapsed
            # print(f"[RATE LIMIT] Sleeping for {sleep_for:.2f} seconds before LLM call...")
            time.sleep(sleep_for)

    # Build the prompt
    prompt = build_generation_prompt(mode, tool_name, orig_desc, param_info)

    # Call the LLM
    response = client.chat.completions.create(
        model=model,
        messages=[
            {
                "role": "system",
                "content": "You rewrite tool descriptions for function-calling APIs."
            },
            {"role": "user", "content": prompt},
        ],
        temperature=0.2,
        max_tokens=256,
    )
    text = (response.choices[0].message.content or "").strip()

    # Cache the result
    _DESCRIPTION_CACHE[cache_key] = text

    # Aggiorna l'istante dell'ultima chiamata reale
    _LAST_CALL_TIME = time.monotonic()

    return text


# ================== CORE: TOOL REWRITING ==================

def rewrite_tools_for_mode_with_llm(
    client: OpenAI,
    generation_model: str,
    tools: List[Any],
    mode: str,
) -> List[str]:
    """
    Given the 'tools' field of a When2Call example (list of JSON strings or dicts),
    return a NEW list of JSON strings where ONLY the 'description' field of each tool
    is rewritten using the LLM according to 'mode'.

    Supported tool shapes:
      1) {"type": "function", "function": { "name": ..., "description": ..., "parameters": ... }}
      2) {"name": ..., "description": ..., "parameters": ...}
    """
    if mode not in MODES:
        raise ValueError(f"Unknown mode in rewrite_tools_for_mode_with_llm: {mode}")

    new_tools: List[str] = []

    for tool_entry in tools or []:
        # Normalize to dict
        if isinstance(tool_entry, str):
            tool_str = tool_entry
            if not tool_str.strip():
                new_tools.append(tool_str)
                continue
            try:
                spec = json.loads(tool_str)
            except json.JSONDecodeError:
                # Malformed JSON, keep as-is
                new_tools.append(tool_str)
                continue
        elif isinstance(tool_entry, dict):
            spec = tool_entry
        else:
            # Unexpected type, just dump it back
            new_tools.append(json.dumps(tool_entry, ensure_ascii=False))
            continue

        # Handle wrapper case: {"type": "function", "function": {...}}
        if "function" in spec and isinstance(spec["function"], dict):
            fn = spec["function"]
            orig_desc = fn.get("description", "") or ""
            tool_name = fn.get("name", "unnamed_tool")

            if mode == "no_desc":
                new_desc = ""
            else:
                param_info = extract_parameters_for_prompt(fn)
                new_desc = llm_generate_description(
                    client=client,
                    model=generation_model,
                    mode=mode,
                    tool_name=tool_name,
                    orig_desc=orig_desc,
                    param_info=param_info,
                )

            fn["description"] = new_desc
            spec["function"] = fn
            new_tools.append(json.dumps(spec, ensure_ascii=False))

        else:
            # Flat case: {"name": ..., "description": ..., "parameters": ...}
            fn = spec
            orig_desc = fn.get("description", "") or ""
            tool_name = fn.get("name", "unnamed_tool")

            if mode == "no_desc":
                new_desc = ""
            else:
                param_info = extract_parameters_for_prompt(fn)
                new_desc = llm_generate_description(
                    client=client,
                    model=generation_model,
                    mode=mode,
                    tool_name=tool_name,
                    orig_desc=orig_desc,
                    param_info=param_info,
                )

            fn["description"] = new_desc
            new_tools.append(json.dumps(fn, ensure_ascii=False))

    return new_tools


# ================== BUILD JSONL VARIANT ==================

def build_variant_jsonl_with_llm(
    client: OpenAI,
    generation_model: str,
    input_path: str,
    output_path: str,
    mode: str,
) -> None:
    """
    Read a When2Call-style JSONL file and write a new JSONL file where ONLY the
    tool descriptions are rewritten according to 'mode' using an LLM.

    All fields EXCEPT 'tools[*].description' are preserved exactly.
    """
    if mode not in MODES:
        raise ValueError(f"Unknown mode in build_variant_jsonl_with_llm: {mode}")

    num_lines = 0
    num_modified = 0

    with open(input_path, "r", encoding="utf-8") as fin, \
         open(output_path, "w", encoding="utf-8") as fout:

        for line in fin:
            if not line.strip():
                continue
            num_lines += 1

            obj = json.loads(line)

            tools = obj.get("tools", [])
            if tools:
                new_tools = rewrite_tools_for_mode_with_llm(
                    client=client,
                    generation_model=generation_model,
                    tools=tools,
                    mode=mode,
                )
                if new_tools != tools:
                    num_modified += 1
                obj["tools"] = new_tools

            # Everything else (uuid, question, answers, labels, etc.) is unchanged
            fout.write(json.dumps(obj, ensure_ascii=False) + "\n")

    print(
        f"[{mode}] Wrote {output_path} "
        f"(lines: {num_lines}, examples with tools modified: {num_modified})"
    )


# ================== ENTRYPOINT (ES. PER NOTEBOOK) ==================

def run_when2call_variants(
    client: OpenAI,
    generation_model: str,
    input_jsonl: str = DEFAULT_INPUT_JSONL,
    output_dir: str = DEFAULT_OUTPUT_DIR,
    modes: List[str] | None = None,
    overwrite: bool = False,
) -> None:
    """
    Generate multiple When2Call dataset variants where ONLY tool descriptions
    are changed via an LLM, under different 'modes'.

    Parameters
    ----------
    client : OpenAI
        OpenAI client already configured for the JRC gateway.
    generation_model : str
        Name of the LLM used to rewrite descriptions (e.g. "gpt-4o", "llama-3.3-70b-instruct").
        This name will also be embedded in the output filenames.
    input_jsonl : str
        Path to the original When2Call JSONL test file.
    output_dir : str
        Directory where variant JSONL files will be written.
    modes : List[str] or None
        List of modes to generate (subset of MODES). If None, all MODES are used.
    overwrite : bool
        If False (default), existing files are skipped. If True, files are regenerated.
    """
    if not os.path.exists(input_jsonl):
        raise FileNotFoundError(f"Input file not found: {input_jsonl}")

    os.makedirs(output_dir, exist_ok=True)

    base_name = os.path.splitext(os.path.basename(input_jsonl))[0]
    model_tag = sanitize_model_name_for_path(generation_model)

    active_modes = modes if modes is not None else MODES

    # Validate modes
    unknown = [m for m in active_modes if m not in MODES]
    if unknown:
        raise ValueError(f"Unknown modes requested: {unknown}. Allowed: {MODES}")

    print(f"Using input file: {input_jsonl}")
    print(f"Output directory: {output_dir}")
    print(f"Generation model tag for filenames: {model_tag}")
    print(
        "The original file acts as the 'original' condition (no modification). "
        f"This function can create up to {len(active_modes)} additional variants."
    )

    for mode in active_modes:
        out_name = f"{base_name}_{model_tag}_{mode}.jsonl"
        out_path = os.path.join(output_dir, out_name)

        if os.path.exists(out_path) and not overwrite:
            print(f"[{mode}] Skipping {out_path} (already exists, overwrite=False).")
            continue

        build_variant_jsonl_with_llm(
            client=client,
            generation_model=generation_model,
            input_path=input_jsonl,
            output_path=out_path,
            mode=mode,
        )


In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
When2Call Tool-Description Variant Generator (Reproducible, Auditable, Minimal-Edit)

This script generates controlled dataset variants by modifying ONLY the tool description
string(s) inside the `tools` field of each JSONL example, while preserving the original
JSONL line formatting as much as technically feasible (i.e., without re-serializing the
outer JSON object). The tool JSON strings inside `tools` are patched in-place at the
exact JSON string-span level, so parameter sub-descriptions and unrelated keys remain
bitwise-identical unless a fallback path is explicitly triggered and logged.

Design goals:
- Minimal-diff patching: only the tool-level "description" is modified, not the entire JSON.
- Deterministic outputs: temperature defaults to 0, strict post-processing, validation+repair.
- Persistent cache: SQLite-backed caching keyed by a cryptographic signature.
- Full audit trail: JSONL logs with prompt, raw outputs, repairs, validations, and fallbacks.
- Reviewer-grade perturbations: a small set (<=5) of widely accepted doc-perturbation axes.
- Hard validators: mode-specific constraints are enforced; invalid outputs are repaired or rejected.
- Schema-aware checks: required parameters coverage and unknown-parameter mention checks.

Expected input:
- JSONL where each line is a JSON object containing (optionally) "tools": a list.
- Each `tools[i]` is typically a JSON-encoded string of a tool schema, but may also be a dict.

Outputs:
- One JSONL per mode where only tool description fields are updated per mode.
- Sidecar files under output_dir: SQLite cache and audit logs.

Environment variables:
- TOKEN_JRC: required for JRC gateway access (OpenAI-compatible).
- Optional: HTTP(S)_PROXY as needed by environment.

Usage (CLI example):
  python when2call_variants_art_5modes.py \
      --input When2Call/data/test/when2call_test_llm_judge.jsonl \
      --output-dir when2call_local_variants \
      --generation-model gpt-oss-120b \
      --modes empty_desc style_concise style_verbose add_examples normative_injection \
      --overwrite
"""

from __future__ import annotations

import argparse
import dataclasses
import hashlib
import json
import os
import random
import sqlite3
import sys
import time
from typing import Any, Dict, List, Optional, Sequence, Tuple

from openai import OpenAI


# ================== CONFIGURATION ==================

DEFAULT_INPUT_JSONL = "When2Call/data/test/when2call_test_llm_judge.jsonl"
DEFAULT_OUTPUT_DIR = "when2call_local_variants"

# Reviewer-grade perturbations (MAX 5):
# 1) empty_desc: remove tool description entirely (description="").
# 2) style_concise: semantic-preserving rewrite, concise, no examples, no normative language.
# 3) style_verbose: semantic-preserving rewrite, verbose, no examples, no normative language.
# 4) add_examples: semantic-preserving rewrite + 1-2 conceptual examples (no normative language).
# 5) normative_injection: semantic-preserving rewrite + prescriptive usage guidance + 1-2 examples.
MODES: List[str] = [
    "empty_desc",
    "style_concise",
    "style_verbose",
    "add_examples",
    "normative_injection",
]

# LLM call discipline
DEFAULT_TEMPERATURE = 0.0
DEFAULT_MAX_TOKENS = 256
DEFAULT_RATE_LIMIT_SLEEP_SEC = 0.0
DEFAULT_MAX_RETRIES = 8

# Repair is the critical reliability lever (Option 2: guided repair).
# Slightly higher default than 2 to reduce "rejected -> unchanged" collapse.
DEFAULT_REPAIR_MAX_ROUNDS = 3

# Output discipline (hard constraints)
DESCRIPTION_MAX_CHARS = 1200

# Persistent cache / audit
CACHE_DB_NAME = "tool_desc_cache.sqlite3"
AUDIT_DIR_NAME = "audit"
AUDIT_CALLS_JSONL = "generation_calls.jsonl"
AUDIT_EVENTS_JSONL = "variant_events.jsonl"
AUDIT_SUMMARY_JSON = "variant_summary.json"
AUDIT_FALLBACKS_JSONL = "fallback_events.jsonl"

# Reproducibility
DEFAULT_RANDOM_SEED = 1337


# ================== CLIENT ==================

def make_jrc_client() -> OpenAI:
    """
    Create an OpenAI-compatible client for the JRC gateway.

    Requires:
    - TOKEN_JRC environment variable.
    """
    token = os.environ.get("TOKEN_JRC")
    if not token:
        raise RuntimeError("TOKEN_JRC environment variable is not set.")
    return OpenAI(api_key=token, base_url="https://api-gpt.jrc.ec.europa.eu/v1")


# ================== SMALL UTILITIES ==================

def sanitize_model_name_for_path(model_name: str) -> str:
    allowed = set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_.")
    return "".join(ch if ch in allowed else "-" for ch in model_name)


def stable_sha256(text: str) -> str:
    return hashlib.sha256(text.encode("utf-8")).hexdigest()


def json_escape_string(value: str) -> str:
    # json.dumps returns a quoted JSON string; slice off quotes.
    return json.dumps(value, ensure_ascii=False)[1:-1]


def normalize_single_line(text: str) -> str:
    return " ".join((text or "").split()).strip()


def now_unix() -> float:
    return time.time()


def ensure_dir(path: str) -> None:
    os.makedirs(path, exist_ok=True)


def _dedupe_preserve_order(items: Sequence[str]) -> List[str]:
    seen = set()
    out: List[str] = []
    for x in items:
        if x not in seen:
            seen.add(x)
            out.append(x)
    return out


# ================== JSON SPAN PARSER (MINIMAL-EDIT PATCHING) ==================

@dataclasses.dataclass
class JsonNode:
    kind: str  # "object" | "array" | "string" | "number" | "true" | "false" | "null"
    start: int
    end: int
    value: Any = None
    obj: Optional[Dict[str, "JsonNode"]] = None
    arr: Optional[List["JsonNode"]] = None


class JsonSpanParseError(Exception):
    pass


class JsonSpanParser:
    """
    A JSON parser that returns both decoded values and exact source spans.

    This parser is intentionally limited to valid JSON (RFC 8259).
    It is used to patch tool-description substrings in-place without re-serializing
    the entire JSON.
    """

    def __init__(self, s: str):
        self.s = s
        self.n = len(s)

    def parse(self) -> JsonNode:
        i = self._skip_ws(0)
        node, j = self._parse_value(i)
        j = self._skip_ws(j)
        if j != self.n:
            raise JsonSpanParseError(f"Trailing content at index {j}")
        return node

    def _skip_ws(self, i: int) -> int:
        s = self.s
        n = self.n
        while i < n and s[i] in " \t\r\n":
            i += 1
        return i

    def _parse_value(self, i: int) -> Tuple[JsonNode, int]:
        i = self._skip_ws(i)
        if i >= self.n:
            raise JsonSpanParseError("Unexpected end of input")

        ch = self.s[i]
        if ch == "{":
            return self._parse_object(i)
        if ch == "[":
            return self._parse_array(i)
        if ch == '"':
            return self._parse_string(i)
        if ch == "-" or ch.isdigit():
            return self._parse_number(i)
        if self.s.startswith("true", i):
            return JsonNode(kind="true", start=i, end=i + 4, value=True), i + 4
        if self.s.startswith("false", i):
            return JsonNode(kind="false", start=i, end=i + 5, value=False), i + 5
        if self.s.startswith("null", i):
            return JsonNode(kind="null", start=i, end=i + 4, value=None), i + 4

        raise JsonSpanParseError(f"Unexpected token at index {i}: {ch!r}")

    def _parse_object(self, i: int) -> Tuple[JsonNode, int]:
        s = self.s
        if s[i] != "{":
            raise JsonSpanParseError("Expected '{'")
        start = i
        i += 1
        i = self._skip_ws(i)

        obj: Dict[str, JsonNode] = {}

        if i < self.n and s[i] == "}":
            end = i + 1
            return JsonNode(kind="object", start=start, end=end, obj=obj, value={}), end

        while True:
            i = self._skip_ws(i)
            if i >= self.n or s[i] != '"':
                raise JsonSpanParseError(f"Expected object key string at index {i}")
            key_node, i = self._parse_string(i)
            key = key_node.value

            i = self._skip_ws(i)
            if i >= self.n or s[i] != ":":
                raise JsonSpanParseError(f"Expected ':' after key at index {i}")
            i += 1

            val_node, i = self._parse_value(i)
            obj[key] = val_node

            i = self._skip_ws(i)
            if i >= self.n:
                raise JsonSpanParseError("Unexpected end in object")
            if s[i] == "}":
                end = i + 1
                return JsonNode(kind="object", start=start, end=end, obj=obj, value=None), end
            if s[i] != ",":
                raise JsonSpanParseError(f"Expected ',' or '}}' at index {i}")
            i += 1

    def _parse_array(self, i: int) -> Tuple[JsonNode, int]:
        s = self.s
        if s[i] != "[":
            raise JsonSpanParseError("Expected '['")
        start = i
        i += 1
        i = self._skip_ws(i)

        arr: List[JsonNode] = []

        if i < self.n and s[i] == "]":
            end = i + 1
            return JsonNode(kind="array", start=start, end=end, arr=arr, value=[]), end

        while True:
            val_node, i = self._parse_value(i)
            arr.append(val_node)

            i = self._skip_ws(i)
            if i >= self.n:
                raise JsonSpanParseError("Unexpected end in array")
            if self.s[i] == "]":
                end = i + 1
                return JsonNode(kind="array", start=start, end=end, arr=arr, value=None), end
            if self.s[i] != ",":
                raise JsonSpanParseError(f"Expected ',' or ']' at index {i}")
            i += 1
            i = self._skip_ws(i)

    def _parse_string(self, i: int) -> Tuple[JsonNode, int]:
        s = self.s
        if s[i] != '"':
            raise JsonSpanParseError("Expected '\"'")
        start = i
        i += 1

        out_chars: List[str] = []
        spans: List[Tuple[int, int]] = []  # (raw_start, raw_end) per OGNI char decodificato

        while i < self.n:
            ch = s[i]

            # Fine stringa
            if ch == '"':
                end = i + 1
                return JsonNode(
                    kind="string",
                    start=start,
                    end=end,
                    value="".join(out_chars),
                    string_char_spans=spans,
                ), end

            # Escape
            if ch == "\\":
                bs = i  # backslash index
                i += 1
                if i >= self.n:
                    raise JsonSpanParseError("Invalid escape at end of string")
                esc = s[i]

                if esc in '"\\/':
                    out_chars.append(esc)
                    spans.append((bs, i + 1))
                elif esc == "b":
                    out_chars.append("\b")
                    spans.append((bs, i + 1))
                elif esc == "f":
                    out_chars.append("\f")
                    spans.append((bs, i + 1))
                elif esc == "n":
                    out_chars.append("\n")
                    spans.append((bs, i + 1))
                elif esc == "r":
                    out_chars.append("\r")
                    spans.append((bs, i + 1))
                elif esc == "t":
                    out_chars.append("\t")
                    spans.append((bs, i + 1))
                elif esc == "u":
                    # \uXXXX
                    if i + 4 >= self.n:
                        raise JsonSpanParseError("Invalid unicode escape (truncated)")
                    hex_part = s[i + 1 : i + 5]
                    try:
                        codepoint = int(hex_part, 16)
                    except ValueError as e:
                        raise JsonSpanParseError("Invalid unicode escape") from e

                    out_chars.append(chr(codepoint))
                    spans.append((bs, i + 5))  # include \uXXXX fully
                    i += 4  # salta le 4 hex digits
                else:
                    raise JsonSpanParseError(f"Invalid escape sequence: \\{esc}")

                i += 1
                continue

            # Carattere normale
            out_chars.append(ch)
            spans.append((i, i + 1))
            i += 1

        raise JsonSpanParseError("Unterminated string")


    def _parse_number(self, i: int) -> Tuple[JsonNode, int]:
        s = self.s
        start = i
        n = self.n

        if s[i] == "-":
            i += 1
            if i >= n:
                raise JsonSpanParseError("Invalid number '-'")

        if i < n and s[i] == "0":
            i += 1
        else:
            if i >= n or not s[i].isdigit():
                raise JsonSpanParseError("Invalid number")
            while i < n and s[i].isdigit():
                i += 1

        if i < n and s[i] == ".":
            i += 1
            if i >= n or not s[i].isdigit():
                raise JsonSpanParseError("Invalid fraction")
            while i < n and s[i].isdigit():
                i += 1

        if i < n and s[i] in "eE":
            i += 1
            if i < n and s[i] in "+-":
                i += 1
            if i >= n or not s[i].isdigit():
                raise JsonSpanParseError("Invalid exponent")
            while i < n and s[i].isdigit():
                i += 1

        end = i
        num_text = s[start:end]
        try:
            val: Any
            if "." in num_text or "e" in num_text or "E" in num_text:
                val = float(num_text)
            else:
                val = int(num_text)
        except ValueError:
            val = num_text

        return JsonNode(kind="number", start=start, end=end, value=val), end


def patch_json_string_field_in_place(
    src_json: str,
    path: Sequence[str],
    new_string_value: str,
) -> Tuple[str, bool, str]:
    """
    Patch a JSON string field at a given object-path, modifying only the value span.
    """
    try:
        root = JsonSpanParser(src_json).parse()
    except Exception as e:
        return src_json, False, f"parse_error:{type(e).__name__}"

    if root.kind != "object" or not root.obj:
        return src_json, False, "root_not_object"

    node = root
    for k in path:
        if node.kind != "object" or not node.obj or k not in node.obj:
            return src_json, False, "path_missing"
        node = node.obj[k]

    if node.kind != "string":
        return src_json, False, "target_not_string"

    escaped = json_escape_string(new_string_value)
    replacement = '"' + escaped + '"'
    patched = src_json[: node.start] + replacement + src_json[node.end :]
    return patched, True, "patched"


# ================== SCHEMA FLATTENING (FOR PROMPT CONTEXT) ==================

@dataclasses.dataclass(frozen=True)
class ParamSpec:
    path: str
    types: Tuple[str, ...]
    description: str
    required: bool
    enum: Tuple[str, ...]
    default: Optional[str]
    fmt: Optional[str]
    pattern: Optional[str]
    minimum: Optional[str]
    maximum: Optional[str]


def _as_tuple_str(x: Any) -> Tuple[str, ...]:
    if x is None:
        return tuple()
    if isinstance(x, (list, tuple)):
        return tuple(str(v) for v in x)
    return (str(x),)


def _schema_types(schema: Dict[str, Any]) -> Tuple[str, ...]:
    t = schema.get("type")
    if isinstance(t, str):
        return (t,)
    if isinstance(t, list):
        return tuple(str(v) for v in t)
    if "properties" in schema:
        return ("object",)
    if "items" in schema:
        return ("array",)
    return ("string",)


def flatten_json_schema_parameters(
    parameters: Any,
    *,
    prefix: str = "",
    required_paths: Optional[set] = None,
) -> List[ParamSpec]:
    if required_paths is None:
        required_paths = set()

    out: List[ParamSpec] = []
    if not isinstance(parameters, dict):
        return out

    req = parameters.get("required")
    if isinstance(req, list):
        for r in req:
            if isinstance(r, str):
                required_paths.add((prefix + "." + r).lstrip("."))

    for comb_key in ("oneOf", "anyOf", "allOf"):
        comb = parameters.get(comb_key)
        if isinstance(comb, list) and comb:
            for sub in comb:
                out.extend(flatten_json_schema_parameters(sub, prefix=prefix, required_paths=set(required_paths)))

    tps = _schema_types(parameters)
    desc = normalize_single_line(parameters.get("description") or "")
    enum = _as_tuple_str(parameters.get("enum"))
    default = parameters.get("default")
    default_s = None if default is None else normalize_single_line(str(default))
    fmt = parameters.get("format")
    fmt_s = None if fmt is None else normalize_single_line(str(fmt))
    pattern = parameters.get("pattern")
    pat_s = None if pattern is None else normalize_single_line(str(pattern))
    minimum = parameters.get("minimum")
    maximum = parameters.get("maximum")
    min_s = None if minimum is None else normalize_single_line(str(minimum))
    max_s = None if maximum is None else normalize_single_line(str(maximum))

    if prefix and ("properties" not in parameters) and ("items" not in parameters):
        out.append(
            ParamSpec(
                path=prefix,
                types=tps,
                description=desc,
                required=(prefix in required_paths),
                enum=enum,
                default=default_s,
                fmt=fmt_s,
                pattern=pat_s,
                minimum=min_s,
                maximum=max_s,
            )
        )

    props = parameters.get("properties")
    if isinstance(props, dict):
        for k, sub in props.items():
            if isinstance(k, str) and isinstance(sub, dict):
                sub_prefix = (prefix + "." + k).lstrip(".")
                out.extend(flatten_json_schema_parameters(sub, prefix=sub_prefix, required_paths=required_paths))

    items = parameters.get("items")
    if isinstance(items, dict):
        sub_prefix = prefix + "[]" if prefix else "[]"
        out.extend(flatten_json_schema_parameters(items, prefix=sub_prefix, required_paths=required_paths))

    if prefix and ("properties" in parameters):
        out.append(
            ParamSpec(
                path=prefix,
                types=tps,
                description=desc,
                required=(prefix in required_paths),
                enum=enum,
                default=default_s,
                fmt=fmt_s,
                pattern=pat_s,
                minimum=min_s,
                maximum=max_s,
            )
        )

    return out


def build_param_context_for_prompt(param_specs: List[ParamSpec]) -> str:
    if not param_specs:
        return "PARAMETERS: {\"note\": \"No structured parameters were found.\"}"

    items = sorted(param_specs, key=lambda p: p.path)
    parts: List[str] = []
    parts.append("PARAMETERS: [")
    for p in items:
        parts.append(
            "  {"
            f"\"path\": \"{p.path}\", "
            f"\"types\": {list(p.types)}, "
            f"\"required\": {str(bool(p.required)).lower()}, "
            f"\"description\": \"{json_escape_string(p.description)}\", "
            f"\"enum\": {list(p.enum)}, "
            f"\"default\": {json.dumps(p.default, ensure_ascii=False)}, "
            f"\"format\": {json.dumps(p.fmt, ensure_ascii=False)}, "
            f"\"pattern\": {json.dumps(p.pattern, ensure_ascii=False)}, "
            f"\"minimum\": {json.dumps(p.minimum, ensure_ascii=False)}, "
            f"\"maximum\": {json.dumps(p.maximum, ensure_ascii=False)}"
            "}"
        )
    parts.append("]")
    return "\n".join(parts)


# ================== MODE DEFINITIONS + VALIDATION ==================

# NOTE: For the modes where allow_normative=False, we keep the detection conservative.
# The repair step is guided to explicitly avoid these tokens.
NORMATIVE_KEYWORDS = (
    "should",
    "should not",
    "must",
    "must not",
    "always",
    "never",
    "required to",
    "do not",
    "avoid",
)

EXAMPLE_MARKERS = (
    "for example",
    "e.g.",
    "example:",
)


@dataclasses.dataclass(frozen=True)
class ModePolicy:
    mode: str
    allow_examples: bool
    allow_normative: bool
    target_length: str  # "short" | "concise" | "verbose"
    must_preserve_semantics: bool


MODE_POLICIES: Dict[str, ModePolicy] = {
    "empty_desc": ModePolicy("empty_desc", allow_examples=False, allow_normative=False, target_length="short", must_preserve_semantics=True),
    "style_concise": ModePolicy("style_concise", allow_examples=False, allow_normative=False, target_length="concise", must_preserve_semantics=True),
    "style_verbose": ModePolicy("style_verbose", allow_examples=False, allow_normative=False, target_length="verbose", must_preserve_semantics=True),
    "add_examples": ModePolicy("add_examples", allow_examples=True, allow_normative=False, target_length="verbose", must_preserve_semantics=True),
    "normative_injection": ModePolicy("normative_injection", allow_examples=True, allow_normative=True, target_length="verbose", must_preserve_semantics=True),
}


def _contains_any_case_insensitive(text: str, needles: Sequence[str]) -> bool:
    lt = text.lower()
    return any(n in lt for n in needles)


def validate_description(
    mode: str,
    desc: str,
    *,
    required_param_names: Sequence[str],
    all_param_names: Sequence[str],
) -> Tuple[bool, List[str]]:
    """
    Hard validation for reproducible perturbations.

    Key invariants:
    - Single line
    - Max length
    - For modes with allow_examples=False: no "For example"/"e.g." markers
    - For modes with allow_normative=False: no normative keywords
    - No mentions of unknown parameter tokens (conservative, token-based)
    - For must_preserve_semantics modes (all non-empty_desc modes here):
      every REQUIRED top-level parameter name must appear at least once (verbatim token match).

    The last constraint is strict by design; the guided repair prompt is engineered to
    make compliance easy and deterministic.
    """
    errors: List[str] = []
    policy = MODE_POLICIES.get(mode)
    if policy is None:
        return False, [f"unknown_mode:{mode}"]

    if "\n" in desc or "\r" in desc:
        errors.append("contains_newline")

    if len(desc) > DESCRIPTION_MAX_CHARS:
        errors.append("too_long_chars")

    if mode != "empty_desc" and not desc.strip():
        errors.append("empty_description_not_allowed")

    if mode == "empty_desc" and desc != "":
        errors.append("empty_desc_must_be_exact_empty_string")

    if not policy.allow_examples and _contains_any_case_insensitive(desc, EXAMPLE_MARKERS):
        errors.append("examples_not_allowed_in_mode")

    if not policy.allow_normative and _contains_any_case_insensitive(desc, NORMATIVE_KEYWORDS):
        errors.append("normative_language_not_allowed_in_mode")

    # Normalize for token checks (keep underscores/dots/brackets).
    lowered = " " + "".join((c.lower() if (c.isalnum() or c in "_.[]") else " ") for c in desc) + " "

    def has_token(token: str) -> bool:
        t = token.lower()
        return f" {t} " in lowered

    # Unknown parameter mention check (conservative):
    allowed = {p.lower() for p in all_param_names if p}
    tokens = [t for t in lowered.split() if t]
    for t in tokens:
        if all(c.isalnum() or c in "_.[]" for c in t) and ("_" in t or "." in t or "[" in t):
            if t in allowed:
                continue
            errors.append(f"mentions_unknown_param:{t}")

    # Strict: required parameters must be mentioned for semantic-preserving modes (all here except empty_desc).
    if policy.must_preserve_semantics and mode != "empty_desc":
        for rp in required_param_names:
            if rp and not has_token(rp):
                errors.append(f"missing_required_param_mention:{rp}")

    return (len(errors) == 0), errors


# ================== PERSISTENT CACHE (SQLITE) ==================

class DescCache:
    """
    SQLite-backed cache for generated tool descriptions.
    """

    def __init__(self, db_path: str):
        self.db_path = db_path
        self.conn = sqlite3.connect(db_path)
        self._init_db()

    def _init_db(self) -> None:
        cur = self.conn.cursor()
        cur.execute(
            """
            CREATE TABLE IF NOT EXISTS desc_cache (
                key_hash TEXT PRIMARY KEY,
                created_at REAL NOT NULL,
                generation_model TEXT NOT NULL,
                mode TEXT NOT NULL,
                tool_name TEXT NOT NULL,
                tool_signature TEXT NOT NULL,
                prompt_hash TEXT NOT NULL,
                prompt_text TEXT NOT NULL,
                raw_output TEXT NOT NULL,
                final_output TEXT NOT NULL,
                status TEXT NOT NULL,
                validation_errors TEXT NOT NULL
            )
            """
        )
        self.conn.commit()

    def get(self, key_hash: str) -> Optional[Dict[str, Any]]:
        cur = self.conn.cursor()
        cur.execute(
            "SELECT key_hash, created_at, generation_model, mode, tool_name, tool_signature, prompt_hash, prompt_text, raw_output, final_output, status, validation_errors FROM desc_cache WHERE key_hash=?",
            (key_hash,),
        )
        row = cur.fetchone()
        if not row:
            return None
        return {
            "key_hash": row[0],
            "created_at": row[1],
            "generation_model": row[2],
            "mode": row[3],
            "tool_name": row[4],
            "tool_signature": row[5],
            "prompt_hash": row[6],
            "prompt_text": row[7],
            "raw_output": row[8],
            "final_output": row[9],
            "status": row[10],
            "validation_errors": json.loads(row[11]),
        }

    def put(
        self,
        *,
        key_hash: str,
        generation_model: str,
        mode: str,
        tool_name: str,
        tool_signature: str,
        prompt_hash: str,
        prompt_text: str,
        raw_output: str,
        final_output: str,
        status: str,
        validation_errors: List[str],
    ) -> None:
        cur = self.conn.cursor()
        cur.execute(
            """
            INSERT OR REPLACE INTO desc_cache
            (key_hash, created_at, generation_model, mode, tool_name, tool_signature, prompt_hash, prompt_text, raw_output, final_output, status, validation_errors)
            VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
            """,
            (
                key_hash,
                now_unix(),
                generation_model,
                mode,
                tool_name,
                tool_signature,
                prompt_hash,
                prompt_text,
                raw_output,
                final_output,
                status,
                json.dumps(validation_errors, ensure_ascii=False),
            ),
        )
        self.conn.commit()

    def close(self) -> None:
        self.conn.close()


# ================== AUDIT LOGGING ==================

class AuditLogger:
    """
    Append-only JSONL logs for reproducibility.
    """

    def __init__(self, audit_dir: str):
        self.audit_dir = audit_dir
        ensure_dir(audit_dir)
        self.calls_path = os.path.join(audit_dir, AUDIT_CALLS_JSONL)
        self.events_path = os.path.join(audit_dir, AUDIT_EVENTS_JSONL)
        self.fallbacks_path = os.path.join(audit_dir, AUDIT_FALLBACKS_JSONL)
        self.summary_path = os.path.join(audit_dir, AUDIT_SUMMARY_JSON)
        self.summary: Dict[str, Any] = {
            "tools_seen": 0,
            "tools_patched": 0,
            "tools_unchanged": 0,
            "tools_fallback_reserialized": 0,
            "tools_parse_failed": 0,
            "llm_calls": 0,
            "llm_cache_hits": 0,
            "llm_repaired": 0,
            "llm_rejected": 0,
            "lines_seen": 0,
            "lines_written": 0,
        }

    def log_call(self, record: Dict[str, Any]) -> None:
        with open(self.calls_path, "a", encoding="utf-8") as f:
            f.write(json.dumps(record, ensure_ascii=False) + "\n")

    def log_event(self, record: Dict[str, Any]) -> None:
        with open(self.events_path, "a", encoding="utf-8") as f:
            f.write(json.dumps(record, ensure_ascii=False) + "\n")

    def log_fallback(self, record: Dict[str, Any]) -> None:
        with open(self.fallbacks_path, "a", encoding="utf-8") as f:
            f.write(json.dumps(record, ensure_ascii=False) + "\n")

    def inc(self, key: str, n: int = 1) -> None:
        self.summary[key] = int(self.summary.get(key, 0)) + n

    def flush_summary(self, extra: Optional[Dict[str, Any]] = None) -> None:
        payload = dict(self.summary)
        if extra:
            payload.update(extra)
        with open(self.summary_path, "w", encoding="utf-8") as f:
            f.write(json.dumps(payload, ensure_ascii=False, indent=2) + "\n")


# ================== PROMPT CONSTRUCTION ==================

def _format_required_params(required_param_names: Sequence[str]) -> str:
    req = [r for r in required_param_names if r]
    req = _dedupe_preserve_order(req)
    return ", ".join(req) if req else "[NONE]"


def build_generation_prompt(
    *,
    mode: str,
    tool_name: str,
    original_description: str,
    param_context: str,
    required_param_names: Sequence[str],
) -> str:
    policy = MODE_POLICIES.get(mode)
    if policy is None:
        raise ValueError(f"Unknown mode: {mode}")

    common = (
        "TASK: Produce exactly one English description string for a tool in a function-calling benchmark.\n"
        "OUTPUT FORMAT: Output only the description text, and nothing else.\n"
        "OUTPUT CONSTRAINTS:\n"
        "1) Output must be a single line (no newline characters).\n"
        "2) Do not output markdown, code fences, bullet characters, or backticks.\n"
        "3) Do not mention this task, prompts, schemas, or that an AI system is being used.\n"
        "4) Do not invent tool capabilities that are not supported by the provided parameters.\n"
    )

    mode_block = f"MODE: {mode}\n"

    if mode == "empty_desc":
        return common + mode_block + "INSTRUCTION: Output an empty string.\n"

    req_line = f"REQUIRED_PARAMETER_NAMES (must appear verbatim at least once): {_format_required_params(required_param_names)}\n"

    if mode in {"style_concise", "style_verbose"}:
        length_instr = (
            "Write a concise description (one to two short sentences)."
            if mode == "style_concise"
            else "Write a detailed description (three to six sentences) without examples."
        )
        return (
            common
            + mode_block
            + "INSTRUCTION: Rewrite the tool-level description while preserving the tool purpose and inputs.\n"
            + req_line
            + f"LENGTH TARGET: {length_instr}\n"
            + "RESTRICTIONS:\n"
            + "A) Do not include usage examples.\n"
            + "B) Do not include behavioral guidance (avoid 'should', 'must', 'always', 'never', etc.).\n"
            + f"TOOL_NAME: {tool_name}\n"
            + f"ORIGINAL_DESCRIPTION: {normalize_single_line(original_description) if original_description else '[NONE]'}\n"
            + param_context
            + "\n"
        )

    if mode == "add_examples":
        return (
            common
            + mode_block
            + "INSTRUCTION: Rewrite the tool-level description while preserving the tool purpose and inputs.\n"
            + req_line
            + "REQUIREMENTS:\n"
            + "A) Include one or two short conceptual examples inside the text using the exact phrase 'For example,'.\n"
            + "RESTRICTIONS:\n"
            + "A) Do not include behavioral guidance (avoid 'should', 'must', 'always', 'never', etc.).\n"
            + f"TOOL_NAME: {tool_name}\n"
            + f"ORIGINAL_DESCRIPTION: {normalize_single_line(original_description) if original_description else '[NONE]'}\n"
            + param_context
            + "\n"
        )

    if mode == "normative_injection":
        return (
            common
            + mode_block
            + "INSTRUCTION: Rewrite the tool-level description while preserving the tool purpose and inputs.\n"
            + req_line
            + "REQUIREMENTS:\n"
            + "A) Include one or two short conceptual examples using the exact phrase 'For example,'.\n"
            + "B) Include explicit behavioral guidance about when the tool should be used, when follow-up information is needed, and when it should not be used.\n"
            + f"TOOL_NAME: {tool_name}\n"
            + f"ORIGINAL_DESCRIPTION: {normalize_single_line(original_description) if original_description else '[NONE]'}\n"
            + param_context
            + "\n"
        )

    raise ValueError(f"Unhandled mode: {mode}")


def _extract_missing_required_from_errors(errors: Sequence[str]) -> List[str]:
    out: List[str] = []
    for e in errors:
        if e.startswith("missing_required_param_mention:"):
            out.append(e.split(":", 1)[1].strip())
    return _dedupe_preserve_order([x for x in out if x])


def _extract_unknown_params_from_errors(errors: Sequence[str]) -> List[str]:
    out: List[str] = []
    for e in errors:
        if e.startswith("mentions_unknown_param:"):
            out.append(e.split(":", 1)[1].strip())
    return _dedupe_preserve_order([x for x in out if x])


def build_repair_prompt(
    *,
    mode: str,
    tool_name: str,
    candidate_output: str,
    validation_errors: List[str],
    param_context: str,
    required_param_names: Sequence[str],
) -> str:
    """
    Guided repair (Option 2): make validator compliance easy and deterministic.

    Key idea:
    - If required params are missing, explicitly list the exact tokens that MUST appear.
    - If unknown param tokens were mentioned, explicitly list tokens that MUST be removed.
    - Restate mode-specific bans (examples/normative) in an operational way.
    """
    policy = MODE_POLICIES.get(mode)
    if policy is None:
        raise ValueError(f"Unknown mode: {mode}")

    missing = _extract_missing_required_from_errors(validation_errors)
    unknown = _extract_unknown_params_from_errors(validation_errors)

    bans: List[str] = []
    if not policy.allow_examples:
        bans.append("No examples: do not include 'For example', 'e.g.', or 'example:'.")
    if not policy.allow_normative:
        bans.append("No normative language: avoid 'should', 'must', 'always', 'never', 'do not', 'avoid', etc.")

    must_include_all = _dedupe_preserve_order([r for r in required_param_names if r])
    must_include_text = ", ".join(must_include_all) if must_include_all else "[NONE]"
    missing_text = ", ".join(missing) if missing else "[NONE]"
    unknown_text = ", ".join(unknown) if unknown else "[NONE]"
    bans_text = " ".join(bans) if bans else "[NONE]"

    # Strong operational constraints (single line, no markdown) remain.
    return (
        "TASK: Fix a tool description so that it satisfies hard constraints exactly.\n"
        "OUTPUT FORMAT: Output only the corrected description text, and nothing else.\n"
        "HARD CONSTRAINTS:\n"
        "1) Output must be a single line (no newline characters).\n"
        "2) Do not output markdown, code fences, bullets, or backticks.\n"
        "3) Do not mention prompts, schemas, validators, or that an AI system is being used.\n"
        f"4) Mode is {mode}; satisfy the mode-specific rules.\n"
        f"MODE-SPECIFIC BANS: {bans_text}\n"
        f"REQUIRED_PARAMETER_NAMES (must appear verbatim at least once): {must_include_text}\n"
        f"MISSING_REQUIRED_IN_CANDIDATE: {missing_text}\n"
        f"UNKNOWN_PARAMETER_TOKENS_TO_REMOVE: {unknown_text}\n"
        f"TOOL_NAME: {tool_name}\n"
        f"PARAMETERS CONTEXT:\n{param_context}\n"
        f"CANDIDATE OUTPUT:\n{normalize_single_line(candidate_output)}\n"
        "INSTRUCTION:\n"
        "Rewrite the candidate so it passes validation. Ensure all required parameter tokens appear verbatim.\n"
        "If any unknown parameter tokens are present, remove them.\n"
        "Keep content consistent with the parameters; do not invent capabilities.\n"
    )


# ================== LLM CALLS WITH RETRY + AUDIT ==================

def _sleep_with_jitter(seconds: float) -> None:
    if seconds <= 0:
        return
    jitter = random.random() * min(0.25, seconds * 0.1)
    time.sleep(seconds + jitter)


def llm_call_chat_completions(
    *,
    client: OpenAI,
    model: str,
    system_text: str,
    user_text: str,
    temperature: float,
    max_tokens: int,
    rate_limit_sleep_sec: float,
    max_retries: int,
    audit: AuditLogger,
    call_tag: str,
) -> str:
    if rate_limit_sleep_sec > 0:
        _sleep_with_jitter(rate_limit_sleep_sec)

    prompt_hash = stable_sha256(system_text + "\n\n" + user_text)

    last_exc: Optional[Exception] = None
    for attempt in range(1, max_retries + 1):
        try:
            audit.inc("llm_calls", 1)
            t0 = now_unix()
            resp = client.chat.completions.create(
                model=model,
                messages=[
                    {"role": "system", "content": system_text},
                    {"role": "user", "content": user_text},
                ],
                temperature=temperature,
                max_tokens=max_tokens,
            )
            t1 = now_unix()
            text = (resp.choices[0].message.content or "")
            audit.log_call(
                {
                    "ts": t0,
                    "tag": call_tag,
                    "attempt": attempt,
                    "model": model,
                    "temperature": temperature,
                    "max_tokens": max_tokens,
                    "prompt_hash": prompt_hash,
                    "latency_sec": round(t1 - t0, 6),
                    "raw_output_preview": text[:2000],
                }
            )
            return text

        except Exception as e:
            last_exc = e
            backoff = min(30.0, 0.5 * (2 ** (attempt - 1)))
            audit.log_call(
                {
                    "ts": now_unix(),
                    "tag": call_tag,
                    "attempt": attempt,
                    "model": model,
                    "temperature": temperature,
                    "max_tokens": max_tokens,
                    "prompt_hash": prompt_hash,
                    "error_type": type(e).__name__,
                    "error_str": str(e)[:2000],
                    "backoff_sec": backoff,
                }
            )
            _sleep_with_jitter(backoff)

    raise RuntimeError(f"LLM call failed after {max_retries} attempts: {type(last_exc).__name__}: {last_exc}") from last_exc


# ================== TOOL DESCRIPTION GENERATION PIPELINE ==================

def tool_signature_for_cache(tool_obj: Dict[str, Any]) -> str:
    name = str(tool_obj.get("name") or tool_obj.get("function", {}).get("name") or "unnamed_tool")
    if "function" in tool_obj and isinstance(tool_obj["function"], dict):
        params = tool_obj["function"].get("parameters")
        orig_desc = tool_obj["function"].get("description") or ""
    else:
        params = tool_obj.get("parameters")
        orig_desc = tool_obj.get("description") or ""

    params_canon = json.dumps(params, ensure_ascii=False, sort_keys=True, separators=(",", ":"))
    base = f"name={name}\nparams={params_canon}\norig_desc={normalize_single_line(orig_desc)}"
    return stable_sha256(base)


def extract_tool_core(tool_obj: Dict[str, Any]) -> Tuple[str, str, Any, List[ParamSpec], List[str], List[str], Sequence[str]]:
    if "function" in tool_obj and isinstance(tool_obj["function"], dict):
        fn = tool_obj["function"]
        tool_name = str(fn.get("name") or "unnamed_tool")
        orig_desc = str(fn.get("description") or "")
        params = fn.get("parameters")
        desc_path = ("function", "description")
    else:
        tool_name = str(tool_obj.get("name") or "unnamed_tool")
        orig_desc = str(tool_obj.get("description") or "")
        params = tool_obj.get("parameters")
        desc_path = ("description",)

    param_specs = flatten_json_schema_parameters(params if isinstance(params, dict) else {})

    required_names: List[str] = []
    all_names: List[str] = []
    if isinstance(params, dict):
        props = params.get("properties")
        if isinstance(props, dict):
            all_names = [k for k in props.keys() if isinstance(k, str)]
        req = params.get("required")
        if isinstance(req, list):
            required_names = [r for r in req if isinstance(r, str)]

    return tool_name, orig_desc, params, param_specs, required_names, all_names, desc_path


def generate_description_for_tool(
    *,
    client: OpenAI,
    generation_model: str,
    mode: str,
    tool_json_obj: Dict[str, Any],
    cache: DescCache,
    audit: AuditLogger,
    temperature: float,
    max_tokens: int,
    rate_limit_sleep_sec: float,
    max_retries: int,
    repair_max_rounds: int,
) -> Tuple[str, str, List[str], str]:
    if mode not in MODES:
        raise ValueError(f"Unknown mode: {mode}")

    tool_name, orig_desc, _, param_specs, required_names, all_names, _ = extract_tool_core(tool_json_obj)
    param_context = build_param_context_for_prompt(param_specs)

    if mode == "empty_desc":
        return "", "ok", [], stable_sha256("empty_desc")

    prompt = build_generation_prompt(
        mode=mode,
        tool_name=tool_name,
        original_description=orig_desc,
        param_context=param_context,
        required_param_names=required_names,
    )
    prompt_hash = stable_sha256(prompt)

    tool_sig = tool_signature_for_cache(tool_json_obj)
    cache_key = stable_sha256(f"{generation_model}||{mode}||{tool_sig}||{prompt_hash}")

    cached = cache.get(cache_key)
    if cached is not None:
        audit.inc("llm_cache_hits", 1)
        return cached["final_output"], "cached", cached["validation_errors"], prompt_hash

    system_text = "System role: Generate tool documentation strings for function-calling APIs."

    raw = llm_call_chat_completions(
        client=client,
        model=generation_model,
        system_text=system_text,
        user_text=prompt,
        temperature=temperature,
        max_tokens=max_tokens,
        rate_limit_sleep_sec=rate_limit_sleep_sec,
        max_retries=max_retries,
        audit=audit,
        call_tag=f"generate::{mode}::{tool_name}",
    )

    candidate = normalize_single_line(raw)

    ok, errs = validate_description(
        mode,
        candidate,
        required_param_names=required_names,
        all_param_names=all_names,
    )

    status = "ok"
    final = candidate
    all_errs = list(errs)

    repaired_rounds = 0
    while not ok and repaired_rounds < repair_max_rounds:
        repaired_rounds += 1
        repair_prompt = build_repair_prompt(
            mode=mode,
            tool_name=tool_name,
            candidate_output=final,
            validation_errors=all_errs,
            param_context=param_context,
            required_param_names=required_names,
        )
        repair_raw = llm_call_chat_completions(
            client=client,
            model=generation_model,
            system_text=system_text,
            user_text=repair_prompt,
            temperature=0.0,
            max_tokens=max_tokens,
            rate_limit_sleep_sec=rate_limit_sleep_sec,
            max_retries=max_retries,
            audit=audit,
            call_tag=f"repair::{mode}::{tool_name}::round{repaired_rounds}",
        )
        final = normalize_single_line(repair_raw)
        ok, all_errs = validate_description(
            mode,
            final,
            required_param_names=required_names,
            all_param_names=all_names,
        )

    if ok and repaired_rounds > 0:
        status = "repaired"
        audit.inc("llm_repaired", 1)
    elif not ok:
        status = "rejected"
        audit.inc("llm_rejected", 1)
        # Publication-grade conservative fallback: keep original if we cannot enforce constraints.
        final = normalize_single_line(orig_desc)

    cache.put(
        key_hash=cache_key,
        generation_model=generation_model,
        mode=mode,
        tool_name=tool_name,
        tool_signature=tool_sig,
        prompt_hash=prompt_hash,
        prompt_text=prompt,
        raw_output=raw,
        final_output=final,
        status=status,
        validation_errors=all_errs,
    )

    return final, status, all_errs, prompt_hash


# ================== TOOL STRING PATCHING INSIDE OUTER JSONL LINE ==================

def parse_tool_json_string(tool_json_str: str) -> Optional[Dict[str, Any]]:
    try:
        obj = json.loads(tool_json_str)
    except Exception:
        return None
    return obj if isinstance(obj, dict) else None


def deterministic_tool_serialize(tool_obj: Dict[str, Any]) -> str:
    return json.dumps(tool_obj, ensure_ascii=False, sort_keys=True, separators=(",", ":"))


def patch_tool_description_in_tool_json_string(
    *,
    tool_json_str: str,
    new_desc: str,
    desc_path: Sequence[str],
) -> Tuple[str, bool, str]:
    return patch_json_string_field_in_place(tool_json_str, desc_path, new_desc)


def patch_outer_jsonl_line_tools(
    *,
    line: str,
    client: OpenAI,
    generation_model: str,
    mode: str,
    cache: DescCache,
    audit: AuditLogger,
    temperature: float,
    max_tokens: int,
    rate_limit_sleep_sec: float,
    max_retries: int,
    repair_max_rounds: int,
) -> Tuple[str, Dict[str, Any]]:
    stats = {
        "tools_in_line": 0,
        "tools_patched": 0,
        "tools_unchanged": 0,
        "tools_fallback_reserialized": 0,
        "tools_parse_failed": 0,
    }

    try:
        root = JsonSpanParser(line).parse()
    except Exception:
        stats["outer_parse_failed"] = True
        return line, stats

    if root.kind != "object" or not root.obj or "tools" not in root.obj:
        return line, stats

    tools_node = root.obj["tools"]
    if tools_node.kind != "array" or not tools_node.arr:
        return line, stats

    patches: List[Tuple[int, int, str]] = []

    for idx, el in enumerate(tools_node.arr):
        stats["tools_in_line"] += 1
        audit.inc("tools_seen", 1)

        if el.kind != "string":
            stats["tools_unchanged"] += 1
            audit.inc("tools_unchanged", 1)
            audit.log_event(
                {
                    "ts": now_unix(),
                    "event": "tool_entry_not_string_skipped",
                    "mode": mode,
                    "tool_index_in_line": idx,
                    "kind": el.kind,
                }
            )
            continue

        tool_json_str = el.value
        tool_obj = parse_tool_json_string(tool_json_str)
        if tool_obj is None:
            stats["tools_parse_failed"] += 1
            audit.inc("tools_parse_failed", 1)
            audit.log_event(
                {
                    "ts": now_unix(),
                    "event": "tool_string_parse_failed",
                    "mode": mode,
                    "tool_index_in_line": idx,
                }
            )
            stats["tools_unchanged"] += 1
            audit.inc("tools_unchanged", 1)
            continue

        tool_name, orig_desc, _, param_specs, required_names, all_names, desc_path = extract_tool_core(tool_obj)

        new_desc, status, val_errs, prompt_hash = generate_description_for_tool(
            client=client,
            generation_model=generation_model,
            mode=mode,
            tool_json_obj=tool_obj,
            cache=cache,
            audit=audit,
            temperature=temperature,
            max_tokens=max_tokens,
            rate_limit_sleep_sec=rate_limit_sleep_sec,
            max_retries=max_retries,
            repair_max_rounds=repair_max_rounds,
        )

        patched_inner, did_patch, reason = patch_tool_description_in_tool_json_string(
            tool_json_str=tool_json_str,
            new_desc=new_desc,
            desc_path=desc_path,
        )

        if not did_patch:
            stats["tools_fallback_reserialized"] += 1
            audit.inc("tools_fallback_reserialized", 1)

            if len(desc_path) == 2 and desc_path[0] == "function":
                if isinstance(tool_obj.get("function"), dict):
                    tool_obj["function"]["description"] = new_desc
            else:
                tool_obj["description"] = new_desc
            patched_inner = deterministic_tool_serialize(tool_obj)

            audit.log_fallback(
                {
                    "ts": now_unix(),
                    "event": "inplace_patch_failed_fallback_reserialize",
                    "mode": mode,
                    "tool_name": tool_name,
                    "reason": reason,
                    "generation_status": status,
                    "validation_errors": val_errs,
                    "prompt_hash": prompt_hash,
                }
            )

        if normalize_single_line(orig_desc) == normalize_single_line(new_desc) and did_patch:
            stats["tools_unchanged"] += 1
            audit.inc("tools_unchanged", 1)
        else:
            stats["tools_patched"] += 1
            audit.inc("tools_patched", 1)

        replacement_outer = '"' + json_escape_string(patched_inner) + '"'
        patches.append((el.start, el.end, replacement_outer))

        audit.log_event(
            {
                "ts": now_unix(),
                "event": "tool_processed",
                "mode": mode,
                "tool_index_in_line": idx,
                "tool_name": tool_name,
                "generation_model": generation_model,
                "generation_status": status,
                "inplace_patch": did_patch,
                "inplace_patch_reason": reason,
                "original_desc_preview": normalize_single_line(orig_desc)[:300],
                "new_desc_preview": normalize_single_line(new_desc)[:300],
                "validation_errors": val_errs[:50],  # keep bounded
            }
        )

    if not patches:
        return line, stats

    patches.sort(key=lambda x: x[0], reverse=True)
    out = line
    for start, end, repl in patches:
        out = out[:start] + repl + out[end:]

    return out, stats


# ================== JSONL VARIANT GENERATION ==================

def build_variant_jsonl_with_llm(
    *,
    client: OpenAI,
    generation_model: str,
    input_path: str,
    output_path: str,
    mode: str,
    output_dir: str,
    temperature: float,
    max_tokens: int,
    rate_limit_sleep_sec: float,
    max_retries: int,
    repair_max_rounds: int,
    seed: int,
    overwrite: bool,
) -> None:
    if mode not in MODES:
        raise ValueError(f"Unknown mode in build_variant_jsonl_with_llm: {mode}")

    random.seed(seed)

    ensure_dir(output_dir)
    audit_dir = os.path.join(output_dir, AUDIT_DIR_NAME, f"{sanitize_model_name_for_path(generation_model)}__{mode}")
    ensure_dir(audit_dir)

    cache_db = os.path.join(output_dir, CACHE_DB_NAME)
    cache = DescCache(cache_db)
    audit = AuditLogger(audit_dir)

    if os.path.exists(output_path) and not overwrite:
        raise FileExistsError(f"Output exists and overwrite=False: {output_path}")

    with open(input_path, "r", encoding="utf-8") as fin, open(output_path, "w", encoding="utf-8") as fout:
        for line in fin:
            if not line.strip():
                continue
            audit.inc("lines_seen", 1)

            patched_line, stats = patch_outer_jsonl_line_tools(
                line=line.rstrip("\n"),
                client=client,
                generation_model=generation_model,
                mode=mode,
                cache=cache,
                audit=audit,
                temperature=temperature,
                max_tokens=max_tokens,
                rate_limit_sleep_sec=rate_limit_sleep_sec,
                max_retries=max_retries,
                repair_max_rounds=repair_max_rounds,
            )

            fout.write(patched_line + "\n")
            audit.inc("lines_written", 1)

            audit.log_event(
                {
                    "ts": now_unix(),
                    "event": "line_processed",
                    "mode": mode,
                    "tools_in_line": stats.get("tools_in_line", 0),
                    "tools_patched_in_line": stats.get("tools_patched", 0),
                    "tools_fallback_reserialized_in_line": stats.get("tools_fallback_reserialized", 0),
                }
            )

    audit.flush_summary(
        extra={
            "input_path": input_path,
            "output_path": output_path,
            "mode": mode,
            "generation_model": generation_model,
            "temperature": temperature,
            "max_tokens": max_tokens,
            "rate_limit_sleep_sec": rate_limit_sleep_sec,
            "max_retries": max_retries,
            "repair_max_rounds": repair_max_rounds,
            "seed": seed,
        }
    )

    cache.close()
    print(f"[{mode}] Wrote {output_path}")
    print(f"[{mode}] Audit directory: {audit_dir}")
    print(f"[{mode}] Cache DB: {cache_db}")


# ================== MULTI-VARIANT ENTRYPOINT ==================

def run_when2call_variants(
    *,
    client: OpenAI,
    generation_model: str,
    input_jsonl: str = DEFAULT_INPUT_JSONL,
    output_dir: str = DEFAULT_OUTPUT_DIR,
    modes: Optional[List[str]] = None,
    overwrite: bool = False,
    temperature: float = DEFAULT_TEMPERATURE,
    max_tokens: int = DEFAULT_MAX_TOKENS,
    rate_limit_sleep_sec: float = DEFAULT_RATE_LIMIT_SLEEP_SEC,
    max_retries: int = DEFAULT_MAX_RETRIES,
    repair_max_rounds: int = DEFAULT_REPAIR_MAX_ROUNDS,
    seed: int = DEFAULT_RANDOM_SEED,
) -> None:
    if not os.path.exists(input_jsonl):
        raise FileNotFoundError(f"Input file not found: {input_jsonl}")

    ensure_dir(output_dir)

    active_modes = modes if modes is not None else list(MODES)

    unknown = [m for m in active_modes if m not in MODES]
    if unknown:
        raise ValueError(f"Unknown modes requested: {unknown}. Allowed: {MODES}")

    base_name = os.path.splitext(os.path.basename(input_jsonl))[0]
    model_tag = sanitize_model_name_for_path(generation_model)

    print(f"Input: {input_jsonl}")
    print(f"Output dir: {output_dir}")
    print(f"Generator model: {generation_model}")
    print(f"Modes: {active_modes}")
    print(f"Temperature: {temperature} | max_tokens: {max_tokens} | retries: {max_retries} | repair_rounds: {repair_max_rounds}")

    for mode in active_modes:
        out_name = f"{base_name}__gen-{model_tag}__mode-{mode}.jsonl"
        out_path = os.path.join(output_dir, out_name)

        if os.path.exists(out_path) and not overwrite:
            print(f"[{mode}] Skipped (exists, overwrite=False): {out_path}")
            continue

        build_variant_jsonl_with_llm(
            client=client,
            generation_model=generation_model,
            input_path=input_jsonl,
            output_path=out_path,
            mode=mode,
            output_dir=output_dir,
            temperature=temperature,
            max_tokens=max_tokens,
            rate_limit_sleep_sec=rate_limit_sleep_sec,
            max_retries=max_retries,
            repair_max_rounds=repair_max_rounds,
            seed=seed,
            overwrite=overwrite,
        )


# ================== CLI ==================

# def _parse_args(argv: Optional[Sequence[str]] = None) -> argparse.Namespace:
#     p = argparse.ArgumentParser(description="When2Call tool-description variant generator (5 reviewer-grade modes).")
#     p.add_argument("--input", default=DEFAULT_INPUT_JSONL, help="Input JSONL path.")
#     p.add_argument("--output-dir", default=DEFAULT_OUTPUT_DIR, help="Output directory.")
#     p.add_argument("--generation-model", required=True, help="Model name for the JRC gateway (OpenAI-compatible).")
#     p.add_argument(
#         "--modes",
#         nargs="*",
#         default=None,
#         help=f"Subset of modes to run. Allowed: {MODES}. If omitted, runs all.",
#     )
#     p.add_argument("--overwrite", action="store_true", help="Overwrite existing outputs.")
#     p.add_argument("--temperature", type=float, default=DEFAULT_TEMPERATURE)
#     p.add_argument("--max-tokens", type=int, default=DEFAULT_MAX_TOKENS)
#     p.add_argument("--rate-limit-sleep-sec", type=float, default=DEFAULT_RATE_LIMIT_SLEEP_SEC)
#     p.add_argument("--max-retries", type=int, default=DEFAULT_MAX_RETRIES)
#     p.add_argument("--repair-max-rounds", type=int, default=DEFAULT_REPAIR_MAX_ROUNDS)
#     p.add_argument("--seed", type=int, default=DEFAULT_RANDOM_SEED)
#     return p.parse_args(argv)


# def main(argv: Optional[Sequence[str]] = None) -> int:
#     args = _parse_args(argv)
#     client = make_jrc_client()
#     run_when2call_variants(
#         client=client,
#         generation_model=args.generation_model,
#         input_jsonl=args.input,
#         output_dir=args.output_dir,
#         modes=args.modes,
#         overwrite=args.overwrite,
#         temperature=args.temperature,
#         max_tokens=args.max_tokens,
#         rate_limit_sleep_sec=args.rate_limit_sleep_sec,
#         max_retries=args.max_retries,
#         repair_max_rounds=args.repair_max_rounds,
#         seed=args.seed,
#     )
#     return 0


# if __name__ == "__main__":
#     raise SystemExit(main())


# ================== NOTEBOOK SNIPPET (OPTIONAL) ==================

NB_INPUT_JSONL = DEFAULT_INPUT_JSONL
NB_OUTPUT_DIR = DEFAULT_OUTPUT_DIR
NB_GENERATION_MODEL = "gpt-oss-120b"
NB_MODES = ["style_concise", "style_verbose", "add_examples"]  # or None to run all MODES
NB_OVERWRITE = False

NB_TEMPERATURE = DEFAULT_TEMPERATURE
NB_MAX_TOKENS = DEFAULT_MAX_TOKENS
NB_RATE_LIMIT_SLEEP_SEC = DEFAULT_RATE_LIMIT_SLEEP_SEC
NB_MAX_RETRIES = DEFAULT_MAX_RETRIES
NB_REPAIR_MAX_ROUNDS = DEFAULT_REPAIR_MAX_ROUNDS
NB_SEED = DEFAULT_RANDOM_SEED

NB_CLIENT = make_jrc_client()
run_when2call_variants(
    client=NB_CLIENT,
    generation_model=NB_GENERATION_MODEL,
    input_jsonl=NB_INPUT_JSONL,
    output_dir=NB_OUTPUT_DIR,
    modes=NB_MODES,
    overwrite=NB_OVERWRITE,
    temperature=NB_TEMPERATURE,
    max_tokens=NB_MAX_TOKENS,
    rate_limit_sleep_sec=NB_RATE_LIMIT_SLEEP_SEC,
    max_retries=NB_MAX_RETRIES,
    repair_max_rounds=NB_REPAIR_MAX_ROUNDS,
    seed=NB_SEED,
)


Input: When2Call/data/test/when2call_test_llm_judge.jsonl
Output dir: when2call_local_variants
Generator model: gpt-oss-120b
Modes: ['style_concise', 'style_verbose', 'add_examples']
Temperature: 0.0 | max_tokens: 256 | retries: 8 | repair_rounds: 3
[style_concise] Wrote when2call_local_variants/when2call_test_llm_judge__gen-gpt-oss-120b__mode-style_concise.jsonl
[style_concise] Audit directory: when2call_local_variants/audit/gpt-oss-120b__style_concise
[style_concise] Cache DB: when2call_local_variants/tool_desc_cache.sqlite3
[style_verbose] Wrote when2call_local_variants/when2call_test_llm_judge__gen-gpt-oss-120b__mode-style_verbose.jsonl
[style_verbose] Audit directory: when2call_local_variants/audit/gpt-oss-120b__style_verbose
[style_verbose] Cache DB: when2call_local_variants/tool_desc_cache.sqlite3
[add_examples] Wrote when2call_local_variants/when2call_test_llm_judge__gen-gpt-oss-120b__mode-add_examples.jsonl
[add_examples] Audit directory: when2call_local_variants/audit/gpt-oss

In [2]:
import os, json, glob
import pandas as pd

# ====== CONFIG ======
OUTPUT_DIR = "when2call_local_variants"   # <-- cambia se serve
# Se vuoi filtrare per un solo modello, metti una stringa (es. "gpt-oss-120b"), altrimenti None
MODEL_FILTER = None

# ====== LOAD ALL variant_summary.json ======
summary_paths = glob.glob(os.path.join(OUTPUT_DIR, "audit", "*__*", "variant_summary.json"))

rows = []
for path in summary_paths:
    with open(path, "r", encoding="utf-8") as f:
        s = json.load(f)

    # audit/<model_tag>__<mode>/variant_summary.json
    audit_leaf = os.path.basename(os.path.dirname(path))
    if "__" in audit_leaf:
        model_tag, mode = audit_leaf.split("__", 1)
    else:
        model_tag, mode = audit_leaf, s.get("mode", "")

    if MODEL_FILTER and MODEL_FILTER not in model_tag:
        continue

    tools_seen = int(s.get("tools_seen", 0))
    tools_patched = int(s.get("tools_patched", 0))
    tools_unchanged = int(s.get("tools_unchanged", 0))
    tools_parse_failed = int(s.get("tools_parse_failed", 0))
    tools_fallback = int(s.get("tools_fallback_reserialized", 0))

    llm_calls = int(s.get("llm_calls", 0))
    llm_repaired = int(s.get("llm_repaired", 0))
    llm_rejected = int(s.get("llm_rejected", 0))
    llm_cache_hits = int(s.get("llm_cache_hits", 0))

    def pct(num, den):
        return 0.0 if den <= 0 else 100.0 * num / den

    rows.append({
        "model_tag": model_tag,
        "mode": mode,
        "tools_seen": tools_seen,
        "tools_patched": tools_patched,
        "tools_unchanged": tools_unchanged,
        "tools_parse_failed": tools_parse_failed,
        "tools_fallback_reserialized": tools_fallback,
        "llm_calls": llm_calls,
        "llm_cache_hits": llm_cache_hits,
        "llm_repaired": llm_repaired,
        "llm_rejected": llm_rejected,
        "%tools_patched": pct(tools_patched, tools_seen),
        "%tools_unchanged": pct(tools_unchanged, tools_seen),
        "%tools_parse_failed": pct(tools_parse_failed, tools_seen),
        "%tools_fallback_reserialized": pct(tools_fallback, tools_seen),
        "%rejected_over_tools_seen": pct(llm_rejected, tools_seen),
        "%rejected_over_llm_calls": pct(llm_rejected, llm_calls),
    })

df = pd.DataFrame(rows)
if df.empty:
    print("Nessun variant_summary.json trovato. Controlla OUTPUT_DIR e che il run abbia scritto i log.")
else:
    # Ordinamento utile: per modello e per mode
    df = df.sort_values(["model_tag", "mode"]).reset_index(drop=True)

    # Vista compatta (puoi rimuovere colonne se vuoi)
    display_cols = [
        "model_tag","mode",
        "tools_seen","%tools_patched","%tools_unchanged",
        "%tools_parse_failed","%tools_fallback_reserialized",
        "llm_calls","llm_cache_hits","llm_repaired","llm_rejected",
        "%rejected_over_llm_calls",
    ]
    display(df[display_cols])


Unnamed: 0,model_tag,mode,tools_seen,%tools_patched,%tools_unchanged,%tools_parse_failed,%tools_fallback_reserialized,llm_calls,llm_cache_hits,llm_repaired,llm_rejected,%rejected_over_llm_calls
0,gpt-oss-120b,add_examples,978,0.511247,99.488753,0.0,0.0,1903,501,3,474,24.90804
1,gpt-oss-120b,style_concise,978,6.339468,93.660532,0.0,0.0,1877,501,24,453,24.134257
2,gpt-oss-120b,style_verbose,978,3.374233,96.625767,0.0,0.0,1888,501,18,459,24.311441


In [3]:
import os, glob

# ====== CONFIG ======
ORIGINAL_JSONL = "When2Call/data/test/when2call_test_llm_judge.jsonl"
VARIANTS_DIR = "when2call_local_variants"   # dove scrivi i *.jsonl variant
# opzionale: filtra solo i variant di un certo model tag o mode con una substring
FILTER_SUBSTR = None   # es: "mode-style_concise" oppure "gen-gpt-oss-120b"

def count_identical_lines(original_path: str, variant_path: str):
    same = 0
    total = 0
    with open(original_path, "r", encoding="utf-8") as fo, open(variant_path, "r", encoding="utf-8") as fv:
        for lo, lv in zip(fo, fv):
            total += 1
            if lo.rstrip("\n") == lv.rstrip("\n"):
                same += 1

        # se i file hanno lunghezze diverse, contiamo anche quello (dovrebbero essere uguali)
        extra_o = sum(1 for _ in fo)
        extra_v = sum(1 for _ in fv)
        total += extra_o  # se original più lungo
        # per extra_v non possiamo confrontare, quindi non incrementiamo same

    return same, total

variant_paths = sorted(glob.glob(os.path.join(VARIANTS_DIR, "*.jsonl")))
if FILTER_SUBSTR:
    variant_paths = [p for p in variant_paths if FILTER_SUBSTR in os.path.basename(p)]

if not variant_paths:
    print("Nessun file .jsonl variant trovato. Controlla VARIANTS_DIR / FILTER_SUBSTR.")
else:
    print(f"Original: {ORIGINAL_JSONL}")
    for vp in variant_paths:
        same, total = count_identical_lines(ORIGINAL_JSONL, vp)
        pct = 0.0 if total == 0 else 100.0 * same / total
        print(f"{os.path.basename(vp)}  identical_examples={same}/{total} ({pct:.2f}%)")


Original: When2Call/data/test/when2call_test_llm_judge.jsonl
when2call_test_llm_judge__gen-gpt-oss-120b__mode-add_examples.jsonl  identical_examples=289/300 (96.33%)
when2call_test_llm_judge__gen-gpt-oss-120b__mode-style_concise.jsonl  identical_examples=245/300 (81.67%)
when2call_test_llm_judge__gen-gpt-oss-120b__mode-style_verbose.jsonl  identical_examples=272/300 (90.67%)
when2call_test_llm_judge_llama-3.3-70b-instruct_no_desc.jsonl  identical_examples=15/300 (5.00%)
when2call_test_llm_judge_llama-3.3-70b-instruct_short_label.jsonl  identical_examples=15/300 (5.00%)
when2call_test_llm_judge_llama-3.3-70b-instruct_verbose_examples.jsonl  identical_examples=15/300 (5.00%)


In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
When2Call Tool-Description Variant Generator
(Reproducible, Auditable, Minimal-Edit, Judge-Assisted, Dataset-Perturbation Oriented)

This script generates controlled dataset variants by modifying ONLY the tool description
string(s) inside the `tools` field of each JSONL example, while preserving the original
JSONL line formatting as much as technically feasible (i.e., without re-serializing the
outer JSON object). The tool JSON strings inside `tools` are patched in-place at the
exact JSON string-span level, so unrelated keys remain unchanged unless a fallback
path is triggered and logged.

Key properties:
- Minimal-diff patching on JSON spans (outer JSONL not re-serialized).
- Deterministic generation defaults (temperature=0, normalized output).
- Hard validators + guided repair + optional judge pass.
- Persistent SQLite cache keyed by pipeline + models + mode + tool signature.
- Full audit trail (calls, events, fallbacks) + manipulation checks outputs.

Important change vs "robustness" confound-controls:
- For dataset perturbation, we DO NOT ban parameter-name tokens in robustness modes.
  (Banning top-level param tokens like date/city/name/time causes excessive false failures.)
- We DO keep decision-boundary / label-leakage phrase bans for robustness modes.
- We shift prompts to "meaning-preserving paraphrase of ORIGINAL_DESCRIPTION" (minimal edit),
  to avoid template-y boilerplate like "The <tool> tool ...".

Modes (MAX 5):
1) empty_desc: description="".
2) style_concise: meaning-preserving paraphrase of ORIGINAL_DESCRIPTION, concise, no examples, no normative.
3) style_verbose: meaning-preserving paraphrase of ORIGINAL_DESCRIPTION, verbose, no examples, no normative.
4) add_examples: meaning-preserving paraphrase + 1-2 conceptual examples (must include 'For example,'), no normative.
5) normative_injection: meaning-preserving paraphrase + guidance + 1-2 examples (may include leakage by design).

Environment:
- TOKEN_JRC must be set.
"""

from __future__ import annotations

import dataclasses
import difflib
import hashlib
import json
import math
import os
import random
import re
import sqlite3
import time
from typing import Any, Dict, List, Optional, Sequence, Tuple

from openai import OpenAI
import copy


# ================== CONFIGURATION ==================

PIPELINE_VERSION = "5modes_judge_v3_dataset_perturbation_minimal_edit_with_checks"

DEFAULT_TEMPERATURE = 0.0
DEFAULT_MAX_TOKENS = 256
DEFAULT_RATE_LIMIT_SLEEP_SEC = 0.0
DEFAULT_MAX_RETRIES = 8
DEFAULT_REPAIR_MAX_ROUNDS = 3
DEFAULT_JUDGE_MAX_ROUNDS = 1  # bounded for auditability

DESCRIPTION_MAX_CHARS = 1200

# Persistent cache / audit
CACHE_DB_NAME = "tool_desc_cache.sqlite3"
AUDIT_DIR_NAME = "audit"
AUDIT_CALLS_JSONL = "generation_calls.jsonl"
AUDIT_EVENTS_JSONL = "variant_events.jsonl"
AUDIT_SUMMARY_JSON = "variant_summary.json"
AUDIT_FALLBACKS_JSONL = "fallback_events.jsonl"

# Manipulation checks outputs
AUDIT_MANIPULATION_JSONL = "manipulation_checks.jsonl"
AUDIT_MANIPULATION_SUMMARY_JSON = "manipulation_checks_summary.json"
AUDIT_MANIPULATION_TABLE_MD = "manipulation_checks_table.md"

DEFAULT_RANDOM_SEED = 1337


# ================== SIMILARITY GATE (REVISED: BAND-PASS) ==================

ENABLE_SIMILARITY_GATE = True
ENABLE_SIMILARITY_UPPER_BOUND = True  # NEW: block near-identical outputs

# Composite similarity must be within [min, max] (0..1)
SIMILARITY_BAND_BY_MODE = {
    "empty_desc": (0.0, 1.0),            # not applicable
    "style_concise": (0.80, 0.999),      # upper bound ignored in robustness modes (see gate)
    "style_verbose": (0.72, 0.999),
    "add_examples": (0.78, 0.999),       # computed on base (pre "For example,")
    "normative_injection": (0.74, 0.999) # computed on base (pre "For example,")
}


ENABLE_RULE_BASED_RECOVERY = True



# ================== MODES ==================

ROBUSTNESS_MODES: List[str] = [
    "empty_desc",
    "style_concise",
    "style_verbose",
    "add_examples",
]
INTERVENTION_MODES: List[str] = [
    "normative_injection",
]
MODES: List[str] = ROBUSTNESS_MODES + INTERVENTION_MODES


# ================== CLIENT ==================

def make_jrc_client() -> OpenAI:
    token = os.environ.get("TOKEN_JRC")
    if not token:
        raise RuntimeError("TOKEN_JRC environment variable is not set.")
    return OpenAI(api_key=token, base_url="https://api-gpt.jrc.ec.europa.eu/v1")


# ================== SMALL UTILITIES ==================

def sanitize_model_name_for_path(model_name: str) -> str:
    allowed = set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_.")
    return "".join(ch if ch in allowed else "-" for ch in model_name)


def stable_sha256(text: str) -> str:
    return hashlib.sha256(text.encode("utf-8")).hexdigest()


def json_escape_string(value: str) -> str:
    return json.dumps(value, ensure_ascii=False)[1:-1]


def normalize_single_line(text: str) -> str:
    return " ".join((text or "").split()).strip()


def now_unix() -> float:
    return time.time()


def ensure_dir(path: str) -> None:
    os.makedirs(path, exist_ok=True)


def _dedupe_preserve_order(items: Sequence[str]) -> List[str]:
    seen = set()
    out: List[str] = []
    for x in items:
        if x not in seen:
            seen.add(x)
            out.append(x)
    return out


def _mean(xs: Sequence[float]) -> float:
    return float(sum(xs) / len(xs)) if xs else 0.0


def _std(xs: Sequence[float]) -> float:
    if len(xs) < 2:
        return 0.0
    mu = _mean(xs)
    var = sum((x - mu) ** 2 for x in xs) / (len(xs) - 1)
    return float(math.sqrt(var))


# ================== JSON SPAN PARSER (MINIMAL-EDIT PATCHING) ==================

@dataclasses.dataclass
class JsonNode:
    kind: str  # "object" | "array" | "string" | "number" | "true" | "false" | "null"
    start: int
    end: int
    value: Any = None
    obj: Optional[Dict[str, "JsonNode"]] = None
    arr: Optional[List["JsonNode"]] = None
    string_char_spans: Optional[List[Tuple[int, int]]] = None  # NEW



class JsonSpanParseError(Exception):
    pass


class JsonSpanParser:
    """
    JSON parser that returns decoded values AND exact source spans.
    Used to patch JSON string fields in-place without re-serializing the outer JSON line.
    """

    def __init__(self, s: str):
        self.s = s
        self.n = len(s)

    def parse(self) -> JsonNode:
        i = self._skip_ws(0)
        node, j = self._parse_value(i)
        j = self._skip_ws(j)
        if j != self.n:
            raise JsonSpanParseError(f"Trailing content at index {j}")
        return node

    def _skip_ws(self, i: int) -> int:
        s = self.s
        n = self.n
        while i < n and s[i] in " \t\r\n":
            i += 1
        return i

    def _parse_value(self, i: int) -> Tuple[JsonNode, int]:
        i = self._skip_ws(i)
        if i >= self.n:
            raise JsonSpanParseError("Unexpected end of input")

        ch = self.s[i]
        if ch == "{":
            return self._parse_object(i)
        if ch == "[":
            return self._parse_array(i)
        if ch == '"':
            return self._parse_string(i)
        if ch == "-" or ch.isdigit():
            return self._parse_number(i)
        if self.s.startswith("true", i):
            return JsonNode(kind="true", start=i, end=i + 4, value=True), i + 4
        if self.s.startswith("false", i):
            return JsonNode(kind="false", start=i, end=i + 5, value=False), i + 5
        if self.s.startswith("null", i):
            return JsonNode(kind="null", start=i, end=i + 4, value=None), i + 4

        raise JsonSpanParseError(f"Unexpected token at index {i}: {ch!r}")

    def _parse_object(self, i: int) -> Tuple[JsonNode, int]:
        s = self.s
        if s[i] != "{":
            raise JsonSpanParseError("Expected '{'")
        start = i
        i += 1
        i = self._skip_ws(i)

        obj: Dict[str, JsonNode] = {}

        if i < self.n and s[i] == "}":
            end = i + 1
            return JsonNode(kind="object", start=start, end=end, obj=obj, value={}), end

        while True:
            i = self._skip_ws(i)
            if i >= self.n or s[i] != '"':
                raise JsonSpanParseError(f"Expected object key string at index {i}")
            key_node, i = self._parse_string(i)
            key = key_node.value

            i = self._skip_ws(i)
            if i >= self.n or s[i] != ":":
                raise JsonSpanParseError(f"Expected ':' after key at index {i}")
            i += 1

            val_node, i = self._parse_value(i)
            obj[key] = val_node

            i = self._skip_ws(i)
            if i >= self.n:
                raise JsonSpanParseError("Unexpected end in object")
            if s[i] == "}":
                end = i + 1
                return JsonNode(kind="object", start=start, end=end, obj=obj, value=None), end
            if s[i] != ",":
                raise JsonSpanParseError(f"Expected ',' or '}}' at index {i}")
            i += 1

    def _parse_array(self, i: int) -> Tuple[JsonNode, int]:
        s = self.s
        if s[i] != "[":
            raise JsonSpanParseError("Expected '['")
        start = i
        i += 1
        i = self._skip_ws(i)

        arr: List[JsonNode] = []

        if i < self.n and s[i] == "]":
            end = i + 1
            return JsonNode(kind="array", start=start, end=end, arr=arr, value=[]), end

        while True:
            val_node, i = self._parse_value(i)
            arr.append(val_node)

            i = self._skip_ws(i)
            if i >= self.n:
                raise JsonSpanParseError("Unexpected end in array")
            if s[i] == "]":
                end = i + 1
                return JsonNode(kind="array", start=start, end=end, arr=arr, value=None), end
            if s[i] != ",":
                raise JsonSpanParseError(f"Expected ',' or ']' at index {i}")
            i += 1
            i = self._skip_ws(i)

    def _parse_string(self, i: int) -> Tuple[JsonNode, int]:
        s = self.s
        if s[i] != '"':
            raise JsonSpanParseError('Expected \'"\'')
        start = i
        i += 1
        out_chars: List[str] = []

        while i < self.n:
            ch = s[i]

            if ch == '"':
                end = i + 1
                # string_char_spans not used elsewhere; keep it None for safety.
                return JsonNode(kind="string", start=start, end=end, value="".join(out_chars), string_char_spans=None), end

            if ch == "\\":
                i += 1
                if i >= self.n:
                    raise JsonSpanParseError("Invalid escape at end of string")
                esc = s[i]

                if esc in '"\\/':
                    out_chars.append(esc)
                elif esc == "b":
                    out_chars.append("\b")
                elif esc == "f":
                    out_chars.append("\f")
                elif esc == "n":
                    out_chars.append("\n")
                elif esc == "r":
                    out_chars.append("\r")
                elif esc == "t":
                    out_chars.append("\t")
                elif esc == "u":
                    if i + 4 >= self.n:
                        raise JsonSpanParseError("Invalid unicode escape (truncated)")
                    hex_part = s[i + 1 : i + 5]
                    try:
                        codepoint = int(hex_part, 16)
                    except ValueError as e:
                        raise JsonSpanParseError("Invalid unicode escape") from e
                    out_chars.append(chr(codepoint))
                    i += 4
                else:
                    raise JsonSpanParseError(f"Invalid escape sequence: \\{esc}")

            else:
                # JSON forbids raw control chars 0x00-0x1F inside strings
                if ord(ch) < 0x20:
                    raise JsonSpanParseError(f"Unescaped control character in string at index {i}")
                out_chars.append(ch)

            i += 1

        raise JsonSpanParseError("Unterminated string")

    def _parse_number(self, i: int) -> Tuple[JsonNode, int]:
        s = self.s
        start = i
        n = self.n

        if s[i] == "-":
            i += 1
            if i >= n:
                raise JsonSpanParseError("Invalid number '-'")

        if i < n and s[i] == "0":
            i += 1
        else:
            if i >= n or not s[i].isdigit():
                raise JsonSpanParseError("Invalid number")
            while i < n and s[i].isdigit():
                i += 1

        if i < n and s[i] == ".":
            i += 1
            if i >= n or not s[i].isdigit():
                raise JsonSpanParseError("Invalid fraction")
            while i < n and s[i].isdigit():
                i += 1

        if i < n and s[i] in "eE":
            i += 1
            if i < n and s[i] in "+-":
                i += 1
            if i >= n or not s[i].isdigit():
                raise JsonSpanParseError("Invalid exponent")
            while i < n and s[i].isdigit():
                i += 1

        end = i
        num_text = s[start:end]
        try:
            val: Any
            if "." in num_text or "e" in num_text or "E" in num_text:
                val = float(num_text)
            else:
                val = int(num_text)
        except ValueError:
            val = num_text

        return JsonNode(kind="number", start=start, end=end, value=val), end

def escape_for_json_string_literal_minimal(s: str) -> str:
    out = []
    for ch in s:
        o = ord(ch)
        if ch == '"':
            out.append(r'\"')
        elif ch == '\\':
            out.append(r'\\')
        elif ch == '\b':
            out.append(r'\b')
        elif ch == '\f':
            out.append(r'\f')
        elif ch == '\n':
            out.append(r'\n')
        elif ch == '\r':
            out.append(r'\r')
        elif ch == '\t':
            out.append(r'\t')
        elif o < 0x20:
            out.append(f'\\u{o:04x}')
        else:
            out.append(ch)
    return "".join(out)

def patch_json_field_value_as_string_literal_in_place(
    src_json: str,
    path: Sequence[str],
    new_string_value: str,
) -> Tuple[str, bool, str]:
    """
    Like patch_json_string_field_in_place, but replaces the target node span
    with a JSON string literal even if the existing node is not a string (e.g., null).
    """
    try:
        root = JsonSpanParser(src_json).parse()
    except Exception as e:
        return src_json, False, f"parse_error:{type(e).__name__}"

    if root.kind != "object" or not root.obj:
        return src_json, False, "root_not_object"

    node = root
    for k in path:
        if node.kind != "object" or not node.obj or k not in node.obj:
            return src_json, False, "path_missing"
        node = node.obj[k]

    replacement = '"' + json_escape_string(new_string_value) + '"'
    patched = src_json[: node.start] + replacement + src_json[node.end :]
    return patched, True, "patched_value_as_string_literal"


def insert_json_string_field_into_object_text_in_place(
    src_json: str,
    obj_node: JsonNode,
    key: str,
    value: str,
) -> Tuple[str, bool, str]:
    """
    Insert a `"key":"value"` pair into an object node span, minimal-format.
    Does not re-serialize the object; inserts just before the closing `}`.
    """
    if obj_node.kind != "object":
        return src_json, False, "target_not_object"

    if obj_node.start < 0 or obj_node.end > len(src_json) or obj_node.end <= obj_node.start:
        return src_json, False, "object_span_out_of_bounds"

    # Determine if object is empty by checking last non-ws before closing brace.
    close_brace = obj_node.end - 1
    j = close_brace - 1
    while j > obj_node.start and src_json[j] in " \t\r\n":
        j -= 1

    # Build insertion text (minimal, compact)
    pair = '"' + json_escape_string(key) + '":"' + json_escape_string(value) + '"'

    if src_json[j] == "{":
        # Empty object: {<pair>}
        insert_text = pair
        insert_pos = close_brace  # before '}'
        patched = src_json[:insert_pos] + insert_text + src_json[insert_pos:]
        return patched, True, "inserted_field_into_empty_object"

    # Non-empty object: {...,<pair>}
    insert_text = "," + pair
    insert_pos = close_brace
    patched = src_json[:insert_pos] + insert_text + src_json[insert_pos:]
    return patched, True, "inserted_field_into_nonempty_object"

def patch_or_insert_tool_description_in_inner_json_text(
    *,
    tool_json_str: str,
    new_desc: str,
    desc_path: Sequence[str],
) -> Tuple[str, bool, str]:
    """
    Inner-JSON micro-fallback:
    - If desc exists: replace its value span with a JSON string literal (even if non-string).
    - If missing: insert `"description":"..."` into the target object (root or function object).
    """
    try:
        root = JsonSpanParser(tool_json_str).parse()
    except Exception as e:
        return tool_json_str, False, f"inner_parse_error:{type(e).__name__}"

    if root.kind != "object" or not root.obj:
        return tool_json_str, False, "inner_root_not_object"

    # If path points to function.description, target object is function (if exists), else root.
    # If path is just description, target object is root.
    if len(desc_path) == 2 and desc_path[0] == "function" and desc_path[1] == "description":
        fn_node = root.obj.get("function")
        if fn_node is None:
            # No function object: cannot insert under function; fail (caller may reserialize)
            return tool_json_str, False, "inner_missing_function_object"
        if fn_node.kind != "object":
            return tool_json_str, False, "inner_function_not_object"

        # If description exists under function: replace value span (any kind -> string literal)
        if fn_node.obj and "description" in fn_node.obj:
            return patch_json_field_value_as_string_literal_in_place(
                tool_json_str, ("function", "description"), new_desc
            )

        # Otherwise insert into function object span
        return insert_json_string_field_into_object_text_in_place(
            tool_json_str, fn_node, "description", new_desc
        )

    # Root-level description
    if len(desc_path) == 1 and desc_path[0] == "description":
        if root.obj and "description" in root.obj:
            return patch_json_field_value_as_string_literal_in_place(
                tool_json_str, ("description",), new_desc
            )
        return insert_json_string_field_into_object_text_in_place(
            tool_json_str, root, "description", new_desc
        )

    return tool_json_str, False, "inner_unsupported_desc_path"


def patch_json_string_field_in_place(
    src_json: str,
    path: Sequence[str],
    new_string_value: str,
) -> Tuple[str, bool, str]:
    """
    Patch a JSON string field at a given object-path, modifying only the value span.
    """
    try:
        root = JsonSpanParser(src_json).parse()
    except Exception as e:
        return src_json, False, f"parse_error:{type(e).__name__}"

    if root.kind != "object" or not root.obj:
        return src_json, False, "root_not_object"

    node = root
    for k in path:
        if node.kind != "object" or not node.obj or k not in node.obj:
            return src_json, False, "path_missing"
        node = node.obj[k]

    if node.kind != "string":
        return src_json, False, "target_not_string"

    escaped = json_escape_string(new_string_value)
    replacement = '"' + escaped + '"'
    patched = src_json[: node.start] + replacement + src_json[node.end :]
    return patched, True, "patched"


def find_node_by_key_path(root: JsonNode, path: Sequence[str]) -> Optional[JsonNode]:
    node = root
    for k in path:
        if node.kind != "object" or not node.obj or k not in node.obj:
            return None
        node = node.obj[k]
    return node



# ================== SCHEMA HELPERS ==================

_WORD_RE = re.compile(r"[A-Za-z_][A-Za-z0-9_]*")


@dataclasses.dataclass(frozen=True)
class ParamSpec:
    path: str
    types: Tuple[str, ...]
    description: str
    required: bool
    enum: Tuple[str, ...]
    default: Optional[str]
    fmt: Optional[str]


def _as_tuple_str(x: Any) -> Tuple[str, ...]:
    if x is None:
        return tuple()
    if isinstance(x, (list, tuple)):
        return tuple(str(v) for v in x)
    return (str(x),)


def _schema_types(schema: Dict[str, Any]) -> Tuple[str, ...]:
    t = schema.get("type")
    if isinstance(t, str):
        return (t,)
    if isinstance(t, list):
        return tuple(str(v) for v in t)
    if "properties" in schema:
        return ("object",)
    if "items" in schema:
        return ("array",)
    return ("string",)


def flatten_json_schema_parameters(
    parameters: Any,
    *,
    prefix: str = "",
    required_paths: Optional[set] = None,
) -> List[ParamSpec]:
    if required_paths is None:
        required_paths = set()

    out: List[ParamSpec] = []
    if not isinstance(parameters, dict):
        return out

    req = parameters.get("required")
    if isinstance(req, list):
        for r in req:
            if isinstance(r, str):
                required_paths.add((prefix + "." + r).lstrip("."))

    for comb_key in ("oneOf", "anyOf", "allOf"):
        comb = parameters.get(comb_key)
        if isinstance(comb, list) and comb:
            for sub in comb:
                out.extend(flatten_json_schema_parameters(sub, prefix=prefix, required_paths=set(required_paths)))

    tps = _schema_types(parameters)
    desc = normalize_single_line(parameters.get("description") or "")
    enum = _as_tuple_str(parameters.get("enum"))
    default = parameters.get("default")
    default_s = None if default is None else normalize_single_line(str(default))
    fmt = parameters.get("format")
    fmt_s = None if fmt is None else normalize_single_line(str(fmt))

    if prefix and ("properties" not in parameters) and ("items" not in parameters):
        out.append(
            ParamSpec(
                path=prefix,
                types=tps,
                description=desc,
                required=(prefix in required_paths),
                enum=enum,
                default=default_s,
                fmt=fmt_s,
            )
        )

    props = parameters.get("properties")
    if isinstance(props, dict):
        for k, sub in props.items():
            if isinstance(k, str) and isinstance(sub, dict):
                sub_prefix = (prefix + "." + k).lstrip(".")
                out.extend(flatten_json_schema_parameters(sub, prefix=sub_prefix, required_paths=required_paths))

    items = parameters.get("items")
    if isinstance(items, dict):
        sub_prefix = prefix + "[]" if prefix else "[]"
        out.extend(flatten_json_schema_parameters(items, prefix=sub_prefix, required_paths=required_paths))

    if prefix and ("properties" in parameters):
        out.append(
            ParamSpec(
                path=prefix,
                types=tps,
                description=desc,
                required=(prefix in required_paths),
                enum=enum,
                default=default_s,
                fmt=fmt_s,
            )
        )

    return out


def extract_top_level_param_names(params: Any) -> List[str]:
    """
    Extract top-level parameter names from a JSON schema object, if available.
    Deterministic order by insertion in the schema 'properties' dict.
    """
    if not isinstance(params, dict):
        return []
    props = params.get("properties")
    if not isinstance(props, dict):
        return []
    return [k for k in props.keys() if isinstance(k, str)]


def build_param_context_for_prompt(param_specs: List[ParamSpec], *, max_items: int = 40) -> str:
    """
    Option 1 (publication-friendly): provide a NEUTRAL, non-normative parameter context.

    Rationale:
    - The semantic gate is strict: any 'new constraint' not present in ORIGINAL_DESCRIPTION triggers FAIL.
    - If we show the model required/optional, formats, defaults, enums, bounds, it tends to echo them,
      which appears as adding constraints even if they exist in the schema.
    - For dataset-perturbation minimal-edit, we only need light grounding on field NAMES (and coarse types),
      not normative metadata.

    Therefore:
    - Provide field paths + coarse types only.
    - Do NOT mention: required/optional, format, default, enum, min/max, pagination, etc.
    - Keep it short and stable.
    """
    if not param_specs:
        return "PARAMETERS_SUMMARY: No structured parameters were found."

    items = sorted(param_specs, key=lambda p: p.path)

    # Keep only unique paths; prefer the first occurrence deterministically.
    seen = set()
    uniq: List[ParamSpec] = []
    for p in items:
        if p.path in seen:
            continue
        seen.add(p.path)
        uniq.append(p)

    if len(uniq) > max_items:
        uniq = uniq[:max_items]

    lines: List[str] = []
    lines.append(f"PARAMETERS_SUMMARY (neutral grounding): {len(uniq)} field paths (possibly truncated).")
    for p in uniq:
        tps = "/".join(p.types) if p.types else "unspecified"
        lines.append(f"- path={p.path}; type={tps}")

    lines.append(
        "NOTE: Do not mention required/optional, formats, defaults, enums, limits, pagination, or other constraints in the description "
        "unless they already appear in ORIGINAL_DESCRIPTION."
    )
    return "\n".join(lines)


# ================== MODE POLICIES + VALIDATION ==================

NORMATIVE_KEYWORDS = (
    "should",
    "should not",
    "must",
    "must not",
    "always",
    "never",
    "required to",
    "do not",
    "avoid",
)

EXAMPLE_MARKERS = (
    "for example",
    "e.g.",
    "example:",
    "for instance",
)

LEAKAGE_PHRASES = (
    "follow-up",
    "follow up",
    "ask the user",
    "ask user",
    "request more information",
    "need more information",
    "insufficient information",
    "not enough information",
    "cannot answer",
    "can't answer",
    "unable to answer",
    "missing information",
)

REQUIRED_EXAMPLE_PHRASE = "for example,"  # enforced in add_examples / normative_injection (case-insensitive match)



@dataclasses.dataclass(frozen=True)
class ModePolicy:
    mode: str
    allow_examples: bool
    require_examples: bool
    allow_normative: bool
    require_normative: bool
    allow_leakage_phrases: bool
    ban_param_name_tokens: bool
    target_length: str
    must_preserve_semantics: bool
    is_intervention: bool


MODE_POLICIES: Dict[str, ModePolicy] = {
    "empty_desc": ModePolicy(
        "empty_desc",
        allow_examples=False,
        require_examples=False,
        allow_normative=False,
        require_normative=False,
        allow_leakage_phrases=False,
        ban_param_name_tokens=False,
        target_length="short",
        must_preserve_semantics=True,
        is_intervention=False,
    ),
    # Dataset-perturbation: DO NOT ban param-name tokens for these modes.
    "style_concise": ModePolicy(
        "style_concise",
        allow_examples=False,
        require_examples=False,
        allow_normative=False,
        require_normative=False,
        allow_leakage_phrases=False,
        ban_param_name_tokens=False,  # <-- changed
        target_length="concise",
        must_preserve_semantics=True,
        is_intervention=False,
    ),
    "style_verbose": ModePolicy(
        "style_verbose",
        allow_examples=False,
        require_examples=False,
        allow_normative=False,
        require_normative=False,
        allow_leakage_phrases=False,
        ban_param_name_tokens=False,  # <-- changed
        target_length="verbose",
        must_preserve_semantics=True,
        is_intervention=False,
    ),
    "add_examples": ModePolicy(
        "add_examples",
        allow_examples=True,
        require_examples=True,
        allow_normative=False,
        require_normative=False,
        allow_leakage_phrases=False,
        ban_param_name_tokens=False,  # <-- changed
        target_length="verbose",
        must_preserve_semantics=True,
        is_intervention=False,
    ),
    "normative_injection": ModePolicy(
        "normative_injection",
        allow_examples=True,
        require_examples=True,
        allow_normative=True,
        require_normative=True,
        allow_leakage_phrases=True,   # allowed by design
        ban_param_name_tokens=False,  # allowed by design
        target_length="verbose",
        must_preserve_semantics=True,
        is_intervention=True,
    ),
}



def _contains_any_token_case_insensitive(text: str, tokens: Sequence[str]) -> bool:
    words = {w.lower() for w in _WORD_RE.findall(text or "")}
    return any((t or "").lower() in words for t in tokens if t)

def original_has_normative(original_description: Optional[str]) -> bool:
    """
    Returns True if ORIGINAL_DESCRIPTION contains normative language (must/should/always/never/etc.).
    Uses the same detector as validation to avoid mismatches.
    """
    return _contains_any_case_insensitive(original_description or "", NORMATIVE_KEYWORDS)

# ---- NEW: conditional marker handling (meaning-preserving) ----

LISTING_MARKERS = (
    "including",
    "such as",
    "by specifying",
    "by providing",
    "with options to",
    "with the option to",
)

def _marker_set_case_insensitive(text: Optional[str], markers: Sequence[str]) -> set:
    """
    Return the set of marker strings (lowercased) that appear in text (case-insensitive substring check).
    Conservative and deterministic.
    """
    lt = (text or "").lower()
    found = set()
    for m in markers or ():
        ml = (m or "").lower()
        if ml and (ml in lt):
            found.add(ml)
    return found

def original_has_any_example_marker(original_description: Optional[str]) -> bool:
    return bool(_marker_set_case_insensitive(original_description, EXAMPLE_MARKERS))

def original_has_any_listing_marker(original_description: Optional[str]) -> bool:
    return bool(_marker_set_case_insensitive(original_description, LISTING_MARKERS))


def validate_description(
    mode: str,
    desc: str,
    *,
    raw_desc: Optional[str] = None,
    top_level_param_names: Sequence[str],
    original_description: Optional[str] = None,
) -> Tuple[bool, List[str]]:
    errors: List[str] = []
    policy = MODE_POLICIES.get(mode)
    if policy is None:
        return False, [f"unknown_mode:{mode}"]

    raw = raw_desc if raw_desc is not None else desc

    # ---- Raw checks (hard) ----
    if "\n" in (raw or "") or "\r" in (raw or ""):
        errors.append("contains_newline_raw")
    if "`" in (raw or ""):
        errors.append("contains_backticks_raw")
    if re.search(r"(^|\n|\r)\s*([-*•]|\d+\.)\s+", raw or ""):
        errors.append("contains_bullets_raw")

    # ---- Normalized checks ----
    if len(desc or "") > DESCRIPTION_MAX_CHARS:
        errors.append("too_long_chars_global")

    if mode == "empty_desc":
        if desc != "":
            errors.append("empty_desc_must_be_exact_empty_string")
        return (len(errors) == 0), errors

    if not (desc or "").strip():
        errors.append("empty_description_not_allowed")

    # ---- Examples (conditional) ----
    cand_example_markers = _marker_set_case_insensitive(desc, EXAMPLE_MARKERS)
    orig_example_markers = _marker_set_case_insensitive(original_description, EXAMPLE_MARKERS)

    if not policy.allow_examples:
        new_markers = cand_example_markers - orig_example_markers
        if new_markers:
            errors.append("examples_not_allowed_in_mode")

    if policy.require_examples and not _contains_required_example_phrase(desc):
        errors.append("missing_required_example_phrase:For example,")

    # ---- LISTING markers (conditional) - NEW ----
    cand_listing = _marker_set_case_insensitive(desc, LISTING_MARKERS)
    orig_listing = _marker_set_case_insensitive(original_description, LISTING_MARKERS)

    # In robustness modes, do not allow introducing NEW listing markers (they often imply options/fields).
    # If original has them, they are allowed (meaning-preserving), but do not expand into lists (handled elsewhere).
    if mode in ROBUSTNESS_MODES:
        new_listing = cand_listing - orig_listing
        if new_listing:
            errors.append("new_listing_marker_not_allowed")

    # ---- Normative (conditional vs original) ----
    orig_has_norm = original_has_normative(original_description or "")
    if not policy.allow_normative and not orig_has_norm:
        if _contains_any_case_insensitive(desc, NORMATIVE_KEYWORDS):
            errors.append("normative_language_not_allowed_in_mode")

    if policy.require_normative and not _contains_any_case_insensitive(desc, ("should", "must")):
        errors.append("missing_required_normative_keyword")

    # ---- Leakage ----
    if not policy.allow_leakage_phrases and _contains_any_case_insensitive(desc, LEAKAGE_PHRASES):
        errors.append("label_leakage_phrase_not_allowed")

    # ---- Param tokens ban (off in your dataset-perturbation modes) ----
    if policy.ban_param_name_tokens and _contains_any_token_case_insensitive(desc, top_level_param_names):
        errors.append("mentions_param_name_token")

    # ---- Dynamic length budget ----
    budget = compute_length_budget(mode=mode, original_description=original_description or "")
    sent = _count_sentences_rough(desc)
    words = _count_words(desc)
    chars = len(desc or "")

    if sent < budget.min_sentences or sent > budget.max_sentences:
        errors.append(f"sentence_count_out_of_budget:{sent}:expected_{budget.min_sentences}_to_{budget.max_sentences}")

    if words < budget.min_words:
        errors.append(f"word_count_too_small:{words}:min_{budget.min_words}")
    if words > budget.max_words:
        errors.append(f"word_count_too_large:{words}:max_{budget.max_words}")

    if chars > budget.max_chars:
        errors.append(f"too_long_for_mode_budget_chars:{chars}:max_{budget.max_chars}")

    return (len(errors) == 0), errors





# ================== PERSISTENT CACHE (SQLITE) ==================

class DescCache:
    """
    SQLite-backed cache for generated tool descriptions.
    """

    def __init__(self, db_path: str):
        self.db_path = db_path
        self.conn = sqlite3.connect(db_path)
        self._init_db()

    def _init_db(self) -> None:
        cur = self.conn.cursor()
        cur.execute(
            """
            CREATE TABLE IF NOT EXISTS desc_cache (
                key_hash TEXT PRIMARY KEY,
                created_at REAL NOT NULL,
                pipeline_version TEXT NOT NULL,
                generation_model TEXT NOT NULL,
                judge_model TEXT NOT NULL,
                mode TEXT NOT NULL,
                tool_name TEXT NOT NULL,
                tool_signature TEXT NOT NULL,
                prompt_hash TEXT NOT NULL,
                prompt_text TEXT NOT NULL,
                raw_output TEXT NOT NULL,
                final_output TEXT NOT NULL,
                status TEXT NOT NULL,
                validation_errors TEXT NOT NULL
            )
            """
        )
        self.conn.commit()

    def get(self, key_hash: str) -> Optional[Dict[str, Any]]:
        cur = self.conn.cursor()
        cur.execute(
            "SELECT key_hash, created_at, pipeline_version, generation_model, judge_model, mode, tool_name, tool_signature, prompt_hash, prompt_text, raw_output, final_output, status, validation_errors FROM desc_cache WHERE key_hash=?",
            (key_hash,),
        )
        row = cur.fetchone()
        if not row:
            return None
        return {
            "key_hash": row[0],
            "created_at": row[1],
            "pipeline_version": row[2],
            "generation_model": row[3],
            "judge_model": row[4],
            "mode": row[5],
            "tool_name": row[6],
            "tool_signature": row[7],
            "prompt_hash": row[8],
            "prompt_text": row[9],
            "raw_output": row[10],
            "final_output": row[11],
            "status": row[12],
            "validation_errors": json.loads(row[13]),
        }

    def put(
        self,
        *,
        key_hash: str,
        pipeline_version: str,
        generation_model: str,
        judge_model: str,
        mode: str,
        tool_name: str,
        tool_signature: str,
        prompt_hash: str,
        prompt_text: str,
        raw_output: str,
        final_output: str,
        status: str,
        validation_errors: List[str],
    ) -> None:
        cur = self.conn.cursor()
        cur.execute(
            """
            INSERT OR REPLACE INTO desc_cache
            (key_hash, created_at, pipeline_version, generation_model, judge_model, mode, tool_name, tool_signature, prompt_hash, prompt_text, raw_output, final_output, status, validation_errors)
            VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
            """,
            (
                key_hash,
                now_unix(),
                pipeline_version,
                generation_model,
                judge_model,
                mode,
                tool_name,
                tool_signature,
                prompt_hash,
                prompt_text,
                raw_output,
                final_output,
                status,
                json.dumps(validation_errors, ensure_ascii=False),
            ),
        )
        self.conn.commit()

    def close(self) -> None:
        self.conn.close()


# ================== AUDIT LOGGING + MANIPULATION CHECKS ==================


class AuditLogger:
    """
    Append-only JSONL logs for reproducibility.
    """

    def __init__(self, audit_dir: str):
        self.audit_dir = audit_dir
        ensure_dir(audit_dir)

        self.calls_path = os.path.join(audit_dir, AUDIT_CALLS_JSONL)
        self.events_path = os.path.join(audit_dir, AUDIT_EVENTS_JSONL)
        self.fallbacks_path = os.path.join(audit_dir, AUDIT_FALLBACKS_JSONL)
        self.summary_path = os.path.join(audit_dir, AUDIT_SUMMARY_JSON)

        self.manip_jsonl_path = os.path.join(audit_dir, AUDIT_MANIPULATION_JSONL)
        self.manip_summary_path = os.path.join(audit_dir, AUDIT_MANIPULATION_SUMMARY_JSON)
        self.manip_table_md_path = os.path.join(audit_dir, AUDIT_MANIPULATION_TABLE_MD)

        self.summary: Dict[str, Any] = {
            "pipeline_version": PIPELINE_VERSION,
            "tools_seen": 0,
            "tools_patched": 0,
            "tools_unchanged": 0,
            "tools_fallback_reserialized": 0,
            "tools_parse_failed": 0,
            "llm_calls": 0,
            "llm_cache_hits": 0,
            "llm_repaired": 0,
            "llm_rejected": 0,
            "llm_judge_calls": 0,
            "llm_judge_fixed": 0,
            "llm_judge_failed": 0,
            "lines_seen": 0,
            "lines_written": 0,
            "outer_tools_string_literals_reescaped_canonically": 0,
            "tools_entry_kind_string": 0,
            "tools_entry_kind_object": 0,

            # Semantic gate accounting (first pass only)
            "semantic_gate_first_pass_total": 0,
            "semantic_gate_first_pass_fail": 0,

            # NEW: similarity gate accounting (checked only when semantic_ok and ok)
            "similarity_gate_total": 0,
            "similarity_gate_pass_total": 0,
            "similarity_gate_fail_total": 0,
            "similarity_gate_recovered_rule_based": 0,
            "similarity_gate_fallback_original": 0,
        }

        self._manip_rows: List[Dict[str, Any]] = []


      
    def log_call(self, record: Dict[str, Any]) -> None:
        with open(self.calls_path, "a", encoding="utf-8") as f:
            f.write(json.dumps(record, ensure_ascii=False) + "\n")

    def log_event(self, record: Dict[str, Any]) -> None:
        with open(self.events_path, "a", encoding="utf-8") as f:
            f.write(json.dumps(record, ensure_ascii=False) + "\n")

    def log_fallback(self, record: Dict[str, Any]) -> None:
        with open(self.fallbacks_path, "a", encoding="utf-8") as f:
            f.write(json.dumps(record, ensure_ascii=False) + "\n")

    def inc(self, key: str, n: int = 1) -> None:
        self.summary[key] = int(self.summary.get(key, 0)) + n

    def flush_summary(self, extra: Optional[Dict[str, Any]] = None) -> None:
        payload = dict(self.summary)
        if extra:
            payload.update(extra)

        # Derived coverage metrics (paper-facing)
        n_string = int(payload.get("tools_entry_kind_string", 0))
        n_object = int(payload.get("tools_entry_kind_object", 0))
        n_total = n_string + n_object

        payload["tools_entry_kind_total_counted"] = n_total
        payload["tools_entry_kind_object_rate"] = (float(n_object) / float(n_total)) if n_total > 0 else 0.0

        # final_unchanged_rate (tools with final desc identical to original, as tracked by tools_unchanged)
        tools_seen = int(payload.get("tools_seen", 0))
        tools_unchanged = int(payload.get("tools_unchanged", 0))
        payload["final_unchanged_rate"] = (float(tools_unchanged) / float(tools_seen)) if tools_seen > 0 else 0.0

        # semantic_fail_rate (first semantic gate pass)
        sem_total = int(payload.get("semantic_gate_first_pass_total", 0))
        sem_fail = int(payload.get("semantic_gate_first_pass_fail", 0))
        payload["semantic_fail_rate"] = (float(sem_fail) / float(sem_total)) if sem_total > 0 else 0.0

        # NEW: similarity gate rates
        sim_total = int(payload.get("similarity_gate_total", 0))
        sim_fail = int(payload.get("similarity_gate_fail_total", 0))
        sim_rb = int(payload.get("similarity_gate_recovered_rule_based", 0))
        sim_fb = int(payload.get("similarity_gate_fallback_original", 0))
        payload["similarity_fail_rate"] = (float(sim_fail) / float(sim_total)) if sim_total > 0 else 0.0
        payload["similarity_recovery_rule_based_rate"] = (float(sim_rb) / float(sim_total)) if sim_total > 0 else 0.0
        payload["similarity_fallback_original_rate"] = (float(sim_fb) / float(sim_total)) if sim_total > 0 else 0.0

        with open(self.summary_path, "w", encoding="utf-8") as f:
            f.write(json.dumps(payload, ensure_ascii=False, indent=2) + "\n")


            




    def log_manipulation_row(self, row: Dict[str, Any]) -> None:
        self._manip_rows.append(row)
        with open(self.manip_jsonl_path, "a", encoding="utf-8") as f:
            f.write(json.dumps(row, ensure_ascii=False) + "\n")

    def flush_manipulation_summaries(self) -> None:
        by_mode: Dict[str, List[Dict[str, Any]]] = {}
        for r in self._manip_rows:
            by_mode.setdefault(str(r.get("mode")), []).append(r)

        summary: Dict[str, Any] = {"pipeline_version": PIPELINE_VERSION, "modes": {}}
        table_rows: List[Dict[str, Any]] = []

        for mode, rows in sorted(by_mode.items(), key=lambda x: x[0]):
            lens_new = [float(r["new_chars"]) for r in rows]
            lens_old = [float(r["orig_chars"]) for r in rows]
            sims = [float(r["char_similarity"]) for r in rows]
            deltas = [float(r["delta_chars"]) for r in rows]

            def rate(flag: str) -> float:
                return float(sum(1 for r in rows if r.get(flag)) / len(rows)) if rows else 0.0

            mode_summary = {
                "n_tools": len(rows),
                "orig_chars_mean": _mean(lens_old),
                "orig_chars_std": _std(lens_old),
                "new_chars_mean": _mean(lens_new),
                "new_chars_std": _std(lens_new),
                "delta_chars_mean": _mean(deltas),
                "delta_chars_std": _std(deltas),
                "char_similarity_mean": _mean(sims),
                "char_similarity_std": _std(sims),
                "rate_unchanged": rate("is_unchanged"),
                "rate_repaired": rate("status_is_repaired"),
                "rate_rejected": rate("status_is_rejected"),
                "rate_judged": rate("status_is_judged"),
                "rate_contains_example": rate("contains_example"),
                "rate_contains_normative": rate("contains_normative"),
                "rate_contains_leakage": rate("contains_leakage"),
                "rate_mentions_param_token": rate("mentions_param_token"),
                "rate_failed_validation_final": rate("final_failed_validation"),
            }
            summary["modes"][mode] = mode_summary

            table_rows.append(
                {
                    "mode": mode,
                    "n": len(rows),
                    "new_chars_mean": round(mode_summary["new_chars_mean"], 2),
                    "delta_chars_mean": round(mode_summary["delta_chars_mean"], 2),
                    "sim_mean": round(mode_summary["char_similarity_mean"], 3),
                    "unchanged": round(mode_summary["rate_unchanged"], 3),
                    "repaired": round(mode_summary["rate_repaired"], 3),
                    "judged": round(mode_summary["rate_judged"], 3),
                    "rejected": round(mode_summary["rate_rejected"], 3),
                    "leak": round(mode_summary["rate_contains_leakage"], 3),
                    "paramtok": round(mode_summary["rate_mentions_param_token"], 3),
                }
            )

        with open(self.manip_summary_path, "w", encoding="utf-8") as f:
            f.write(json.dumps(summary, ensure_ascii=False, indent=2) + "\n")

        md_lines: List[str] = []
        md_lines.append("| mode | n | new_chars_mean | delta_chars_mean | sim_mean | unchanged | repaired | judged | rejected | leak | paramtok |")
        md_lines.append("|---|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|")
        for r in table_rows:
            md_lines.append(
                f"| {r['mode']} | {r['n']} | {r['new_chars_mean']:.2f} | {r['delta_chars_mean']:.2f} | {r['sim_mean']:.3f} | "
                f"{r['unchanged']:.3f} | {r['repaired']:.3f} | {r['judged']:.3f} | {r['rejected']:.3f} | {r['leak']:.3f} | {r['paramtok']:.3f} |"
            )

        with open(self.manip_table_md_path, "w", encoding="utf-8") as f:
            f.write("\n".join(md_lines) + "\n")


# ================== PROMPT CONSTRUCTION ==================

def is_schema_aware_verbose_mode(mode: str) -> bool:
    """
    Dataset-perturbation policy:
    - Disable schema-derived "extra clarity" for robustness modes.
    Rationale:
    - It systematically introduces confounds (parameter mentions, implied filters/options)
      that inflate diffs and trigger semantic-gate failures.
    - If you want schema-derived perturbations, create a dedicated separate mode.
    """
    return False

def select_schema_hint_param_names(
    *,
    top_level_param_names: Sequence[str],
    param_specs: Sequence[ParamSpec],
    max_names: int = 4,
) -> List[str]:
    """
    Deterministically select up to `max_names` top-level parameter names that are most
    likely to be informative to mention in a verbose description, without enumerating.

    Heuristics:
    - Prefer canonical "query/identifier" inputs (id, token, name, query, text, from/to, date/city/location, ticker).
    - Then consider common filter/option inputs (status, type, category, sort, order, limit, offset, page, class).
    - Keep original schema insertion order when possible for reproducibility.
    """
    names = [n for n in (top_level_param_names or []) if isinstance(n, str) and n.strip()]
    if not names:
        return []

    lower = [n.lower() for n in names]

    primary_keywords = (
        "id", "token", "name", "query", "text",
        "from", "to", "start", "end", "date", "time",
        "city", "location", "origin", "destination",
        "ticker", "symbol", "airport",
    )
    option_keywords = (
        "status", "type", "category", "sort", "order",
        "limit", "offset", "page", "per_page",
        "class", "rating", "rooms", "seats",
    )

    primary_idx: List[int] = []
    option_idx: List[int] = []
    other_idx: List[int] = []

    for i, nl in enumerate(lower):
        if any(k in nl for k in primary_keywords):
            primary_idx.append(i)
        elif any(k in nl for k in option_keywords):
            option_idx.append(i)
        else:
            other_idx.append(i)

    chosen: List[str] = []
    for i in primary_idx + option_idx + other_idx:
        chosen.append(names[i])
        if len(chosen) >= max_names:
            break

    return _dedupe_preserve_order(chosen)


def build_schema_hints_block_for_prompt(
    *,
    mode: str,
    top_level_param_names: Sequence[str],
    param_specs: Sequence[ParamSpec],
) -> str:
    """
    Compact, non-normative hint block for verbose descriptions.

    Updated policy:
    - Prefer not to mention parameter names at all.
    - If mentioning is useful, mention at most ONE name, and never as a list.
    """
    if not is_schema_aware_verbose_mode(mode):
        return ""

    hint_names = select_schema_hint_param_names(
        top_level_param_names=top_level_param_names,
        param_specs=list(param_specs or []),
        max_names=1,  # <-- changed: at most one
    )

    if not hint_names:
        return (
            "SCHEMA-AWARE VERBOSE GUIDANCE:\n"
            "- You may add a small amount of operational clarity (1–2 extra clauses) without adding constraints.\n"
            "- Prefer not to mention parameter names; use neutral phrasing (can/may) and avoid enumerations.\n"
        )

    names_text = ", ".join(hint_names)
    return (
        "SCHEMA-AWARE VERBOSE GUIDANCE:\n"
        "- You may add limited operational clarity derived from the parameter schema.\n"
        f"- Prefer not to mention parameter names; if you do, mention at most one as an example: {names_text}.\n"
        "- Use epistemic phrasing (can/may) and avoid restrictive language (only, must, requires).\n"
        "- Do not turn this into a field list.\n"
    )


def canonical_outer_string_literal_from_decoded(s: str) -> str:
    """
    Canonical re-escaping for embedding a decoded Python string as a JSON string literal.

    NOTE (Fix A): This intentionally normalizes escaping (e.g., may change \\uXXXX vs raw Unicode,
    slash escaping, etc.). Outer JSON object is not reserialized, but the tools[] string literal is.
    """
    return '"' + json_escape_string(s) + '"'


def build_generation_prompt(
    *,
    mode: str,
    tool_name: str,
    original_description: str,
    param_context: str,
    top_level_param_names: Sequence[str],
) -> str:
    policy = MODE_POLICIES.get(mode)
    if policy is None:
        raise ValueError(f"Unknown mode: {mode}")

    odesc = normalize_single_line(original_description or "")
    short_orig = _orig_desc_is_short(odesc)
    budget = compute_length_budget(mode=mode, original_description=odesc)

    orig_has_norm = original_has_normative(odesc)
    orig_has_examples = original_has_any_example_marker(odesc)
    orig_has_listing = original_has_any_listing_marker(odesc)

    listing_clause = (
        "10) Avoid enumeration/listing constructions (including 'including ...', 'such as ...', 'by specifying ...', 'with options to ...') "
        "UNLESS those constructions already appear in ORIGINAL_DESCRIPTION; if they do, you may preserve them but do not expand into lists.\n"
    )

    example_clause = ""
    if orig_has_examples:
        example_clause = (
            "NOTE: ORIGINAL_DESCRIPTION contains example markers (e.g., 'for example'/'e.g.'); preserving them is allowed if needed to keep meaning.\n"
        )

    safe_ops = (
        "SAFE PARAPHRASE OPERATORS (allowed; do not add meaning):\n"
        "- Reorder clauses while preserving scope.\n"
        "- Replace one or two words with close synonyms already implied by the original.\n"
        "- Make small punctuation/segmentation changes (comma/period) without adding content.\n"
        "Do NOT introduce any new concepts (options/filters/criteria/parameters/pagination/etc.) unless present in ORIGINAL_DESCRIPTION.\n"
    )

    anti_copy = (
        "ANTI-COPY REQUIREMENT:\n"
        "- Output must NOT be identical to ORIGINAL_DESCRIPTION.\n"
        "- Make at least TWO minimal surface edits while preserving meaning (e.g., one synonym swap + one punctuation/word-order tweak).\n"
        "- Do NOT add any new information to satisfy this requirement.\n"
    )

    base_common = (
        "TASK: Produce exactly one English tool description string for a function-calling benchmark.\n"
        "OUTPUT FORMAT: Output only the description text, and nothing else.\n"
        "OUTPUT CONSTRAINTS:\n"
        "1) Output must be a single line (no newline characters).\n"
        "2) Do not output markdown, code fences, bullets, or backticks.\n"
        "3) Do not mention prompts, validators, schemas, or that an AI system is being used.\n"
        "4) Do not include the tool name itself in the description.\n"
        "5) Do NOT invent capabilities.\n"
        "6) Do NOT add constraints/requirements/decision rules; avoid introducing new 'only/must/requires' unless present in ORIGINAL_DESCRIPTION.\n"
        "7) Do NOT add any new features that are not explicitly stated in ORIGINAL_DESCRIPTION.\n"
        "8) Avoid purpose/benefit fluff unless it appears in ORIGINAL_DESCRIPTION.\n"
        "9) Do NOT list or name parameters, fields, filters, options, pagination, expiry/expired, or user/account concepts\n"
        "   unless those exact concepts appear in ORIGINAL_DESCRIPTION.\n"
        + listing_clause
        + "11) Do not drop named entities or specific referents present in ORIGINAL_DESCRIPTION (e.g., person names like 'Adriel').\n"
        + example_clause
        + safe_ops
        + anti_copy
    )

    mode_block = f"MODE: {mode}\n"

    confound_block = ""
    if mode in ROBUSTNESS_MODES:
        confound_block = (
            "ROBUSTNESS CONSTRAINTS:\n"
            "A) Do NOT include decision-boundary language (follow-up requests, insufficient information, cannot answer).\n"
        )

    if mode == "empty_desc":
        return base_common + mode_block + "INSTRUCTION: Output an empty string.\n"

    # Numeric TARGETS (explicit), in addition to the band budget.
    target_sent = max(budget.min_sentences, min(budget.max_sentences, int(round((budget.min_sentences + budget.max_sentences) / 2))))
    target_words = max(budget.min_words, min(budget.max_words, int(round((budget.min_words + budget.max_words) / 2))))

    length_instr = (
        "LENGTH BUDGET (must satisfy): "
        f"sentences {budget.min_sentences}-{budget.max_sentences}; "
        f"words {budget.min_words}-{budget.max_words}; "
        f"chars <= {budget.max_chars}.\n"
        f"TARGET (aim for): about {target_words} words and {target_sent} sentence(s).\n"
        "IMPORTANT: Stay within the budget by paraphrasing, not by adding new details.\n"
    )

    requirements = ""
    restrictions = ""

    if mode in {"style_concise", "style_verbose"}:
        if not orig_has_examples:
            restrictions += "RESTRICTIONS: Do not include examples.\n"
        else:
            restrictions += "RESTRICTIONS: Do not introduce NEW examples; you may preserve example markers already present in ORIGINAL_DESCRIPTION if needed.\n"
        if not orig_has_norm:
            restrictions += "RESTRICTIONS: Do not include normative language (avoid should/must/always/never).\n"
    elif mode == "add_examples":
        requirements += "REQUIREMENTS: Include one or two conceptual examples using the exact phrase 'For example,'.\n"
        if not orig_has_norm:
            restrictions += "RESTRICTIONS: Do not include normative language (avoid should/must/always/never).\n"
    elif mode == "normative_injection":
        requirements += "REQUIREMENTS: Include one or two conceptual examples using the exact phrase 'For example,'.\n"
        requirements += "REQUIREMENTS: Include explicit usage guidance with at least one normative keyword (should or must).\n"

    short_orig_block = ""
    if short_orig:
        short_orig_block = (
            "SHORT-ORIGINAL GUARDRAIL:\n"
            "- Keep the paraphrase extremely close to ORIGINAL_DESCRIPTION.\n"
            "- Do NOT add generic concepts like criteria, options, filters, pagination, parameters, results, control, management.\n"
        )
        if not orig_has_listing:
            short_orig_block += "- Do NOT add 'by providing/by specifying/using parameters like/such as ...' constructions.\n"
        else:
            short_orig_block += "- If ORIGINAL_DESCRIPTION already uses listing constructions (e.g., 'such as'), you may preserve them but do not expand into lists.\n"

    return (
        base_common
        + mode_block
        + confound_block
        + short_orig_block
        + "INSTRUCTION: Produce a meaning-preserving paraphrase of ORIGINAL_DESCRIPTION.\n"
        + "INSTRUCTION: Paraphrase only; do not enrich.\n"
        + length_instr
        + requirements
        + restrictions
        + f"TOOL_NAME (do not copy): {tool_name}\n"
        + f"ORIGINAL_DESCRIPTION: {odesc if odesc else '[NONE]'}\n"
        + "PARAMETERS CONTEXT (do not copy; grounding only):\n"
        + param_context
        + "\n"
    )




def build_repair_prompt(
    *,
    mode: str,
    tool_name: str,
    params_context: str,
    candidate_output: str,
    validation_errors: List[str],
    top_level_param_names: Sequence[str],
    original_description: Optional[str] = None,
) -> str:
    policy = MODE_POLICIES.get(mode)
    if policy is None:
        raise ValueError(f"Unknown mode: {mode}")

    orig_desc = normalize_single_line(original_description or "")
    orig_has_norm = original_has_normative(orig_desc)
    orig_has_examples = original_has_any_example_marker(orig_desc)
    orig_has_listing = original_has_any_listing_marker(orig_desc)

    budget = compute_length_budget(mode=mode, original_description=orig_desc)
    target_sent = max(budget.min_sentences, min(budget.max_sentences, int(round((budget.min_sentences + budget.max_sentences) / 2))))
    target_words = max(budget.min_words, min(budget.max_words, int(round((budget.min_words + budget.max_words) / 2))))

    bans: List[str] = []

    # Examples (conditional)
    if not policy.allow_examples:
        if not orig_has_examples:
            bans.append("No examples.")
        else:
            bans.append("Do not introduce NEW examples; you may preserve example markers already present in ORIGINAL_DESCRIPTION if needed.")
    if policy.require_examples:
        bans.append("Must include the exact phrase 'For example,' at least once.")

    # Normative (conditional)
    if not policy.allow_normative and not orig_has_norm:
        bans.append("No normative language (avoid should/must/always/never/do not/avoid).")
    if policy.require_normative:
        bans.append("Must include at least one of: should, must.")

    if not policy.allow_leakage_phrases:
        bans.append("No decision-boundary leakage phrases (follow-up/need more information/cannot answer/etc.).")

    listing_ban = (
        "Avoid listing constructions: 'including ...', 'such as ...', 'by specifying ...', 'with options to ...'"
    )
    if orig_has_listing:
        listing_ban += " unless they already appear in ORIGINAL_DESCRIPTION; if present, you may preserve but do not expand into lists."
    else:
        listing_ban += "."

    bans.extend(
        [
            "Do not include the tool name itself in the description.",
            "Do not invent capabilities.",
            "Do not add constraints/requirements/decision rules; avoid introducing new 'only/must/requires' unless present in ORIGINAL_DESCRIPTION (or required by mode).",
            "Do not add purpose/benefit fluff unless present in ORIGINAL_DESCRIPTION.",
            "Do not add or enumerate parameters/fields/filters/options/pagination/expiry unless present in ORIGINAL_DESCRIPTION.",
            listing_ban,
            "Do not add generic filler concepts like various details, relevant details, specific context, configurations, preferences, or other relevant details unless present in ORIGINAL_DESCRIPTION.",
            "Do not introduce proper nouns or identifiers not present in ORIGINAL_DESCRIPTION (e.g., names like 'Adriel').",
        ]
    )

    anti_copy = (
        "ANTI-COPY REQUIREMENT: The output must NOT be identical to ORIGINAL_DESCRIPTION, and must include at least TWO minimal surface edits "
        "(synonym/punctuation/word-order) without adding any new meaning."
    )

    ban_text = " ".join(bans) if bans else "[NONE]"

    return (
        "TASK: Fix a tool description so that it satisfies hard constraints exactly.\n"
        "OUTPUT FORMAT: Output only the corrected description text, and nothing else.\n"
        "HARD CONSTRAINTS:\n"
        "1) Output must be a single line (no newline characters).\n"
        "2) Do not output markdown, code fences, bullets, or backticks.\n"
        "3) Do not mention prompts, validators, schemas, or that an AI system is being used.\n"
        "4) Prefer deleting or replacing problematic fragments; do NOT add new content.\n"
        f"5) Mode is {mode}; satisfy the mode-specific rules including sentence/word budget expectations.\n"
        f"6) LENGTH BUDGET: sentences {budget.min_sentences}-{budget.max_sentences}; words {budget.min_words}-{budget.max_words}; chars <= {budget.max_chars}. "
        f"TARGET: about {target_words} words and {target_sent} sentence(s).\n"
        f"7) {anti_copy}\n"
        f"MODE-SPECIFIC BANS/RESTRICTIONS: {ban_text}\n"
        f"VALIDATION_ERRORS: {validation_errors}\n"
        f"TOOL_NAME (do not copy): {tool_name}\n"
        f"ORIGINAL_DESCRIPTION: {orig_desc if orig_desc else '[NONE]'}\n"
        f"PARAMETERS CONTEXT (grounding only):\n{params_context}\n"
        f"CANDIDATE OUTPUT: {normalize_single_line(candidate_output)}\n"
        "INSTRUCTION: Rewrite the candidate so it passes validation while staying faithful to the original intent.\n"
    )




def build_semantic_repair_prompt(
    *,
    tool_name: str,
    mode: str,
    original_description: str,
    candidate_description: str,
    semantic_judge_feedback: str,
) -> str:
    odesc = normalize_single_line(original_description or "")
    cdesc = normalize_single_line(candidate_description or "")
    short_orig = _orig_desc_is_short(odesc)

    strict_block = (
        "CRITICAL:\n"
        "- Preserve meaning and scope of ORIGINAL_DESCRIPTION.\n"
        "- Do NOT add constraints/requirements/decision rules.\n"
        "- Do NOT add new purpose/benefit framing.\n"
        "- Do NOT add new concepts (criteria/options/filters/pagination/parameters/expiry/expired) unless present in ORIGINAL_DESCRIPTION.\n"
        "- Do NOT introduce proper nouns or identifiers not present in ORIGINAL_DESCRIPTION.\n"
        "- Prefer deleting or replacing the offending phrase; do not add new details.\n"
    )
    if short_orig:
        strict_block += (
            "- SHORT ORIGINAL: stay extremely close; prefer a near-literal paraphrase.\n"
            "- Avoid 'including/such as/by specifying/with options to' constructions.\n"
        )

    return (
        "TASK: Rewrite CANDIDATE_DESCRIPTION so it passes the semantic judge.\n"
        "OUTPUT FORMAT: Output only the rewritten description as a single line.\n"
        f"MODE: {mode}\n"
        + strict_block +
        f"SEMANTIC_JUDGE_FEEDBACK: {normalize_single_line(semantic_judge_feedback)}\n"
        f"ORIGINAL_DESCRIPTION: {odesc}\n"
        f"CANDIDATE_DESCRIPTION: {cdesc}\n"
    )




def build_judge_prompt(
    *,
    mode: str,
    tool_name: str,
    original_description: str,
    last_candidate: str,
    validation_errors: List[str],
    params_context: str,
    top_level_param_names: Sequence[str],
) -> str:
    policy = MODE_POLICIES.get(mode)
    if policy is None:
        raise ValueError(f"Unknown mode: {mode}")

    odesc = normalize_single_line(original_description or "")
    orig_has_norm = original_has_normative(odesc)

    budget = compute_length_budget(mode=mode, original_description=odesc)
    target_sent = max(budget.min_sentences, min(budget.max_sentences, int(round((budget.min_sentences + budget.max_sentences) / 2))))
    target_words = max(budget.min_words, min(budget.max_words, int(round((budget.min_words + budget.max_words) / 2))))

    bans: List[str] = []
    if not policy.allow_examples:
        bans.append("No examples.")
    if policy.require_examples:
        bans.append("Include the exact phrase 'For example,' at least once.")
    if not policy.allow_normative and not orig_has_norm:
        bans.append("No normative language (avoid should/must/always/never/do not/avoid).")
    if policy.require_normative:
        bans.append("Include at least one of: should, must.")
    if not policy.allow_leakage_phrases:
        bans.append("No decision-boundary leakage phrases (follow-up/need more information/cannot answer/etc.).")
    bans.append("Do not include the tool name itself in the description.")

    anti_copy = (
        "ANTI-COPY REQUIREMENT: Output must NOT be identical to ORIGINAL_DESCRIPTION, and must include at least TWO minimal surface edits "
        "(synonym/punctuation/word-order) without adding any new meaning."
    )

    ban_text = " ".join(bans) if bans else "[NONE]"
    err_text = ", ".join(validation_errors) if validation_errors else "[NONE]"

    return (
        "ROLE: You are a strict compliance editor for tool documentation strings.\n"
        "TASK: Produce a single-line English tool description that passes a fixed validator.\n"
        "OUTPUT FORMAT: Output only the description text, and nothing else.\n"
        "ABSOLUTE CONSTRAINTS:\n"
        "1) Single line only. No markdown. No bullets. No backticks.\n"
        "2) Do not mention prompts, validators, schemas, or that an AI system is being used.\n"
        "3) Do not invent capabilities.\n"
        "4) Do not include the tool name itself in the description.\n"
        f"5) LENGTH BUDGET: sentences {budget.min_sentences}-{budget.max_sentences}; words {budget.min_words}-{budget.max_words}; chars <= {budget.max_chars}. "
        f"TARGET: about {target_words} words and {target_sent} sentence(s).\n"
        f"6) {anti_copy}\n"
        f"MODE: {mode}\n"
        f"MODE-SPECIFIC BANS/RESTRICTIONS: {ban_text}\n"
        f"VALIDATION_ERRORS_TO_FIX: {err_text}\n"
        f"TOOL_NAME (do not copy): {tool_name}\n"
        f"ORIGINAL_DESCRIPTION (context only): {odesc if odesc else '[NONE]'}\n"
        f"LAST_FAILED_CANDIDATE (context only): {normalize_single_line(last_candidate) if last_candidate else '[NONE]'}\n"
        f"PARAMETERS CONTEXT (grounding only):\n{params_context}\n"
        "INSTRUCTION: Write a fresh description that satisfies all constraints and stays faithful to the original meaning.\n"
    )



# ================== LLM CALLS WITH RETRY + AUDIT ==================

def _sleep_with_jitter(seconds: float) -> None:
    if seconds <= 0:
        return
    jitter = random.random() * min(0.25, seconds * 0.1)
    time.sleep(seconds + jitter)

def _orig_desc_is_short(orig_desc: str, *, max_chars: int = 140, max_words: int = 22) -> bool:
    o = normalize_single_line(orig_desc or "")
    if not o:
        return False
    if len(o) <= max_chars:
        return True
    if len(re.findall(r"\S+", o)) <= max_words:
        return True
    return False



def _looks_like_param_enumeration(text: str, *, original_description: Optional[str] = None) -> bool:
    """
    Heuristic: candidate is likely listing parameters/options rather than paraphrasing intent.
    NEW: allow listing markers that already appear in the original (meaning-preserving).
    """
    t = (text or "").lower()
    o = (original_description or "").lower()

    patterns = [
        "by providing",
        "by specifying",
        "by entering",
        "details such as",
        "such as the",
        "including the",
        "including details",
        "allowing users to",
        "allow users to",
        "with options to",
        "with the option to",
        "fields such as",
    ]

    # If candidate uses these patterns but original has none of the listing markers,
    # it's very likely template-y enumeration.
    cand_listing = _marker_set_case_insensitive(t, LISTING_MARKERS)
    orig_listing = _marker_set_case_insensitive(o, LISTING_MARKERS)

    # If candidate introduces listing markers that weren't in the original -> suspicious.
    if (cand_listing - orig_listing):
        return True

    # If it contains strong listing patterns and original had no listing markers at all -> suspicious.
    if any(p in t for p in patterns) and not orig_listing:
        return True

    # Too many commas often indicates a field list
    if t.count(",") >= 4 and o.count(",") < 4:
        return True

    # Colon lists like "details: name, date, city"
    if (":" in t and t.count(",") >= 2) and not (":" in o and o.count(",") >= 2):
        return True

    return False



# Phrases that frequently create "new information" under strict semantic judging.
# We trigger a guardrail rewrite when these appear in the candidate but not in ORIGINAL_DESCRIPTION.

SEMANTIC_FLUFF_PHRASES = (
    "comprehensive",
    "comprehensive understanding",
    "overall",
    "insight",
    "provides insight",
    "providing insight",
    "overview",
    "provide an overview",
    "providing an overview",
    "help users",
    "helping users",
    "helpful",
    "allowing users",
    "enabling users",
    "straightforward way",
    "up-to-date",
    "up to date",
    "relevant details",
    "relevant information",
    "various factors",
    "needs and preferences",
    "purposes",
    "interpretation",
    "to get a",
    "to gain a",
    "to understand",
)

def _contains_fluff_not_in_original(original_description: str, candidate_description: str) -> bool:
    o = (original_description or "").lower()
    c = (candidate_description or "").lower()
    if not o or not c:
        return False

    for p in SEMANTIC_FLUFF_PHRASES:
        if p in c and p not in o:
            return True
    return False


def should_trigger_pre_semantic_guardrail(
    *,
    original_description: str,
    candidate_description: str,
    top_level_param_names: Sequence[str],
    mode: Optional[str] = None,
) -> bool:
    """
    Pre-semantic guardrail: detect common enrichment/template drift BEFORE semantic gate.

    Update:
    - In modes that require examples/normative, evaluate guardrail triggers on BASE only,
      so the mode-allowed suffix does not cause spurious rewrites.
    """
    cand_full = normalize_single_line(candidate_description or "")
    if not cand_full:
        return False

    o_full = normalize_single_line(original_description or "")

    # For example/normative modes: analyze BASE only (avoid punishing the required suffix).
    if mode in {"add_examples", "normative_injection"}:
        cand = semantic_base_text(mode, cand_full)
        odesc = semantic_base_text(mode, o_full)
    else:
        cand = cand_full
        odesc = o_full

    if not cand:
        return False

    # Purpose/benefit fluff not in original -> rewrite
    if _contains_fluff_not_in_original(odesc, cand):
        return True

    # Parameter-enumeration/template patterns -> rewrite
    if _looks_like_param_enumeration(cand, original_description=odesc):
        return True

    # repetition heuristics (BASE-only when applicable)
    o = (odesc or "").lower()
    c = (cand or "").lower()
    if c.count("returns") >= 2 and o.count("returns") < 2:
        return True
    if c.count("based on") >= 2 and o.count("based on") < 2:
        return True

    # Short originals: extra strict (BASE-only)
    if _orig_desc_is_short(odesc):
        bad_new_concepts = ("criteria", "options", "filters", "pagination", "parameters", "paginated", "filtered")
        if any(w in c and w not in o for w in bad_new_concepts):
            return True

    return False



def build_guardrail_rewrite_prompt(
    *,
    mode: str,
    tool_name: str,
    original_description: str,
    candidate_description: str,
    params_context: str,
) -> str:
    odesc = normalize_single_line(original_description or "")
    budget = compute_length_budget(mode=mode, original_description=odesc)

    orig_has_listing = original_has_any_listing_marker(odesc)
    orig_has_examples = original_has_any_example_marker(odesc)

    trigger_block = (
        "TRIGGER PHRASES POLICY:\n"
        "- If ANY of the following concepts/phrases appear in CANDIDATE_DESCRIPTION but do NOT appear in ORIGINAL_DESCRIPTION,\n"
        "  you MUST delete them (do not paraphrase them):\n"
        "  identifier, id, contact_id, default, based on provided details, provided details, filter, filtering,\n"
        "  options, preferences, criteria, parameters, fields, pagination, authenticate, authorization, authorize,\n"
        "  account, user account, user’s account.\n"
    )

    if mode == "style_concise":
        req_block = "EXAMPLES/NORMATIVE: No examples unless they already appear in ORIGINAL_DESCRIPTION. No normative language unless present in ORIGINAL_DESCRIPTION."
    elif mode == "style_verbose":
        req_block = "EXAMPLES/NORMATIVE: No examples unless they already appear in ORIGINAL_DESCRIPTION. No normative language unless present in ORIGINAL_DESCRIPTION."
    elif mode == "add_examples":
        req_block = "EXAMPLES/NORMATIVE: Include 1–2 conceptual examples using the exact phrase 'For example,' at least once. No normative language unless present in ORIGINAL_DESCRIPTION."
    elif mode == "normative_injection":
        req_block = "EXAMPLES/NORMATIVE: Include 1–2 conceptual examples using the exact phrase 'For example,' at least once. Include usage guidance with 'should' or 'must'."
    else:
        req_block = "EXAMPLES/NORMATIVE: Follow the mode policy."

    length_block = (
        "LENGTH BUDGET (must satisfy): "
        f"sentences {budget.min_sentences}-{budget.max_sentences}; "
        f"words {budget.min_words}-{budget.max_words}; "
        f"chars <= {budget.max_chars}.\n"
    )

    listing_rule = "5) Avoid enumeration constructions: 'including', 'such as', 'by providing', 'by specifying', 'with options to'."
    if orig_has_listing:
        listing_rule += " If ORIGINAL_DESCRIPTION already uses them, you may preserve them but do not expand into lists."
    else:
        listing_rule += "."

    examples_rule = ""
    if orig_has_examples:
        examples_rule = "NOTE: ORIGINAL_DESCRIPTION contains example markers; preserving them is allowed if needed to keep meaning, but do not introduce new examples.\n"

    return (
        "TASK: Rewrite CANDIDATE_DESCRIPTION into a meaning-preserving paraphrase of ORIGINAL_DESCRIPTION.\n"
        "OUTPUT FORMAT: Output only the rewritten description text, single line.\n"
        "HARD RULES:\n"
        "1) Keep it a minimal-edit paraphrase; do not write a generic template.\n"
        "2) Do NOT add new concepts unless present in ORIGINAL_DESCRIPTION (e.g., criteria, options, filters, pagination, parameters, expiry/expired).\n"
        "3) Do NOT add purpose/benefit framing unless present in ORIGINAL_DESCRIPTION (overview/insight/comprehensive/relevant).\n"
        "4) Do NOT add new constraints/requirements/decision rules; avoid introducing new only/must/requires unless present in ORIGINAL_DESCRIPTION (or required by mode).\n"
        + listing_rule + "\n"
        "6) Do not introduce proper nouns or identifiers not present in ORIGINAL_DESCRIPTION.\n"
        "7) Do not include the tool name.\n"
        + examples_rule +
        trigger_block +
        f"MODE: {mode}\n"
        + length_block +
        f"{req_block}\n"
        f"TOOL_NAME (do not copy): {tool_name}\n"
        f"ORIGINAL_DESCRIPTION: {odesc if odesc else '[NONE]'}\n"
        f"CANDIDATE_DESCRIPTION: {normalize_single_line(candidate_description) if candidate_description else '[NONE]'}\n"
    )



def _mentions_multiple_param_tokens(text: str, top_level_param_names: Sequence[str], *, min_hits: int = 2) -> bool:
    """
    Uses token-based match (word boundaries via _WORD_RE). This is cheap and consistent with your validator.
    """
    words = {w.lower() for w in _WORD_RE.findall(text or "")}
    hits = 0
    for p in top_level_param_names or []:
        pl = (p or "").lower()
        if pl and pl in words:
            hits += 1
            if hits >= min_hits:
                return True
    return False



def llm_call_chat_completions(
    *,
    client: OpenAI,
    model: str,
    system_text: str,
    user_text: str,
    temperature: float,
    max_tokens: int,
    rate_limit_sleep_sec: float,
    max_retries: int,
    audit: "AuditLogger",
    call_tag: str,
) -> str:
    if rate_limit_sleep_sec > 0:
        _sleep_with_jitter(rate_limit_sleep_sec)

    prompt_hash = stable_sha256(system_text + "\n\n" + user_text)

    last_exc: Optional[Exception] = None
    current_max_tokens = int(max_tokens)
    max_tokens_cap = max(1024, current_max_tokens)  # conservative cap

    def _is_rate_limit_exc(e: Exception) -> bool:
        name = type(e).__name__.lower()
        msg = str(e).lower()
        return ("ratelimit" in name) or ("429" in msg) or ("rate limit" in msg)

    for attempt in range(1, max_retries + 1):
        try:
            audit.inc("llm_calls", 1)
            t0 = now_unix()

            resp = client.chat.completions.create(
                model=model,
                messages=[
                    {"role": "system", "content": system_text},
                    {"role": "user", "content": user_text},
                ],
                temperature=temperature,
                max_tokens=current_max_tokens,
            )

            t1 = now_unix()

            choice0 = resp.choices[0]
            msg = choice0.message

            content = getattr(msg, "content", None) or ""
            tool_calls = getattr(msg, "tool_calls", None)
            refusal = getattr(msg, "refusal", None)
            finish_reason = getattr(choice0, "finish_reason", None)

            text = content if isinstance(content, str) else str(content)
            normalized = normalize_single_line(text)

            audit.log_call(
                {
                    "ts": t0,
                    "tag": call_tag,
                    "attempt": attempt,
                    "model": model,
                    "temperature": temperature,
                    "max_tokens": current_max_tokens,
                    "prompt_hash": prompt_hash,
                    "latency_sec": round(t1 - t0, 6),
                    "finish_reason": finish_reason,
                    "has_tool_calls": bool(tool_calls),
                    "has_refusal": bool(refusal),
                    "raw_output_preview": text[:2000] if isinstance(text, str) else str(text)[:2000],
                }
            )

            # Treat truncation as failed attempt; retry with higher max_tokens.
            if finish_reason == "length":
                bump = 256 if current_max_tokens >= 256 else 128
                next_tokens = min(max_tokens_cap, current_max_tokens + bump)

                audit.log_call(
                    {
                        "ts": now_unix(),
                        "tag": call_tag,
                        "attempt": attempt,
                        "model": model,
                        "prompt_hash": prompt_hash,
                        "event": "finish_reason_length_retry",
                        "prev_max_tokens": current_max_tokens,
                        "next_max_tokens": next_tokens,
                    }
                )

                if next_tokens <= current_max_tokens or attempt == max_retries:
                    raise RuntimeError(
                        "Model output truncated (finish_reason=length) and cannot increase max_tokens further."
                    )
                current_max_tokens = next_tokens
                continue

            if ("empty_desc" not in call_tag) and (normalized == ""):
                raise RuntimeError(
                    f"Empty model output (finish_reason={finish_reason}, tool_calls={bool(tool_calls)}, refusal={bool(refusal)})"
                )

            return text

        except Exception as e:
            last_exc = e

            # Improved backoff for rate limits
            if _is_rate_limit_exc(e):
                # Start higher and grow slower; cap higher.
                backoff = min(90.0, max(2.0, 2.0 * (1.8 ** (attempt - 1))))
            else:
                backoff = min(30.0, 0.5 * (2 ** (attempt - 1)))

            audit.log_call(
                {
                    "ts": now_unix(),
                    "tag": call_tag,
                    "attempt": attempt,
                    "model": model,
                    "temperature": temperature,
                    "max_tokens": current_max_tokens,
                    "prompt_hash": prompt_hash,
                    "error_type": type(e).__name__,
                    "error_str": str(e)[:2000],
                    "backoff_sec": backoff,
                }
            )
            _sleep_with_jitter(backoff)

    raise RuntimeError(
        f"LLM call failed after {max_retries} attempts: {type(last_exc).__name__}: {last_exc}"
    ) from last_exc



# ==================  ROBUST DETECTORS (word boundaries) ==================

def _contains_any_substring_ci(text: str, needles: Sequence[str]) -> bool:
    """
    Conservative substring match (case-insensitive). Use only when substring semantics are desired.
    """
    lt = (text or "").lower()
    return any((n or "").lower() in lt for n in (needles or ()) if n)


def _compile_phrase_regex(phrases: Sequence[str]) -> re.Pattern:
    r"""
    Compile a single case-insensitive regex that matches any phrase with safe word boundaries.

    - Pure word tokens -> \b boundaries.
    - Multi-token / punctuation phrases -> (?<!\w) ... (?!\w).
    """
    alts: List[str] = []
    for p in phrases:
        if not p:
            continue
        p_norm = str(p).strip()
        if not p_norm:
            continue

        if re.fullmatch(r"[A-Za-z]+(?:'[A-Za-z]+)?", p_norm):
            alts.append(rf"\b{re.escape(p_norm)}\b")
        else:
            alts.append(rf"(?<!\w){re.escape(p_norm)}(?!\w)")

    if not alts:
        return re.compile(r"(?!x)x")
    return re.compile("|".join(alts), flags=re.IGNORECASE)


# Precompiled regexes for the canonical phrase sets
_NORMATIVE_RE = _compile_phrase_regex(NORMATIVE_KEYWORDS)
_EXAMPLE_RE = _compile_phrase_regex(EXAMPLE_MARKERS)
_LEAKAGE_RE = _compile_phrase_regex(LEAKAGE_PHRASES)
_REQUIRED_EXAMPLE_RE = re.compile(r"(?<!\w)for example,(?!\w)", flags=re.IGNORECASE)


def _contains_any_phrase_regex_ci(text: str, needles: Sequence[str]) -> bool:
    """
    Phrase match with word-boundary safety (regex). Default for validators/judges.
    """
    t = text or ""
    # Fast path for canonical sets (identity check is stable: they are module-level tuples)
    if needles is NORMATIVE_KEYWORDS:
        return bool(_NORMATIVE_RE.search(t))
    if needles is EXAMPLE_MARKERS:
        return bool(_EXAMPLE_RE.search(t))
    if needles is LEAKAGE_PHRASES:
        return bool(_LEAKAGE_RE.search(t))
    return bool(_compile_phrase_regex(needles).search(t))


def _contains_any_case_insensitive(text: str, needles: Sequence[str]) -> bool:
    """
    Backwards-compatible default: use regex-safe phrase detection.
    """
    return _contains_any_phrase_regex_ci(text, needles)


def _contains_required_example_phrase(text: str) -> bool:
    return bool(_REQUIRED_EXAMPLE_RE.search(text or ""))



# ================== SEMANTIC CHECK (judge gate) ==================


def semantic_check_or_fail(
    *,
    client: OpenAI,
    judge_model: str,
    tool_name: str,
    original_description: str,
    candidate_description: str,
    params_context: str,
    mode: str,
    audit: "AuditLogger",
    rate_limit_sleep_sec: float,
    max_retries: int,
) -> Tuple[bool, str]:
    """
    Returns (is_pass, judge_raw_line).
    """
    prompt = build_semantic_judge_prompt(
        tool_name=tool_name,
        original_description=original_description,
        candidate_description=candidate_description,
        params_context=params_context,
        mode=mode,
    )

    raw = llm_call_chat_completions(
        client=client,
        model=judge_model,
        system_text="System role: Semantic equivalence judge for tool descriptions.",
        user_text=prompt,
        temperature=0.0,
        max_tokens=256,
        rate_limit_sleep_sec=rate_limit_sleep_sec,
        max_retries=max_retries,
        audit=audit,
        call_tag=f"semantic_judge::{tool_name}",
    )

    line = normalize_single_line(raw)
    if line.upper() == "PASS":
        return True, line
    if line.upper().startswith("FAIL"):
        return False, line

    return False, f"FAIL: non_compliant_judge_output:{line[:120]}"


# ================== GENERATION PIPELINE ==================

def tool_signature_for_cache(tool_obj: Dict[str, Any]) -> str:
    name = str(tool_obj.get("name") or tool_obj.get("function", {}).get("name") or "unnamed_tool")
    if "function" in tool_obj and isinstance(tool_obj["function"], dict):
        params = tool_obj["function"].get("parameters")
        orig_desc = tool_obj["function"].get("description") or ""
    else:
        params = tool_obj.get("parameters")
        orig_desc = tool_obj.get("description") or ""

    params_canon = json.dumps(params, ensure_ascii=False, sort_keys=True, separators=(",", ":"))
    base = f"name={name}\nparams={params_canon}\norig_desc={normalize_single_line(orig_desc)}"
    return stable_sha256(base)

def extract_tool_core(
    tool_obj: Dict[str, Any]
) -> Tuple[str, str, Any, List[ParamSpec], List[str], List[str], Sequence[str]]:
    """
    Returns:
      tool_name: str
      orig_desc: str
      params: Any
      param_specs: List[ParamSpec]
      required_names: List[str]
      all_names: List[str]
      desc_path: Sequence[str]
    """
    if "function" in tool_obj and isinstance(tool_obj["function"], dict):
        fn = tool_obj["function"]
        tool_name = str(fn.get("name") or "unnamed_tool")
        orig_desc = str(fn.get("description") or "")
        params = fn.get("parameters")
        desc_path: Sequence[str] = ("function", "description")
    else:
        tool_name = str(tool_obj.get("name") or "unnamed_tool")
        orig_desc = str(tool_obj.get("description") or "")
        params = tool_obj.get("parameters")
        desc_path = ("description",)

    param_specs = flatten_json_schema_parameters(params if isinstance(params, dict) else {})

    all_names = extract_top_level_param_names(params)
    required_names: List[str] = []
    if isinstance(params, dict):
        req = params.get("required")
        if isinstance(req, list):
            required_names = [r for r in req if isinstance(r, str)]

    return tool_name, orig_desc, params, param_specs, required_names, all_names, desc_path


def _char_similarity(a: str, b: str) -> float:
    return float(difflib.SequenceMatcher(None, a or "", b or "").ratio())

def _token_jaccard_similarity(a: str, b: str) -> float:
    wa = {w.lower() for w in _WORD_RE.findall(a or "")}
    wb = {w.lower() for w in _WORD_RE.findall(b or "")}
    if not wa and not wb:
        return 1.0
    inter = len(wa & wb)
    uni = len(wa | wb)
    return float(inter / uni) if uni else 0.0


def strip_for_similarity_by_mode(mode: str, text: str) -> str:
    """
    Per i mode con esempi/normative, la similarità va valutata sulla parte 'base'
    (prima del primo 'For example,') per non penalizzare l'append di esempi.
    """
    t = normalize_single_line(text or "")
    if mode in {"add_examples", "normative_injection"}:
        m = _REQUIRED_EXAMPLE_RE.search(t)
        if m:
            t = t[: m.start()].rstrip()
    return t


def split_base_and_suffix_by_mode(mode: str, text: str) -> Tuple[str, str]:
    """
    For modes that require examples/normative, treat everything from the first
    required 'For example,' onward as suffix (mode-allowed append).
    Returns (base, suffix) both normalized single-line.
    """
    t = normalize_single_line(text or "")
    if mode in {"add_examples", "normative_injection"}:
        m = _REQUIRED_EXAMPLE_RE.search(t)
        if m:
            base = t[: m.start()].rstrip()
            suffix = t[m.start():].lstrip()
            return normalize_single_line(base), normalize_single_line(suffix)
    return t, ""


def semantic_base_text(mode: str, text: str) -> str:
    """
    Base text used for semantic equivalence comparisons in modes where examples/normative
    are mode-allowed and should not be treated as semantic drift.
    """
    base, _ = split_base_and_suffix_by_mode(mode, text)
    return normalize_single_line(base)


def compute_similarity_metrics(mode: str, original_desc: str, candidate_desc: str) -> Dict[str, float]:
    o = strip_for_similarity_by_mode(mode, normalize_single_line(original_desc or ""))
    c = strip_for_similarity_by_mode(mode, normalize_single_line(candidate_desc or ""))

    char_sim = _char_similarity(o, c)
    tok_sim = _token_jaccard_similarity(o, c)

    # REVISED: make it less "near-copy biased" (still favors minimal edit)
    comp = 0.55 * char_sim + 0.45 * tok_sim
    return {"char": char_sim, "token": tok_sim, "composite": comp}



def similarity_gate_pass(mode: str, original_desc: str, candidate_desc: str) -> Tuple[bool, Dict[str, float], str]:
    if not ENABLE_SIMILARITY_GATE:
        return True, {"char": 1.0, "token": 1.0, "composite": 1.0}, "disabled"

    if mode == "empty_desc":
        return True, {"char": 1.0, "token": 1.0, "composite": 1.0}, "not_applicable"

    o_norm = normalize_single_line(original_desc or "")
    c_norm = normalize_single_line(candidate_desc or "")

    # Robustness modes: do NOT apply similarity upper bound.
    local_enable_upper = bool(ENABLE_SIMILARITY_UPPER_BOUND) and (mode not in ROBUSTNESS_MODES)

    # Always enforce not-identical (anti-copy).
    if c_norm == o_norm:
        m = compute_similarity_metrics(mode, o_norm, c_norm)
        return False, m, "fail:identical"

    # For short originals: keep only tau_min + not-identical.
    if _orig_desc_is_short(o_norm):
        band = SIMILARITY_BAND_BY_MODE.get(mode, (0.75, 0.999))
        tau_min = float(band[0])
        m = compute_similarity_metrics(mode, o_norm, c_norm)
        comp = float(m["composite"])
        if comp < tau_min:
            return False, m, f"fail:too_different:min={tau_min}"
        return True, m, f"pass:short_original_not_identical:min={tau_min}"

    band = SIMILARITY_BAND_BY_MODE.get(mode, (0.75, 0.999))
    tau_min = float(band[0])
    tau_max = float(band[1])

    m = compute_similarity_metrics(mode, o_norm, c_norm)
    comp = float(m["composite"])

    if comp < tau_min:
        return False, m, f"fail:too_different:min={tau_min}"

    if local_enable_upper and comp > tau_max:
        return False, m, f"fail:too_similar:max={tau_max}"

    return True, m, f"pass:min={tau_min}" + (f",max={tau_max}" if local_enable_upper else ",max=disabled_in_mode")



def deterministic_micro_edit_on_candidate(
    *,
    mode: str,
    original_description: str,
    candidate_description: str,
) -> str:
    """
    Deterministic minimal edit to ensure NOT identical and break near-copy.
    Guarantees at least TWO surface edits when possible:
      1) one safe synonym swap (if any match)
      2) one punctuation/connector tweak
    Does NOT add new semantic content.
    """
    o = normalize_single_line(original_description or "")
    c = normalize_single_line(candidate_description or "")
    if not c:
        return c
    if not o:
        return c

    budget = compute_length_budget(mode=mode, original_description=o)

    def _truncate_to_budget(t: str) -> str:
        t = normalize_single_line(t)
        if len(t) > budget.max_chars:
            t = t[: budget.max_chars].rstrip()
        while _count_words(t) > budget.max_words and len(t) > 10:
            t = re.sub(r"\s+\S+\s*$", "", t).strip()
        if t and t[-1] not in ".!?":
            t += "."
        return normalize_single_line(t)

    edited = c

    # Edit 1: synonym swap (at most one)
    edited1 = _apply_one_safe_synonym_swap(edited)
    if edited1 != edited:
        edited = edited1
    else:
        # If no synonym swap possible, do a tiny reorder-free article tweak (very conservative):
        # Replace first occurrence of " the " <-> " a " if present (English-only surface).
        if re.search(r"\bthe\b", edited, flags=re.IGNORECASE):
            edited = re.sub(r"\bthe\b", "a", edited, count=1, flags=re.IGNORECASE)
        elif re.search(r"\ba\b", edited, flags=re.IGNORECASE):
            edited = re.sub(r"\ba\b", "the", edited, count=1, flags=re.IGNORECASE)

    # Edit 2: punctuation/connector tweak
    # Prefer changing one comma into a period, else add/remove a comma before "and".
    if ", " in edited:
        edited = edited.replace(", ", ". ", 1)
    else:
        # add a comma before first "and" if present and not already punctuated
        edited = re.sub(r"\s+and\s+", ", and ", edited, count=1, flags=re.IGNORECASE)

    edited = _truncate_to_budget(edited)

    # Ensure anti-copy: if still identical, force a final punctuation flip (period <-> no period)
    if normalize_single_line(edited) == normalize_single_line(o):
        if edited.endswith("."):
            edited = edited[:-1].rstrip()
        else:
            edited = edited + "."
        edited = _truncate_to_budget(edited)

    return normalize_single_line(edited)


def build_similarity_pullback_prompt(
    *,
    mode: str,
    tool_name: str,
    original_description: str,
    candidate_description: str,
    params_context: str,
) -> str:
    """
    Targeted recovery when candidate is too different: pull back towards original
    WITHOUT copying identically and with at least two micro-edits.
    """
    o = normalize_single_line(original_description or "")
    c = normalize_single_line(candidate_description or "")
    budget = compute_length_budget(mode=mode, original_description=o)

    target_sent = max(budget.min_sentences, min(budget.max_sentences, int(round((budget.min_sentences + budget.max_sentences) / 2))))
    target_words = max(budget.min_words, min(budget.max_words, int(round((budget.min_words + budget.max_words) / 2))))

    return (
        "TASK: Rewrite the description to be much closer to ORIGINAL_DESCRIPTION while preserving meaning.\n"
        "OUTPUT FORMAT: Output only the rewritten description text, single line.\n"
        "HARD RULES:\n"
        "1) Do NOT add any new concepts not present in ORIGINAL_DESCRIPTION.\n"
        "2) Do NOT add constraints/requirements/decision rules.\n"
        "3) Do NOT list parameters/fields/options/filters unless they already appear in ORIGINAL_DESCRIPTION.\n"
        "4) The output must NOT be identical to ORIGINAL_DESCRIPTION, and must include at least TWO minimal surface edits "
        "(synonym/punctuation/word-order) without adding meaning.\n"
        f"5) LENGTH BUDGET: sentences {budget.min_sentences}-{budget.max_sentences}; words {budget.min_words}-{budget.max_words}; chars <= {budget.max_chars}. "
        f"TARGET: about {target_words} words and {target_sent} sentence(s).\n"
        f"MODE: {mode}\n"
        f"TOOL_NAME (do not copy): {tool_name}\n"
        f"ORIGINAL_DESCRIPTION: {o if o else '[NONE]'}\n"
        f"CANDIDATE_DESCRIPTION (too different): {c if c else '[NONE]'}\n"
        "PARAMETERS CONTEXT (grounding only):\n"
        f"{params_context}\n"
        "INSTRUCTION: Produce a close, minimal-edit paraphrase of ORIGINAL_DESCRIPTION.\n"
    )




def _count_words(text: str) -> int:
    return len([w for w in re.findall(r"\S+", text or "") if w])


def _count_sentences_rough(text: str) -> int:
    t = (text or "").strip()
    if not t:
        return 0
    return max(1, len(re.findall(r"[.!?]+", t)))

def _split_one_sentence_minimally_deterministic(text: str) -> str:
    """
    Deterministic minimal split:
    - If input is exactly 1 rough sentence, try to split into 2 sentences
      by replacing a comma-separator or a connector ("and"/"to") near the middle.
    - Does NOT add content; only changes punctuation/segmentation.
    """
    t = normalize_single_line(text or "")
    if not t:
        return t
    if _count_sentences_rough(t) != 1:
        return t

    # Prefer splitting at a comma near the midpoint.
    comma_positions = [m.start() for m in re.finditer(r",\s+", t)]
    if comma_positions:
        mid = len(t) // 2
        candidates = [p for p in comma_positions if 40 <= p <= max(40, len(t) - 40)]
        if not candidates:
            candidates = comma_positions
        split_pos = min(candidates, key=lambda p: abs(p - mid))
        t2 = t[:split_pos] + "." + t[split_pos + 1 :]
        return normalize_single_line(t2)

    # Fallback: split on a connector near the midpoint.
    mid = len(t) // 2
    for pat in [r"\s+and\s+", r"\s+to\s+"]:
        positions = [m.start() for m in re.finditer(pat, t)]
        if positions:
            split_pos = min(positions, key=lambda p: abs(p - mid))
            t2 = t[:split_pos] + ". " + t[split_pos + 1 :].lstrip()
            return normalize_single_line(t2)

    return t



@dataclasses.dataclass(frozen=True)
class LengthBudget:
    min_sentences: int
    max_sentences: int
    min_words: int
    max_words: int
    max_chars: int

_SAFE_SYNONYM_MAP = [
    (r"\bpurchase\b", "buy"),
    (r"\bbuy\b", "purchase"),
    (r"\bsearch\b", "find"),
    (r"\bfind\b", "search"),
    (r"\bretrieve\b", "get"),
    (r"\bget\b", "retrieve"),
    (r"\bgenerate\b", "create"),
    (r"\bcreate\b", "generate"),
    (r"\breturn\b", "provide"),
    (r"\bprovide\b", "return"),
    (r"\blist\b", "show"),
    (r"\bshow\b", "list"),
    (r"\broute\b", "itinerary"),
    (r"\bitinerary\b", "route"),
    (r"\btickets\b", "passes"),
    (r"\bpasses\b", "tickets"),
]



def _apply_one_safe_synonym_swap(text: str) -> str:
    t = normalize_single_line(text or "")
    if not t:
        return t

    # Applica al massimo UNA sostituzione (la prima che matcha, in ordine).
    for pat, rep in _SAFE_SYNONYM_MAP:
        if re.search(pat, t, flags=re.IGNORECASE):
            t2 = re.sub(pat, rep, t, count=1, flags=re.IGNORECASE)
            return normalize_single_line(t2)
    return t


def rule_based_paraphrase_operator(
    *,
    mode: str,
    original_description: str,
    top_level_param_names: Sequence[str],
) -> str:
    """
    Baseline deterministica e auditabile:
    - non inventa nulla, non enumera parametri, minimal-edit.
    - soddisfa i requisiti per mode (esempi/normative) aggiungendo SOLO frasi ultra-neutre.
    """
    o = normalize_single_line(original_description or "")
    if not o:
        return o if mode == "empty_desc" else ""

    base = _apply_one_safe_synonym_swap(o)

    if base and base[-1] not in ".!?":
        base += "."

    budget = compute_length_budget(mode=mode, original_description=o)

    def _force_single_sentence(t: str) -> str:
        t = normalize_single_line(t)
        t = re.sub(r"[.!?]+\s+", ", ", t).strip()
        if t and t[-1] not in ".!?":
            t += "."
        return normalize_single_line(t)

    def _ensure_sentence_count_for_verbose(t: str) -> str:
        t = normalize_single_line(t)
        if _count_sentences_rough(t) >= budget.min_sentences:
            return t
        # Deterministic split only (no content added)
        return _split_one_sentence_minimally_deterministic(t)

    def _truncate_to_budget(t: str) -> str:
        t = normalize_single_line(t)
        if len(t) > budget.max_chars:
            t = t[: budget.max_chars].rstrip()
        while _count_words(t) > budget.max_words and len(t) > 10:
            t = re.sub(r"\s+\S+\s*$", "", t).strip()
        if t and t[-1] not in ".!?":
            t += "."
        return normalize_single_line(t)

    if mode == "style_concise":
        t = _force_single_sentence(base)
        t = _truncate_to_budget(t)
        return t

    if mode == "style_verbose":
        t = _ensure_sentence_count_for_verbose(base)
        t = _truncate_to_budget(t)
        return t

    if mode == "add_examples":
        t = (
            f"{normalize_single_line(base)} "
            "For example, use it in a case that matches the description above. "
            "For example, apply it when you need the same operation described there."
        )
        t = _truncate_to_budget(t)
        return t

    if mode == "normative_injection":
        t = (
            f"{normalize_single_line(base)} "
            "For example, use it in a case that matches the description above. "
            "You should use it in the intended context described there."
        )
        t = _truncate_to_budget(t)
        return t

    return _truncate_to_budget(base)



def compute_length_budget(
    *,
    mode: str,
    original_description: str,
) -> LengthBudget:
    o = normalize_single_line(original_description or "")
    ow = _count_words(o)
    osent = _count_sentences_rough(o)
    och = len(o)

    # POLICY: empty ORIGINAL_DESCRIPTION -> deterministic, avoid filler budgets.
    if not o:
        if mode == "empty_desc":
            return LengthBudget(0, 0, 0, 0, 0)

        if mode in {"style_concise", "style_verbose"}:
            # Allow very short deterministic placeholder outputs
            return LengthBudget(1, 2, 2, 20, min(240, DESCRIPTION_MAX_CHARS))

        if mode == "add_examples":
            # Must include 'For example,'
            return LengthBudget(2, 5, 6, 60, min(420, DESCRIPTION_MAX_CHARS))

        if mode == "normative_injection":
            # Must include 'For example,' and should/must
            return LengthBudget(2, 5, 8, 70, min(440, DESCRIPTION_MAX_CHARS))

        return LengthBudget(1, 3, 2, 30, min(240, DESCRIPTION_MAX_CHARS))

    short_orig = _orig_desc_is_short(o)

    mode_hard_caps = {
        "style_concise": 240,
        "style_verbose": 560,
        "add_examples": 780,
        "normative_injection": 820,
    }
    hard_cap_chars = min(mode_hard_caps.get(mode, 560), DESCRIPTION_MAX_CHARS)

    # Helper: relax min_words for very short originals (ow <= 6)
    def _min_words_relaxed(default_min: int) -> int:
        if ow <= 6:
            # User-requested: min_words = ow (or max(ow,3)).
            return max(ow, 3)
        return default_min

    if mode == "style_concise":
        min_w = _min_words_relaxed(max(6, int(round(ow * 0.55))))
        max_w = min(32, max(min_w + 6, int(round(ow * 0.95)) + 6))
        max_c = min(hard_cap_chars, max(140, int(round(och * 0.95)) + 30))
        return LengthBudget(1, 1, min_w, max_w, max_c)

    if mode == "style_verbose":
        if short_orig:
            min_s, max_s = 1, 2
            min_w = _min_words_relaxed(max(8, int(round(ow * 0.85))))
            max_w = min(70, max(min_w + 10, int(round(ow * 2.1)) + 6))
            max_c = min(hard_cap_chars, max(220, int(round(och * 2.1)) + 60))
            return LengthBudget(min_s, max_s, min_w, max_w, max_c)

        if osent == 1:
            min_s, max_s = 1, 2
            min_w = _min_words_relaxed(max(10, int(round(ow * 0.85))))
            max_w = min(150, max(min_w + 14, int(round(ow * 1.75)) + 10))
            max_c = min(hard_cap_chars, max(320, int(round(och * 1.75)) + 80))
            return LengthBudget(min_s, max_s, min_w, max_w, max_c)

        min_s = max(2, osent)
        max_s = min(5, max(min_s, osent + 1))
        min_w = _min_words_relaxed(max(10, int(round(ow * 0.85))))
        max_w = min(150, max(min_w + 14, int(round(ow * 1.75)) + 10))
        max_c = min(hard_cap_chars, max(320, int(round(och * 1.75)) + 80))
        return LengthBudget(min_s, max_s, min_w, max_w, max_c)

    if mode == "add_examples":
        min_s, max_s = 3, 5
        min_w = max(16, int(round(ow * 0.85)))
        if ow <= 6:
            min_w = max(min_w, 10)  # keep room for required 'For example,' without filler spam
        max_w = min(200, max(min_w + 20, int(round(ow * 2.1)) + 18))
        max_c = min(hard_cap_chars, max(420, int(round(och * 2.1)) + 120))
        return LengthBudget(min_s, max_s, min_w, max_w, max_c)

    if mode == "normative_injection":
        min_s, max_s = 3, 5
        min_w = max(16, int(round(ow * 0.85)))
        if ow <= 6:
            min_w = max(min_w, 12)  # room for 'For example,' + 'should/must'
        max_w = min(220, max(min_w + 22, int(round(ow * 2.2)) + 24))
        max_c = min(hard_cap_chars, max(440, int(round(och * 2.2)) + 140))
        return LengthBudget(min_s, max_s, min_w, max_w, max_c)

    return LengthBudget(
        1,
        4,
        _min_words_relaxed(max(8, int(round(ow * 0.85)))),
        min(150, max(_min_words_relaxed(int(round(ow * 0.85))) + 14, int(round(ow * 1.75)) + 10)),
        hard_cap_chars,
    )


def _flag_contains_normative(text: str) -> bool:
    return _contains_any_case_insensitive(text, NORMATIVE_KEYWORDS)


def _flag_contains_example(text: str) -> bool:
    return _contains_any_case_insensitive(text, EXAMPLE_MARKERS)


def _flag_contains_leakage(text: str) -> bool:
    return _contains_any_case_insensitive(text, LEAKAGE_PHRASES)

def _build_manip_row(
    *,
    mode: str,
    tool_name: str,
    orig_desc: str,
    new_desc: str,
    status: str,
    validation_errors: List[str],
    top_level_param_names: Sequence[str],
) -> Dict[str, Any]:
    o = normalize_single_line(orig_desc)
    n = normalize_single_line(new_desc)

    ok_final, errs_final = validate_description(
        mode,
        n,
        raw_desc=new_desc,
        top_level_param_names=top_level_param_names,
        original_description=orig_desc,  # <-- NEW
    )

    return {
        "ts": now_unix(),
        "mode": mode,
        "tool_name": tool_name,
        "status": status,
        "status_is_repaired": (status == "repaired"),
        "status_is_judged": (status == "judged"),
        "status_is_rejected": ("fallback_original" in status or status == "rejected"),
        "orig_chars": len(o),
        "new_chars": len(n),
        "delta_chars": len(n) - len(o),
        "orig_words": _count_words(o),
        "new_words": _count_words(n),
        "delta_words": _count_words(n) - _count_words(o),
        "orig_sentences_rough": _count_sentences_rough(o),
        "new_sentences_rough": _count_sentences_rough(n),
        "char_similarity": _char_similarity(o, n),
        "contains_example": _flag_contains_example(n),
        "contains_normative": _flag_contains_normative(n),
        "contains_leakage": _flag_contains_leakage(n),
        "mentions_param_token": _contains_any_token_case_insensitive(n, top_level_param_names),
        "is_unchanged": (o == n),
        "final_failed_validation": (not ok_final),
        "final_validation_errors": errs_final,
        "generation_validation_errors": list(validation_errors),
    }


# ==================  JSON NODE MATERIALIZATION (object-tools fix) ==================


def node_to_python(node: "JsonNode") -> Any:
    """
    Convert a JsonNode tree into a plain Python object (dict/list/str/int/float/bool/None).

    This is required because JsonSpanParser stores object/array children in `obj`/`arr`,
    and does not necessarily populate `value` for composite nodes.
    """
    if node.kind == "object":
        if not node.obj:
            return {}
        return {str(k): node_to_python(v) for k, v in node.obj.items()}
    if node.kind == "array":
        if not node.arr:
            return []
        return [node_to_python(v) for v in node.arr]
    if node.kind == "string":
        return "" if node.value is None else str(node.value)
    if node.kind == "number":
        return node.value
    if node.kind == "true":
        return True
    if node.kind == "false":
        return False
    if node.kind == "null":
        return None
    return node.value


def build_semantic_judge_prompt(
    *,
    tool_name: str,
    original_description: str,
    candidate_description: str,
    params_context: str,
    mode: str = "unknown",
) -> str:
    """
    Mode-aware semantic judge prompt.

    Key rule:
    - For add_examples / normative_injection: semantic equivalence is evaluated on BASE text only
      (everything before the first required 'For example,').
    - The suffix (examples/normative guidance) is mode-allowed and must NOT be treated as drift
      unless it introduces new capabilities, constraints, refusal policies, or decision rules.
    """
    odesc = normalize_single_line(original_description or "")
    cdesc = normalize_single_line(candidate_description or "")

    o_base = semantic_base_text(mode, odesc)
    c_base = semantic_base_text(mode, cdesc)

    short_orig = _orig_desc_is_short(odesc)

    short_block = ""
    if short_orig:
        short_block = (
            "SHORT-ORIGINAL STRICTNESS:\n"
            "- Treat any newly introduced generic concept as a semantic change.\n"
            "- Disallowed new concepts unless present in original BASE: criteria, options, filters, pagination, parameters,\n"
            "  purpose/benefit framing (overview/insight/comprehensive/relevant), or implying extra inputs.\n"
        )

    mode_block = ""
    if mode in {"add_examples", "normative_injection"}:
        mode_block = (
            "MODE-AWARE RULE:\n"
            "- Evaluate semantic equivalence using BASE ONLY (text before the first 'For example,').\n"
            "- The suffix containing examples and/or normative guidance is allowed by mode.\n"
            "- Do NOT fail solely because the suffix exists or is tautological.\n"
            "- FAIL if the suffix introduces new capabilities, constraints, refusal policies, decision rules, or extra requirements.\n"
        )

    return (
        "ROLE: You are a strict semantic consistency judge for tool description strings.\n"
        "TASK: Decide whether CANDIDATE_DESCRIPTION preserves the intent and scope of ORIGINAL_DESCRIPTION\n"
        "and does NOT introduce new requirements, constraints, refusal policies, decision rules, or new concepts.\n"
        "Focus on meaning, not style.\n"
        "OUTPUT FORMAT:\n"
        "- Output exactly one line.\n"
        "- If acceptable: output 'PASS'.\n"
        "- If not acceptable: output 'FAIL: <brief reason>'.\n"
        "DECISION CRITERIA:\n"
        "- FAIL if candidate invents capabilities.\n"
        "- FAIL if candidate adds restrictions/requirements (must/should/requires/only) unless required by the mode suffix.\n"
        "- FAIL if candidate adds new purpose/benefit framing not present in the original BASE.\n"
        "- FAIL if candidate drops essential specifics from original BASE.\n"
        + mode_block
        + short_block +
        f"TOOL_NAME (context only): {tool_name}\n"
        f"MODE: {mode}\n"
        f"ORIGINAL_DESCRIPTION: {odesc if odesc else '[NONE]'}\n"
        f"CANDIDATE_DESCRIPTION: {cdesc if cdesc else '[NONE]'}\n"
        f"ORIGINAL_BASE (use for equivalence): {o_base if o_base else '[NONE]'}\n"
        f"CANDIDATE_BASE (use for equivalence): {c_base if c_base else '[NONE]'}\n"
        "PARAMETERS CONTEXT (grounding only; do not treat as extra requirements):\n"
        f"{params_context}\n"
    )



def make_validator_safe_fallback(
    *,
    mode: str,
    original_description: str,
    top_level_param_names: Sequence[str],
) -> str:
    """
    Budget-aware fallback:
    Goal order:
      1) Pass validate_description(mode, ...) under dynamic budget
      2) Stay as close as possible to ORIGINAL_DESCRIPTION (minimal edit, no enrichment)
    """
    o = normalize_single_line(original_description or "")
    if not o:
        return o

    ok, _ = validate_description(
        mode,
        o,
        top_level_param_names=top_level_param_names,
        original_description=o,
    )
    if ok:
        return o

    budget = compute_length_budget(mode=mode, original_description=o)

    def _truncate_to_budget(t: str) -> str:
        t = normalize_single_line(t)
        if len(t) > budget.max_chars:
            t = t[: budget.max_chars].rstrip()
        while _count_words(t) > budget.max_words and len(t) > 10:
            t = re.sub(r"\s+\S+\s*$", "", t).strip()
        if t and t[-1] not in ".!?":
            t += "."
        return normalize_single_line(t)

    if mode == "style_concise":
        t = re.sub(r"[.!?]+\s+", ", ", o).strip()
        if t and t[-1] not in ".!?":
            t += "."
        return _truncate_to_budget(t)

    if mode == "style_verbose":
        t = o
        if _count_sentences_rough(t) < budget.min_sentences:
            # Deterministic split only (no content added)
            t = _split_one_sentence_minimally_deterministic(t)
        return _truncate_to_budget(t)

    if mode == "add_examples":
        base = o
        if base and base[-1] not in ".!?":
            base += "."
        t = (
            f"{normalize_single_line(base)} "
            "For example, use it in a case that matches the description above. "
            "This is the same operation described in the original description."
        )
        return _truncate_to_budget(t)

    if mode == "normative_injection":
        base = o
        if base and base[-1] not in ".!?":
            base += "."
        t = (
            f"{normalize_single_line(base)} "
            "For example, use it in a case that matches the description above. "
            "You should use it in the intended context described there."
        )
        return _truncate_to_budget(t)

    return _truncate_to_budget(o)

def deterministic_description_for_empty_original(mode: str) -> str:
    """
    Deterministic policy when ORIGINAL_DESCRIPTION is empty/missing.
    Avoid calling LLM without anchoring (robustness).
    """
    if mode == "empty_desc":
        return ""

    if mode == "style_concise":
        return "Description unavailable."

    if mode == "style_verbose":
        return "Description unavailable."

    if mode == "add_examples":
        return "Description unavailable. For example, use it only in a context consistent with the tool definition."

    if mode == "normative_injection":
        return "Description unavailable. For example, use it only in a context consistent with the tool definition. You should consult the tool definition."

    return "Description unavailable."


def generate_description_for_tool(
    *,
    client: OpenAI,
    generation_model: str,
    judge_model: str,
    enable_judge: bool,
    mode: str,
    tool_json_obj: Dict[str, Any],
    cache: DescCache,
    audit: AuditLogger,
    temperature: float,
    max_tokens: int,
    rate_limit_sleep_sec: float,
    max_retries: int,
    repair_max_rounds: int,
    judge_max_rounds: int,
) -> Tuple[str, str, List[str], str]:
    if mode not in MODES:
        raise ValueError(f"Unknown mode: {mode}")

    tool_name, orig_desc, _, param_specs, _, all_names, _ = extract_tool_core(tool_json_obj)
    top_level_param_names = all_names
    param_context = build_param_context_for_prompt(param_specs)

    # Mode: empty_desc stays literal.
    if mode == "empty_desc":
        return "", "ok", [], stable_sha256("empty_desc")

    # POLICY: empty original in robustness -> deterministic output (no LLM without anchoring).
    if not normalize_single_line(orig_desc) and mode in ROBUSTNESS_MODES:
        out = deterministic_description_for_empty_original(mode)
        ok, errs = validate_description(
            mode,
            normalize_single_line(out),
            raw_desc=out,
            top_level_param_names=top_level_param_names,
            original_description=orig_desc,
        )
        status = "deterministic_empty_original" if ok else "deterministic_empty_original_invalid"
        return normalize_single_line(out), status, errs, stable_sha256(f"deterministic::{mode}::empty_original")

    prompt = build_generation_prompt(
        mode=mode,
        tool_name=tool_name,
        original_description=orig_desc,
        param_context=param_context,
        top_level_param_names=top_level_param_names,
    )
    prompt_hash = stable_sha256(prompt)

    tool_sig = tool_signature_for_cache(tool_json_obj)

    cache_key = stable_sha256(
        "||".join(
            [
                PIPELINE_VERSION,
                generation_model,
                judge_model,
                str(bool(enable_judge)),
                "semantic_gate_v3_minimal_edit_strict",
                mode,
                tool_sig,
                prompt_hash,
            ]
        )
    )

    cached = cache.get(cache_key)
    if cached is not None:
        audit.inc("llm_cache_hits", 1)
        return cached["final_output"], "cached", cached["validation_errors"], prompt_hash

    system_text = "System role: Generate single-line tool documentation strings for function-calling APIs."

    _BULLET_RE = re.compile(r"(^|\n|\r)\s*(?:[-*•]|\d+\.)\s+")

    def _validate_raw_then_normalized(raw_text: str) -> Tuple[bool, str, List[str]]:
        raw_text = raw_text if isinstance(raw_text, str) else str(raw_text or "")
        errors: List[str] = []

        if "\n" in raw_text or "\r" in raw_text:
            errors.append("contains_newline_raw")
        if "`" in raw_text:
            errors.append("contains_backtick_raw")
        if _BULLET_RE.search(raw_text):
            errors.append("contains_bullet_raw")

        normalized = normalize_single_line(raw_text)

        ok_norm, norm_errs = validate_description(
            mode,
            normalized,
            raw_desc=raw_text,
            top_level_param_names=top_level_param_names,
            original_description=orig_desc,
        )
        errors.extend(norm_errs)
        ok = (len(errors) == 0) and ok_norm
        return ok, normalized, errors

    def _sentence_fixup_if_needed(text: str) -> str:
        t = normalize_single_line(text or "")
        if not t:
            return t
        if mode != "style_verbose":
            return t

        odesc = normalize_single_line(orig_desc or "")
        budget = compute_length_budget(mode=mode, original_description=odesc)
        if budget.min_sentences >= 2 and _count_sentences_rough(t) == 1:
            return _split_one_sentence_minimally_deterministic(t)
        return t

    # Phase 0: initial generation
    raw_initial = llm_call_chat_completions(
        client=client,
        model=generation_model,
        system_text=system_text,
        user_text=prompt,
        temperature=temperature,
        max_tokens=max_tokens,
        rate_limit_sleep_sec=rate_limit_sleep_sec,
        max_retries=max_retries,
        audit=audit,
        call_tag=f"generate::{mode}::{tool_name}",
    )

    ok, final, all_errs = _validate_raw_then_normalized(raw_initial)
    status = "ok"

    repaired_rounds = 0
    repair_tokens = min(512, max_tokens * 2)

    # Phase 1: hard validation repair loop
    while not ok and repaired_rounds < repair_max_rounds:
        repaired_rounds += 1

        repair_prompt = build_repair_prompt(
            mode=mode,
            tool_name=tool_name,
            params_context=param_context,
            candidate_output=final,
            validation_errors=all_errs,
            top_level_param_names=top_level_param_names,
            original_description=orig_desc,
        )

        repair_raw = llm_call_chat_completions(
            client=client,
            model=generation_model,
            system_text=system_text,
            user_text=repair_prompt,
            temperature=0.0,
            max_tokens=repair_tokens,
            rate_limit_sleep_sec=rate_limit_sleep_sec,
            max_retries=max_retries,
            audit=audit,
            call_tag=f"repair::{mode}::{tool_name}::round{repaired_rounds}",
        )

        ok, final, all_errs = _validate_raw_then_normalized(repair_raw)

    if ok and repaired_rounds > 0:
        status = "repaired"
        audit.inc("llm_repaired", 1)

    # Phase 2: optional judge to fix hard validation (only)
    judge_tokens = min(512, max_tokens * 2)
    if (not ok) and enable_judge and judge_max_rounds > 0:
        try:
            for jr in range(1, judge_max_rounds + 1):
                audit.inc("llm_judge_calls", 1)

                judge_prompt = build_judge_prompt(
                    mode=mode,
                    tool_name=tool_name,
                    original_description=orig_desc,
                    last_candidate=final,
                    validation_errors=all_errs,
                    params_context=param_context,
                    top_level_param_names=top_level_param_names,
                )

                judge_raw_out = llm_call_chat_completions(
                    client=client,
                    model=judge_model,
                    system_text="System role: Strict compliance editor for tool documentation strings.",
                    user_text=judge_prompt,
                    temperature=0.0,
                    max_tokens=judge_tokens,
                    rate_limit_sleep_sec=rate_limit_sleep_sec,
                    max_retries=max_retries,
                    audit=audit,
                    call_tag=f"judge::{mode}::{tool_name}::round{jr}",
                )

                ok2, judged_norm, errs2 = _validate_raw_then_normalized(judge_raw_out)
                if ok2:
                    final = judged_norm
                    status = "judged"
                    all_errs = []
                    audit.inc("llm_judge_fixed", 1)
                    ok = True
                    break
                else:
                    final = judged_norm
                    all_errs = errs2

            if not ok:
                audit.inc("llm_judge_failed", 1)
                audit.log_fallback(
                    {
                        "ts": now_unix(),
                        "event": "judge_failed_no_immediate_fallback",
                        "mode": mode,
                        "tool_name": tool_name,
                        "generation_model": generation_model,
                        "judge_model": judge_model,
                        "last_validation_errors": all_errs[:50],
                    }
                )
        except Exception as e:
            audit.inc("llm_judge_failed", 1)
            audit.log_fallback(
                {
                    "ts": now_unix(),
                    "event": "judge_exception_no_immediate_fallback",
                    "mode": mode,
                    "tool_name": tool_name,
                    "generation_model": generation_model,
                    "judge_model": judge_model,
                    "error_type": type(e).__name__,
                    "error_str": str(e)[:2000],
                    "last_validation_errors": all_errs[:50],
                }
            )
            ok = False

    # Phase 2.5: pre-semantic guardrail rewrite
    if ok and normalize_single_line(orig_desc):
        if should_trigger_pre_semantic_guardrail(
            original_description=orig_desc,
            candidate_description=final,
            top_level_param_names=top_level_param_names,
            mode=mode,
        ):
            guardrail_prompt = build_guardrail_rewrite_prompt(
                mode=mode,
                tool_name=tool_name,
                original_description=orig_desc,
                candidate_description=final,
                params_context=param_context,
            )
            gr_raw = llm_call_chat_completions(
                client=client,
                model=generation_model,
                system_text=system_text,
                user_text=guardrail_prompt,
                temperature=0.0,
                max_tokens=repair_tokens,
                rate_limit_sleep_sec=rate_limit_sleep_sec,
                max_retries=max_retries,
                audit=audit,
                call_tag=f"guardrail_rewrite::{mode}::{tool_name}",
            )

            ok_gr, gr_norm, errs_gr = _validate_raw_then_normalized(gr_raw)
            if not ok_gr:
                gr_fixed = _sentence_fixup_if_needed(gr_norm)
                ok_gr2, gr_fixed_norm, errs_gr2 = _validate_raw_then_normalized(gr_fixed)
                if ok_gr2:
                    final = gr_fixed_norm
                    status = "guardrail_rewrite"
                    all_errs = []
                    ok = True
                else:
                    all_errs = errs_gr2
                    ok = False
            else:
                final = gr_norm
                status = "guardrail_rewrite"
                all_errs = []
                ok = True

    # Phase 3: semantic gate (FIRST)
    semantic_ok = True
    semantic_judge_line = "PASS"

    if ok and normalize_single_line(orig_desc):
        prompt_sem = build_semantic_judge_prompt(
            tool_name=tool_name,
            original_description=orig_desc,
            candidate_description=final,
            params_context=param_context,
            mode=mode,
        )

        raw = llm_call_chat_completions(
            client=client,
            model=judge_model,
            system_text="System role: Semantic consistency judge for tool descriptions.",
            user_text=prompt_sem,
            temperature=0.0,
            max_tokens=256,
            rate_limit_sleep_sec=rate_limit_sleep_sec,
            max_retries=max_retries,
            audit=audit,
            call_tag=f"semantic_judge_first::{tool_name}",
        )

        line = normalize_single_line(raw)
        semantic_ok = (line.upper() == "PASS")
        semantic_judge_line = line if line else "FAIL: empty_judge_output"

        audit.inc("semantic_gate_first_pass_total", 1)
        if not semantic_ok:
            audit.inc("semantic_gate_first_pass_fail", 1)

        audit.log_event(
            {
                "ts": now_unix(),
                "event": "semantic_gate_result",
                "mode": mode,
                "tool_name": tool_name,
                "result": "PASS" if semantic_ok else "FAIL",
                "judge_line": semantic_judge_line[:300],
            }
        )

    # Phase 3.5: semantic repair + semantic re-judge
    if ok and (not semantic_ok):
        sem_repair_prompt = build_semantic_repair_prompt(
            tool_name=tool_name,
            mode=mode,
            original_description=orig_desc,
            candidate_description=final,
            semantic_judge_feedback=semantic_judge_line,
        )

        sem_raw = llm_call_chat_completions(
            client=client,
            model=generation_model,
            system_text=system_text,
            user_text=sem_repair_prompt,
            temperature=0.0,
            max_tokens=repair_tokens,
            rate_limit_sleep_sec=rate_limit_sleep_sec,
            max_retries=max_retries,
            audit=audit,
            call_tag=f"semantic_repair::{mode}::{tool_name}",
        )

        ok3, sem_norm, errs3 = _validate_raw_then_normalized(sem_raw)
        if not ok3:
            audit.log_event(
                {
                    "ts": now_unix(),
                    "event": "semantic_repair_validation_failed",
                    "mode": mode,
                    "tool_name": tool_name,
                    "validation_errors": errs3[:50],
                }
            )
            semantic_ok = False
            all_errs = errs3
        else:
            prompt_sem2 = build_semantic_judge_prompt(
                tool_name=tool_name,
                original_description=orig_desc,
                candidate_description=sem_norm,
                params_context=param_context,
                mode=mode,
            )
            raw2 = llm_call_chat_completions(
                client=client,
                model=judge_model,
                system_text="System role: Semantic consistency judge for tool descriptions.",
                user_text=prompt_sem2,
                temperature=0.0,
                max_tokens=256,
                rate_limit_sleep_sec=rate_limit_sleep_sec,
                max_retries=max_retries,
                audit=audit,
                call_tag=f"semantic_judge_after_repair::{tool_name}",
            )
            line2 = normalize_single_line(raw2)
            semantic_ok2 = (line2.upper() == "PASS")

            audit.log_event(
                {
                    "ts": now_unix(),
                    "event": "semantic_gate_result_after_repair",
                    "mode": mode,
                    "tool_name": tool_name,
                    "result": "PASS" if semantic_ok2 else "FAIL",
                    "judge_line": line2[:300],
                }
            )

            if semantic_ok2:
                final = sem_norm
                status = "semantic_repaired"
                semantic_ok = True
                all_errs = []
            else:
                semantic_ok = False
                semantic_judge_line = line2
                all_errs = ["semantic_gate_failed_after_repair"]

    # Phase 4: similarity gate + targeted recovery (NO immediate fallback)
    if ok and semantic_ok and normalize_single_line(orig_desc):
        audit.inc("similarity_gate_total", 1)

        sim_ok, sim_metrics, sim_reason = similarity_gate_pass(mode, orig_desc, final)
        audit.log_event(
            {
                "ts": now_unix(),
                "event": "similarity_gate_checked",
                "mode": mode,
                "tool_name": tool_name,
                "result": "PASS" if sim_ok else "FAIL",
                "reason": sim_reason,
                "metrics": sim_metrics,
            }
        )

        if sim_ok:
            audit.inc("similarity_gate_pass_total", 1)
        else:
            audit.inc("similarity_gate_fail_total", 1)
            recovered = False

            if ("too_similar" in sim_reason) or ("identical" in sim_reason):
                micro = deterministic_micro_edit_on_candidate(
                    mode=mode,
                    original_description=orig_desc,
                    candidate_description=final,
                )
                ok_m, micro_norm, micro_errs = _validate_raw_then_normalized(micro)
                if ok_m:
                    prompt_sem_m = build_semantic_judge_prompt(
                        tool_name=tool_name,
                        original_description=orig_desc,
                        candidate_description=micro_norm,
                        params_context=param_context,
                        mode=mode,
                    )
                    raw_m = llm_call_chat_completions(
                        client=client,
                        model=judge_model,
                        system_text="System role: Semantic consistency judge for tool descriptions.",
                        user_text=prompt_sem_m,
                        temperature=0.0,
                        max_tokens=256,
                        rate_limit_sleep_sec=rate_limit_sleep_sec,
                        max_retries=max_retries,
                        audit=audit,
                        call_tag=f"semantic_judge_micro_edit::{tool_name}",
                    )
                    line_m = normalize_single_line(raw_m)
                    if line_m.upper() == "PASS":
                        sim_ok2, sim_m2, sim_r2 = similarity_gate_pass(mode, orig_desc, micro_norm)
                        if sim_ok2:
                            final = micro_norm
                            status = "similarity_recovered_micro_edit"
                            all_errs = []
                            recovered = True
                            audit.inc("similarity_gate_recovered_rule_based", 1)
                            audit.log_event(
                                {
                                    "ts": now_unix(),
                                    "event": "similarity_gate_recovered",
                                    "mode": mode,
                                    "tool_name": tool_name,
                                    "recovery": "micro_edit",
                                    "metrics": sim_m2,
                                    "reason": sim_r2,
                                }
                            )

            if (not recovered) and ("too_different" in sim_reason):
                pull_prompt = build_similarity_pullback_prompt(
                    mode=mode,
                    tool_name=tool_name,
                    original_description=orig_desc,
                    candidate_description=final,
                    params_context=param_context,
                )
                pull_raw = llm_call_chat_completions(
                    client=client,
                    model=generation_model,
                    system_text=system_text,
                    user_text=pull_prompt,
                    temperature=0.0,
                    max_tokens=repair_tokens,
                    rate_limit_sleep_sec=rate_limit_sleep_sec,
                    max_retries=max_retries,
                    audit=audit,
                    call_tag=f"similarity_pullback::{mode}::{tool_name}",
                )
                ok_p, pull_norm, pull_errs = _validate_raw_then_normalized(pull_raw)
                if ok_p:
                    prompt_sem_p = build_semantic_judge_prompt(
                        tool_name=tool_name,
                        original_description=orig_desc,
                        candidate_description=pull_norm,
                        params_context=param_context,
                        mode=mode,
                    )
                    raw_p = llm_call_chat_completions(
                        client=client,
                        model=judge_model,
                        system_text="System role: Semantic consistency judge for tool descriptions.",
                        user_text=prompt_sem_p,
                        temperature=0.0,
                        max_tokens=256,
                        rate_limit_sleep_sec=rate_limit_sleep_sec,
                        max_retries=max_retries,
                        audit=audit,
                        call_tag=f"semantic_judge_pullback::{tool_name}",
                    )
                    line_p = normalize_single_line(raw_p)
                    if line_p.upper() == "PASS":
                        sim_ok3, sim_m3, sim_r3 = similarity_gate_pass(mode, orig_desc, pull_norm)
                        if sim_ok3:
                            final = pull_norm
                            status = "similarity_recovered_pullback"
                            all_errs = []
                            recovered = True
                            audit.inc("similarity_gate_recovered_rule_based", 1)
                            audit.log_event(
                                {
                                    "ts": now_unix(),
                                    "event": "similarity_gate_recovered",
                                    "mode": mode,
                                    "tool_name": tool_name,
                                    "recovery": "pullback",
                                    "metrics": sim_m3,
                                    "reason": sim_r3,
                                }
                            )

            if not recovered and ENABLE_RULE_BASED_RECOVERY:
                rb = rule_based_paraphrase_operator(
                    mode=mode,
                    original_description=orig_desc,
                    top_level_param_names=top_level_param_names,
                )
                ok_rb, rb_norm, rb_errs = _validate_raw_then_normalized(rb)
                if ok_rb:
                    prompt_sem_rb = build_semantic_judge_prompt(
                        tool_name=tool_name,
                        original_description=orig_desc,
                        candidate_description=rb_norm,
                        params_context=param_context,
                        mode=mode,
                    )
                    raw_rb = llm_call_chat_completions(
                        client=client,
                        model=judge_model,
                        system_text="System role: Semantic consistency judge for tool descriptions.",
                        user_text=prompt_sem_rb,
                        temperature=0.0,
                        max_tokens=256,
                        rate_limit_sleep_sec=rate_limit_sleep_sec,
                        max_retries=max_retries,
                        audit=audit,
                        call_tag=f"semantic_judge_rule_based::{tool_name}",
                    )
                    line_rb = normalize_single_line(raw_rb)
                    if line_rb.upper() == "PASS":
                        sim_ok4, sim_m4, sim_r4 = similarity_gate_pass(mode, orig_desc, rb_norm)
                        if sim_ok4:
                            final = rb_norm
                            status = "similarity_recovered_rule_based"
                            all_errs = []
                            recovered = True
                            audit.inc("similarity_gate_recovered_rule_based", 1)
                            audit.log_event(
                                {
                                    "ts": now_unix(),
                                    "event": "similarity_gate_recovered",
                                    "mode": mode,
                                    "tool_name": tool_name,
                                    "recovery": "rule_based",
                                    "metrics": sim_m4,
                                    "reason": sim_r4,
                                }
                            )

            if not recovered:
                final = make_validator_safe_fallback(
                    mode=mode,
                    original_description=orig_desc,
                    top_level_param_names=top_level_param_names,
                )
                status = "similarity_failed_fallback_original"
                all_errs = ["similarity_gate_failed"]
                audit.inc("similarity_gate_fallback_original", 1)

    # FINAL FALLBACK (hard fail or semantic fail)
    if (not ok) or (not semantic_ok):
        audit.inc("llm_rejected", 1)

        fallback = make_validator_safe_fallback(
            mode=mode,
            original_description=orig_desc,
            top_level_param_names=top_level_param_names,
        )
        final = fallback

        ok_fb, fb_errs = validate_description(
            mode,
            final,
            top_level_param_names=top_level_param_names,
            original_description=orig_desc,
        )
        if not ok_fb:
            all_errs = (all_errs or []) + [f"fallback_still_invalid:{e}" for e in fb_errs[:10]]

        status = "semantic_failed_fallback_original" if ok and (not semantic_ok) else (
            "judged_failed_fallback_original" if enable_judge else "rejected_fallback_original"
        )

    cache.put(
        key_hash=cache_key,
        pipeline_version=PIPELINE_VERSION,
        generation_model=generation_model,
        judge_model=judge_model,
        mode=mode,
        tool_name=tool_name,
        tool_signature=tool_sig,
        prompt_hash=prompt_hash,
        prompt_text=prompt,
        raw_output=raw_initial if raw_initial is not None else "",
        final_output=final,
        status=status,
        validation_errors=all_errs,
    )

    return final, status, all_errs, prompt_hash




# ================== TOOL STRING PATCHING (INNER TOOL JSON STRING) ==================


def infer_tool_name_path(tool_obj: Dict[str, Any]) -> Sequence[str]:
    if "function" in tool_obj and isinstance(tool_obj["function"], dict):
        return ("function", "name")
    return ("name",)


def patch_tool_name_in_tool_object(
    *,
    tool_obj: Dict[str, Any],
    new_name: str,
) -> Tuple[Dict[str, Any], bool, str]:
    try:
        name_path = infer_tool_name_path(tool_obj)
        if len(name_path) == 2:
            fn = tool_obj.get("function")
            if isinstance(fn, dict):
                fn["name"] = new_name
                return tool_obj, True, "patched_obj:function.name"
            return tool_obj, False, "missing_function_dict"
        else:
            tool_obj["name"] = new_name
            return tool_obj, True, "patched_obj:name"
    except Exception as e:
        return tool_obj, False, f"patch_name_obj_exception:{type(e).__name__}"


def patch_tool_name_in_tool_json_string(
    *,
    tool_json_str: str,
    tool_obj: Dict[str, Any],
    new_name: str,
) -> Tuple[str, bool, str]:
    """
    Minimal-edit name patch: modifica solo lo span della stringa name dentro il JSON tool.
    """
    name_path = infer_tool_name_path(tool_obj)
    return patch_json_string_field_in_place(tool_json_str, name_path, new_name)


def patch_tool_description_in_tool_object(
    *,
    tool_obj: Dict[str, Any],
    new_desc: str,
    desc_path: Sequence[str],
) -> Tuple[Dict[str, Any], bool, str]:
    """
    Patch the tool description directly on a tool object (dict), supporting both:
      - {"function": {"description": "..."}}
      - {"description": "..."}
    Returns (patched_obj, did_patch, reason).
    """
    try:
        if len(desc_path) == 2 and desc_path[0] == "function" and desc_path[1] == "description":
            fn = tool_obj.get("function")
            if isinstance(fn, dict):
                old = fn.get("description", None)
                fn["description"] = new_desc
                return tool_obj, True, "patched_obj:function.description"
            return tool_obj, False, "missing_function_dict"
        elif len(desc_path) == 1 and desc_path[0] == "description":
            old = tool_obj.get("description", None)
            tool_obj["description"] = new_desc
            return tool_obj, True, "patched_obj:description"
        else:
            return tool_obj, False, "unsupported_desc_path"
    except Exception as e:
        return tool_obj, False, f"patch_obj_exception:{type(e).__name__}"


def parse_tool_json_string(tool_json_str: str) -> Optional[Dict[str, Any]]:
    try:
        obj = json.loads(tool_json_str)
    except Exception:
        return None
    return obj if isinstance(obj, dict) else None


def patch_tool_description_in_object_node_text_preserving_format(
    *,
    outer_line: str,
    tool_node: "JsonNode",
    new_desc: str,
    desc_path: Sequence[str],
) -> Tuple[str, bool, str]:
    """
    In-place patch for tools entry of kind OBJECT, preserving formatting (no json.dumps fallback).

    Extended behavior:
    - If description exists but is non-string (null/number/etc): replace its span with a JSON string literal.
    - If description is missing: insert `"description":"..."` into the target object span:
        - function object for ("function","description") if available and is an object
        - otherwise root tool object
    """

    def _replace_span(obj_text: str, rel_start: int, rel_end: int, replacement: str) -> Tuple[str, bool, str]:
        if rel_start < 0 or rel_end < 0 or rel_start > len(obj_text) or rel_end > len(obj_text) or rel_start >= rel_end:
            return "", False, "span_out_of_bounds"
        return obj_text[:rel_start] + replacement + obj_text[rel_end:], True, "replaced_span"

    def _insert_field_into_object_text(obj_text: str, obj_rel_start: int, obj_rel_end: int, key: str, value: str) -> Tuple[str, bool, str]:
        if obj_rel_start < 0 or obj_rel_end > len(obj_text) or obj_rel_end <= obj_rel_start:
            return "", False, "object_span_out_of_bounds"
        if obj_text[obj_rel_start] != "{" or obj_text[obj_rel_end - 1] != "}":
            return "", False, "target_not_object_text"

        close_brace = obj_rel_end - 1
        j = close_brace - 1
        while j > obj_rel_start and obj_text[j] in " \t\r\n":
            j -= 1

        pair = '"' + json_escape_string(key) + '":"' + json_escape_string(value) + '"'
        if obj_text[j] == "{":
            insert_text = pair
        else:
            insert_text = "," + pair

        patched = obj_text[:close_brace] + insert_text + obj_text[close_brace:]
        return patched, True, "inserted_field"

    try:
        if tool_node.kind != "object":
            return "", False, "tool_node_not_object"
        if not tool_node.obj:
            return "", False, "tool_object_no_children"

        obj_text = outer_line[tool_node.start : tool_node.end]
        replacement_value = '"' + json_escape_string(new_desc) + '"'

        # Case A: function.description
        if len(desc_path) == 2 and desc_path[0] == "function" and desc_path[1] == "description":
            fn_node = tool_node.obj.get("function")
            if fn_node is not None and fn_node.kind == "object":
                # If description exists under function: replace span regardless of kind
                if fn_node.obj and "description" in fn_node.obj:
                    target = fn_node.obj["description"]
                    rel_start = target.start - tool_node.start
                    rel_end = target.end - tool_node.start
                    patched_obj_text, ok, reason = _replace_span(obj_text, rel_start, rel_end, replacement_value)
                    if ok:
                        return patched_obj_text, True, "patched_function_description_any_kind"
                    return "", False, reason

                # Insert description into function object span
                fn_rel_start = fn_node.start - tool_node.start
                fn_rel_end = fn_node.end - tool_node.start
                patched_obj_text, ok, reason = _insert_field_into_object_text(obj_text, fn_rel_start, fn_rel_end, "description", new_desc)
                if ok:
                    return patched_obj_text, True, "inserted_function_description_missing"
                # If insertion fails, fall through to root insertion
            else:
                # function missing or not object -> we cannot insert under function reliably; fall back to root insertion
                pass

            # Root insertion fallback (still in-place)
            root_rel_start = 0
            root_rel_end = len(obj_text)
            patched_obj_text, ok, reason = _insert_field_into_object_text(obj_text, root_rel_start, root_rel_end, "description", new_desc)
            if ok:
                return patched_obj_text, True, "inserted_root_description_fallback_from_function_path"
            return "", False, f"function_path_insert_failed:{reason}"

        # Case B: root description
        if len(desc_path) == 1 and desc_path[0] == "description":
            target = find_node_by_key_path(tool_node, desc_path)
            if target is not None:
                # Replace span regardless of kind (string/null/number/etc)
                rel_start = target.start - tool_node.start
                rel_end = target.end - tool_node.start
                patched_obj_text, ok, reason = _replace_span(obj_text, rel_start, rel_end, replacement_value)
                if ok:
                    return patched_obj_text, True, "patched_root_description_any_kind"
                return "", False, reason

            # Missing -> insert into root object span
            patched_obj_text, ok, reason = _insert_field_into_object_text(obj_text, 0, len(obj_text), "description", new_desc)
            if ok:
                return patched_obj_text, True, "inserted_root_description_missing"
            return "", False, reason

        return "", False, "unsupported_desc_path"

    except Exception as e:
        return "", False, f"patch_object_text_exception:{type(e).__name__}"




def deterministic_tool_serialize(tool_obj: Dict[str, Any]) -> str:
    """
    Fallback serializer: keep it compact but DO NOT reorder keys.
    Reordering (sort_keys=True) explodes diffs even when semantics are unchanged.
    """
    return json.dumps(tool_obj, ensure_ascii=False, sort_keys=False, separators=(",", ":"))


def patch_tool_description_in_tool_json_string(
    *,
    tool_json_str: str,
    new_desc: str,
    desc_path: Sequence[str],
) -> Tuple[str, bool, str]:
    return patch_json_string_field_in_place(tool_json_str, desc_path, new_desc)


def patch_outer_jsonl_line_tools(
    *,
    line: str,
    client: OpenAI,
    generation_model: str,
    judge_model: str,
    enable_judge: bool,
    mode: str,
    cache: DescCache,
    audit: AuditLogger,
    temperature: float,
    max_tokens: int,
    rate_limit_sleep_sec: float,
    max_retries: int,
    repair_max_rounds: int,
    judge_max_rounds: int,
) -> Tuple[str, Dict[str, Any]]:
    stats: Dict[str, Any] = {
        "tools_in_line": 0,
        "tools_patched": 0,
        "tools_unchanged": 0,
        "tools_fallback_reserialized": 0,
        "tools_parse_failed": 0,
        "tools_object_in_line": 0,
        "outer_parse_failed": False,
    }

    try:
        root = JsonSpanParser(line).parse()
    except Exception as e:
        stats["outer_parse_failed"] = True
        audit.log_event(
            {
                "ts": now_unix(),
                "event": "outer_json_parse_failed",
                "mode": mode,
                "error_type": type(e).__name__,
                "error_str": str(e)[:300],
            }
        )
        return line, stats

    if root.kind != "object" or not root.obj or "tools" not in root.obj:
        return line, stats

    tools_node = root.obj["tools"]
    if tools_node.kind != "array" or not tools_node.arr:
        return line, stats

    patches: List[Tuple[int, int, str]] = []

    for idx, el in enumerate(tools_node.arr):
        stats["tools_in_line"] += 1
        audit.inc("tools_seen", 1)

        # -------------------------
        # Case 1: tools entry is a JSON string containing tool JSON
        # -------------------------
        if el.kind == "string":
            audit.inc("tools_entry_kind_string", 1)

            tool_json_str = el.value
            tool_obj = parse_tool_json_string(tool_json_str)
            if tool_obj is None:
                stats["tools_parse_failed"] += 1
                audit.inc("tools_parse_failed", 1)
                audit.log_event(
                    {
                        "ts": now_unix(),
                        "event": "tool_string_parse_failed",
                        "mode": mode,
                        "tool_index_in_line": idx,
                    }
                )
                stats["tools_unchanged"] += 1
                audit.inc("tools_unchanged", 1)
                continue

            tool_name, orig_desc, _, _, _, all_names, desc_path = extract_tool_core(tool_obj)
            top_level_param_names = all_names

            new_desc, status, val_errs, prompt_hash = generate_description_for_tool(
                client=client,
                generation_model=generation_model,
                judge_model=judge_model,
                enable_judge=enable_judge,
                mode=mode,
                tool_json_obj=tool_obj,
                cache=cache,
                audit=audit,
                temperature=temperature,
                max_tokens=max_tokens,
                rate_limit_sleep_sec=rate_limit_sleep_sec,
                max_retries=max_retries,
                repair_max_rounds=repair_max_rounds,
                judge_max_rounds=judge_max_rounds,
            )

            audit.log_manipulation_row(
                _build_manip_row(
                    mode=mode,
                    tool_name=tool_name,
                    orig_desc=orig_desc,
                    new_desc=new_desc,
                    status=status,
                    validation_errors=val_errs,
                    top_level_param_names=top_level_param_names,
                )
            )

            patched_inner, did_patch, reason = patch_tool_description_in_tool_json_string(
                tool_json_str=tool_json_str,
                new_desc=new_desc,
                desc_path=desc_path,
            )

            if not did_patch:
                patched_inner2, did_patch2, reason2 = patch_or_insert_tool_description_in_inner_json_text(
                    tool_json_str=tool_json_str,
                    new_desc=new_desc,
                    desc_path=desc_path,
                )

                if did_patch2:
                    patched_inner = patched_inner2
                    audit.log_fallback(
                        {
                            "ts": now_unix(),
                            "event": "inplace_patch_failed_fallback_inner_text_patch",
                            "mode": mode,
                            "tool_name": tool_name,
                            "reason": f"{reason} -> {reason2}",
                            "generation_status": status,
                            "validation_errors": val_errs[:50],
                            "prompt_hash": prompt_hash,
                        }
                    )
                else:
                    stats["tools_fallback_reserialized"] += 1
                    audit.inc("tools_fallback_reserialized", 1)

                    if len(desc_path) == 2 and desc_path[0] == "function":
                        if isinstance(tool_obj.get("function"), dict):
                            tool_obj["function"]["description"] = new_desc
                    else:
                        tool_obj["description"] = new_desc

                    patched_inner = deterministic_tool_serialize(tool_obj)

                    audit.log_fallback(
                        {
                            "ts": now_unix(),
                            "event": "inplace_patch_failed_fallback_reserialize",
                            "mode": mode,
                            "tool_name": tool_name,
                            "reason": f"{reason} -> inner_text_patch_failed:{reason2}",
                            "generation_status": status,
                            "validation_errors": val_errs[:50],
                            "prompt_hash": prompt_hash,
                        }
                    )

            if normalize_single_line(orig_desc) == normalize_single_line(new_desc) and did_patch:
                stats["tools_unchanged"] += 1
                audit.inc("tools_unchanged", 1)
            else:
                stats["tools_patched"] += 1
                audit.inc("tools_patched", 1)

            replacement_outer = canonical_outer_string_literal_from_decoded(patched_inner)
            audit.inc("outer_tools_string_literals_reescaped_canonically", 1)
            patches.append((el.start, el.end, replacement_outer))

            audit.log_event(
                {
                    "ts": now_unix(),
                    "event": "tool_processed",
                    "mode": mode,
                    "tool_index_in_line": idx,
                    "tool_name": tool_name,
                    "tool_entry_kind": "string",
                    "generation_model": generation_model,
                    "judge_model": judge_model,
                    "enable_judge": bool(enable_judge),
                    "generation_status": status,
                    "inplace_patch": did_patch,
                    "inplace_patch_reason": reason,
                    "outer_tools_replacement_kind": "string_literal",
                    "original_desc_preview": normalize_single_line(orig_desc)[:300],
                    "new_desc_preview": normalize_single_line(new_desc)[:300],
                    "validation_errors": val_errs[:50],
                }
            )
            continue

        # -------------------------
        # Case 2: tools entry is an object (dict-style tool schema)
        # -------------------------
        if el.kind == "object":
            stats["tools_object_in_line"] += 1
            audit.inc("tools_entry_kind_object", 1)

            tool_obj = node_to_python(el)
            if not isinstance(tool_obj, dict):
                stats["tools_unchanged"] += 1
                audit.inc("tools_unchanged", 1)
                audit.log_event(
                    {
                        "ts": now_unix(),
                        "event": "tool_object_not_dict_skipped",
                        "mode": mode,
                        "tool_index_in_line": idx,
                    }
                )
                continue

            tool_name, orig_desc, _, _, _, all_names, desc_path = extract_tool_core(tool_obj)
            top_level_param_names = all_names

            new_desc, status, val_errs, prompt_hash = generate_description_for_tool(
                client=client,
                generation_model=generation_model,
                judge_model=judge_model,
                enable_judge=enable_judge,
                mode=mode,
                tool_json_obj=tool_obj,
                cache=cache,
                audit=audit,
                temperature=temperature,
                max_tokens=max_tokens,
                rate_limit_sleep_sec=rate_limit_sleep_sec,
                max_retries=max_retries,
                repair_max_rounds=repair_max_rounds,
                judge_max_rounds=judge_max_rounds,
            )

            audit.log_manipulation_row(
                _build_manip_row(
                    mode=mode,
                    tool_name=tool_name,
                    orig_desc=orig_desc,
                    new_desc=new_desc,
                    status=status,
                    validation_errors=val_errs,
                    top_level_param_names=top_level_param_names,
                )
            )

            patched_obj_text, did_text_patch, text_patch_reason = patch_tool_description_in_object_node_text_preserving_format(
                outer_line=line,
                tool_node=el,
                new_desc=new_desc,
                desc_path=desc_path,
            )

            if did_text_patch:
                # IMPORTANT: replacement is JSON object text, not a JSON string literal.
                inner = patched_obj_text
            else:
                stats["tools_fallback_reserialized"] += 1
                audit.inc("tools_fallback_reserialized", 1)

                patched_obj, did_patch, reason = patch_tool_description_in_tool_object(
                    tool_obj=tool_obj,
                    new_desc=new_desc,
                    desc_path=desc_path,
                )

                inner = json.dumps(patched_obj, ensure_ascii=False, sort_keys=False, separators=(",", ":"))

                audit.log_fallback(
                    {
                        "ts": now_unix(),
                        "event": "object_text_inplace_patch_failed_fallback_reserialize",
                        "mode": mode,
                        "tool_name": tool_name,
                        "reason": text_patch_reason,
                        "generation_status": status,
                        "validation_errors": val_errs[:50],
                        "prompt_hash": prompt_hash,
                    }
                )

            if normalize_single_line(orig_desc) == normalize_single_line(new_desc):
                stats["tools_unchanged"] += 1
                audit.inc("tools_unchanged", 1)
            else:
                stats["tools_patched"] += 1
                audit.inc("tools_patched", 1)

            # IMPORTANT: Keep object as object in outer JSON (no string conversion).
            replacement_outer = inner
            patches.append((el.start, el.end, replacement_outer))

            audit.log_event(
                {
                    "ts": now_unix(),
                    "event": "tool_processed",
                    "mode": mode,
                    "tool_index_in_line": idx,
                    "tool_name": tool_name,
                    "tool_entry_kind": "object",
                    "generation_model": generation_model,
                    "judge_model": judge_model,
                    "enable_judge": bool(enable_judge),
                    "generation_status": status,
                    "inplace_patch": bool(did_text_patch),
                    "inplace_patch_reason": text_patch_reason,
                    "outer_tools_replacement_kind": "object",
                    "original_desc_preview": normalize_single_line(orig_desc)[:300],
                    "new_desc_preview": normalize_single_line(new_desc)[:300],
                    "validation_errors": val_errs[:50],
                }
            )
            continue

        # -------------------------
        # Case 3: unsupported tools entry kind
        # -------------------------
        stats["tools_unchanged"] += 1
        audit.inc("tools_unchanged", 1)
        audit.log_event(
            {
                "ts": now_unix(),
                "event": "tool_entry_kind_unsupported_skipped",
                "mode": mode,
                "tool_index_in_line": idx,
                "kind": el.kind,
            }
        )

    if not patches:
        return line, stats

    patches.sort(key=lambda x: x[0], reverse=True)
    out = line
    for start, end, repl in patches:
        out = out[:start] + repl + out[end:]

    return out, stats


# ================== JSONL VARIANT GENERATION ==================

def build_variant_jsonl_with_llm(
    *,
    client: OpenAI,
    generation_model: str,
    judge_model: str,
    enable_judge: bool,
    input_path: str,
    output_path: str,
    mode: str,
    output_dir: str,
    temperature: float,
    max_tokens: int,
    rate_limit_sleep_sec: float,
    max_retries: int,
    repair_max_rounds: int,
    judge_max_rounds: int,
    seed: int,
    overwrite: bool,
) -> None:
    if mode not in MODES:
        raise ValueError(f"Unknown mode in build_variant_jsonl_with_llm: {mode}")

    random.seed(seed)

    ensure_dir(output_dir)
    audit_dir = os.path.join(
        output_dir,
        AUDIT_DIR_NAME,
        f"{sanitize_model_name_for_path(generation_model)}__{sanitize_model_name_for_path(judge_model)}__judge-{int(bool(enable_judge))}__{mode}",
    )
    ensure_dir(audit_dir)

    cache_db = os.path.join(output_dir, CACHE_DB_NAME)
    cache = DescCache(cache_db)
    audit = AuditLogger(audit_dir)

    if os.path.exists(output_path) and not overwrite:
        raise FileExistsError("Output exists and overwrite=False.")

    with open(input_path, "r", encoding="utf-8") as fin, open(output_path, "w", encoding="utf-8") as fout:
        for line in fin:
            if not line.strip():
                continue
            audit.inc("lines_seen", 1)

            line_clean = line.rstrip("\r\n")
            patched_line, stats = patch_outer_jsonl_line_tools(
                line=line_clean,
                client=client,
                generation_model=generation_model,
                judge_model=judge_model,
                enable_judge=enable_judge,
                mode=mode,
                cache=cache,
                audit=audit,
                temperature=temperature,
                max_tokens=max_tokens,
                rate_limit_sleep_sec=rate_limit_sleep_sec,
                max_retries=max_retries,
                repair_max_rounds=repair_max_rounds,
                judge_max_rounds=judge_max_rounds,
            )


            fout.write(patched_line + "\n")
            audit.inc("lines_written", 1)

            audit.log_event(
                {
                    "ts": now_unix(),
                    "event": "line_processed",
                    "mode": mode,
                    "outer_parse_failed": bool(stats.get("outer_parse_failed", False)),
                    "tools_in_line": stats.get("tools_in_line", 0),
                    "tools_patched_in_line": stats.get("tools_patched", 0),
                    "tools_fallback_reserialized_in_line": stats.get("tools_fallback_reserialized", 0),
                }
            )

    audit.flush_summary(
        extra={
            "mode": mode,
            "generation_model": generation_model,
            "judge_model": judge_model,
            "enable_judge": bool(enable_judge),
            "temperature": temperature,
            "max_tokens": max_tokens,
            "rate_limit_sleep_sec": rate_limit_sleep_sec,
            "max_retries": max_retries,
            "repair_max_rounds": repair_max_rounds,
            "judge_max_rounds": judge_max_rounds,
            "seed": seed,
        }
    )
    audit.flush_manipulation_summaries()

    cache.close()
    print(f"[{mode}] wrote output JSONL")
    print(f"[{mode}] audit dir: {audit_dir}")


# ================== MULTI-VARIANT ENTRYPOINT ==================

def run_when2call_variants(
    *,
    client: OpenAI,
    generation_model: str,
    judge_model: Optional[str] = None,
    enable_judge: bool = False,
    input_jsonl: str,
    output_dir: str,
    modes: Optional[List[str]] = None,
    overwrite: bool = False,
    temperature: float = DEFAULT_TEMPERATURE,
    max_tokens: int = DEFAULT_MAX_TOKENS,
    rate_limit_sleep_sec: float = DEFAULT_RATE_LIMIT_SLEEP_SEC,
    max_retries: int = DEFAULT_MAX_RETRIES,
    repair_max_rounds: int = DEFAULT_REPAIR_MAX_ROUNDS,
    judge_max_rounds: int = DEFAULT_JUDGE_MAX_ROUNDS,
    seed: int = DEFAULT_RANDOM_SEED,
) -> None:
    if not os.path.exists(input_jsonl):
        raise FileNotFoundError("Input file not found.")

    ensure_dir(output_dir)

    active_modes = modes if modes is not None else list(MODES)
    unknown = [m for m in active_modes if m not in MODES]
    if unknown:
        raise ValueError(f"Unknown modes requested: {unknown}. Allowed: {MODES}")

    jm = judge_model if judge_model is not None else generation_model

    print(f"Pipeline: {PIPELINE_VERSION}")
    print(f"Generator model: {generation_model}")
    print(f"Judge model: {jm} | enable_judge={bool(enable_judge)}")
    print(f"Modes: {active_modes}")
    print(
        f"Temperature: {temperature} | max_tokens: {max_tokens} | retries: {max_retries} | "
        f"repair_rounds: {repair_max_rounds} | judge_rounds: {judge_max_rounds}"
    )

    base_name = os.path.splitext(os.path.basename(input_jsonl))[0]
    model_tag = sanitize_model_name_for_path(generation_model)
    judge_tag = sanitize_model_name_for_path(jm)

    for mode in active_modes:
        out_name = f"{base_name}__gen-{model_tag}__judge-{judge_tag}__judgeon-{int(bool(enable_judge))}__mode-{mode}.jsonl"
        out_path = os.path.join(output_dir, out_name)

        if os.path.exists(out_path) and not overwrite:
            print(f"[{mode}] skipped (exists, overwrite=False)")
            continue

        build_variant_jsonl_with_llm(
            client=client,
            generation_model=generation_model,
            judge_model=jm,
            enable_judge=enable_judge,
            input_path=input_jsonl,
            output_path=out_path,
            mode=mode,
            output_dir=output_dir,
            temperature=temperature,
            max_tokens=max_tokens,
            rate_limit_sleep_sec=rate_limit_sleep_sec,
            max_retries=max_retries,
            repair_max_rounds=repair_max_rounds,
            judge_max_rounds=judge_max_rounds,
            seed=seed,
            overwrite=overwrite,
        )


In [7]:
#!/usr/bin/env python3
import json
from pathlib import Path
from typing import Dict, Iterator, Optional, Set, Tuple

INPUT_JSONL = "When2Call/data/test/when2call_test_llm_judge.jsonl"


def iter_tools_from_when2call(
    jsonl_path: str,
    *,
    unique_by_name: bool = True,
) -> Iterator[Tuple[str, str, Dict]]:
    """
    Itera sui tool presenti nel campo 'orig_tools' di un file JSONL When2Call.

    Yields:
        (name, description, tool_dict)

    Note:
        - 'orig_tools' nel dataset è tipicamente una lista di stringhe JSON.
        - Se unique_by_name=True, deduplica per 'name' (stampa ogni tool una sola volta).
    """
    seen: Set[str] = set()
    path = Path(jsonl_path)

    with path.open("r", encoding="utf-8") as f:
        for line_no, line in enumerate(f, start=1):
            line = line.strip()
            if not line:
                continue

            try:
                record = json.loads(line)
            except json.JSONDecodeError:
                # Riga corrotta/non JSON: la saltiamo
                continue

            orig_tools = record.get("tools") or []
            if not isinstance(orig_tools, list):
                continue

            for tool_entry in orig_tools:
                if isinstance(tool_entry, str):
                    try:
                        tool = json.loads(tool_entry)
                    except json.JSONDecodeError:
                        continue
                elif isinstance(tool_entry, dict):
                    tool = tool_entry
                else:
                    continue

                name = (tool.get("name") or "").strip()
                desc = (tool.get("description") or "").strip()

                if not name:
                    continue

                if unique_by_name:
                    if name in seen:
                        continue
                    seen.add(name)

                yield name, desc, tool


def interactive_pager(it: Iterator[Tuple[str, str, Dict]]) -> None:
    """
    Consuma l'iteratore in modo interattivo: Invio = prossimo, 'q' = stop.
    """
    for idx, (name, desc, _tool) in enumerate(it, start=1):
        print(f"\n[{idx}] {name}\n    {desc}")
        cmd = input("Invio=prossimo, q=esci > ").strip().lower()
        if cmd == "q":
            break


if __name__ == "__main__":
    tools_iter = iter_tools_from_when2call(INPUT_JSONL, unique_by_name=True)

    # Modalità 1 (consigliata): paginazione interattiva (ti fermi con 'q')
    interactive_pager(tools_iter)

    # ----
    # Modalità 2: uso “manuale” dell’iteratore con next() e stop quando vuoi
    #
    # tools_iter = iter_tools_from_when2call(INPUT_JSONL, unique_by_name=True)
    # while True:
    #     try:
    #         name, desc, tool = next(tools_iter)
    #     except StopIteration:
    #         print("Fine tools.")
    #         break
    #     print(f"{name}: {desc}")
    #     # qui puoi decidere tu quando fermarti
    #     if some_condition:
    #         break



[1] api_token_api.APITokenApi.get_api_tokens
    Retrieve a list of API tokens associated with the user's account.

[2] api_token_api.APITokenApi.post_api_token
    Generate a new API token to authenticate and authorize subsequent API calls.

[3] Buses_3_FindBus
    Search for a bus itinerary between two cities on a specific date.

[4] Buses_3_BuyBusTicket
    Purchase bus tickets for a specified route, date, and time. Options for the number of passengers and additional luggage are available.

[5] Events_3_BuyEventTickets
    Facilitates the purchase of tickets for a cultural event on a specific date in a designated city.

[6] detail_adriel_project
    Retrieve the detailed information of the project that Adriel was working on, including the project's current status and expected completion date.

[7] adriel_detail_experience_and_education
    Retrieve the detailed information regarding Adriel's professional experiences and educational background.

[8] adriel_experiences_and_education


In [None]:
#!/usr/bin/env python3  28 dicembre
import json
import shutil
import os
import time
import hashlib
from pathlib import Path
from typing import Dict, Any, Optional, Tuple, List

from openai import OpenAI


# ========= Config =========
GEMINI_BASE_URL = "https://generativelanguage.googleapis.com/v1beta/openai/"
LLM_MODEL = "gemini-2.5-flash"
MODE_KEY = "style_verbose"

HASH_HEX_LEN = 32

DEFAULT_MAX_TOKENS = 512
RETRY_ON_LENGTH = True
RETRY_MAX_TOKENS = 1024

DEFAULT_ALLOW_RESERIALIZE_FALLBACK = False

REGEN_DIVERSITY_INSTRUCTION = (
    "Return a meaning-equivalent rewrite that is lexically different from your previous rewrite; "
    "avoid repeating the same sentence structure."
)

# How much of the rejected previous rewrite to store in audit (for resume) and to feed back into prompt.
MAX_PREV_REWRITE_CHARS = 800


# ========= Client =========
def make_gemini_client() -> OpenAI:
    token = os.environ.get("TOKEN_GEMINI")
    if not token:
        raise RuntimeError("TOKEN_GEMINI environment variable is not set.")
    return OpenAI(api_key=token, base_url=GEMINI_BASE_URL)


# ========= Small utils =========
def _json_safe(obj: Any) -> Any:
    if obj is None or isinstance(obj, (str, int, float, bool)):
        return obj
    if isinstance(obj, dict):
        return {str(k): _json_safe(v) for k, v in obj.items()}
    if isinstance(obj, (list, tuple)):
        return [_json_safe(x) for x in obj]
    if hasattr(obj, "model_dump") and callable(getattr(obj, "model_dump")):
        try:
            return _json_safe(obj.model_dump())
        except Exception:
            pass
    if hasattr(obj, "dict") and callable(getattr(obj, "dict")):
        try:
            return _json_safe(obj.dict())
        except Exception:
            pass
    if hasattr(obj, "__dict__"):
        try:
            return _json_safe(vars(obj))
        except Exception:
            pass
    try:
        return str(obj)
    except Exception:
        return None


def _sha256_text(s: str) -> str:
    return hashlib.sha256((s or "").encode("utf-8")).hexdigest()


def _canonical_json(obj: Any) -> str:
    return json.dumps(obj, ensure_ascii=False, sort_keys=True, separators=(",", ":"))


def _sha256_file(path: Path) -> str:
    h = hashlib.sha256()
    with path.open("rb") as f:
        for chunk in iter(lambda: f.read(1024 * 1024), b""):
            h.update(chunk)
    return h.hexdigest()


# ========= Raw JSON-string patcher (for tools stored as JSON strings) =========
def _extract_json_string_value(raw_json: str, key: str) -> Optional[str]:
    token = f'"{key}"'
    i = raw_json.find(token)
    if i < 0:
        return None
    i = raw_json.find(":", i + len(token))
    if i < 0:
        return None
    i += 1
    n = len(raw_json)
    while i < n and raw_json[i] in " \t\r\n":
        i += 1
    if i >= n or raw_json[i] != '"':
        return None
    start = i
    i += 1
    esc = False
    while i < n:
        c = raw_json[i]
        if esc:
            esc = False
        else:
            if c == "\\":
                esc = True
            elif c == '"':
                return raw_json[start : i + 1]
        i += 1
    return None


def _decode_raw_json_string(raw_json_string_with_quotes: str) -> str:
    try:
        obj = json.loads('{"description":' + raw_json_string_with_quotes + "}")
        return obj.get("description") or ""
    except json.JSONDecodeError:
        return ""


def _get_description_for_print(entry: Any) -> Tuple[str, str]:
    if isinstance(entry, str):
        raw = _extract_json_string_value(entry, "description")
        if raw is not None:
            return raw, "raw_json"
        try:
            obj = json.loads(entry)
            return obj.get("description") or "", "rendered"
        except json.JSONDecodeError:
            return "", "rendered"
    if isinstance(entry, dict):
        return entry.get("description") or "", "rendered"
    return "", "rendered"


def _load_tool(entry: Any) -> Tuple[Optional[Dict[str, Any]], str]:
    if isinstance(entry, str):
        try:
            return json.loads(entry), "json_str"
        except json.JSONDecodeError:
            return None, "other"
    if isinstance(entry, dict):
        return entry, "dict"
    return None, "other"


def _skip_ws(s: str, i: int) -> int:
    n = len(s)
    while i < n and s[i] in " \t\r\n":
        i += 1
    return i


def _scan_string_span(s: str, i: int) -> Optional[Tuple[int, int]]:
    n = len(s)
    if i >= n or s[i] != '"':
        return None
    j = i + 1
    esc = False
    while j < n:
        c = s[j]
        if esc:
            esc = False
        else:
            if c == "\\":
                esc = True
            elif c == '"':
                return (i, j + 1)
        j += 1
    return None


def _scan_number_span(s: str, i: int) -> Optional[Tuple[int, int]]:
    n = len(s)
    j = i
    if j < n and s[j] == "-":
        j += 1
    if j >= n:
        return None
    if s[j] == "0":
        j += 1
    elif s[j].isdigit():
        while j < n and s[j].isdigit():
            j += 1
    else:
        return None
    if j < n and s[j] == ".":
        j += 1
        if j >= n or not s[j].isdigit():
            return None
        while j < n and s[j].isdigit():
            j += 1
    if j < n and s[j] in "eE":
        j += 1
        if j < n and s[j] in "+-":
            j += 1
        if j >= n or not s[j].isdigit():
            return None
        while j < n and s[j].isdigit():
            j += 1
    return (i, j)


def _scan_literal_span(s: str, i: int) -> Optional[Tuple[int, int]]:
    for lit in ("true", "false", "null"):
        if s.startswith(lit, i):
            return (i, i + len(lit))
    return None


def _scan_container_span(s: str, i: int) -> Optional[Tuple[int, int]]:
    n = len(s)
    if i >= n:
        return None

    opener = s[i]
    if opener not in "{[":
        return None

    stack: List[str] = ["}" if opener == "{" else "]"]
    j = i + 1
    in_str = False
    esc = False

    while j < n:
        c = s[j]

        if in_str:
            if esc:
                esc = False
            else:
                if c == "\\":
                    esc = True
                elif c == '"':
                    in_str = False
            j += 1
            continue

        if c == '"':
            in_str = True
            j += 1
            continue

        if c == "{":
            stack.append("}")
            j += 1
            continue
        if c == "[":
            stack.append("]")
            j += 1
            continue

        if c in "}]":
            if not stack:
                return None
            expected = stack[-1]
            if c != expected:
                return None
            stack.pop()
            j += 1
            if not stack:
                return (i, j)
            continue

        j += 1

    return None


def _is_value_delim(c: str) -> bool:
    return c in ",}]"


def _scan_value_span(s: str, i: int) -> Optional[Tuple[int, int]]:
    n = len(s)
    i = _skip_ws(s, i)
    if i >= n:
        return None

    c = s[i]
    if c == '"':
        return _scan_string_span(s, i)
    if c in "{[":
        return _scan_container_span(s, i)

    span: Optional[Tuple[int, int]]
    if c == "-" or c.isdigit():
        span = _scan_number_span(s, i)
    else:
        span = _scan_literal_span(s, i)

    if not span:
        return None

    _, end = span
    k = _skip_ws(s, end)
    if k >= n:
        return span
    if _is_value_delim(s[k]):
        return span
    return None


def _replace_top_level_string_field_in_raw_object(raw_json_obj: str, key: str, new_value: str) -> Tuple[str, bool, str]:
    s = raw_json_obj
    n = len(s)

    i = _skip_ws(s, 0)
    if i >= n or s[i] != "{":
        return raw_json_obj, False, "not_object"

    i += 1
    found_any_key = False
    expect_key = True

    while True:
        i = _skip_ws(s, i)
        if i >= n:
            return raw_json_obj, False, "cannot_scan_value"

        if expect_key:
            if s[i] == "}":
                return raw_json_obj, False, "key_not_found"
            if s[i] != '"':
                return raw_json_obj, False, "invalid_key_string"

            key_span = _scan_string_span(s, i)
            if not key_span:
                return raw_json_obj, False, "invalid_key_string"

            found_any_key = True
            k_start, k_end = key_span
            try:
                key_decoded = json.loads(s[k_start:k_end])
            except Exception:
                return raw_json_obj, False, "invalid_key_string"

            i = _skip_ws(s, k_end)
            if i >= n or s[i] != ":":
                return raw_json_obj, False, "missing_colon"

            v_span = _scan_value_span(s, i + 1)
            if not v_span:
                return raw_json_obj, False, "cannot_scan_value"

            v_start, v_end = v_span

            if key_decoded == key:
                if v_start >= n or s[v_start] != '"':
                    return raw_json_obj, False, "value_not_string"

                replacement_literal = json.dumps(new_value, ensure_ascii=False)
                patched = s[:v_start] + replacement_literal + s[v_end:]

                try:
                    obj = json.loads(patched)
                except Exception:
                    return raw_json_obj, False, "json_load_failed_after_patch"

                if isinstance(obj, dict) and obj.get(key) == new_value:
                    return patched, True, "ok"
                return raw_json_obj, False, "validation_failed_after_patch"

            i = v_end
            expect_key = False
            continue

        i = _skip_ws(s, i)
        if i >= n:
            return raw_json_obj, False, "cannot_scan_value"

        if s[i] == ",":
            i += 1
            expect_key = True
            continue
        if s[i] == "}":
            return raw_json_obj, False, ("key_not_found" if found_any_key else "key_not_found")
        return raw_json_obj, False, "cannot_scan_value"


# ========= IDs =========
def _tool_fingerprint_excluding_description(tool_obj: Dict[str, Any]) -> str:
    filtered = {k: v for k, v in tool_obj.items() if k != "description"}
    payload = _canonical_json(filtered)
    return hashlib.sha256(payload.encode("utf-8")).hexdigest()[:HASH_HEX_LEN]


def _record_id(record_obj: Dict[str, Any], tool_field: str) -> str:
    rec = dict(record_obj)
    tools = rec.get(tool_field)
    if isinstance(tools, list):
        canon_tools: List[Any] = []
        for entry in tools:
            tool_obj, kind = _load_tool(entry)
            if tool_obj is None:
                canon_tools.append({"_unparsed": entry, "_kind": kind})
            else:
                canon_tools.append({k: v for k, v in tool_obj.items() if k != "description"})
        rec[tool_field] = canon_tools
    payload = _canonical_json(rec)
    return hashlib.sha256(payload.encode("utf-8")).hexdigest()[:HASH_HEX_LEN]


def _tool_instance_key(record_id: str, tool_index: int, tool_obj: Dict[str, Any]) -> str:
    fp = _tool_fingerprint_excluding_description(tool_obj)
    return f"rec:{record_id}:t{tool_index}:{fp}"


# ========= Audit (single file, resumable) =========
def _audit_identity(dataset_path: Path, *, mode_key: str, model: str, tool_field: str) -> str:
    stable = f"{dataset_path.resolve()}|{mode_key}|{model}|{tool_field}"
    return hashlib.sha256(stable.encode("utf-8")).hexdigest()[:12]


def _audit_file_path(
    dataset_path: Path,
    *,
    audit_dir: Path,
    mode_key: str,
    model: str,
    tool_field: str,
) -> Path:
    audit_key = _audit_identity(dataset_path, mode_key=mode_key, model=model, tool_field=tool_field)
    safe_model = "".join(ch if ch.isalnum() or ch in ("-", "_", ".") else "_" for ch in model)
    out_dir = audit_dir / audit_key
    filename = f"{dataset_path.stem}.{audit_key}.{mode_key}.{safe_model}.audit.jsonl"
    return out_dir / filename


def _append_audit_event(audit_file: Path, event: Dict[str, Any]) -> None:
    audit_file.parent.mkdir(parents=True, exist_ok=True)
    safe_event = _json_safe(event)
    with audit_file.open("a", encoding="utf-8") as f:
        f.write(json.dumps(safe_event, ensure_ascii=False) + "\n")


def _load_resume_state(
    audit_file: Path,
) -> Tuple[
    Dict[str, Tuple[str, Optional[str], Optional[Dict[str, Any]]]],
    Dict[str, int],
    Dict[str, Optional[str]],
    Optional[Dict[str, Any]],
]:
    """
    Returns:
      - decisions_by_instance: instance_key -> (status, final_description, llm_bundle)
      - regen_counts: instance_key -> max regen_index observed
      - last_rejected_text: instance_key -> last rejected proposal text (from regenerate events)
      - prior_run_start: first run_start event (if any)
    """
    decisions: Dict[str, Tuple[str, Optional[str], Optional[Dict[str, Any]]]] = {}
    regen_counts: Dict[str, int] = {}
    last_rejected_text: Dict[str, Optional[str]] = {}
    prior_run_start: Optional[Dict[str, Any]] = None

    if not audit_file.exists():
        return decisions, regen_counts, last_rejected_text, None

    # Track per-instance best regen index so we keep the latest text
    best_ri: Dict[str, int] = {}

    with audit_file.open("r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            try:
                ev = json.loads(line)
            except Exception:
                continue
            if not isinstance(ev, dict):
                continue

            et = ev.get("event_type")
            if et == "run_start" and prior_run_start is None:
                prior_run_start = ev

            if et == "regenerate":
                ik = ev.get("instance_key")
                ri = ev.get("regen_index")
                txt = ev.get("last_proposal_text")
                if isinstance(ik, str) and isinstance(ri, int) and ri >= 0:
                    prev = regen_counts.get(ik, 0)
                    if ri > prev:
                        regen_counts[ik] = ri
                    prev_best = best_ri.get(ik, -1)
                    if ri >= prev_best:
                        best_ri[ik] = ri
                        last_rejected_text[ik] = txt if isinstance(txt, str) else None

            if et == "decision":
                ik = ev.get("instance_key")
                status = ev.get("status")
                final_desc = ev.get("final_description")
                llm_bundle = ev.get("llm_bundle")
                if isinstance(ik, str) and isinstance(status, str):
                    decisions[ik] = (
                        status,
                        final_desc if isinstance(final_desc, str) else None,
                        llm_bundle if isinstance(llm_bundle, dict) else None,
                    )

    return decisions, regen_counts, last_rejected_text, prior_run_start


# ========= LLM helpers =========
def _ends_like_complete_sentence(text: str) -> bool:
    t = (text or "").strip()
    return bool(t) and t.endswith((".", "!", "?", "”", '"', "’", "'"))


def _sanitize_llm_output(text: str) -> str:
    t = (text or "").strip()
    if t.startswith("{") and "description" in t:
        try:
            obj = json.loads(t)
            if isinstance(obj, dict) and isinstance(obj.get("description"), str):
                t = obj["description"].strip()
        except Exception:
            pass
    if (t.startswith('"') and t.endswith('"')) or (t.startswith("'") and t.endswith("'")):
        t = t[1:-1].strip()
    return t


def _llm_chat_completion(
    *,
    client: OpenAI,
    model: str,
    messages: List[Dict[str, str]],
    temperature: float,
    max_tokens: int,
    seed: Optional[int],
) -> Tuple[str, Dict[str, Any]]:
    meta: Dict[str, Any] = {
        "seed_requested": seed,
        "seed_applied": False,
        "seed_error": None,
        "finish_reason": None,
        "usage": None,
        "max_tokens_requested": int(max_tokens),
        "max_param_used": None,
    }

    base_kwargs: Dict[str, Any] = dict(model=model, messages=messages, temperature=temperature)

    def attempt(max_param_used: str, include_seed: bool) -> Tuple[str, Dict[str, Any]]:
        req = dict(base_kwargs)
        if max_param_used == "max_completion_tokens":
            req["max_completion_tokens"] = int(max_tokens)
        else:
            req["max_tokens"] = int(max_tokens)
        if include_seed and seed is not None:
            req["seed"] = int(seed)

        resp = client.chat.completions.create(**req)
        text = (resp.choices[0].message.content or "").strip()

        meta_local = dict(meta)
        meta_local["max_param_used"] = max_param_used
        meta_local["finish_reason"] = getattr(resp.choices[0], "finish_reason", None)
        meta_local["usage"] = getattr(resp, "usage", None)
        meta_local["seed_applied"] = bool(include_seed and seed is not None)
        return text, meta_local

    def is_seed_error(e: Exception) -> bool:
        s = str(e).lower()
        return ("seed" in s) and ("unknown" in s or "unsupported" in s or "invalid" in s)

    try:
        return attempt("max_completion_tokens", include_seed=True)
    except Exception as e1:
        if seed is not None and is_seed_error(e1):
            meta["seed_error"] = str(e1)
            try:
                return attempt("max_completion_tokens", include_seed=False)
            except Exception:
                pass
        try:
            return attempt("max_tokens", include_seed=True)
        except Exception as e2:
            if seed is not None and is_seed_error(e2):
                meta["seed_error"] = str(e2)
                return attempt("max_tokens", include_seed=False)
            raise


def generate_verbose_description_via_llm(
    *,
    client: OpenAI,
    tool_name: str,
    base_description: str,
    model: str,
    seed: Optional[int],
    max_tokens: int,
    retry_on_length: bool,
    retry_max_tokens: int,
    regen_index: int = 0,
    previous_rewrite: Optional[str] = None,
) -> Tuple[str, Dict[str, Any]]:
    system = (
        "Rewrite tool descriptions.\n"
        "Hard constraints:\n"
        "- Preserve meaning exactly; do not add new capabilities, steps, motivations, benefits, or context.\n"
        "- Do not delete information present in the original description.\n"
        "- Do not introduce new parameter names, IDs, field names, flags, or implementation details.\n"
        "- If parameter/field names/IDs/flags already appear in the original description, keep them (do not remove them).\n"
        "- Do not add examples, normative language, or assumptions.\n"
        "- Keep the same subject (the tool) and the same scope.\n"
        "- Output only the rewritten description text, nothing else.\n"
        "- Style: verbose but controlled; keep it concise and complete (1–2 sentences), clear and direct.\n"
    )

    user_parts: List[str] = []
    user_parts.append(f"Tool name: {tool_name}")
    user_parts.append("Base description:")
    user_parts.append(base_description.strip() or "(empty)")
    user_parts.append("")
    user_parts.append("Rewrite in 'style_verbose' under the constraints.")

    if regen_index > 0:
        user_parts.append("")
        user_parts.append(f"Regeneration request: {regen_index}")
        user_parts.append(REGEN_DIVERSITY_INSTRUCTION)
        if previous_rewrite and previous_rewrite.strip():
            prev = previous_rewrite.strip()
            if len(prev) > MAX_PREV_REWRITE_CHARS:
                prev = prev[:MAX_PREV_REWRITE_CHARS].rstrip()
            user_parts.append("")
            user_parts.append("Previous rewrite (do not reuse wording):")
            user_parts.append(prev)

    user = "\n".join(user_parts)

    raw1, meta1 = _llm_chat_completion(
        client=client,
        model=model,
        messages=[{"role": "system", "content": system}, {"role": "user", "content": user}],
        temperature=0.0,
        max_tokens=max_tokens,
        seed=seed,
    )
    san1 = _sanitize_llm_output(raw1)
    finish1 = (meta1.get("finish_reason") or "").lower()
    looks_truncated_1 = (finish1 == "length") or (san1 and not _ends_like_complete_sentence(san1))

    if not looks_truncated_1:
        return san1, {
            "proposal_origin": "primary",
            "proposal_sanitized_final": san1,
            "llm_text_raw_primary": raw1,
            "llm_text_raw_retry": None,
            "primary": meta1,
            "retry": None,
        }

    raw2 = None
    meta2 = None
    san2 = None
    best_san = san1
    origin = "primary"

    if retry_on_length and retry_max_tokens > max_tokens:
        raw2, meta2 = _llm_chat_completion(
            client=client,
            model=model,
            messages=[{"role": "system", "content": system}, {"role": "user", "content": user}],
            temperature=0.0,
            max_tokens=int(retry_max_tokens),
            seed=seed,
        )
        san2 = _sanitize_llm_output(raw2)
        finish2 = (meta2.get("finish_reason") or "").lower()
        looks_truncated_2 = (finish2 == "length") or (san2 and not _ends_like_complete_sentence(san2))

        if san2 and len(san2) >= len(best_san):
            best_san = san2
            origin = "retry"

        if not looks_truncated_2 and san2:
            return san2, {
                "proposal_origin": "retry",
                "proposal_sanitized_final": san2,
                "llm_text_raw_primary": raw1,
                "llm_text_raw_retry": raw2,
                "primary": meta1,
                "retry": meta2,
            }

    return best_san, {
        "proposal_origin": origin,
        "proposal_sanitized_final": best_san,
        "llm_text_raw_primary": raw1,
        "llm_text_raw_retry": raw2,
        "primary": meta1,
        "retry": meta2,
    }


# ========= IO =========
def make_working_copy(input_jsonl: str, output_jsonl: str, *, overwrite: bool = False) -> str:
    src = Path(input_jsonl)
    dst = Path(output_jsonl)

    if not src.exists():
        raise FileNotFoundError(f"File not found: {src}")

    if dst.exists() and not overwrite:
        return str(dst)

    dst.parent.mkdir(parents=True, exist_ok=True)
    shutil.copy2(src, dst)
    return str(dst)


def _normalize_cmd(raw: str) -> str:
    c = (raw or "").strip().lower()
    if c in ("", "y", "yes", "ok", "okay", "si", "sì"):
        return "y"
    if c in ("r", "retry", "again", "prova", "prova ancora", "rigenera"):
        return "r"
    if c in ("e", "edit", "modifica"):
        return "e"
    if c in ("m", "manual", "mine", "mio", "mia", "custom"):
        return "m"
    if c in ("s", "skip", "salta", "pass"):
        return "s"
    if c in ("q", "quit", "exit", "esci"):
        return "q"
    return c


# ========= Main interactive =========
def interactive_llm_verbose_tools_in_jsonl(
    jsonl_path: str,
    *,
    tool_field: str,
    create_backup_of_target: bool,
    llm_model: str,
    seed: Optional[int],
    max_tokens: int,
    retry_on_length: bool,
    retry_max_tokens: int,
    allow_reserialize_fallback: bool,
    min_sleep_sec_between_calls: float,
    audit_dir: str,
) -> None:
    path = Path(jsonl_path)
    if not path.exists():
        raise FileNotFoundError(f"File not found: {jsonl_path}")

    client = make_gemini_client()
    audit_file = _audit_file_path(
        path,
        audit_dir=Path(audit_dir),
        mode_key=MODE_KEY,
        model=llm_model,
        tool_field=tool_field,
    )

    decisions_by_instance, regen_counts, last_rejected_text_by_instance, prior_run_start = _load_resume_state(audit_file)

    tool_order: List[Dict[str, Any]] = []
    with path.open("r", encoding="utf-8") as f:
        for raw_line in f:
            line = raw_line.rstrip("\n")
            if not line.strip():
                continue
            try:
                record = json.loads(line)
            except json.JSONDecodeError:
                continue
            if not isinstance(record, dict):
                continue

            rid = _record_id(record, tool_field=tool_field)
            tools = record.get(tool_field)
            if not isinstance(tools, list):
                continue

            for tool_index, entry in enumerate(tools):
                tool_obj, kind = _load_tool(entry)
                if not tool_obj:
                    continue
                name = (tool_obj.get("name") or "").strip()
                if not name:
                    continue

                desc_print, desc_mode = _get_description_for_print(entry)
                instance_key = _tool_instance_key(rid, tool_index, tool_obj)

                tool_order.append(
                    {
                        "record_id": rid,
                        "tool_index": tool_index,
                        "tool_name": name,
                        "desc_print": desc_print,
                        "desc_mode": desc_mode,
                        "instance_key": instance_key,
                        "entry_kind": kind,
                    }
                )

    n_total = len(tool_order)
    n_prev_reviewed = len(decisions_by_instance)

    start_pos = 0
    while start_pos < n_total and tool_order[start_pos]["instance_key"] in decisions_by_instance:
        start_pos += 1

    session_id = hashlib.sha256(f"{time.time_ns()}".encode("utf-8")).hexdigest()[:12]
    before_sha = _sha256_file(path)

    if prior_run_start is None:
        _append_audit_event(
            audit_file,
            {
                "event_type": "run_start",
                "ts": int(time.time()),
                "session_id": session_id,
                "mode": MODE_KEY,
                "model": llm_model,
                "seed": seed,
                "dataset_path": str(path),
                "dataset_sha256_at_session_start": before_sha,
                "tool_field": tool_field,
                "n_total_occurrences": n_total,
                "max_tokens_requested": int(max_tokens),
                "retry_on_length": bool(retry_on_length),
                "retry_max_tokens": int(retry_max_tokens),
                "allow_reserialize_fallback": bool(allow_reserialize_fallback),
            },
        )
    else:
        _append_audit_event(
            audit_file,
            {
                "event_type": "run_resume",
                "ts": int(time.time()),
                "session_id": session_id,
                "mode": MODE_KEY,
                "model": llm_model,
                "seed": seed,
                "dataset_path": str(path),
                "dataset_sha256_at_session_start": before_sha,
                "tool_field": tool_field,
                "n_total_occurrences": n_total,
                "n_previously_reviewed": n_prev_reviewed,
                "resume_from_index_1based": (start_pos + 1) if start_pos < n_total else (n_total + 1),
            },
        )

    print(f"Target: {path}")
    print(f"Audit file (RESUMABLE): {audit_file}")
    print(f"Tool occurrences total: {n_total}")
    if start_pos < n_total:
        print(f"Resume position: [{start_pos + 1}/{n_total}] (previously reviewed: {n_prev_reviewed})")
    else:
        print(f"Resume position: completed (previously reviewed: {n_prev_reviewed})")
    print(f"LLM: {llm_model} @ {GEMINI_BASE_URL}")
    print(f"Max tokens: {int(max_tokens)}; retry_on_length={bool(retry_on_length)}; retry_max_tokens={int(retry_max_tokens)}")
    print("Commands: ENTER/ok=accept, r=regenerate, e=edit, m=manual, s=skip, q=quit\n")

    quit_requested = False
    resume_next_index_1based: Optional[int] = None

    for pos in range(start_pos, n_total):
        item = tool_order[pos]
        idx = pos + 1

        name = item["tool_name"]
        desc_mode = item["desc_mode"]
        old_desc_print = item["desc_print"]
        instance_key = item["instance_key"]
        rid = item["record_id"]
        tool_i = item["tool_index"]

        # Per-instance regen state (resumable for regen_index; previous rejected text is best-effort).
        regen_index_local = int(regen_counts.get(instance_key, 0))
        previous_rewrite_local: Optional[str] = last_rejected_text_by_instance.get(instance_key)

        print("=" * 80)
        print(f"[{idx}/{n_total}] {name}")
        print(f"instance_key: {instance_key} (record_id={rid}, tool_index={tool_i})")

        if desc_mode == "raw_json":
            print("Current description RAW (escaped):")
            print(old_desc_print if old_desc_print else "(empty)")
            base_desc = _decode_raw_json_string(old_desc_print) if old_desc_print else ""
            print("\nCurrent description DECODED:")
            print(base_desc if base_desc else "(empty)")
        else:
            base_desc = old_desc_print or ""
            print("Current description:")
            print(base_desc if base_desc else "(empty)")

        proposal = ""
        llm_bundle: Optional[Dict[str, Any]] = None

        while True:
            if not proposal:
                try:
                    proposal, llm_bundle = generate_verbose_description_via_llm(
                        client=client,
                        tool_name=name,
                        base_description=base_desc,
                        model=llm_model,
                        seed=seed,
                        max_tokens=max_tokens,
                        retry_on_length=retry_on_length,
                        retry_max_tokens=retry_max_tokens,
                        regen_index=regen_index_local,
                        previous_rewrite=previous_rewrite_local,
                    )
                except Exception as e:
                    print(f"\nLLM ERROR: {e}")
                    cmd = _normalize_cmd(input("Choice [m=manual, e=edit, s=skip, q=quit] > "))
                    now = int(time.time())

                    if cmd == "q":
                        quit_requested = True
                        resume_next_index_1based = idx
                        break

                    if cmd == "s":
                        decisions_by_instance[instance_key] = ("skipped", None, None)
                        _append_audit_event(
                            audit_file,
                            {
                                "event_type": "decision",
                                "ts": now,
                                "session_id": session_id,
                                "status": "skipped",
                                "tool_name": name,
                                "instance_key": instance_key,
                                "record_id": rid,
                                "tool_index": tool_i,
                                "model": llm_model,
                                "seed": seed,
                                "base_description": base_desc,
                                "final_description": None,
                                "source": "user",
                                "note": "skip_after_llm_error",
                            },
                        )
                        break

                    if cmd in ("m", "e"):
                        manual_final = input("Manual final description (empty cancels) > ").rstrip("\n").strip()
                        status = "manual" if (cmd == "m" and manual_final) else ("edited" if (cmd == "e" and manual_final) else "skipped")
                        decisions_by_instance[instance_key] = (status, manual_final or None, None)
                        _append_audit_event(
                            audit_file,
                            {
                                "event_type": "decision",
                                "ts": now,
                                "session_id": session_id,
                                "status": status,
                                "tool_name": name,
                                "instance_key": instance_key,
                                "record_id": rid,
                                "tool_index": tool_i,
                                "model": llm_model,
                                "seed": seed,
                                "base_description": base_desc,
                                "final_description": manual_final or None,
                                "source": "user",
                                "note": "manual_or_edit_after_llm_error",
                            },
                        )
                        break

                    proposal = ""
                    continue

                proposal = (proposal or "").strip()

            print("\nLLM proposal:")
            print(proposal if proposal else "(empty)")

            if llm_bundle:
                try:
                    origin = llm_bundle.get("proposal_origin")
                    p = llm_bundle.get("primary")
                    r = llm_bundle.get("retry")
                    print(f"\nproposal_origin={origin}")
                    if p:
                        print(f"meta: finish_reason={p.get('finish_reason')}, max_param_used={p.get('max_param_used')}, usage={p.get('usage')}")
                    if r:
                        print(f"meta(retry): finish_reason={r.get('finish_reason')}, max_param_used={r.get('max_param_used')}, usage={r.get('usage')}")
                except Exception:
                    pass

            cmd = _normalize_cmd(input("\nChoice [ENTER=accept, r=regen, e=edit, m=manual, s=skip, q=quit] > "))
            now = int(time.time())

            if cmd == "y":
                if proposal.strip():
                    decisions_by_instance[instance_key] = ("accepted", proposal.strip(), llm_bundle)
                    _append_audit_event(
                        audit_file,
                        {
                            "event_type": "decision",
                            "ts": now,
                            "session_id": session_id,
                            "status": "accepted",
                            "tool_name": name,
                            "instance_key": instance_key,
                            "record_id": rid,
                            "tool_index": tool_i,
                            "model": llm_model,
                            "seed": seed,
                            "base_description": base_desc,
                            "final_description": proposal.strip(),
                            "source": "llm",
                            "llm_bundle": llm_bundle,
                        },
                    )
                else:
                    decisions_by_instance[instance_key] = ("skipped", None, llm_bundle)
                    _append_audit_event(
                        audit_file,
                        {
                            "event_type": "decision",
                            "ts": now,
                            "session_id": session_id,
                            "status": "skipped",
                            "tool_name": name,
                            "instance_key": instance_key,
                            "record_id": rid,
                            "tool_index": tool_i,
                            "model": llm_model,
                            "seed": seed,
                            "base_description": base_desc,
                            "final_description": None,
                            "source": "llm",
                            "note": "empty_proposal",
                            "llm_bundle": llm_bundle,
                        },
                    )
                break

            if cmd == "r":
                # Store the rejected output and feed it back as "previous rewrite" for the next generation.
                previous_rewrite_local = proposal.strip() if proposal else None
                if previous_rewrite_local and len(previous_rewrite_local) > MAX_PREV_REWRITE_CHARS:
                    previous_rewrite_local = previous_rewrite_local[:MAX_PREV_REWRITE_CHARS].rstrip()

                regen_counts[instance_key] = regen_counts.get(instance_key, 0) + 1
                regen_index_local = int(regen_counts[instance_key])

                # Persist the rejected text for resume.
                last_rejected_text_by_instance[instance_key] = previous_rewrite_local

                _append_audit_event(
                    audit_file,
                    {
                        "event_type": "regenerate",
                        "ts": now,
                        "session_id": session_id,
                        "tool_name": name,
                        "instance_key": instance_key,
                        "record_id": rid,
                        "tool_index": tool_i,
                        "regen_index": regen_index_local,
                        "last_proposal_sha256": _sha256_text(proposal),
                        "last_proposal_text": previous_rewrite_local,
                        "last_proposal_origin": (llm_bundle or {}).get("proposal_origin") if llm_bundle else None,
                    },
                )

                proposal = ""
                llm_bundle = None
                if min_sleep_sec_between_calls > 0:
                    time.sleep(min_sleep_sec_between_calls)
                continue

            if cmd == "e":
                edited = input("Edit proposal (empty cancels) > ").rstrip("\n").strip()
                status = "edited" if edited else "skipped"
                decisions_by_instance[instance_key] = (status, edited or None, llm_bundle)
                _append_audit_event(
                    audit_file,
                    {
                        "event_type": "decision",
                        "ts": now,
                        "session_id": session_id,
                        "status": status,
                        "tool_name": name,
                        "instance_key": instance_key,
                        "record_id": rid,
                        "tool_index": tool_i,
                        "model": llm_model,
                        "seed": seed,
                        "base_description": base_desc,
                        "final_description": edited or None,
                        "source": "user",
                        "note": "edit_proposal",
                        "llm_bundle": llm_bundle,
                    },
                )
                break

            if cmd == "m":
                manual_final = input("Manual final description (empty cancels) > ").rstrip("\n").strip()
                status = "manual" if manual_final else "skipped"
                decisions_by_instance[instance_key] = (status, manual_final or None, llm_bundle)
                _append_audit_event(
                    audit_file,
                    {
                        "event_type": "decision",
                        "ts": now,
                        "session_id": session_id,
                        "status": status,
                        "tool_name": name,
                        "instance_key": instance_key,
                        "record_id": rid,
                        "tool_index": tool_i,
                        "model": llm_model,
                        "seed": seed,
                        "base_description": base_desc,
                        "final_description": manual_final or None,
                        "source": "user",
                        "note": "manual_replace",
                        "llm_bundle": llm_bundle,
                    },
                )
                break

            if cmd == "s":
                decisions_by_instance[instance_key] = ("skipped", None, llm_bundle)
                _append_audit_event(
                    audit_file,
                    {
                        "event_type": "decision",
                        "ts": now,
                        "session_id": session_id,
                        "status": "skipped",
                        "tool_name": name,
                        "instance_key": instance_key,
                        "record_id": rid,
                        "tool_index": tool_i,
                        "model": llm_model,
                        "seed": seed,
                        "base_description": base_desc,
                        "final_description": None,
                        "source": "user",
                        "note": "skip",
                        "llm_bundle": llm_bundle,
                    },
                )
                break

            if cmd == "q":
                quit_requested = True
                resume_next_index_1based = idx
                break

            print("Invalid command.")

        if quit_requested:
            break

    # ========= Apply decisions to file =========
    tmp_path = path.with_suffix(path.suffix + ".tmp")
    updated_count = 0
    patch_failures = 0

    with path.open("r", encoding="utf-8") as fin, tmp_path.open("w", encoding="utf-8") as fout:
        for raw_line in fin:
            line = raw_line.rstrip("\n")
            if not line.strip():
                fout.write(line + "\n")
                continue

            try:
                record = json.loads(line)
            except json.JSONDecodeError:
                fout.write(line + "\n")
                continue

            if not isinstance(record, dict):
                fout.write(json.dumps(record, ensure_ascii=False) + "\n")
                continue

            rid = _record_id(record, tool_field=tool_field)
            tools = record.get(tool_field)

            if isinstance(tools, list):
                new_tools: List[Any] = []
                for tool_index, entry in enumerate(tools):
                    tool_obj, kind = _load_tool(entry)
                    if not tool_obj:
                        new_tools.append(entry)
                        continue

                    instance_key = _tool_instance_key(rid, tool_index, tool_obj)
                    decision = decisions_by_instance.get(instance_key)

                    if decision is None:
                        new_tools.append(entry)
                        continue

                    status, new_desc, llm_bundle = decision
                    if status in ("accepted", "edited", "manual") and new_desc:
                        if kind == "json_str" and isinstance(entry, str):
                            # Skip patch if already correct
                            already_ok = False
                            try:
                                obj0 = json.loads(entry)
                                if isinstance(obj0, dict) and obj0.get("description") == new_desc:
                                    already_ok = True
                            except Exception:
                                already_ok = False

                            if already_ok:
                                new_tools.append(entry)
                                continue

                            patched, ok, reason = _replace_top_level_string_field_in_raw_object(entry, "description", new_desc)
                            if ok:
                                new_tools.append(patched)
                                updated_count += 1
                            else:
                                fallback_ok = False
                                fallback_patched = entry
                                if allow_reserialize_fallback:
                                    try:
                                        obj = json.loads(entry)
                                        if isinstance(obj, dict):
                                            obj["description"] = new_desc
                                            fallback_patched = json.dumps(obj, ensure_ascii=False)
                                            fallback_ok = True
                                    except Exception:
                                        fallback_ok = False

                                if fallback_ok:
                                    new_tools.append(fallback_patched)
                                    updated_count += 1
                                    _append_audit_event(
                                        audit_file,
                                        {
                                            "event_type": "patch_fallback_reserialize",
                                            "ts": int(time.time()),
                                            "session_id": session_id,
                                            "instance_key": instance_key,
                                            "record_id": rid,
                                            "tool_index": tool_index,
                                            "tool_name": tool_obj.get("name"),
                                            "entry_sha256_before": _sha256_text(entry),
                                            "entry_sha256_after": _sha256_text(fallback_patched),
                                            "patch_reason": reason,
                                        },
                                    )
                                else:
                                    new_tools.append(entry)
                                    patch_failures += 1
                                    _append_audit_event(
                                        audit_file,
                                        {
                                            "event_type": "patch_failure",
                                            "ts": int(time.time()),
                                            "session_id": session_id,
                                            "instance_key": instance_key,
                                            "record_id": rid,
                                            "tool_index": tool_index,
                                            "tool_name": tool_obj.get("name"),
                                            "status": status,
                                            "patch_reason": reason,
                                            "final_description_sha256": _sha256_text(new_desc),
                                            "entry_sha256": _sha256_text(entry),
                                            "entry_excerpt": entry[:240],
                                            "llm_text_raw_primary_sha256": _sha256_text((llm_bundle or {}).get("llm_text_raw_primary") or ""),
                                            "llm_text_raw_retry_sha256": _sha256_text((llm_bundle or {}).get("llm_text_raw_retry") or ""),
                                            "proposal_origin": (llm_bundle or {}).get("proposal_origin"),
                                        },
                                    )
                        else:
                            if tool_obj.get("description") == new_desc:
                                new_tools.append(tool_obj)
                                continue
                            tool_obj["description"] = new_desc
                            new_tools.append(tool_obj)
                            updated_count += 1
                    else:
                        new_tools.append(entry)

                record[tool_field] = new_tools

            fout.write(json.dumps(record, ensure_ascii=False) + "\n")

    if create_backup_of_target:
        bak_path = path.with_suffix(path.suffix + ".bak")
        if not bak_path.exists():
            shutil.copy2(path, bak_path)

    tmp_path.replace(path)
    after_sha = _sha256_file(path)

    n_reviewed = len(decisions_by_instance)
    n_skipped = sum(1 for st, _, _ in decisions_by_instance.values() if st == "skipped")
    completed = (n_reviewed >= n_total) and (not quit_requested)

    _append_audit_event(
        audit_file,
        {
            "event_type": "run_end",
            "ts": int(time.time()),
            "session_id": session_id,
            "mode": MODE_KEY,
            "model": llm_model,
            "seed": seed,
            "dataset_path": str(path),
            "dataset_sha256_at_session_start": before_sha,
            "dataset_sha256_at_session_end": after_sha,
            "n_total_occurrences": n_total,
            "n_reviewed_total": n_reviewed,
            "n_updated_this_session": updated_count,
            "n_skipped_total": n_skipped,
            "completed": bool(completed),
            "quit_requested": bool(quit_requested),
            "raw_patch_failures_this_session": patch_failures,
            "resume_next_index_1based": resume_next_index_1based if quit_requested else (n_total + 1 if completed else None),
        },
    )

    print("\nChanges applied.")
    print(f"Descriptions updated (this session): {updated_count}")
    if patch_failures:
        print(f"Raw JSON-string patch failures (left unchanged): {patch_failures}")
    print(f"Reviewed total (from audit): {n_reviewed} / {n_total}")
    print(f"Completed: {completed} (quit_requested={quit_requested})")
    if quit_requested and resume_next_index_1based is not None:
        print(f"Resume next time from: [{resume_next_index_1based}/{n_total}]")
    print(f"Updated file: {path}")
    print(f"Audit file (same on resume): {audit_file}")


if __name__ == "__main__":
    INPUT_JSONL = "When2Call/data/test/when2call_test_llm_judge.jsonl"
    OUTPUT_JSONL = "When2Call/data/test/when2call_test_llm_judge.WORKING_COPY.jsonl"

    working = make_working_copy(INPUT_JSONL, OUTPUT_JSONL, overwrite=False)
    print(f"Working copy: {working}")

    seed_env = os.environ.get("GEMINI_SEED")
    seed_val: Optional[int] = int(seed_env.strip()) if (seed_env and seed_env.strip()) else None

    max_tokens_env = os.environ.get("GEMINI_MAX_TOKENS")
    max_tokens_val = int(max_tokens_env.strip()) if (max_tokens_env and max_tokens_env.strip()) else DEFAULT_MAX_TOKENS

    retry_max_tokens_env = os.environ.get("GEMINI_RETRY_MAX_TOKENS")
    retry_max_tokens_val = int(retry_max_tokens_env.strip()) if (retry_max_tokens_env and retry_max_tokens_env.strip()) else RETRY_MAX_TOKENS

    allow_reserialize_env = os.environ.get("ALLOW_RESERIALIZE_FALLBACK")
    allow_reserialize_val = (
        bool(int(allow_reserialize_env.strip()))
        if (allow_reserialize_env and allow_reserialize_env.strip())
        else DEFAULT_ALLOW_RESERIALIZE_FALLBACK
    )

    interactive_llm_verbose_tools_in_jsonl(
        working,
        tool_field="tools",
        create_backup_of_target=False,
        llm_model=LLM_MODEL,
        seed=seed_val,
        max_tokens=max_tokens_val,
        retry_on_length=RETRY_ON_LENGTH,
        retry_max_tokens=retry_max_tokens_val,
        allow_reserialize_fallback=allow_reserialize_val,
        min_sleep_sec_between_calls=0.0,
        audit_dir="audit",
    )


Working copy: When2Call/data/test/when2call_test_llm_judge.WORKING_COPY.jsonl
Target: When2Call/data/test/when2call_test_llm_judge.WORKING_COPY.jsonl
Audit file (RESUMABLE): audit/e21d3604eba2/when2call_test_llm_judge.WORKING_COPY.e21d3604eba2.style_verbose.gemini-2.5-flash.audit.jsonl
Tool occurrences total: 978
Resume position: [4/978] (previously reviewed: 3)
LLM: gemini-2.5-flash @ https://generativelanguage.googleapis.com/v1beta/openai/
Max tokens: 512; retry_on_length=True; retry_max_tokens=1024
Commands: ENTER/ok=accept, r=regenerate, e=edit, m=manual, s=skip, q=quit

[4/978] Buses_3_BuyBusTicket
instance_key: rec:64cbc7e8819e45258b49e186164c9fad:t1:9a77a4750b12a8be645d3b59a745bc6f (record_id=64cbc7e8819e45258b49e186164c9fad, tool_index=1)
Current description RAW (escaped):
"Purchase bus tickets for a specified route, date, and time. Options for the number of passengers and additional luggage are available."

Current description DECODED:
Purchase bus tickets for a specified rout