In [24]:
!cat requirements.txt

aiohttp==3.12.13
boto3==1.38.39
botocore==1.38.39
sagemaker==2.247.0
litellm==1.72.2
strands-agents==0.1.6
strands-agents-builder==0.1.2
strands-agents-tools==0.1.4
matplotlib==3.10.3
pandas==2.3.0
seaborn==0.13.2
joblib==1.5.1
requests==2.32.4
uv==0.7.13

In [25]:
import warnings
warnings.filterwarnings("ignore")

In [26]:
# Warnings are safe to ignore
%pip uninstall -q -y autogluon-multimodal autogluon-timeseries autogluon-features autogluon-common autogluon-core
%pip install -r requirements.txt -qU

[0mNote: you may need to restart the kernel to use updated packages.
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
aiobotocore 2.21.1 requires botocore<1.37.2,>=1.37.0, but you have botocore 1.38.39 which is incompatible.
sagemaker-studio-analytics-extension 0.2.0 requires sparkmagic==0.22.0, but you have sparkmagic 0.21.0 which is incompatible.[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.


In [27]:
%pip install -q boto3 python-docx PyPDF2

Note: you may need to restart the kernel to use updated packages.


In [28]:
from IPython import get_ipython
get_ipython().kernel.do_shutdown(True)

{'status': 'ok', 'restart': True}

In [1]:
!pip install --upgrade boto3

Collecting boto3
  Using cached boto3-1.40.11-py3-none-any.whl.metadata (6.7 kB)
Collecting botocore<1.41.0,>=1.40.11 (from boto3)
  Using cached botocore-1.40.11-py3-none-any.whl.metadata (5.7 kB)
Using cached boto3-1.40.11-py3-none-any.whl (140 kB)
Using cached botocore-1.40.11-py3-none-any.whl (14.0 MB)
Installing collected packages: botocore, boto3
[2K  Attempting uninstall: botocore
[2K    Found existing installation: botocore 1.38.39
[2K    Uninstalling botocore-1.38.39:
[2K      Successfully uninstalled botocore-1.38.39
[2K  Attempting uninstall: boto3━━━━━━━━━━━━━━━━━━━[0m [32m0/2[0m [botocore]
[2K    Found existing installation: boto3 1.38.39m━━━━━━━━━━━━━━━━━━━[0m [32m1/2[0m [boto3]
[2K    Uninstalling boto3-1.38.39:0m╺[0m[90m━━━━━━━━━━━━━━━━━━━[0m [32m1/2[0m [boto3]
[2K      Successfully uninstalled boto3-1.38.39━━━━━━━━━━━━━━━━━━[0m [32m1/2[0m [boto3]
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [boto3]/2[0m [boto3]
[1A[2K[

In [17]:
#from utils.strands_sagemaker import SageMakerAIModel
from strands.models.bedrock import BedrockModel

In [18]:
# Import Required Libraries
import os
from strands import Agent, tool
from strands_tools import http_request 
import json, time, uuid, re, requests
import io
import mimetypes
from typing import Tuple, List, Optional

try:
    import boto3
    from botocore.exceptions import BotoCoreError, ClientError
except Exception:
    boto3 = None

try:
    from PyPDF2 import PdfReader
except Exception:
    PdfReader = None

try:
    import docx  # python-docx
except Exception:
    docx = None


In [29]:
bedrock = boto3.client(
    service_name='bedrock-runtime',
    region_name=boto3.Session().region_name
)

def invoke_model(model_id, model_input):
    body = json.dumps(model_input)
    
    response = bedrock.invoke_model(
        modelId=model_id,
        body=body
    )
    
    response_body = json.loads(response['body'].read())
    return response_body

In [30]:
provider = "BEDROCK_Mistral"  # Change this to SAGEMAKER to use a deployed endpoint instead of Bedrock
provider_model_id = ""

match provider:
    case "BEDROCK_Mistral":
        # Using Mistral 7B Instruct from Bedrock
        full_model = BedrockModel(
            model_id="mistral.mistral-7b-instruct-v0:2",
            max_tokens=1024,
            temperature=0.8,
            top_k=50,
            top_p=0.95,
        )
        provider_model_id = "mistral.mistral-7b-instruct-v0:2"

    case "BEDROCK_Anthropic":
        # Using Claude 3.5 Sonnet from Bedrock
        full_model = BedrockModel(
            model_id="us.anthropic.claude-3-5-sonnet-20241022-v2:0",
            max_tokens=1024,
            temperature=0.8,
            top_k=50,
            top_p=0.95,
        )
        provider_model_id = "us.anthropic.claude-3-5-sonnet-20241022-v2:0"

    case "SAGEMAKER":
        model = SageMakerAIModel({
            "endpoint_name": SAGEMAKER_ENDPOINT_NAME,
            "max_tokens": 16*1024,
            "temperature": 0.1,
            "stream": False
		})

# Analyzer Agent

In [31]:
# ================== Analyzer (multi-source, multi-file, model-swappable) ==================
from strands import Agent, tool
import json, os, time, uuid, re, requests, io, mimetypes
from typing import List, Optional, Tuple

# --- optional deps for S3/PDF/DOCX ---
import boto3
from botocore.exceptions import BotoCoreError, ClientError
from PyPDF2 import PdfReader
from docx import Document

# ---- environment time ----
os.environ["TZ"] = "America/New_York"
if hasattr(time, "tzset"):
    time.tzset()

# ---- defaults / paths ----
DEFAULT_SOURCE  = None                       # used if you call fetch_data() without data_source
ROW_DELIM       = "@"

DATA_LOG_FILE   = "analyzer_raw_s3_data_log.txt"
OUTPUT_JSONL    = "analyzer_s3_outputs.jsonl"
OUTPUT_DIR_INDIVIDUAL = "analyzer_individual_mistral_outputs"  # per-patient per-model JSON files

# ===== helpers =====
def _format_rows_as_lines(raw_text: str) -> str:
    """
    If the data uses '@' as a row delimiter, split onto newlines.
    Otherwise, return the text as-is (e.g., clinician notes).
    """
    text = (raw_text or "").strip()
    if ROW_DELIM in text:
        chunks = [c.strip() for c in text.split(ROW_DELIM) if c.strip()]
        return "\n".join(chunks)
    return text

def _save_formatted_to_file(formatted_text: str, log_path: str):
    os.makedirs(os.path.dirname(log_path) or ".", exist_ok=True)
    timestamp = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
    with open(log_path, "a", encoding="utf-8") as f:
        f.write(f"\n=== Run at {timestamp} ===\n")
        f.write(formatted_text + "\n")

def _coerce_json(text: str):
    """
    Extract the first JSON object from an LLM response and parse it.
    """
    s = str(text).strip()
    if s.startswith("{") and s.endswith("}"):
        return json.loads(s)
    m = re.search(r"\{.*\}", s, flags=re.DOTALL)
    if not m:
        raise ValueError("No JSON object found in agent output.")
    return json.loads(m.group(0))

def _append_jsonl(path: str, obj: dict):
    os.makedirs(os.path.dirname(path) or ".", exist_ok=True)
    with open(path, "a", encoding="utf-8") as f:
        f.write(json.dumps(obj, ensure_ascii=False) + "\n")

# ===== S3 helpers =====
def _parse_s3_uri(uri: str) -> Tuple[str, str]:
    # s3://bucket/key -> (bucket, key)
    assert uri.lower().startswith("s3://"), "Not an s3:// URI"
    without = uri[5:]
    parts = without.split("/", 1)
    bucket = parts[0]
    key = parts[1] if len(parts) > 1 else ""
    return bucket, key

def _read_s3_object(uri: str) -> bytes:
    bucket, key = _parse_s3_uri(uri)
    s3 = boto3.client("s3")
    try:
        obj = s3.get_object(Bucket=bucket, Key=key)
        return obj["Body"].read()
    except (BotoCoreError, ClientError) as e:
        raise RuntimeError(f"S3 read failed for {uri}: {e}")

def _list_s3_uris(s3_prefix: str, extensions: Optional[List[str]] = None) -> List[str]:
    """
    Expand an s3 prefix (ending with '/'): s3://bucket/prefix/ -> [s3://bucket/prefix/file1, ...]
    Optionally filter by extensions ['.docx', '.pdf', '.txt'] (case-insensitive).
    """
    bucket, prefix = _parse_s3_uri(s3_prefix)
    s3 = boto3.client("s3")
    uris = []
    paginator = s3.get_paginator("list_objects_v2")
    for page in paginator.paginate(Bucket=bucket, Prefix=prefix):
        for obj in page.get("Contents", []):
            key = obj["Key"]
            if key.endswith("/"):
                continue
            if extensions:
                ext = os.path.splitext(key)[1].lower()
                if ext not in [e.lower() for e in extensions]:
                    continue
            uris.append(f"s3://{bucket}/{key}")
    return uris

def _ext_or_mime(uri: str, content_bytes: bytes) -> str:
    mime, _ = mimetypes.guess_type(uri)
    return mime or "application/octet-stream"

def _extract_text_from_bytes(uri: str, content: bytes) -> str:
    mime = _ext_or_mime(uri, content)
    luri = uri.lower()
    if luri.endswith(".pdf") or mime == "application/pdf":
        reader = PdfReader(io.BytesIO(content))
        parts = []
        for page in reader.pages:
            try:
                parts.append(page.extract_text() or "")
            except Exception:
                continue
        return "\n".join(p.strip() for p in parts if p)
    if luri.endswith(".docx") or mime == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
        d = Document(io.BytesIO(content))
        return "\n".join(p.text for p in d.paragraphs if p.text)
    # Fallback: treat as UTF-8 text
    try:
        return content.decode("utf-8")
    except UnicodeDecodeError:
        return content.decode("latin-1", errors="ignore")

# ===== unified fetch tool (accepts data_source param) =====
@tool
def fetch_data(data_source: str | None = None) -> dict:
    """
    Fetch data from data_source (http/https URL, local file, or s3://bucket/key PDF/DOCX/TXT).
    If data_source is None, uses DEFAULT_SOURCE.
    Returns { raw_text, formatted_text, meta }.
    """
    ds = data_source or DEFAULT_SOURCE
    if not ds:
        return {"error": "No data_source provided.", "raw_text": "", "formatted_text": "", "meta": {"source_type": "unknown", "data_source": str(ds)}}

    # S3
    if isinstance(ds, str) and ds.lower().startswith("s3://"):
        try:
            blob = _read_s3_object(ds)
            raw_text = _extract_text_from_bytes(ds, blob)
        except Exception as e:
            return {"error": f"S3 error: {e}", "raw_text": "", "formatted_text": "", "meta": {"source_type": "s3", "data_source": ds}}
        formatted = _format_rows_as_lines(raw_text)
        _save_formatted_to_file(formatted, DATA_LOG_FILE)
        return {"raw_text": raw_text, "formatted_text": formatted, "meta": {"source_type": "s3", "data_source": ds}}

    # URL
    if isinstance(ds, str) and ds.lower().startswith(("http://", "https://")):
        try:
            resp = requests.post(ds, data={}, timeout=60)
            resp.raise_for_status()
            raw = resp.text
        except Exception as e:
            return {"error": f"HTTP error: {e}", "raw_text": "", "formatted_text": "", "meta": {"source_type": "url", "data_source": ds}}
        formatted = _format_rows_as_lines(raw)
        _save_formatted_to_file(formatted, DATA_LOG_FILE)
        return {"raw_text": raw, "formatted_text": formatted, "meta": {"source_type": "url", "data_source": ds}}

    # Local file
    if isinstance(ds, str) and os.path.exists(ds):
        try:
            if ds.lower().endswith((".pdf", ".docx")):
                with open(ds, "rb") as f:
                    content = f.read()
                raw_text = _extract_text_from_bytes(ds, content)
            else:
                with open(ds, "r", encoding="utf-8") as f:
                    raw_text = f.read()
        except Exception as e:
            return {"error": f"File read error: {e}", "raw_text": "", "formatted_text": "", "meta": {"source_type": "file", "data_source": ds}}
        formatted = _format_rows_as_lines(raw_text)
        return {"raw_text": raw_text, "formatted_text": formatted, "meta": {"source_type": "file", "data_source": ds}}

    # Unknown
    return {"error": f"Unsupported data_source: {ds}", "raw_text": "", "formatted_text": "", "meta": {"source_type": "unknown", "data_source": str(ds)}}

# ===== analyzer prompt (says to call fetch_data with the provided data_source) =====
ANALYZER_PROMPT = """
You are an Analyzer Agent.

Tool available:
- fetch_data(data_source) -> {raw_text, formatted_text, meta}

You will receive an input JSON with a key "data_source".
INSTRUCTIONS:
1) Call fetch_data EXACTLY ONCE with the provided data_source.
2) Use "formatted_text" as your working input. It is newline-separated if the source used '@' row delimiters; otherwise it may be free text/paragraphs.
3) Perform the analysis according to the TASK below.
4) Produce output that matches the OUTPUT CONTRACT below EXACTLY (keys and structure). Output ONLY that JSON object and nothing else.
5) Do not call any other tools. Do not print anything except the final JSON. Do not retry fetch_data.

DEFAULT TASK:
- Derive from the following content SMART goals that are specific, measurable, actionable, relevant and time-bounded based on the provided content.
- Example: The SMART goal is to keep h1ac below 6.0 in the next 3 months. Monitor fasting glucose daily for 90 days and calculate the average of the daily fasting glucose readings. This is a SMART goal because it is specific to h1ac evaluation, it measures glucose daily reading and it require the action to monitor the reading daily. The glucose reading is relevant to h1ac because h1ac is the 90 day average of the glucose reading. And it is time bounded for carrying out the measurement for 90 days.

DEFAULT OUTPUT CONTRACT:
{
  "smart_goals": [
    {
      "goal_number": "integer (starts at 1 and increments for each goal)",
      "description": "string (time-bound, measurable details)"
    }
  ]
}
"""

# ---------- helpers for filenames ----------
def _basename_no_ext(path_or_uri: str) -> str:
    """
    's3://bucket/path/patient1_summary.docx' -> 'patient1_summary'
    'patient2.pdf' -> 'patient2'
    'https://.../file.txt?x=y' -> 'file' (best effort)
    """
    s = path_or_uri.split("?", 1)[0]
    if s.lower().startswith("s3://"):
        _, key = s[5:].split("/", 1)
        base = os.path.basename(key)
    else:
        base = os.path.basename(s)
    name, _ext = os.path.splitext(base)
    return name or "unknown_source"

def _safe_fragment(s: str) -> str:
    """
    Make a safe filename fragment: replace non [A-Za-z0-9_-] with '_'.
    Also replace ':', '.', '/' commonly found in model ids.
    """
    s = s.replace(":", "_").replace("/", "_").replace(".", "_")
    return "".join(c if c.isalnum() or c in ("-", "_") else "_" for c in s)


# ---------- batch runner ----------
def run_analyzer_batch(
    data_sources: list[str],
    model_name,                              # string alias/id OR model object
    output_jsonl: str = "analyzer_s3_outputs.jsonl",
    save_each: bool = True
):
    """
    For each source in data_sources:
      - Build an Analyzer agent for the chosen model
      - Call fetch_data(data_source=source)
      - Append one line to the JSONL with {run_id, timestamp, data_source, model_id, analyzer_output}
      - Write a per-patient, per-model JSON file in analyzer_individual_outputs/
    Also expands any S3 prefix that ends with '/' into all files under that prefix.
    """
    # Normalize to a plain string id (prevents JSON serialization errors)
    model_id = provider_model_id

    # Expand any s3 prefixes into object URIs
    expanded_sources: list[str] = []
    for src in data_sources:
        if isinstance(src, str) and src.lower().startswith("s3://") and src.endswith("/"):
            # expand directory-like prefix
            expanded_sources.extend(_list_s3_uris(src, extensions=[".docx", ".pdf", ".txt"]))
        else:
            expanded_sources.append(src)

    # Prepare output dir for per-patient files
    os.makedirs(OUTPUT_DIR_INDIVIDUAL, exist_ok=True)

    results: list[dict] = []
    for src in expanded_sources:
        # Build an Analyzer agent for this model id
        analyzer = Agent(
            model=model_id,
            system_prompt=ANALYZER_PROMPT,
            tools=[fetch_data],
        )

        # Agent input: tell it which data_source to fetch
        payload = {"data_source": src}
        raw = analyzer(json.dumps(payload))

        # Parse the agent's strict-JSON output (smart_goals)
        parsed = _coerce_json(raw)

        # Append a JSONL record
        record = {
            "run_id": str(uuid.uuid4()),
            "timestamp": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()),
            "data_source": src,
            "model_id": model_id,
            "analyzer_output": parsed,
        }
        if save_each:
            _append_jsonl(output_jsonl, record)

        # Write per-patient, per-model JSON file
        base = _basename_no_ext(src)
        safe_model = _safe_fragment(model_id)
        out_path = os.path.join(
            OUTPUT_DIR_INDIVIDUAL,
            f"{base}_{safe_model}_output.json"
        )
        with open(out_path, "w", encoding="utf-8") as f:
            json.dump(
                {
                    "model_id": model_id,
                    "data_source": src,
                    "timestamp": record["timestamp"],
                    "smart_goals": parsed.get("smart_goals", []),
                },
                f,
                ensure_ascii=False,
                indent=2,
            )

        results.append(record)

    return {"count": len(results), "runs": results}


In [32]:
sources = [
    "s3://patient-summary-bucket/patient1_summary.docx",
    "s3://patient-summary-bucket/patient2_summary.docx",
    "s3://patient-summary-bucket/patient3_summary.docx",
    "s3://patient-summary-bucket/patient4_summary.docx",
    "s3://patient-summary-bucket/patient5_summary.docx",
]

# Run output behavior (JSONL + per-patient files)
summary = run_analyzer_batch(
        sources,
        model_name=full_model,
        output_jsonl=OUTPUT_JSONL,
        save_each=True,
    )


ValidationException: An error occurred (ValidationException) when calling the ConverseStream operation: This model doesn't support system messages. Try again without a system message or use a model that supports system messages.

# Evaluator Agent

In [None]:
# ================== Evaluator (llm as judge) ==================

from strands import Agent, tool
import json, os, time, uuid, re
from statistics import mean

# ---------- Paths ----------
ANALYZER_JSONL = "analyzer_s3_outputs.jsonl"     # produced by the Analyzer
CLINICIAN_JSON = "clinician_evaluation.json"  # optional: only used for engagement eval
EVAL_JSONL     = "evaluator_runs.jsonl"

OUTPUT_DIR_INDIVIDUAL = "analyzer_individual_outputs"  # per-patient per-model JSON files

# ---------- File helpers ----------
def _read_jsonl(path: str):
    items = []
    if not os.path.exists(path):
        return items
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            try:
                items.append(json.loads(line))
            except json.JSONDecodeError:
                continue
    return items

def _read_json(path: str):
    if not os.path.exists(path):
        return []
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

def _append_jsonl(path: str, obj: dict):
    os.makedirs(os.path.dirname(path) or ".", exist_ok=True)
    with open(path, "a", encoding="utf-8") as f:
        f.write(json.dumps(obj, ensure_ascii=False) + "\n")

def _coerce_json(text: str):
    s = str(text).strip()
    if s.startswith("{") and s.endswith("}"):
        return json.loads(s)
    m = re.search(r"\{.*\}", s, flags=re.DOTALL)
    if not m:
        raise ValueError("No JSON object found in evaluator output.")
    return json.loads(m.group(0))

# ---------- Low-level loaders as tools ----------
@tool
def load_analyzer_runs(limit: int | None = None) -> dict:
    """
    Load analyzer outputs (JSONL), sorted by timestamp ASC. Optionally keep only latest 'limit'.
    """
    runs = _read_jsonl(ANALYZER_JSONL)
    runs.sort(key=lambda r: r.get("timestamp", ""))
    if limit:
        runs = runs[-limit:]
    return {"runs": runs}

@tool
def load_clinician_eval() -> dict:
    """
    Load clinician_evaluation.json (array), sorted by timestamp ASC (optional for SMART).
    """
    items = _read_json(CLINICIAN_JSON)
    items.sort(key=lambda r: r.get("timestamp", ""))
    return {"items": items}

# ---------- Build Engagement pairs: analyzer vs clinician ----------
def _build_engagement_cases(runs, clinician_items):
    # index clinician by timestamp -> {device_id -> rec}
    clin_index = {}
    for row in clinician_items:
        ts = row.get("timestamp", "")
        for rec in row.get("clinician_output", {}).get("recommendations", []):
            clin_index.setdefault(ts, {})[rec.get("device_id")] = {
                "category_recommended": rec.get("category_recommended"),
                "rationale": rec.get("rationale"),
            }

    cases = []
    for run in runs:
        ts = run.get("timestamp", "")
        recs = (run.get("analyzer_output") or {}).get("recommendations", [])
        for rec in recs:
            dev = rec.get("device_id")
            analyzer_rec = {
                "category_recommended": rec.get("category_recommended"),
                "rationale": rec.get("rationale"),
            }
            clinician_rec = (clin_index.get(ts, {}) or {}).get(dev)
            if clinician_rec:
                cases.append({
                    "case_id": f"{ts}::{dev}",
                    "timestamp": ts,
                    "device_id": dev,
                    "analyzer": analyzer_rec,
                    "clinician": clinician_rec,
                })
    return cases

# ---------- Build SMART-goal cases (rubric-driven) ----------
def _build_smart_goal_cases(runs):
    """
    Flatten SMART goals from analyzer_output.smart_goals.
    Each case contains goal text plus a default SMART rubric the judge can use.
    """
    cases = []
    for run in runs:
        ts = run.get("timestamp", "")
        ao = run.get("analyzer_output") or {}
        goals = ao.get("smart_goals") or []
        for g in goals:
            num = g.get("goal_number")
            desc = g.get("description", "")
            cases.append({
                "case_id": f"{ts}::goal_{num}",
                "timestamp": ts,
                "goal_number": num,
                "goal_text": desc,
            })
    return cases

# ---------- Planning tool that abstracts use cases ----------
@tool
def build_eval_plan(limit: int | None = None) -> dict:
    """
    Decide which evaluation to run based on analyzer_outputs.jsonl contents.
    Returns a plan with:
      {
        "evaluation_type": "engagement_vs_clinician" | "smart_goals_rubric",
        "metrics": ["..."],
        "rubric": { ... optional ... },
        "cases": [ ... normalized cases ... ]
      }
    """
    runs = load_analyzer_runs(limit=limit)["runs"]
    clinicians = load_clinician_eval()["items"]

    # Heuristic: if analyzer_output has 'recommendations' (device_id...), prefer engagement;
    # if analyzer_output has 'smart_goals', prefer SMART rubric mode.
    has_engagement = any((r.get("analyzer_output") or {}).get("recommendations") for r in runs)
    has_smart = any((r.get("analyzer_output") or {}).get("smart_goals") for r in runs)

    if has_engagement and clinicians:
        cases = _build_engagement_cases(runs, clinicians)
        return {
            "evaluation_type": "engagement_vs_clinician",
            "metrics": ["correctness", "completeness", "helpfulness", "coherence", "relevance"],
            "rubric": {
                "notes": "Compare analyzer category & rationale to clinician's.",
                "agreement_rules": {
                    "match": "same category (case-insensitive)",
                    "partial": "different category but rationale overlaps clinician intent",
                    "mismatch": "different with little/no overlap",
                }
            },
            "cases": cases
        }

    if has_smart:
        cases = _build_smart_goal_cases(runs)
        return {
            "evaluation_type": "smart_goals_rubric",
            "metrics": ["specific", "measurable", "achievable", "relevant", "time_bound", "clarity"],
            "rubric": {
                "specific":   "Clearly states the behavior/target (who/what/when/where).",
                "measurable": "Includes a quantifiable criterion (count, frequency, value).",
                "achievable": "Feasible for the patient (resources/constraints).",
                "relevant":   "Aligned to diabetes/health needs in the notes.",
                "time_bound": "Contains a concrete timeframe or deadline.",
                "clarity":    "Readable, unambiguous, free of contradictions."
            },
            "cases": cases
        }

    # Fallback: nothing to evaluate
    return {
        "evaluation_type": "none",
        "metrics": [],
        "rubric": {},
        "cases": []
    }

# ---------- Evaluator Agent (general, plan-driven) ----------
EVALUATOR_PROMPT = """
You are an Evaluator (LLM-as-Judge) that supports multiple evaluation modes via a plan.

You will be given a plan from the tool build_eval_plan(limit) with:
- evaluation_type: "engagement_vs_clinician" or "smart_goals_rubric"
- metrics: list of metric names to score in [0.0, 1.0]
- rubric: guidance for scoring
- cases: a list of cases to evaluate

CALLS:
1) Call build_eval_plan(limit) EXACTLY ONCE (use the user-provided {"limit":N} if present; otherwise none).

SCORING:
- For "engagement_vs_clinician":
  Each case has:
    { case_id, timestamp, device_id, analyzer{category_recommended, rationale}, clinician{category_recommended, rationale} }
  Score metrics: correctness, completeness, helpfulness, coherence, relevance.
  Also produce:
    agreement = "match" | "partial" | "mismatch"
  Rules:
    - match if categories are the same (case-insensitive).
    - partial if different but analyzer rationale substantially overlaps clinician intent.
    - mismatch otherwise.

- For "smart_goals_rubric":
  Each case has:
    { case_id, timestamp, goal_number, goal_text }
  Score metrics: specific, measurable, achievable, relevant, time_bound, clarity.
  Focus only on the goal_text vs rubric. If unsafe, note it briefly.

OUTPUT: STRICT JSON ONLY:
{
  "evaluation_type": "string",
  "cases_scored": 0,
  "scores": [
    {
      "case_id": "string",
      "metric_scores": { "<metric>": 0.0 },
      "agreement": "match|partial|mismatch|n/a",
      "notes": "short justification (<=40 words)"
    }
  ]
}

PROCESS:
- Produce one score object per case with values in [0.0, 1.0].
- Use "agreement":"n/a" for smart_goals_rubric (no clinician).
- Keep notes concise and specific.
"""

evaluator_agent = Agent(
    model=BedrockModel("us.anthropic.claude-3-5-sonnet-20241022-v2:0"),
    system_prompt=EVALUATOR_PROMPT,
    tools=[build_eval_plan],  # single entry tool that returns everything needed
)

# ---------- Runner (single call) ----------
def run_evaluator(limit: int | None = None, print_json: bool = True):
    payload = {"limit": limit} if limit else {}
    raw = evaluator_agent(json.dumps(payload))
    out = _coerce_json(raw)

    # Compute overall means dynamically across whatever metric set came back
    metrics = set()
    for s in out.get("scores", []):
        for k in (s.get("metric_scores") or {}).keys():
            metrics.add(k)
    metrics = sorted(metrics)

    buckets = {k: [] for k in metrics}
    for s in out.get("scores", []):
        for k in metrics:
            v = (s.get("metric_scores") or {}).get(k)
            if isinstance(v, (int, float)):
                buckets[k].append(float(v))

    overall = {k: (round(mean(buckets[k]), 4) if buckets[k] else 0.0) for k in metrics}
    out["cases_scored"] = len(out.get("scores", []))
    out["overall"] = overall

    _append_jsonl(EVAL_JSONL, {
        "run_id": str(uuid.uuid4()),
        "timestamp": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()),
        "evaluator_output": out
    })

    return

# ---- run ----
run_evaluator()           # evaluate all available cases
