## LLM-as-Judge Evaluator agent prototype

### Overview
This LLM-as-Judge Evalautor agent runs on its own kernal session, allowing service request for multiple session.


In [None]:
%%writefile ./lab_helpers/evaluator_agent_runtime.py
# evaluator_agent_runtime.py
import json
import uuid
import time
from typing import Optional, Dict, Any

import re

from strands import Agent
from strands.models import BedrockModel

from bedrock_agentcore.runtime import BedrockAgentCoreApp

from lab_helpers.smartgoalgenerator_mcp_tools import build_eval_plan_v2, load_analyzer_runs_v2

# =========================================
# ===== Module-level constants ============
# =========================================
EVALUATOR_MODEL_ID = "us.anthropic.claude-3-7-sonnet-20250219-v1:0"

# ==================================
# ===== LLM-as-Judge essential =====
# ==================================
def evaluator_system_prompt() -> str:
    return """You are an Evaluator (LLM-as-Judge) that supports multiple evaluation modes via a plan.

CRITICAL: You MUST evaluate ALL cases provided in the plan. Do not stop early or skip any cases.

You will be given a plan from the tool build_eval_plan_v2(analyzer_json_src, limit) with:
- evaluation_type: "engagement_vs_clinician" or "smart_goals_rubric"
- metrics: list of metric names to score in [0.0, 1.0]
- rubric: guidance for scoring
- cases: a list of cases to evaluate

CALLS:
1) Call build_eval_plan_v2(analyzer_json_src, limit) EXACTLY ONCE (use the user-provided {"analyzer_json_src":analyzer_json_src} if present; otherwise none).

SCORING:
- For "engagement_vs_clinician":
  Each case has:
    { case_id, timestamp, device_id, analyzer{category_recommended, rationale}, clinician{category_recommended, rationale} }
  Score metrics: correctness, completeness, helpfulness, coherence, relevance.
  Also produce:
    agreement = "match" | "partial" | "mismatch"
  Rules:
    - match if categories are the same (case-insensitive).
    - partial if different but analyzer rationale substantially overlaps clinician intent.
    - mismatch otherwise.

- For "smart_goals_rubric":
  Each case has:
    { case_id, timestamp, goal_number, goal_text }
  Score metrics: specific, measurable, achievable, relevant, time_bound, clarity.
  Focus only on the goal_text vs rubric. If unsafe, note it briefly.

OUTPUT: STRICT JSON ONLY:
{
  "evaluation_type": "string",
  "cases_scored": 0,
  "scores": [
    {
      "case_id": "string",
      "metric_scores": { "<metric>": 0.0 },
      "agreement": "match|partial|mismatch|n/a",
      "notes": "short justification (<=40 words)"
    }
  ]
}

PROCESS:
- Produce one score object per case with values in [0.0, 1.0].
- Use "agreement":"n/a" for smart_goals_rubric (no clinician).
- Keep notes concise and specific.
"""

# ==============================
# ===== Json/Jsonl helpers =====
# ==============================
def clean_json_str(s: str) -> str:
    # remove trailing commas before } or ]
    s = re.sub(r",\s*([}\]])", r"\1", s)
    # strip any junk after final closing brace
    last_brace = max(s.rfind("}"), s.rfind("]"))
    if last_brace != -1:
        s = s[:last_brace+1]
    return s

def _coerce_json(s):
    import json, re

    if not isinstance(s, str):
        if hasattr(s, "output"): s = s.output
        elif hasattr(s, "content"): s = s.content
        elif hasattr(s, "text"): s = s.text
        else: s = str(s)

    s = s.strip()

    if s.startswith("{") and s.endswith("}"):
        candidate = s
    else:
        m = re.search(r"\{.*\}", s, flags=re.DOTALL)
        if not m:
            raise ValueError("No JSON object found in agent output.")
        candidate = m.group(0)

    candidate = clean_json_str(candidate)
    
    try:
        return json.loads(candidate)
    except json.JSONDecodeError as e:
        # Print useful debug info
        snippet = candidate[max(0, e.pos-80):e.pos+80]
        print(f"\n--- JSON parse error ---\n{e}\nContext:\n...{snippet}...\n")
        raise

# =========================================
# ===== Module-level evaluator agent ======
# =========================================

# Step 1: Initialize BedrockModel at module load (only if tool is available)
evaluator_model = BedrockModel(
    model_id=EVALUATOR_MODEL_ID,
    max_tokens=8192,
    temperature=0.8,
    top_k=50,
    top_p=0.95,
)

# Prepare evaluator agent configuration
evaluator_agent_kwargs = {"model": evaluator_model}

# Add tools if available
try:
    evaluator_agent_kwargs["tools"] = [build_eval_plan_v2, load_analyzer_runs_v2]
except Exception as e:
    print(f"Tool listing failed: {e}")

# Add system prompt
evaluator_agent_kwargs["system_prompt"] = evaluator_system_prompt()

# Initialize evaluator agent once with the right capabilities
evaluator_agent = Agent(**evaluator_agent_kwargs)

# =========================================
# ===== Bedrock AgentCore Entrypoint --- Initialize the agentcore runtime ======
# =========================================
app = BedrockAgentCoreApp()

@app.entrypoint
def invoke(payload: Dict[str, Any]):
    """AgentCore Runtime entrypoint function"""
    try:
        analyzer_payload = payload.get("analyzer_payload")
        if not analyzer_payload:
            return {
                "statusCode": 400,
                "body": json.dumps({"error": "No analyzer_payload provided."})
            }

        # Step 1: Run the evaluator agent
        text = "Please analyze this analyzer output and provide evaluation metrics: " + json.dumps(analyzer_payload)
        response = evaluator_agent(text)

        # Step 2: Parse agent output using the same helper function as smart goal generator
        parsed = _coerce_json(response)

        # Step 3: Structure the evaluation output
        output_obj = {
            "run_id": str(uuid.uuid4()),
            "timestamp": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()),
            "evaluator_output": parsed,
            "analyzer_input": analyzer_payload
        }

        # Step 4: Return HTTP-style response
        return {
            "statusCode": 200,
            "headers": {
                "Content-Type": "application/json"
            },
            "body": json.dumps(output_obj, ensure_ascii=False)
        }

    except Exception as e:
        print(f"Error: {str(e)}")
        return {
            "statusCode": 500,
            "body": json.dumps({"error": str(e)})
        }    

if __name__ == "__main__":
    app.run()

In [2]:
# Install required packages
%pip install -U -r requirements.txt -q

Note: you may need to restart the kernel to use updated packages.


In [3]:
# Install required packages
%pip install strands-agents "boto3>=1.39.15" strands-agents-tools bedrock_agentcore ddgs -q

Note: you may need to restart the kernel to use updated packages.


In [4]:
# Import required libraries
import os
import json
import boto3
from boto3.session import Session
from strands import Agent
from strands.models import BedrockModel


In [5]:
# Get boto session
boto_session = Session()
region = boto_session.region_name

In [6]:
pip install bedrock-agentcore-starter-toolkit

Note: you may need to restart the kernel to use updated packages.


In [7]:
!pip install boto3==1.34.162 botocore==1.34.162 --force-reinstall

Collecting boto3==1.34.162
  Using cached boto3-1.34.162-py3-none-any.whl.metadata (6.6 kB)
Collecting botocore==1.34.162
  Using cached botocore-1.34.162-py3-none-any.whl.metadata (5.7 kB)
Collecting jmespath<2.0.0,>=0.7.1 (from boto3==1.34.162)
  Using cached jmespath-1.0.1-py3-none-any.whl.metadata (7.6 kB)
Collecting s3transfer<0.11.0,>=0.10.0 (from boto3==1.34.162)
  Using cached s3transfer-0.10.4-py3-none-any.whl.metadata (1.7 kB)
Collecting python-dateutil<3.0.0,>=2.1 (from botocore==1.34.162)
  Using cached python_dateutil-2.9.0.post0-py2.py3-none-any.whl.metadata (8.4 kB)
Collecting urllib3!=2.2.0,<3,>=1.25.4 (from botocore==1.34.162)
  Using cached urllib3-2.5.0-py3-none-any.whl.metadata (6.5 kB)
Collecting six>=1.5 (from python-dateutil<3.0.0,>=2.1->botocore==1.34.162)
  Using cached six-1.17.0-py2.py3-none-any.whl.metadata (1.7 kB)
Using cached boto3-1.34.162-py3-none-any.whl (139 kB)
Using cached botocore-1.34.162-py3-none-any.whl (12.5 MB)
Using cached jmespath-1.0.1-py3-

In [8]:
pip install --upgrade bedrock-agentcore

Collecting boto3>=1.40.52 (from bedrock-agentcore)
  Using cached boto3-1.40.55-py3-none-any.whl.metadata (6.6 kB)
Collecting botocore>=1.40.52 (from bedrock-agentcore)
  Using cached botocore-1.40.55-py3-none-any.whl.metadata (5.7 kB)
Collecting s3transfer<0.15.0,>=0.14.0 (from boto3>=1.40.52->bedrock-agentcore)
  Using cached s3transfer-0.14.0-py3-none-any.whl.metadata (1.7 kB)
Using cached boto3-1.40.55-py3-none-any.whl (139 kB)
Using cached botocore-1.40.55-py3-none-any.whl (14.1 MB)
Using cached s3transfer-0.14.0-py3-none-any.whl (85 kB)
Installing collected packages: botocore, s3transfer, boto3
[2K  Attempting uninstall: botocore
[2K    Found existing installation: botocore 1.34.162
[2K    Uninstalling botocore-1.34.162:
[2K      Successfully uninstalled botocore-1.34.162
[2K  Attempting uninstall: s3transfer━━━━━━━━━━━━━━[0m [32m0/3[0m [botocore]
[2K    Found existing installation: s3transfer 0.10.4[32m0/3[0m [botocore]
[2K    Uninstalling s3transfer-0.10.4:━━━━━━━━━

In [9]:
import boto3
client = boto3.client('bedrock', region_name='us-east-1')
print(client.meta.service_model.service_name)


bedrock


In [10]:
from bedrock_agentcore_starter_toolkit import Runtime
from lab_helpers.utils_evaluator import create_agentcore_runtime_execution_role

# Initialize the runtime toolkit
boto_session = boto3.session.Session()
region = boto_session.region_name

execution_role_arn = create_agentcore_runtime_execution_role()

agentcore_runtime = Runtime()

# Configure the deployment
response = agentcore_runtime.configure(
    entrypoint="lab_helpers/evaluator_agent_runtime.py",
    execution_role=execution_role_arn,
    auto_create_ecr=True,
    requirements_file="requirements.txt",
    region=region,
    agent_name="llm_evaluator_agent",
)

print("Configuration completed:", response)

Entrypoint parsed: file=/mnt/custom-file-systems/efs/fs-09f36259b5e98907e_fsap-09cbf9f8e29ef1a0c/SIPPA-llm-evaluator-final/lab_helpers/evaluator_agent_runtime.py, bedrock_agentcore_name=evaluator_agent_runtime
Memory configured with STM only
Configuring BedrockAgentCore agent: llm_evaluator_agent


ℹ️ Role EvaluatorBedrockAgentCoreRole-us-east-1 already exists
Role ARN: arn:aws:iam::711246752798:role/EvaluatorBedrockAgentCoreRole-us-east-1


Will create new memory with mode: STM_ONLY
Memory configuration: Short-term memory only
Found existing memory ID from previous launch: llm_evaluator_agent_mem-Oq55uu3ClK


Generated Dockerfile: Dockerfile
Generated .dockerignore: /mnt/custom-file-systems/efs/fs-09f36259b5e98907e_fsap-09cbf9f8e29ef1a0c/SIPPA-llm-evaluator-final/.dockerignore
Keeping 'llm_evaluator_agent' as default agent
Bedrock AgentCore configured: /mnt/custom-file-systems/efs/fs-09f36259b5e98907e_fsap-09cbf9f8e29ef1a0c/SIPPA-llm-evaluator-final/.bedrock_agentcore.yaml


Configuration completed: config_path=PosixPath('/mnt/custom-file-systems/efs/fs-09f36259b5e98907e_fsap-09cbf9f8e29ef1a0c/SIPPA-llm-evaluator-final/.bedrock_agentcore.yaml') dockerfile_path=PosixPath('/mnt/custom-file-systems/efs/fs-09f36259b5e98907e_fsap-09cbf9f8e29ef1a0c/SIPPA-llm-evaluator-final/Dockerfile') dockerignore_path=PosixPath('/mnt/custom-file-systems/efs/fs-09f36259b5e98907e_fsap-09cbf9f8e29ef1a0c/SIPPA-llm-evaluator-final/.dockerignore') runtime='None' region='us-east-1' account_id='711246752798' execution_role='arn:aws:iam::711246752798:role/EvaluatorBedrockAgentCoreRole-us-east-1' ecr_repository=None auto_create_ecr=True memory_id=None


In [11]:
# Launch the agent (this will build and deploy the container)
from lab_helpers.utils_evaluator import put_ssm_parameter

launch_result = agentcore_runtime.launch()
print("Launch completed:", launch_result.agent_arn)

agent_arn = put_ssm_parameter(
    "/app/llmevaluator/agentcore/runtime_arn", launch_result.agent_arn
)

🚀 CodeBuild mode: building in cloud (RECOMMENDED - DEFAULT)
   • Build ARM64 containers in the cloud with CodeBuild
   • No local Docker required
💡 Available deployment modes:
   • runtime.launch()                           → CodeBuild (current)
   • runtime.launch(local=True)                 → Local development
   • runtime.launch(local_build=True)           → Local build + cloud deploy (NEW)
Creating memory resource for agent: llm_evaluator_agent
✅ MemoryManager initialized for region: us-east-1
🔎 Retrieving memory resource with ID: llm_evaluator_agent_mem-Oq55uu3ClK...
  Found memory: llm_evaluator_agent_mem-Oq55uu3ClK
Found existing memory in cloud: llm_evaluator_agent_mem-Oq55uu3ClK
Existing memory has 0 strategies
✅ Using existing STM-only memory
Starting CodeBuild ARM64 deployment for agent 'llm_evaluator_agent' to account 711246752798 (us-east-1)
Setting up AWS resources (ECR repository, execution roles)...
Getting or creating ECR repository for agent: llm_evaluator_agent
✅ ECR

✅ Reusing existing ECR repository: 711246752798.dkr.ecr.us-east-1.amazonaws.com/bedrock-agentcore-llm_evaluator_agent


Getting or creating CodeBuild execution role for agent: llm_evaluator_agent
Role name: AmazonBedrockAgentCoreSDKCodeBuild-us-east-1-a9aedc1888
Reusing existing CodeBuild execution role: arn:aws:iam::711246752798:role/AmazonBedrockAgentCoreSDKCodeBuild-us-east-1-a9aedc1888
Using dockerignore.template with 45 patterns for zip filtering
Uploaded source to S3: llm_evaluator_agent/source.zip
Updated CodeBuild project: bedrock-agentcore-llm_evaluator_agent-builder
Starting CodeBuild build (this may take several minutes)...
Starting CodeBuild monitoring...
🔄 QUEUED started (total: 0s)
✅ QUEUED completed in 1.0s
🔄 PROVISIONING started (total: 1s)
✅ PROVISIONING completed in 8.3s
🔄 DOWNLOAD_SOURCE started (total: 9s)
✅ DOWNLOAD_SOURCE completed in 2.1s
🔄 BUILD started (total: 11s)
✅ BUILD completed in 15.5s
🔄 POST_BUILD started (total: 27s)
✅ POST_BUILD completed in 11.3s
🔄 FINALIZING started (total: 38s)
✅ FINALIZING completed in 1.0s
🔄 COMPLETED started (total: 39s)
✅ COMPLETED completed in 1

Launch completed: arn:aws:bedrock-agentcore:us-east-1:711246752798:runtime/llm_evaluator_agent-M3IWgT3T7l


In [12]:
import time

# Wait for the agent to be ready
status_response = agentcore_runtime.status()
status = status_response.endpoint["status"]

end_status = ["READY", "CREATE_FAILED", "DELETE_FAILED", "UPDATE_FAILED"]
while status not in end_status:
    print(f"Waiting for deployment... Current status: {status}")
    time.sleep(10)
    status_response = agentcore_runtime.status()
    status = status_response.endpoint["status"]

print(f"Final status: {status}")


✅ MemoryManager initialized for region: us-east-1
🔎 Retrieving memory resource with ID: llm_evaluator_agent_mem-Oq55uu3ClK...
  Found memory: llm_evaluator_agent_mem-Oq55uu3ClK
Retrieved Bedrock AgentCore status for: llm_evaluator_agent


Final status: READY


## Staging Data for Testing and Validation


In [13]:
output_obj=[{'case_id': 1,
  'timestamp': '2025-10-13 00:55:59',
  'goal_number': 1,
  'goal_text': 'Reduce daily carbohydrate intake to 45g per meal and aim for a total of 135g per day. Monitor progress weekly by recording carbohydrate counts in food diary.'},
 {'case_id': 2,
  'timestamp': '2025-10-13 00:55:59',
  'goal_number': 2,
  'goal_text': 'Engage in at least 30 minutes of moderate-intensity aerobic activity 5 days per week. Track activity using a fitness tracker or smartphone app.'},
 {'case_id': 3,
  'timestamp': '2025-10-13 00:55:59',
  'goal_number': 3,
  'goal_text': 'Take metformin 500mg twice daily with meals. Refill prescription every 90 days and report any side effects to healthcare provider.'},
 {'case_id': 4,
  'timestamp': '2025-10-13 00:55:59',
  'goal_number': 4,
  'goal_text': 'Check blood glucose levels before meals and at bedtime. Target pre-meal blood glucose levels below 130mg/dL and bedtime levels below 110mg/dL. Record results in a log.'},
 {'case_id': 5,
  'timestamp': '2025-10-13 00:55:59',
  'goal_number': 5,
  'goal_text': 'Maintain adequate water intake by drinking at least 8 glasses of water per day. Monitor urine color and report any unusual changes to healthcare provider.'},
 {'case_id': 6,
  'timestamp': '2025-10-13 00:55:59',
  'goal_number': 6,
  'goal_text': 'Attend all scheduled appointments with healthcare provider and bring a copy of the most recent blood glucose log. Update medication list and share any medication changes.'}]

## Prepare Payload from Staging Data

In [14]:
payload={"analyzer_payload": output_obj}
payload

{'analyzer_payload': [{'case_id': 1,
   'timestamp': '2025-10-13 00:55:59',
   'goal_number': 1,
   'goal_text': 'Reduce daily carbohydrate intake to 45g per meal and aim for a total of 135g per day. Monitor progress weekly by recording carbohydrate counts in food diary.'},
  {'case_id': 2,
   'timestamp': '2025-10-13 00:55:59',
   'goal_number': 2,
   'goal_text': 'Engage in at least 30 minutes of moderate-intensity aerobic activity 5 days per week. Track activity using a fitness tracker or smartphone app.'},
  {'case_id': 3,
   'timestamp': '2025-10-13 00:55:59',
   'goal_number': 3,
   'goal_text': 'Take metformin 500mg twice daily with meals. Refill prescription every 90 days and report any side effects to healthcare provider.'},
  {'case_id': 4,
   'timestamp': '2025-10-13 00:55:59',
   'goal_number': 4,
   'goal_text': 'Check blood glucose levels before meals and at bedtime. Target pre-meal blood glucose levels below 130mg/dL and bedtime levels below 110mg/dL. Record results in a

## Verifying Agentcore Agent is reachable within the same kernel session.

In [15]:
import uuid

# Create a session ID for demonstrating session continuity
session_id = uuid.uuid4()

# Ensure payload is a proper dict
payload = {"analyzer_payload": output_obj}

# If your runtime uses IAM role, you can omit bearer_token
#response = agentcore_runtime.invoke(
#    payload,
#    session_id=str(session_id)
#)

response = agentcore_runtime.invoke(
    payload,
    session_id=str(session_id)
)

response

✅ MemoryManager initialized for region: us-east-1
Memory is active, proceeding with invoke


{'ResponseMetadata': {'RequestId': '1ae8cdb7-86e8-4046-b835-9454e702e8bb',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'date': 'Sun, 19 Oct 2025 08:21:20 GMT',
   'content-type': 'application/json',
   'transfer-encoding': 'chunked',
   'connection': 'keep-alive',
   'x-amzn-requestid': '1ae8cdb7-86e8-4046-b835-9454e702e8bb',
   'baggage': 'Self=1-68f49f67-7be5de4e533931b201332ada,session.id=fa0cf5fe-d735-41c8-b028-c72afeedd5b1',
   'x-amzn-bedrock-agentcore-runtime-session-id': 'fa0cf5fe-d735-41c8-b028-c72afeedd5b1',
   'x-amzn-trace-id': 'Root=1-68f49f67-57f572fb66a9996c48b04837;Parent=60c3b4822ec34424;Sampled=1;Self=1-68f49f67-7be5de4e533931b201332ada'},
  'RetryAttempts': 0},
 'runtimeSessionId': 'fa0cf5fe-d735-41c8-b028-c72afeedd5b1',
 'traceId': 'Root=1-68f49f67-57f572fb66a9996c48b04837;Parent=60c3b4822ec34424;Sampled=1;Self=1-68f49f67-7be5de4e533931b201332ada',
 'baggage': 'Self=1-68f49f67-7be5de4e533931b201332ada,session.id=fa0cf5fe-d735-41c8-b028-c72afeedd5b1',
 'contentType': 

## Add Support Function to Prepare Necessary Data for Validating Remote Access of Agentcore Agent

In [16]:
# ==== Testing ... ====
# @tool  ---------- Low-level loaders as tools ----------
def load_analyzer_runs_v2(analyzer_json_src: str, limit: int | None = None) -> dict:
    # Need to define input source
    """
    Load analyzer outputs (JSONL), sorted by timestamp ASC. Optionally keep only latest 'limit'.
    """
    #runs = _read_jsonl(analyzer_json_src)
    runs = analyzer_json_src
    runs.sort(key=lambda r: r.get("timestamp", ""))
    if limit:
        runs = runs[-limit:]
    return {"runs": runs}


# @tool  ---------- Planning tool that abstracts use cases ----------
def build_eval_plan_v2(analyzer_json_src: str, limit=50) -> dict:
    """
    Decide which evaluation to run based on analyzer_outputs.jsonl contents.
    Returns a plan with:
      {
        "evaluation_type": "engagement_vs_clinician" | "smart_goals_rubric",
        "metrics": ["..."],
        "rubric": { ... optional ... },
        "cases": [ ... normalized cases ... ]
      }
    """
    runs = load_analyzer_runs_v2(analyzer_json_src=analyzer_json_src, limit=limit)["runs"]
    cases = runs
#    cases = _build_smart_goal_cases_v2(runs)
    return {
            "evaluation_type": "smart_goals_rubric",
            "metrics": ["specific", "measurable", "achievable", "relevant", "time_bound", "clarity"],
            "rubric": {
                "specific":   "Clearly states the behavior/target (who/what/when/where).",
                "measurable": "Includes a quantifiable criterion (count, frequency, value).",
                "achievable": "Feasible for the patient (resources/constraints).",
                "relevant":   "Aligned to diabetes/health needs in the notes.",
                "time_bound": "Contains a concrete timeframe or deadline.",
                "clarity":    "Readable, unambiguous, free of contradictions."
            },
            "cases": cases
    }

    # Fallback: nothing to evaluate
    return {
        "evaluation_type": "none",
        "metrics": [],
        "rubric": {},
        "cases": []
    }


In [17]:
# ==== Testing ... ====
import json

output = {
    'response': '{"statusCode": 200, "headers": {"Content-Type": "application/json"}, "body": "{\\"model_output\\": {\\"model_id\\": \\"mistral.mistral-7b-instruct-v0:2\\", \\"data_source\\": \\"s3://sippa/app_data_repo/SIPPA_AI-Extraction-Treatment-Plan/clinician_summary_data_source/AM-09121152.docx\\", \\"timestamp\\": \\"2025-10-13 00:55:59\\", \\"smart_goals\\": [{\\"goal_number\\": 1, \\"description\\": \\"Reduce daily carbohydrate intake to 45g per meal and aim for a total of 135g per day. Monitor progress weekly by recording carbohydrate counts in food diary.\\"}, {\\"goal_number\\": 2, \\"description\\": \\"Engage in at least 30 minutes of moderate-intensity aerobic activity 5 days per week. Track activity using a fitness tracker or smartphone app.\\"}, {\\"goal_number\\": 3, \\"description\\": \\"Take metformin 500mg twice daily with meals. Refill prescription every 90 days and report any side effects to healthcare provider.\\"}, {\\"goal_number\\": 4, \\"description\\": \\"Check blood glucose levels before meals and at bedtime. Target pre-meal blood glucose levels below 130mg/dL and bedtime levels below 110mg/dL. Record results in a log.\\"}, {\\"goal_number\\": 5, \\"description\\": \\"Maintain adequate water intake by drinking at least 8 glasses of water per day. Monitor urine color and report any unusual changes to healthcare provider.\\"}, {\\"goal_number\\": 6, \\"description\\": \\"Attend all scheduled appointments with healthcare provider and bring a copy of the most recent blood glucose log. Update medication list and share any medication changes.\\"}]}, \\"evaluator_result\\": {\\"error\\": \\"\'BedrockAgentCoreDataPlaneFrontingLayer\' object has no attribute \'invoke_runtime\'\\"}}"}'
}

# Step 1: Parse the outermost JSON string (the value of 'response')
response_data = json.loads(output['response'])

# Step 2: Parse the 'body' which is another JSON string
body_data = json.loads(response_data['body'])

# Step 3: Extract the necessary components
model_output = body_data['model_output']
timestamp = model_output['timestamp']
smart_goals = model_output['smart_goals']

# Step 4: Iterate through goals, extract data, and format
final_output = []
# The case_id is a simple count, so we'll start at 1
case_id_counter = 1 

for goal in smart_goals:
    goal_entry = {
        'case_id': case_id_counter,
        'timestamp': timestamp,
        'goal_number': goal['goal_number'],
        'goal_text': goal['description']
    }
    final_output.append(goal_entry)
    case_id_counter += 1

# Print the final result in a human-readable format
# print(json.dumps(final_output, indent=4))

output_obj = final_output
output_obj

[{'case_id': 1,
  'timestamp': '2025-10-13 00:55:59',
  'goal_number': 1,
  'goal_text': 'Reduce daily carbohydrate intake to 45g per meal and aim for a total of 135g per day. Monitor progress weekly by recording carbohydrate counts in food diary.'},
 {'case_id': 2,
  'timestamp': '2025-10-13 00:55:59',
  'goal_number': 2,
  'goal_text': 'Engage in at least 30 minutes of moderate-intensity aerobic activity 5 days per week. Track activity using a fitness tracker or smartphone app.'},
 {'case_id': 3,
  'timestamp': '2025-10-13 00:55:59',
  'goal_number': 3,
  'goal_text': 'Take metformin 500mg twice daily with meals. Refill prescription every 90 days and report any side effects to healthcare provider.'},
 {'case_id': 4,
  'timestamp': '2025-10-13 00:55:59',
  'goal_number': 4,
  'goal_text': 'Check blood glucose levels before meals and at bedtime. Target pre-meal blood glucose levels below 130mg/dL and bedtime levels below 110mg/dL. Record results in a log.'},
 {'case_id': 5,
  'timestam

In [18]:
# ==== Testing ... ====
runs = load_analyzer_runs_v2(analyzer_json_src=output_obj)["runs"]
runs


[{'case_id': 1,
  'timestamp': '2025-10-13 00:55:59',
  'goal_number': 1,
  'goal_text': 'Reduce daily carbohydrate intake to 45g per meal and aim for a total of 135g per day. Monitor progress weekly by recording carbohydrate counts in food diary.'},
 {'case_id': 2,
  'timestamp': '2025-10-13 00:55:59',
  'goal_number': 2,
  'goal_text': 'Engage in at least 30 minutes of moderate-intensity aerobic activity 5 days per week. Track activity using a fitness tracker or smartphone app.'},
 {'case_id': 3,
  'timestamp': '2025-10-13 00:55:59',
  'goal_number': 3,
  'goal_text': 'Take metformin 500mg twice daily with meals. Refill prescription every 90 days and report any side effects to healthcare provider.'},
 {'case_id': 4,
  'timestamp': '2025-10-13 00:55:59',
  'goal_number': 4,
  'goal_text': 'Check blood glucose levels before meals and at bedtime. Target pre-meal blood glucose levels below 130mg/dL and bedtime levels below 110mg/dL. Record results in a log.'},
 {'case_id': 5,
  'timestam

In [19]:
# ==== Testing ... ====
build_eval_plan_v2(analyzer_json_src=output_obj)

{'evaluation_type': 'smart_goals_rubric',
 'metrics': ['specific',
  'measurable',
  'achievable',
  'relevant',
  'time_bound',
  'clarity'],
 'rubric': {'specific': 'Clearly states the behavior/target (who/what/when/where).',
  'measurable': 'Includes a quantifiable criterion (count, frequency, value).',
  'achievable': 'Feasible for the patient (resources/constraints).',
  'relevant': 'Aligned to diabetes/health needs in the notes.',
  'time_bound': 'Contains a concrete timeframe or deadline.',
  'clarity': 'Readable, unambiguous, free of contradictions.'},
 'cases': [{'case_id': 1,
   'timestamp': '2025-10-13 00:55:59',
   'goal_number': 1,
   'goal_text': 'Reduce daily carbohydrate intake to 45g per meal and aim for a total of 135g per day. Monitor progress weekly by recording carbohydrate counts in food diary.'},
  {'case_id': 2,
   'timestamp': '2025-10-13 00:55:59',
   'goal_number': 2,
   'goal_text': 'Engage in at least 30 minutes of moderate-intensity aerobic activity 5 days

In [20]:
# ==== Testing ... ====
output_obj

[{'case_id': 1,
  'timestamp': '2025-10-13 00:55:59',
  'goal_number': 1,
  'goal_text': 'Reduce daily carbohydrate intake to 45g per meal and aim for a total of 135g per day. Monitor progress weekly by recording carbohydrate counts in food diary.'},
 {'case_id': 2,
  'timestamp': '2025-10-13 00:55:59',
  'goal_number': 2,
  'goal_text': 'Engage in at least 30 minutes of moderate-intensity aerobic activity 5 days per week. Track activity using a fitness tracker or smartphone app.'},
 {'case_id': 3,
  'timestamp': '2025-10-13 00:55:59',
  'goal_number': 3,
  'goal_text': 'Take metformin 500mg twice daily with meals. Refill prescription every 90 days and report any side effects to healthcare provider.'},
 {'case_id': 4,
  'timestamp': '2025-10-13 00:55:59',
  'goal_number': 4,
  'goal_text': 'Check blood glucose levels before meals and at bedtime. Target pre-meal blood glucose levels below 130mg/dL and bedtime levels below 110mg/dL. Record results in a log.'},
 {'case_id': 5,
  'timestam

## Staging Access to Agentcore Agent in Separated Kernel Session

In [21]:
import boto3
import json

EVALUATOR_RUNTIME_ARN = "arn:aws:bedrock-agentcore:us-east-1:711246752798:runtime/llm_evaluator_agent-M3IWgT3T7l"

def call_evaluator_runtime(payload: dict) -> dict:
    # Initialize the Bedrock AgentCore client
    agent_core_client = boto3.client('bedrock-agentcore')
  
    # Prepare the payload prompt
    #payload={'analyzer_payload':output_obj}
    prompt = json.dumps(payload).encode()
  
    # Invoke the agent
    response = agent_core_client.invoke_agent_runtime(
                    agentRuntimeArn=EVALUATOR_RUNTIME_ARN,
                    #agentRuntimeArn="arn:aws:bedrock-agentcore:us-east-1:711246752798:runtime/llm_evaluator_agent-KGa43vGulM", 
                    #runtimeSessionId=session_id,
                    payload=prompt
                    )

    # Process and print the response
    if "text/event-stream" in response.get("contentType", ""):
        # Handle streaming response
        content = []
        for line in response["response"].iter_lines(chunk_size=10):
            if line:
                line = line.decode("utf-8")
                if line.startswith("data: "):
                    line = line[6:]
                    print(line)
                    content.append(line)
        #print("\nComplete response:", "\n".join(content))
        return "\n".join(content)
        
    elif response.get("contentType") == "application/json":
        # Handle standard JSON response
        content = []
        for chunk in response.get("response", []):
            content.append(chunk.decode('utf-8'))
        #print(json.loads(''.join(content)))
        return json.loads(''.join(content))
  
    #else:
        # Print raw response for other content types
        #print(response)
    return response


## Validating Reachability of Agentcore Agent in Separated Kernel Session

In [22]:
payload={'analyzer_payload':output_obj}

raw_output = call_evaluator_runtime(payload)

raw_output

{'statusCode': 200,
 'headers': {'Content-Type': 'application/json'},
 'body': '{"run_id": "59a486c0-2f48-4d8e-b733-b5614929cf74", "timestamp": "2025-10-19 08:21:56", "evaluator_output": {"evaluation_type": "smart_goals_rubric", "cases_scored": 6, "scores": [{"case_id": 1, "metric_scores": {"specific": 1.0, "measurable": 1.0, "achievable": 0.9, "relevant": 1.0, "time_bound": 0.9, "clarity": 1.0}, "agreement": "n/a", "notes": "Clear carb targets (45g/meal, 135g/day) with weekly monitoring method specified. Achievable but challenging."}, {"case_id": 2, "metric_scores": {"specific": 1.0, "measurable": 1.0, "achievable": 1.0, "relevant": 1.0, "time_bound": 1.0, "clarity": 1.0}, "agreement": "n/a", "notes": "Perfectly SMART goal with specific duration (30min), frequency (5 days/week), intensity level, and tracking method."}, {"case_id": 3, "metric_scores": {"specific": 1.0, "measurable": 0.9, "achievable": 1.0, "relevant": 1.0, "time_bound": 0.9, "clarity": 1.0}, "agreement": "n/a", "notes"

In [23]:
response=json.loads(raw_output['body'])['evaluator_output']
response

{'evaluation_type': 'smart_goals_rubric',
 'cases_scored': 6,
 'scores': [{'case_id': 1,
   'metric_scores': {'specific': 1.0,
    'measurable': 1.0,
    'achievable': 0.9,
    'relevant': 1.0,
    'time_bound': 0.9,
    'clarity': 1.0},
   'agreement': 'n/a',
   'notes': 'Clear carb targets (45g/meal, 135g/day) with weekly monitoring method specified. Achievable but challenging.'},
  {'case_id': 2,
   'metric_scores': {'specific': 1.0,
    'measurable': 1.0,
    'achievable': 1.0,
    'relevant': 1.0,
    'time_bound': 1.0,
    'clarity': 1.0},
   'agreement': 'n/a',
   'notes': 'Perfectly SMART goal with specific duration (30min), frequency (5 days/week), intensity level, and tracking method.'},
  {'case_id': 3,
   'metric_scores': {'specific': 1.0,
    'measurable': 0.9,
    'achievable': 1.0,
    'relevant': 1.0,
    'time_bound': 0.9,
    'clarity': 1.0},
   'agreement': 'n/a',
   'notes': 'Clear medication instructions with dosage and timing. Includes refill schedule and side eff

In [24]:
dict_output = {"eval_out": response}

json_str = json.dumps(dict_output)
json_str

'{"eval_out": {"evaluation_type": "smart_goals_rubric", "cases_scored": 6, "scores": [{"case_id": 1, "metric_scores": {"specific": 1.0, "measurable": 1.0, "achievable": 0.9, "relevant": 1.0, "time_bound": 0.9, "clarity": 1.0}, "agreement": "n/a", "notes": "Clear carb targets (45g/meal, 135g/day) with weekly monitoring method specified. Achievable but challenging."}, {"case_id": 2, "metric_scores": {"specific": 1.0, "measurable": 1.0, "achievable": 1.0, "relevant": 1.0, "time_bound": 1.0, "clarity": 1.0}, "agreement": "n/a", "notes": "Perfectly SMART goal with specific duration (30min), frequency (5 days/week), intensity level, and tracking method."}, {"case_id": 3, "metric_scores": {"specific": 1.0, "measurable": 0.9, "achievable": 1.0, "relevant": 1.0, "time_bound": 0.9, "clarity": 1.0}, "agreement": "n/a", "notes": "Clear medication instructions with dosage and timing. Includes refill schedule and side effect reporting protocol."}, {"case_id": 4, "metric_scores": {"specific": 1.0, "m