From ed832ca3e0b503ab55d2ca26bc809be8c93d0f81 Mon Sep 17 00:00:00 2001
From: TomMonks <thomas.m.monks@gmail.com>
Date: Fri, 1 Aug 2025 16:02:13 +0100
Subject: [PATCH 01/23] feat(pkg): start of MCP sim package

---
 mcpsim/__init__.py | 5 +++++
 1 file changed, 5 insertions(+)
 create mode 100644 mcpsim/__init__.py

diff --git a/mcpsim/__init__.py b/mcpsim/__init__.py
new file mode 100644
index 0000000..66e77de
--- /dev/null
+++ b/mcpsim/__init__.py
@@ -0,0 +1,5 @@
+"""Package for managing Model Context Protocol servers for simulation models
+"""
+
+__version__ = "0.1.0"
+__author__ = "Tom Monks, Alison Harper"
\ No newline at end of file

From a3abce3288e66ec4fc1bffdea56b80d5bcdd57a3 Mon Sep 17 00:00:00 2001
From: TomMonks <thomas.m.monks@gmail.com>
Date: Fri, 1 Aug 2025 16:09:32 +0100
Subject: [PATCH 02/23] feat(pkg): +Protocol to adapt sim model interface

---
 mcpsim/protocol.py | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)
 create mode 100644 mcpsim/protocol.py

diff --git a/mcpsim/protocol.py b/mcpsim/protocol.py
new file mode 100644
index 0000000..4b3be64
--- /dev/null
+++ b/mcpsim/protocol.py
@@ -0,0 +1,36 @@
+"""Defines a simple protocol that each simulation model should 
+use in order to be compatable with the agent.
+"""
+
+from typing import Protocol, runtime_checkable, Dict, Any
+from abc import abstractmethod
+
+@runtime_checkable
+class SimulationModelAdapter(Protocol):
+    """Protocol defining the interface for MCP-compatible simulation models."""
+    
+    @abstractmethod
+    def run_simulation(self, parameters: Dict[str, Any]) -> Dict[str, Any]:
+        """Execute the simulation with given parameters and return results."""
+        ...
+    
+    @abstractmethod
+    def get_parameter_schema(self) -> Dict[str, Any]:
+        """Return JSON schema for valid simulation parameters."""
+        ...
+    
+    @abstractmethod
+    def get_model_description(self) -> str:
+        """Return human-readable description of the simulation model."""
+        ...
+    
+    @abstractmethod
+    def validate_parameters(self, parameters: Dict[str, Any]) -> Dict[str, Any]:
+        """Validate parameters and return validation results."""
+        ...
+    
+    @property
+    @abstractmethod
+    def model_name(self) -> str:
+        """Return the name/identifier for this simulation model."""
+        ...
\ No newline at end of file

From b6db7c585e5d448f98fd1c7c930b6e311882fe85 Mon Sep 17 00:00:00 2001
From: TomMonks <thomas.m.monks@gmail.com>
Date: Fri, 1 Aug 2025 16:30:12 +0100
Subject: [PATCH 03/23] feat(examples): call_centre_model

---
 mcpsim/example_models/__init__.py             |   0
 mcpsim/example_models/callcentre/__init__.py  |   0
 .../callcentre/call_centre_mcp.py             |  73 ++++
 .../callcentre/call_centre_model.py           | 374 ++++++++++++++++++
 .../callcentre/resources/parameter_prompt.txt |  45 +++
 .../callcentre/resources/schema.json          |  79 ++++
 6 files changed, 571 insertions(+)
 create mode 100644 mcpsim/example_models/__init__.py
 create mode 100644 mcpsim/example_models/callcentre/__init__.py
 create mode 100644 mcpsim/example_models/callcentre/call_centre_mcp.py
 create mode 100644 mcpsim/example_models/callcentre/call_centre_model.py
 create mode 100644 mcpsim/example_models/callcentre/resources/parameter_prompt.txt
 create mode 100644 mcpsim/example_models/callcentre/resources/schema.json

diff --git a/mcpsim/example_models/__init__.py b/mcpsim/example_models/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/mcpsim/example_models/callcentre/__init__.py b/mcpsim/example_models/callcentre/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/mcpsim/example_models/callcentre/call_centre_mcp.py b/mcpsim/example_models/callcentre/call_centre_mcp.py
new file mode 100644
index 0000000..25c0fd8
--- /dev/null
+++ b/mcpsim/example_models/callcentre/call_centre_mcp.py
@@ -0,0 +1,73 @@
+import json
+from model import run_simulation_from_dict
+from typing import Dict, Any
+
+
+class CallCentreSimulation:
+    """Call centre simulation implementation following 
+    the SimulationModelAdapter protocol."""
+    
+    def __init__(self, schema_path: str = "resources/schema.json"):
+        self.schema_path = schema_path
+        self._schema = None
+    
+    @property
+    def model_name(self) -> str:
+        return "urgent_care_call_centre"
+    
+    def run_simulation(self, parameters: Dict[str, Any]) -> Dict[str, Any]:
+        """Run the discrete-event healthcare call centre simulation."""
+        return run_simulation_from_dict(parameters)
+    
+    def get_parameter_schema(self) -> Dict[str, Any]:
+        """Load and return the parameter schema."""
+        if self._schema is None:
+            with open(self.schema_path) as f:
+                self._schema = json.load(f)
+        return self._schema
+    
+    def get_model_description(self) -> str:
+        """Return human-readable description of the call centre model."""
+        return (
+            "This is a discrete-event simulation of a healthcare call centre. "
+            "Patients call in, interact with operators, and a subset may require a nurse callback. "
+            "Simulation components: SimPy queues and resources. Tracks wait times, utilization, and callback rates. "
+            "Configurable parameters: number of operators and nurses, call durations and rates, etc. "
+            "Sample: 'Run with 14 operators and 5% higher demand.'"
+        )
+    
+    def validate_parameters(self, parameters: Dict[str, Any]) -> Dict[str, Any]:
+        """Validate simulation parameters against the experiment schema."""
+        schema = self.get_parameter_schema()
+        errors = []
+        
+        for key, value in parameters.items():
+            # Check for unknown parameters
+            if key not in schema:
+                errors.append(f"Unknown parameter: {key}")
+                continue
+                
+            spec = schema[key]
+            expected_type = int if spec["type"] == "int" else float
+            
+            # Type validation
+            if not isinstance(value, expected_type):
+                errors.append(f"{key} must be {spec['type']}")
+                continue
+                
+            # Range validation
+            if "minimum" in spec and value < spec["minimum"]:
+                errors.append(f"{key} below minimum {spec['minimum']}")
+            if "maximum" in spec and value > spec["maximum"]:
+                errors.append(f"{key} above maximum {spec['maximum']}")
+        
+        # Cross-parameter validation
+        if all(x in parameters for x in ("call_low", "call_mode", "call_high")):
+            if not (parameters["call_low"] <= parameters["call_mode"] <= parameters["call_high"]):
+                errors.append("call_low ≤ call_mode ≤ call_high violated")
+                
+        if all(x in parameters for x in ("nurse_consult_low", "nurse_consult_high")):
+            if not (parameters["nurse_consult_low"] <= parameters["nurse_consult_high"]):
+                errors.append("nurse_consult_low ≤ nurse_consult_high violated")
+        
+        return {"is_valid": len(errors) == 0, "errors": errors}
\ No newline at end of file
diff --git a/mcpsim/example_models/callcentre/call_centre_model.py b/mcpsim/example_models/callcentre/call_centre_model.py
new file mode 100644
index 0000000..ede7143
--- /dev/null
+++ b/mcpsim/example_models/callcentre/call_centre_model.py
@@ -0,0 +1,374 @@
+
+"""
+Enhanced Call Centre Simulation Model with Nurse Callbacks
+
+A discrete event simulation model of a call centre using SimPy.
+Extended to include nurse callbacks for 40% of patients.
+
+Author: Enhanced from Tom Monks' original model
+"""
+
+import numpy as np
+import pandas as pd
+import simpy
+import itertools
+import matplotlib.pyplot as plt
+import seaborn as sns
+import inspect
+
+# =============================================================================
+# CONSTANTS AND DEFAULT VALUES
+# =============================================================================
+
+# Default resources
+N_OPERATORS = 13
+N_NURSES = 10
+
+# Default mean inter-arrival time (exp)
+MEAN_IAT = 60 / 100
+
+# Default service time parameters (triangular)
+CALL_LOW = 5.0
+CALL_MODE = 7.0
+CALL_HIGH = 10.0
+
+# Nurse callback parameters
+CALLBACK_PROBABILITY = 0.4  # 40% of patients need nurse callback
+NURSE_CONSULT_LOW = 10.0    # Uniform distribution 10-20 minutes
+NURSE_CONSULT_HIGH = 20.0
+
+# Sampling settings
+N_STREAMS = 4  # Increased for additional distributions
+DEFAULT_RND_SET = 0
+
+# Boolean switch to display simulation results as the model runs
+TRACE = False
+
+# Run variables
+RESULTS_COLLECTION_PERIOD = 1000
+
+# =============================================================================
+# DISTRIBUTION CLASSES
+# =============================================================================
+
+class Triangular:
+    """
+    Convenience class for the triangular distribution.
+    Packages up distribution parameters, seed and random generator.
+    """
+    def __init__(self, low, mode, high, random_seed=None):
+        self.rand = np.random.default_rng(seed=random_seed)
+        self.low = low
+        self.high = high
+        self.mode = mode
+
+    def sample(self, size=None):
+        return self.rand.triangular(self.low, self.mode, self.high, size=size)
+
+class Exponential:
+    """
+    Convenience class for the exponential distribution.
+    Packages up distribution parameters, seed and random generator.
+    """
+    def __init__(self, mean, random_seed=None):
+        self.rand = np.random.default_rng(seed=random_seed)
+        self.mean = mean
+
+    def sample(self, size=None):
+        return self.rand.exponential(self.mean, size=size)
+
+class Uniform:
+    """
+    Convenience class for the uniform distribution.
+    Packages up distribution parameters, seed and random generator.
+    """
+    def __init__(self, low, high, random_seed=None):
+        self.rand = np.random.default_rng(seed=random_seed)
+        self.low = low
+        self.high = high
+
+    def sample(self, size=None):
+        return self.rand.uniform(self.low, self.high, size=size)
+
+class Bernoulli:
+    """
+    Convenience class for the Bernoulli distribution.
+    Used for callback decision (40% probability).
+    """
+    def __init__(self, p, random_seed=None):
+        self.rand = np.random.default_rng(seed=random_seed)
+        self.p = p
+
+    def sample(self, size=None):
+        return self.rand.binomial(1, self.p, size=size)
+
+# =============================================================================
+# EXPERIMENT CLASS
+# =============================================================================
+
+class Experiment:
+    """
+    Enhanced experiment class with nurse callback functionality.
+    """
+    def __init__(
+        self,
+        random_number_set=DEFAULT_RND_SET,
+        n_operators=N_OPERATORS,
+        n_nurses=N_NURSES,
+        mean_iat=MEAN_IAT,
+        call_low=CALL_LOW,
+        call_mode=CALL_MODE,
+        call_high=CALL_HIGH,
+        callback_prob=CALLBACK_PROBABILITY,
+        nurse_consult_low=NURSE_CONSULT_LOW,
+        nurse_consult_high=NURSE_CONSULT_HIGH,
+        n_streams=N_STREAMS,
+    ):
+        # sampling
+        self.random_number_set = random_number_set
+        self.n_streams = n_streams
+
+        # store parameters for the run of the model
+        self.n_operators = n_operators
+        self.n_nurses = n_nurses
+        self.mean_iat = mean_iat
+        self.call_low = call_low
+        self.call_mode = call_mode
+        self.call_high = call_high
+        self.callback_prob = callback_prob
+        self.nurse_consult_low = nurse_consult_low
+        self.nurse_consult_high = nurse_consult_high
+
+        # resources: initialized after Environment is created
+        self.operators = None
+        self.nurses = None
+
+        # initialise results and sampling
+        self.init_results_variables()
+        self.init_sampling()
+
+    def set_random_no_set(self, random_number_set):
+        self.random_number_set = random_number_set
+        self.init_sampling()
+
+    def init_sampling(self):
+        """Create the distributions used by the model"""
+        # produce n non-overlapping streams
+        seed_sequence = np.random.SeedSequence(self.random_number_set)
+        self.seeds = seed_sequence.spawn(self.n_streams)
+
+        # create distributions
+        self.arrival_dist = Exponential(self.mean_iat, random_seed=self.seeds[0])
+        self.call_dist = Triangular(
+            self.call_low, self.call_mode, self.call_high, random_seed=self.seeds[1]
+        )
+        self.callback_dist = Bernoulli(self.callback_prob, random_seed=self.seeds[2])
+        self.nurse_dist = Uniform(
+            self.nurse_consult_low, self.nurse_consult_high, random_seed=self.seeds[3]
+        )
+
+    def init_results_variables(self):
+        """Initialize all experiment variables used in results collection"""
+        self.results = {}
+
+        # Original results
+        self.results["waiting_times"] = []
+        self.results["total_call_duration"] = 0.0
+
+        # New nurse results
+        self.results["nurse_waiting_times"] = []
+        self.results["total_nurse_duration"] = 0.0
+        self.results["callbacks_requested"] = 0
+        self.results["total_patients"] = 0
+
+# =============================================================================
+# UTILITY FUNCTIONS
+# =============================================================================
+
+def trace(msg):
+    """Turning printing of events on and off."""
+    if TRACE:
+        print(msg)
+
+# =============================================================================
+# MODEL LOGIC
+# =============================================================================
+
+def nurse_service(identifier, env, args):
+    """
+    Simulates the nurse callback process
+    1. request and wait for a nurse
+    2. nurse consultation (uniform)
+    3. exit system
+    """
+    # record the time that patient entered nurse queue
+    start_wait = env.now
+
+    # request a nurse
+    with args.nurses.request() as req:
+        yield req
+
+        # record the waiting time for nurse callback
+        waiting_time = env.now - start_wait
+        args.results["nurse_waiting_times"].append(waiting_time)
+
+        trace(f"nurse started callback for patient {identifier} at {env.now:.3f}")
+
+        # nurse consultation time
+        consult_duration = args.nurse_dist.sample()
+        yield env.timeout(consult_duration)
+
+        # update total nurse duration
+        args.results["total_nurse_duration"] += consult_duration
+
+        trace(f"nurse callback for patient {identifier} ended at {env.now:.3f}; "
+              f"waiting time was {waiting_time:.3f}")
+
+def service(identifier, env, args):
+    """
+    Enhanced service process with nurse callback
+    1. request and wait for a call operator
+    2. phone triage (triangular)
+    3. determine if callback needed (40% probability)
+    4. if needed, initiate nurse callback process
+    5. exit system
+    """
+    # record the time that call entered the queue
+    start_wait = env.now
+
+    # request an operator
+    with args.operators.request() as req:
+        yield req
+
+        # record the waiting time for call to be answered
+        waiting_time = env.now - start_wait
+        args.results["waiting_times"].append(waiting_time)
+
+        trace(f"operator answered call {identifier} at {env.now:.3f}")
+
+        # call duration
+        call_duration = args.call_dist.sample()
+        yield env.timeout(call_duration)
+
+        # update the total call duration
+        args.results["total_call_duration"] += call_duration
+
+        trace(f"call {identifier} ended {env.now:.3f}; "
+              f"waiting time was {waiting_time:.3f}")
+
+    # After operator call, determine if nurse callback is needed
+    args.results["total_patients"] += 1
+    callback_needed = args.callback_dist.sample()
+
+    if callback_needed:
+        args.results["callbacks_requested"] += 1
+        trace(f"patient {identifier} requires nurse callback")
+
+        # Start nurse callback process
+        env.process(nurse_service(identifier, env, args))
+
+def arrivals_generator(env, args):
+    """Generate patient arrivals with exponential inter-arrival times"""
+    for caller_count in itertools.count(start=1):
+        # sample inter-arrival time
+        inter_arrival_time = args.arrival_dist.sample()
+        yield env.timeout(inter_arrival_time)
+
+        trace(f"call arrives at: {env.now:.3f}")
+
+        # start service process
+        env.process(service(caller_count, env, args))
+
+# =============================================================================
+# EXPERIMENT EXECUTION FUNCTIONS
+# =============================================================================
+
+def single_run(experiment, rep=0, rc_period=RESULTS_COLLECTION_PERIOD):
+    """Perform a single run of the enhanced model"""
+    run_results = {}
+
+    # reset all result collection variables
+    experiment.init_results_variables()
+
+    # set random number set
+    experiment.set_random_no_set(rep)
+
+    # create environment
+    env = simpy.Environment()
+
+    # create resources
+    experiment.operators = simpy.Resource(env, capacity=experiment.n_operators)
+    experiment.nurses = simpy.Resource(env, capacity=experiment.n_nurses)
+
+    # start arrivals process
+    env.process(arrivals_generator(env, experiment))
+
+    # run simulation
+    env.run(until=rc_period)
+
+    # calculate results
+    run_results["01_mean_waiting_time"] = np.mean(experiment.results["waiting_times"])
+    run_results["02_operator_util"] = (
+        experiment.results["total_call_duration"] / 
+        (rc_period * experiment.n_operators)
+    ) * 100.0
+
+    # Calculate nurse results
+    if experiment.results["nurse_waiting_times"]:
+        run_results["03_mean_nurse_waiting_time"] = np.mean(experiment.results["nurse_waiting_times"])
+    else:
+        run_results["03_mean_nurse_waiting_time"] = 0.0
+
+    run_results["04_nurse_util"] = (
+        experiment.results["total_nurse_duration"] / 
+        (rc_period * experiment.n_nurses)
+    ) * 100.0
+
+    # Additional metrics
+    run_results["05_callback_rate"] = (
+        experiment.results["callbacks_requested"] / 
+        experiment.results["total_patients"]
+    ) * 100.0 if experiment.results["total_patients"] > 0 else 0.0
+
+    return run_results
+
+def multiple_replications(experiment, rc_period=RESULTS_COLLECTION_PERIOD, n_reps=5):
+    """Perform multiple replications of the enhanced model"""
+    results = [single_run(experiment, rep, rc_period) for rep in range(n_reps)]
+    df_results = pd.DataFrame(results)
+    df_results.index = np.arange(1, len(df_results) + 1)
+    df_results.index.name = "rep"
+    return df_results
+
+def set_trace(trace_on=True):
+    """Turn tracing on/off globally"""
+    global TRACE
+    TRACE = trace_on
+
+
+def run_simulation_from_dict(params: dict):
+    """
+    Wrapper to run the simulation using params supplied in a dictionary.
+    Only the parameters necessary for the Experiment's constructor are passed;
+    defaults are used for those not provided.
+    """
+    # Extract constructor signature of the Experiment class
+    signature = inspect.signature(Experiment.__init__)
+    
+    # Build kwargs dict: match keys in 'params' to Experiment.__init__ params
+    experiment_kwargs = {
+        key: params.get(key, param.default)
+        for key, param in signature.parameters.items()
+        if key != 'self'  # exclude 'self' from constructor
+    }
+
+    # Create Experiment instance with dynamic/default parameters
+    exp = Experiment(**experiment_kwargs)
+
+    # Separate non-constructor parameters
+    run_length = params.get("run_length", RESULTS_COLLECTION_PERIOD)
+    rep_seed = params.get("random_seed", DEFAULT_RND_SET)
+
+    # Run simulation
+    return single_run(exp, rep=rep_seed, rc_period=run_length)
+
diff --git a/mcpsim/example_models/callcentre/resources/parameter_prompt.txt b/mcpsim/example_models/callcentre/resources/parameter_prompt.txt
new file mode 100644
index 0000000..bcfe6d5
--- /dev/null
+++ b/mcpsim/example_models/callcentre/resources/parameter_prompt.txt
@@ -0,0 +1,45 @@
+You are a discrete-event simulation assistant.
+You help turn natural language into structured simulation parameters.
+
+Here is the simulation configuration schema:
+{schema}
+
+User request: {user_input}
+
+{validation_feedback}
+
+IMPORTANT:
+- Return ONLY the parameter VALUES, not the schema structure
+- Use the exact parameter names from the schema
+- Output a simple JSON object with key-value pairs
+- Do NOT include type information, descriptions, or constraints
+- CALCULATE all mathematical expressions and provide ONLY the final numerical result
+- Do NOT show calculations like "0.6 * 0.95" - provide the computed value like 0.57
+- All values must be final computed numbers, not mathematical expressions
+
+Example output format:
+{{
+  "n_operators": 15,
+  "n_nurses": 8,
+  "mean_iat": 0.57,
+  "random_seed": 42,
+  "run_length": 1000
+}}
+
+WRONG format (do not do this):
+{{
+  "mean_iat": "0.6 * 0.95"
+}}
+
+CORRECT format:
+{{
+  "mean_iat": 0.57
+}}
+
+
+Before providing the JSON:
+1. Mentally calculate any mathematical expressions
+2. Convert all calculations to final numerical values
+3. Then output only the JSON with computed numbers
+
+JSON Response:
diff --git a/mcpsim/example_models/callcentre/resources/schema.json b/mcpsim/example_models/callcentre/resources/schema.json
new file mode 100644
index 0000000..66d699d
--- /dev/null
+++ b/mcpsim/example_models/callcentre/resources/schema.json
@@ -0,0 +1,79 @@
+{
+  "n_operators": {
+    "type": "int",
+    "default": 13,
+    "minimum": 1,
+    "maximum": 100,
+    "description": "Number of call operators (1-100)"
+  },
+  "n_nurses": {
+    "type": "int",
+    "default": 10,
+    "minimum": 1,
+    "maximum": 50,
+    "description": "Number of nurse callbacks (1-50)"
+  },
+  "mean_iat": {
+    "type": "float",
+    "default": 0.6,
+    "minimum": 0.1,
+    "maximum": 10.0,
+    "description": "Mean time between arrivals (minutes) or inter-arrival time. 0.1 = 6 calls/min, 10.0 = 1 call/10min. An increase in demand = a reduction in the inter-arrival time. A reduction in demand = an increase in inter-arrival time"
+  },
+  "call_low": {
+    "type": "float",
+    "default": 5.0,
+    "minimum": 1.0,
+    "maximum": 30.0,
+    "description": "Low bound for call duration (minutes)"
+  },
+  "call_mode": {
+    "type": "float",
+    "default": 7.0,
+    "minimum": 1.0,
+    "maximum": 30.0,
+    "description": "Mode for triangular call distribution (minutes)"
+  },
+  "call_high": {
+    "type": "float",
+    "default": 10.0,
+    "minimum": 1.0,
+    "maximum": 30.0,
+    "description": "High bound for call duration (minutes)"
+  },
+  "callback_prob": {
+    "type": "float",
+    "default": 0.4,
+    "minimum": 0.0,
+    "maximum": 1.0,
+    "description": "Nurse callback probability (0-1)"
+  },
+  "nurse_consult_low": {
+    "type": "float",
+    "default": 10.0,
+    "minimum": 5.0,
+    "maximum": 60.0,
+    "description": "Low bound for nurse consult duration (minutes)"
+  },
+  "nurse_consult_high": {
+    "type": "float",
+    "default": 20.0,
+    "minimum": 5.0,
+    "maximum": 60.0,
+    "description": "High bound for nurse consult duration (minutes)"
+  },
+  "random_seed": {
+    "type": "int",
+    "default": 0,
+    "minimum": 0,
+    "maximum": 999999,
+    "description": "Random number seed for reproducibility"
+  },
+  "run_length": {
+    "type": "int",
+    "default": 1000,
+    "minimum": 100,
+    "maximum": 86400,
+    "description": "Simulation runtime (minutes)"
+  }
+}

From 0932d4210183cf9513eefdb371acf1b5c2c81a3c Mon Sep 17 00:00:00 2001
From: TomMonks <thomas.m.monks@gmail.com>
Date: Fri, 1 Aug 2025 16:30:30 +0100
Subject: [PATCH 04/23] feat(server): added generic sim mcp server code

---
 mcpsim/server.py | 145 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 145 insertions(+)
 create mode 100644 mcpsim/server.py

diff --git a/mcpsim/server.py b/mcpsim/server.py
new file mode 100644
index 0000000..7984f62
--- /dev/null
+++ b/mcpsim/server.py
@@ -0,0 +1,145 @@
+from fastmcp import FastMCP 
+from langchain_core.prompts import PromptTemplate
+from fastmcp.prompts.prompt import PromptMessage, TextContent
+
+from mcpsim.protocol import SimulationModelAdapter
+
+class SimulationMCPServer:
+    """MCP server that can work with any SimulationModelAdapter implementation."""
+    
+    def __init__(self, simulation_model: SimulationModelAdapter, server_name: str = None):
+        # validate interface
+        if not isinstance(simulation_model, SimulationModelAdapter):
+            missing_methods = []
+            for method in ['run_simulation', 'get_parameter_schema', 'get_model_description', 
+                          'validate_parameters']:
+                if not hasattr(simulation_model, method):
+                    missing_methods.append(method)
+            if not hasattr(simulation_model, 'model_name'):
+                missing_methods.append('model_name (property)')
+                
+            raise TypeError(f"Object {type(simulation_model)} missing required methods: {missing_methods}")
+        
+        self.model = simulation_model
+        server_name = server_name or f"{simulation_model.model_name} Simulation MCP Server"
+        self.mcp = FastMCP(server_name)
+        self._register_tools()
+        self._register_resources()
+        self._register_prompts()
+    
+    def _register_tools(self):
+        """Register MCP tools that delegate to the simulation model."""
+        
+        # note investigate how to clean up description
+        @self.mcp.tool(
+            name=f"run_{self.model.model_name}_simulation",
+            description=f"""
+            Runs a discrete-event {self.model.model_name} simulation with specified parameters, 
+            returning performance metrics.
+
+            Inputs: parameters (dict) — JSON object matching the experiment schema.
+            Returns: dict with simulation metrics, such as mean wait times and resource utilizations.
+
+            Tags: ["simulation", "{self.model.model_name}", "experiment"]
+            """
+        )
+        def run_simulation(parameters: dict) -> dict:
+            return self.model.run_simulation(parameters)
+
+        @self.mcp.tool(
+            name="validate_simulation_parameters",
+            description="""
+            Validate a proposed set of simulation parameters (JSON object) 
+            against the experiment schema.
+
+            Inputs: parameters (dict)
+            Returns: {"is_valid": bool, "errors": [str, ...]} — status and explanation.
+
+            Tags: ["validation", "parameter_check", "pre_run_check", "schema"]
+            """
+        )
+        def validate_parameters(parameters: dict) -> dict:
+            return self.model.validate_parameters(parameters)
+    
+    def _register_resources(self):
+        """Register MCP resources that delegate to the simulation model."""
+        
+        @self.mcp.resource(
+            uri="resource://schema/experiment_parameters",
+            description="""
+            Returns the JSON schema defining all allowed input parameters, parameter types, 
+            and value constraints.
+
+            Outputs: dict (JSON schema), sent as a JSON object.
+
+            Tags: ["schema", "parameters", "template"]
+            """
+        )
+        def get_schema() -> dict:
+            return self.model.get_parameter_schema()
+
+        @self.mcp.resource(
+            uri="resource://model/description",
+            description=f"""
+            Provides a natural language description of 
+            the {self.model.model_name} simulation model.
+
+            Outputs: str (text description).
+
+            Tags: ["model", "description", "documentation"]
+            """
+        )
+        def get_description() -> str:
+            return self.model.get_model_description()
+    
+    def _register_prompts(self):
+        """Register MCP prompts for parameter conversion."""
+        
+        @self.mcp.prompt(
+            name="parameter_jsonification_prompt",
+            description="""
+            INSTRUCTION TO LLM: Convert a user's freeform simulation request 
+            into a JSON object matching a given schema.
+
+            Inputs:
+            - schema (str): JSON Schema as a string
+            - user_input (str): User's natural language request
+
+            Returns: PromptMessage (LLM input) guiding the agent to produce valid JSON parameters.
+
+            Tags: ["jsonification", "schema_mapping", "prompt", "parameters"]
+            """
+        )
+        def parameter_jsonification_prompt(
+            schema: str, 
+            user_input: str,
+            validation_errors: str = ""
+        ) -> PromptMessage:
+            with open("resources/parameter_prompt.txt", encoding="utf-8") as f:
+                prompt_template_text = f.read()
+            prompt = PromptTemplate.from_template(prompt_template_text)
+
+            # Handle validation error feedback
+            if validation_errors and validation_errors.strip():
+                validation_feedback = (
+                    "**Validation Feedback:**\n"
+                    "Your last attempt did not pass validation for these reasons:\n"
+                    f"{validation_errors}\n\n"
+                    "Please address the issues above and try again."
+                )
+            else:
+                validation_feedback = ""
+
+            filled_prompt = prompt.format(
+                schema=schema, 
+                user_input=user_input,
+                validation_feedback=validation_feedback
+            )
+            return PromptMessage(
+                role="user",
+                content=TextContent(type="text", text=filled_prompt)
+            )
+    
+    def run(self, **kwargs):
+        """Start the MCP server."""
+        self.mcp.run(**kwargs)
\ No newline at end of file

From 8039f93e6238f9d043b062f9a53d976407153c1e Mon Sep 17 00:00:00 2001
From: TomMonks <thomas.m.monks@gmail.com>
Date: Fri, 1 Aug 2025 16:31:35 +0100
Subject: [PATCH 05/23] clean(self_ref): removed commented out models

---
 agent_self_reflection.py | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/agent_self_reflection.py b/agent_self_reflection.py
index bcb7b50..11c869f 100644
--- a/agent_self_reflection.py
+++ b/agent_self_reflection.py
@@ -445,20 +445,6 @@ def parse_arguments():
 
 if __name__ == "__main__":
 
-    # TM notes: mistral:7b can fail and successed with reflection (or 2nd go)
-    # qwen2:math:7b is not suitable
-    # gemma3:27b both variants work well
-    # deepseek-r1:32b is too slow on 4080
-
-    #model_name = "gemma3n:e4b"
-    #model_name = "deepseek-r1:32b"
-    #model_name = "llama3:latest"
-    #model_name = "llama3.1:8b"
-    #model_name = "gemma3:27b"
-    # model_name = "gemma3:27b-it-qat"
-    #model_name = "qwen2-math:7b"
-    #model_name = "mistral:7b"
-
     # Parse command line arguments
     args = parse_arguments()
 

From 8b69a76336d224a4c2a85412aafa7b321e9a8b10 Mon Sep 17 00:00:00 2001
From: TomMonks <thomas.m.monks@gmail.com>
Date: Fri, 1 Aug 2025 16:41:24 +0100
Subject: [PATCH 06/23] feat(call_centre): renamed adapter class

---
 mcpsim/example_models/callcentre/call_centre_mcp.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mcpsim/example_models/callcentre/call_centre_mcp.py b/mcpsim/example_models/callcentre/call_centre_mcp.py
index 25c0fd8..cbbb9ec 100644
--- a/mcpsim/example_models/callcentre/call_centre_mcp.py
+++ b/mcpsim/example_models/callcentre/call_centre_mcp.py
@@ -3,7 +3,7 @@
 from typing import Dict, Any
 
 
-class CallCentreSimulation:
+class CallCentreSimulationAdapter:
     """Call centre simulation implementation following 
     the SimulationModelAdapter protocol."""
     

From 783e5161d77e84170df07a7da2eb4c20727aef34 Mon Sep 17 00:00:00 2001
From: TomMonks <thomas.m.monks@gmail.com>
Date: Fri, 1 Aug 2025 17:08:39 +0100
Subject: [PATCH 07/23] feat(mcp): setup script to run new framework

---
 mcpsim/__init__.py                                |  4 +++-
 mcpsim/example_models/__init__.py                 |  3 +++
 mcpsim/example_models/callcentre/__init__.py      |  2 ++
 .../example_models/callcentre/call_centre_mcp.py  |  2 +-
 mcpsim/server.py                                  |  2 +-
 run_call_centre_server.py                         | 15 +++++++++++++++
 6 files changed, 25 insertions(+), 3 deletions(-)
 create mode 100644 run_call_centre_server.py

diff --git a/mcpsim/__init__.py b/mcpsim/__init__.py
index 66e77de..462e805 100644
--- a/mcpsim/__init__.py
+++ b/mcpsim/__init__.py
@@ -2,4 +2,6 @@
 """
 
 __version__ = "0.1.0"
-__author__ = "Tom Monks, Alison Harper"
\ No newline at end of file
+__author__ = "Tom Monks, Alison Harper"
+
+from .example_models import CallCentreSimulationAdapter
diff --git a/mcpsim/example_models/__init__.py b/mcpsim/example_models/__init__.py
index e69de29..f09bbcf 100644
--- a/mcpsim/example_models/__init__.py
+++ b/mcpsim/example_models/__init__.py
@@ -0,0 +1,3 @@
+# example_models.__init__
+from .callcentre import CallCentreSimulationAdapter
+
diff --git a/mcpsim/example_models/callcentre/__init__.py b/mcpsim/example_models/callcentre/__init__.py
index e69de29..5529771 100644
--- a/mcpsim/example_models/callcentre/__init__.py
+++ b/mcpsim/example_models/callcentre/__init__.py
@@ -0,0 +1,2 @@
+#callcentre.___init__
+from .call_centre_mcp import CallCentreSimulationAdapter
\ No newline at end of file
diff --git a/mcpsim/example_models/callcentre/call_centre_mcp.py b/mcpsim/example_models/callcentre/call_centre_mcp.py
index cbbb9ec..07761c0 100644
--- a/mcpsim/example_models/callcentre/call_centre_mcp.py
+++ b/mcpsim/example_models/callcentre/call_centre_mcp.py
@@ -13,7 +13,7 @@ def __init__(self, schema_path: str = "resources/schema.json"):
     
     @property
     def model_name(self) -> str:
-        return "urgent_care_call_centre"
+        return "call_centre"
     
     def run_simulation(self, parameters: Dict[str, Any]) -> Dict[str, Any]:
         """Run the discrete-event healthcare call centre simulation."""
diff --git a/mcpsim/server.py b/mcpsim/server.py
index 7984f62..8f731ce 100644
--- a/mcpsim/server.py
+++ b/mcpsim/server.py
@@ -43,7 +43,7 @@ def _register_tools(self):
             Tags: ["simulation", "{self.model.model_name}", "experiment"]
             """
         )
-        def run_simulation(parameters: dict) -> dict:
+        def run_call_centre_simulation(parameters: dict) -> dict:
             return self.model.run_simulation(parameters)
 
         @self.mcp.tool(
diff --git a/run_call_centre_server.py b/run_call_centre_server.py
new file mode 100644
index 0000000..7330373
--- /dev/null
+++ b/run_call_centre_server.py
@@ -0,0 +1,15 @@
+from mcpsim.example_models import CallCentreSimulationAdapter
+from mcpsim.server import SimulationMCPServer
+
+def main():
+    # Create the simulation model
+    call_centre_sim = CallCentreSimulationAdapter()
+    
+    # Create the MCP server
+    server = SimulationMCPServer(call_centre_sim)
+    
+    # Run the server
+    server.run(transport="http", host="127.0.0.1", port=8001, path="/mcp")
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file

From 87a17d07c768cc66cb22761d9296cffdf963cc67 Mon Sep 17 00:00:00 2001
From: TomMonks <thomas.m.monks@gmail.com>
Date: Thu, 7 Aug 2025 15:47:49 +0100
Subject: [PATCH 08/23] fix(planner): memory stores schema under get_schema

---
 agent_planning_workflow.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/agent_planning_workflow.py b/agent_planning_workflow.py
index 05d2e27..2ae1941 100644
--- a/agent_planning_workflow.py
+++ b/agent_planning_workflow.py
@@ -438,11 +438,16 @@ async def run_plan(
 
                 prompt = prompt_map[step_name]
                 # Most prompts need schema, user input
-                schema = memory.get("get_experiment_parameter_schema")
+                schema = memory.get("get_schema")
                 result = await client.get_prompt(
                     prompt.name, {"schema": schema, "user_input": user_input}
                 )
                 llm_prompt_text = result.messages[0].content.text
+
+                if debug_mode:
+                    print(f"🐛 {llm_prompt_text}")
+
+
                 # Actually run LLM (synchronously!)
                 llm_result = llm.invoke(llm_prompt_text)
                 parameters = json.loads(clean_llm_response(llm_result))
@@ -539,7 +544,7 @@ async def main(
         logger.debug(f"Response length: {len(response)} characters")
         logger.debug("Response preview:")
         logger.debug(
-            response[:500] + "..." if len(response) > 500 else response
+            response[:1000] + "..." if len(response) > 1000 else response
         )
         debug_print_plan(plan_steps, logger)
 
@@ -551,7 +556,7 @@ async def main(
         transient=True,  # Removes progress bar after completion
     ) as progress:
         task = progress.add_task("executing", total=None)
-        memory = await run_plan(plan_steps, features, llm, user_input)
+        memory = await run_plan(plan_steps, features, llm, user_input, debug_mode)
         progress.remove_task(task)
 
     # 4. Results

From 1f119cca889ff827246a5fb6176d1664b9786197 Mon Sep 17 00:00:00 2001
From: TomMonks <thomas.m.monks@gmail.com>
Date: Thu, 7 Aug 2025 16:00:05 +0100
Subject: [PATCH 09/23] feat(planner): shorten plan preview to 500 charac

---
 agent_planning_workflow.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/agent_planning_workflow.py b/agent_planning_workflow.py
index 2ae1941..f6ca4b6 100644
--- a/agent_planning_workflow.py
+++ b/agent_planning_workflow.py
@@ -444,11 +444,7 @@ async def run_plan(
                 )
                 llm_prompt_text = result.messages[0].content.text
 
-                if debug_mode:
-                    print(f"🐛 {llm_prompt_text}")
-
-
-                # Actually run LLM (synchronously!)
+                # run the prompt
                 llm_result = llm.invoke(llm_prompt_text)
                 parameters = json.loads(clean_llm_response(llm_result))
                 memory["parameters"] = parameters
@@ -544,7 +540,7 @@ async def main(
         logger.debug(f"Response length: {len(response)} characters")
         logger.debug("Response preview:")
         logger.debug(
-            response[:1000] + "..." if len(response) > 1000 else response
+            response[:500] + "..." if len(response) > 500 else response
         )
         debug_print_plan(plan_steps, logger)
 

From 66bfbd47e8eb3dba005e13eacdccba415288aa47 Mon Sep 17 00:00:00 2001
From: TomMonks <thomas.m.monks@gmail.com>
Date: Thu, 7 Aug 2025 16:01:54 +0100
Subject: [PATCH 10/23] chore(clean): removed original mcp and model modules

---
 mcp_server.py | 229 -------------------------------
 model.py      | 374 --------------------------------------------------
 2 files changed, 603 deletions(-)
 delete mode 100644 mcp_server.py
 delete mode 100644 model.py

diff --git a/mcp_server.py b/mcp_server.py
deleted file mode 100644
index 50d47ce..0000000
--- a/mcp_server.py
+++ /dev/null
@@ -1,229 +0,0 @@
-"""
-Call Centre Simulation MCP Server
-=================================
-
-This module implements the Model Content Protocol (MCP) server for a discrete-event 
-simulation (DES) model of a healthcare call centre. The server exposes executable 
-simulation tools, parameter validation utilities, schema/documentation resources, 
-and prompting interfaces designed for integration with language model (LLM) agents.
-
-Main Features
--------------
-- Run call centre simulations with configurable staffing and demand parameters.
-- Validate proposed simulation parameters against a machine-readable JSON schema.
-- Provide self-describing resources (schema, natural language model documentation).
-- Generate LLM prompts to map user requests onto structured simulation parameter sets.
-
-Accessible Functionalities
---------------------------
-- `run_call_centre_simulation`: Simulate the healthcare call centre and return metrics.
-- `validate_simulation_parameters`: Pre-validate parameter sets to prevent runtime errors.
-- `get_experiment_parameter_schema`: Obtain the schema for all simulation parameters.
-- `get_model_description`: Retrieve a human-oriented description of the simulation model.
-- `parameter_jsonification_prompt`: Produce a prompt for LLMs to convert text requests into structured parameters.
-
-Intended Usage
---------------
-This server is intended to be used with agentic or LLM-driven client interfaces, 
-providing a robust, schema-driven, and discoverable API for advanced simulation reasoning, 
-configuration generation, and automated experimentation.
-
-"""
-
-from fastmcp import FastMCP 
-from langchain_core.prompts import PromptTemplate
-from fastmcp.prompts.prompt import PromptMessage, TextContent
-from model import run_simulation_from_dict
-import json
-import re
-
-mcp = FastMCP("Call Centre Simulation MCP Server")
-
-@mcp.tool(
-    name="run_call_centre_simulation",
-    description="""
-Runs a discrete-event healthcare call centre simulation with specified parameters, returning performance metrics.
-
-Inputs: parameters (dict) — JSON object matching the experiment schema.
-Example: {"operators": 12, "nurses": 5, "call_arrival_rate": 120}
-
-Returns: dict with simulation metrics, such as mean wait times and resource utilizations.
-
-Tags: ["simulation", "call_centre", "experiment"]
-""")
-def run_call_centre_simulation(parameters: dict) -> dict:
-    """
-    Run a discrete-event healthcare call centre simulation.
-
-    Parameters
-    ----------
-    parameters : dict
-        Simulation configuration as a JSON object matching the experiment parameter schema.
-        Example:
-            {"operators": 14, "nurses": 4, "call_arrival_rate": 120}
-
-    Returns
-    -------
-    dict
-        Simulation results, such as:
-        {
-            "mean_wait_time": float,
-            "operator_utilization": float,
-            "callback_rate": float,
-            ...
-        }
-    """
-    return run_simulation_from_dict(parameters)
-
-@mcp.resource(
-    uri="resource://schema/experiment_parameters",
-    description="""
-Returns the JSON schema defining all allowed input parameters, parameter types, and value constraints.
-
-Outputs: dict (JSON schema), sent as a JSON object.
-
-Tags: ["schema", "parameters", "template"]
-""")
-def get_experiment_parameter_schema() -> dict:
-    """
-    Retrieve the experiment parameter JSON schema.
-
-    Returns
-    -------
-    dict
-        JSON schema describing all allowable simulation parameters, accepted types, and value constraints.
-        Example:
-            {
-                "operators": {"type": "int", "minimum": 1, "maximum": 50},
-                ...
-            }
-    """
-    with open("resources/schema.json") as f:
-        return json.load(f)
-
-@mcp.resource(
-    uri="resource://model/description",
-    description="""
-Provides a natural language description of the healthcare call centre simulation model.
-
-Outputs: str (text description).
-
-Tags: ["model", "description", "documentation"]
-""")
-def get_model_description() -> str:
-    """
-    Get a natural language description of the call centre simulation model.
-
-    Returns
-    -------
-    str
-        Human-readable description of the model's structure, main features, and configurable aspects.
-    """
-    return (
-        "This is a discrete-event simulation of a healthcare call centre. "
-        "Patients call in, interact with operators, and a subset may require a nurse callback. "
-        "Simulation components: SimPy queues and resources. Tracks wait times, utilization, and callback rates. "
-        "Configurable parameters: number of operators and nurses, call durations and rates, etc. "
-        "Sample: 'Run with 14 operators and 5% higher demand.'"
-    )
-
-@mcp.prompt(
-    name="parameter_jsonification_prompt",
-    description="""
-INSTRUCTION TO LLM: Convert a user's freeform simulation request into a JSON object matching a given schema.
-
-Inputs:
-- schema (str): JSON Schema as a string
-- user_input (str): User's natural language request
-
-Returns: PromptMessage (LLM input) guiding the agent to produce valid JSON parameters.
-
-Tags: ["jsonification", "schema_mapping", "prompt", "parameters"]
-""")
-def parameter_jsonification_prompt(
-    schema: str, 
-    user_input: str,
-    validation_errors: str = ""
-) -> PromptMessage:
-    with open("resources/parameter_prompt.txt", encoding="utf-8") as f:
-        prompt_template_text = f.read()
-    prompt = PromptTemplate.from_template(prompt_template_text)
-
-    # manage validation error str if not ""
-    if validation_errors and validation_errors.strip():
-        validation_feedback = (
-            "**Validation Feedback:**\n"
-            "Your last attempt did not pass validation for these reasons:\n"
-            f"{validation_errors}\n\n"
-            "Please address the issues above and try again."
-        )
-    else:
-        validation_feedback = ""
-
-    filled_prompt = prompt.format(
-        schema=schema, 
-        user_input=user_input,
-        validation_feedback=validation_feedback
-    )
-    return PromptMessage(
-        role="user",
-        content=TextContent(type="text", text=filled_prompt)
-    )
-
-
-@mcp.tool(
-    name="validate_simulation_parameters",
-    description="""
-Validate a proposed set of simulation parameters (JSON object) against the experiment schema.
-
-Inputs: parameters (dict)
-Returns: {"is_valid": bool, "errors": [str, ...]} — status and explanation.
-
-Tags: ["validation", "parameter_check", "pre_run_check", "schema"]
-""")
-def validate_simulation_parameters(parameters: dict) -> dict:
-    """
-    Validate simulation parameters against the experiment schema.
-
-    Parameters
-    ----------
-    parameters : dict
-        Proposed parameter set for simulation, as a JSON-compatible dictionary.
-
-    Returns
-    -------
-    dict
-        {
-            "is_valid": bool,
-            "errors": list of str
-        }
-        is_valid: True if all parameters are recognized and within allowed ranges.
-        errors: Descriptive messages for each invalid parameter or failed interparameter constraint.
-    """
-    with open("resources/schema.json") as f:
-        schema = json.load(f)
-    errors = []
-    for key, value in parameters.items():
-        # catch hallucinated parameters
-        if key not in schema:
-            errors.append(f"Unknown parameter: {key}")
-            continue
-        spec = schema[key]
-        expected_type = int if spec["type"] == "int" else float
-        if not isinstance(value, expected_type):
-            errors.append(f"{key} must be {spec['type']}")
-            continue
-        if "minimum" in spec and value < spec["minimum"]:
-            errors.append(f"{key} below minimum {spec['minimum']}")
-        if "maximum" in spec and value > spec["maximum"]:
-            errors.append(f"{key} above maximum {spec['maximum']}")
-    if all(x in parameters for x in ("call_low", "call_mode", "call_high")):
-        if not (parameters["call_low"] <= parameters["call_mode"] <= parameters["call_high"]):
-            errors.append("call_low ≤ call_mode ≤ call_high violated")
-    if all(x in parameters for x in ("nurse_consult_low", "nurse_consult_high")):
-        if not (parameters["nurse_consult_low"] <= parameters["nurse_consult_high"]):
-            errors.append("nurse_consult_low ≤ nurse_consult_high violated")
-    return {"is_valid": len(errors) == 0, "errors": errors}
-
-if __name__ == "__main__":
-    mcp.run(transport="http", host="127.0.0.1", port=8001, path="/mcp")
diff --git a/model.py b/model.py
deleted file mode 100644
index ede7143..0000000
--- a/model.py
+++ /dev/null
@@ -1,374 +0,0 @@
-
-"""
-Enhanced Call Centre Simulation Model with Nurse Callbacks
-
-A discrete event simulation model of a call centre using SimPy.
-Extended to include nurse callbacks for 40% of patients.
-
-Author: Enhanced from Tom Monks' original model
-"""
-
-import numpy as np
-import pandas as pd
-import simpy
-import itertools
-import matplotlib.pyplot as plt
-import seaborn as sns
-import inspect
-
-# =============================================================================
-# CONSTANTS AND DEFAULT VALUES
-# =============================================================================
-
-# Default resources
-N_OPERATORS = 13
-N_NURSES = 10
-
-# Default mean inter-arrival time (exp)
-MEAN_IAT = 60 / 100
-
-# Default service time parameters (triangular)
-CALL_LOW = 5.0
-CALL_MODE = 7.0
-CALL_HIGH = 10.0
-
-# Nurse callback parameters
-CALLBACK_PROBABILITY = 0.4  # 40% of patients need nurse callback
-NURSE_CONSULT_LOW = 10.0    # Uniform distribution 10-20 minutes
-NURSE_CONSULT_HIGH = 20.0
-
-# Sampling settings
-N_STREAMS = 4  # Increased for additional distributions
-DEFAULT_RND_SET = 0
-
-# Boolean switch to display simulation results as the model runs
-TRACE = False
-
-# Run variables
-RESULTS_COLLECTION_PERIOD = 1000
-
-# =============================================================================
-# DISTRIBUTION CLASSES
-# =============================================================================
-
-class Triangular:
-    """
-    Convenience class for the triangular distribution.
-    Packages up distribution parameters, seed and random generator.
-    """
-    def __init__(self, low, mode, high, random_seed=None):
-        self.rand = np.random.default_rng(seed=random_seed)
-        self.low = low
-        self.high = high
-        self.mode = mode
-
-    def sample(self, size=None):
-        return self.rand.triangular(self.low, self.mode, self.high, size=size)
-
-class Exponential:
-    """
-    Convenience class for the exponential distribution.
-    Packages up distribution parameters, seed and random generator.
-    """
-    def __init__(self, mean, random_seed=None):
-        self.rand = np.random.default_rng(seed=random_seed)
-        self.mean = mean
-
-    def sample(self, size=None):
-        return self.rand.exponential(self.mean, size=size)
-
-class Uniform:
-    """
-    Convenience class for the uniform distribution.
-    Packages up distribution parameters, seed and random generator.
-    """
-    def __init__(self, low, high, random_seed=None):
-        self.rand = np.random.default_rng(seed=random_seed)
-        self.low = low
-        self.high = high
-
-    def sample(self, size=None):
-        return self.rand.uniform(self.low, self.high, size=size)
-
-class Bernoulli:
-    """
-    Convenience class for the Bernoulli distribution.
-    Used for callback decision (40% probability).
-    """
-    def __init__(self, p, random_seed=None):
-        self.rand = np.random.default_rng(seed=random_seed)
-        self.p = p
-
-    def sample(self, size=None):
-        return self.rand.binomial(1, self.p, size=size)
-
-# =============================================================================
-# EXPERIMENT CLASS
-# =============================================================================
-
-class Experiment:
-    """
-    Enhanced experiment class with nurse callback functionality.
-    """
-    def __init__(
-        self,
-        random_number_set=DEFAULT_RND_SET,
-        n_operators=N_OPERATORS,
-        n_nurses=N_NURSES,
-        mean_iat=MEAN_IAT,
-        call_low=CALL_LOW,
-        call_mode=CALL_MODE,
-        call_high=CALL_HIGH,
-        callback_prob=CALLBACK_PROBABILITY,
-        nurse_consult_low=NURSE_CONSULT_LOW,
-        nurse_consult_high=NURSE_CONSULT_HIGH,
-        n_streams=N_STREAMS,
-    ):
-        # sampling
-        self.random_number_set = random_number_set
-        self.n_streams = n_streams
-
-        # store parameters for the run of the model
-        self.n_operators = n_operators
-        self.n_nurses = n_nurses
-        self.mean_iat = mean_iat
-        self.call_low = call_low
-        self.call_mode = call_mode
-        self.call_high = call_high
-        self.callback_prob = callback_prob
-        self.nurse_consult_low = nurse_consult_low
-        self.nurse_consult_high = nurse_consult_high
-
-        # resources: initialized after Environment is created
-        self.operators = None
-        self.nurses = None
-
-        # initialise results and sampling
-        self.init_results_variables()
-        self.init_sampling()
-
-    def set_random_no_set(self, random_number_set):
-        self.random_number_set = random_number_set
-        self.init_sampling()
-
-    def init_sampling(self):
-        """Create the distributions used by the model"""
-        # produce n non-overlapping streams
-        seed_sequence = np.random.SeedSequence(self.random_number_set)
-        self.seeds = seed_sequence.spawn(self.n_streams)
-
-        # create distributions
-        self.arrival_dist = Exponential(self.mean_iat, random_seed=self.seeds[0])
-        self.call_dist = Triangular(
-            self.call_low, self.call_mode, self.call_high, random_seed=self.seeds[1]
-        )
-        self.callback_dist = Bernoulli(self.callback_prob, random_seed=self.seeds[2])
-        self.nurse_dist = Uniform(
-            self.nurse_consult_low, self.nurse_consult_high, random_seed=self.seeds[3]
-        )
-
-    def init_results_variables(self):
-        """Initialize all experiment variables used in results collection"""
-        self.results = {}
-
-        # Original results
-        self.results["waiting_times"] = []
-        self.results["total_call_duration"] = 0.0
-
-        # New nurse results
-        self.results["nurse_waiting_times"] = []
-        self.results["total_nurse_duration"] = 0.0
-        self.results["callbacks_requested"] = 0
-        self.results["total_patients"] = 0
-
-# =============================================================================
-# UTILITY FUNCTIONS
-# =============================================================================
-
-def trace(msg):
-    """Turning printing of events on and off."""
-    if TRACE:
-        print(msg)
-
-# =============================================================================
-# MODEL LOGIC
-# =============================================================================
-
-def nurse_service(identifier, env, args):
-    """
-    Simulates the nurse callback process
-    1. request and wait for a nurse
-    2. nurse consultation (uniform)
-    3. exit system
-    """
-    # record the time that patient entered nurse queue
-    start_wait = env.now
-
-    # request a nurse
-    with args.nurses.request() as req:
-        yield req
-
-        # record the waiting time for nurse callback
-        waiting_time = env.now - start_wait
-        args.results["nurse_waiting_times"].append(waiting_time)
-
-        trace(f"nurse started callback for patient {identifier} at {env.now:.3f}")
-
-        # nurse consultation time
-        consult_duration = args.nurse_dist.sample()
-        yield env.timeout(consult_duration)
-
-        # update total nurse duration
-        args.results["total_nurse_duration"] += consult_duration
-
-        trace(f"nurse callback for patient {identifier} ended at {env.now:.3f}; "
-              f"waiting time was {waiting_time:.3f}")
-
-def service(identifier, env, args):
-    """
-    Enhanced service process with nurse callback
-    1. request and wait for a call operator
-    2. phone triage (triangular)
-    3. determine if callback needed (40% probability)
-    4. if needed, initiate nurse callback process
-    5. exit system
-    """
-    # record the time that call entered the queue
-    start_wait = env.now
-
-    # request an operator
-    with args.operators.request() as req:
-        yield req
-
-        # record the waiting time for call to be answered
-        waiting_time = env.now - start_wait
-        args.results["waiting_times"].append(waiting_time)
-
-        trace(f"operator answered call {identifier} at {env.now:.3f}")
-
-        # call duration
-        call_duration = args.call_dist.sample()
-        yield env.timeout(call_duration)
-
-        # update the total call duration
-        args.results["total_call_duration"] += call_duration
-
-        trace(f"call {identifier} ended {env.now:.3f}; "
-              f"waiting time was {waiting_time:.3f}")
-
-    # After operator call, determine if nurse callback is needed
-    args.results["total_patients"] += 1
-    callback_needed = args.callback_dist.sample()
-
-    if callback_needed:
-        args.results["callbacks_requested"] += 1
-        trace(f"patient {identifier} requires nurse callback")
-
-        # Start nurse callback process
-        env.process(nurse_service(identifier, env, args))
-
-def arrivals_generator(env, args):
-    """Generate patient arrivals with exponential inter-arrival times"""
-    for caller_count in itertools.count(start=1):
-        # sample inter-arrival time
-        inter_arrival_time = args.arrival_dist.sample()
-        yield env.timeout(inter_arrival_time)
-
-        trace(f"call arrives at: {env.now:.3f}")
-
-        # start service process
-        env.process(service(caller_count, env, args))
-
-# =============================================================================
-# EXPERIMENT EXECUTION FUNCTIONS
-# =============================================================================
-
-def single_run(experiment, rep=0, rc_period=RESULTS_COLLECTION_PERIOD):
-    """Perform a single run of the enhanced model"""
-    run_results = {}
-
-    # reset all result collection variables
-    experiment.init_results_variables()
-
-    # set random number set
-    experiment.set_random_no_set(rep)
-
-    # create environment
-    env = simpy.Environment()
-
-    # create resources
-    experiment.operators = simpy.Resource(env, capacity=experiment.n_operators)
-    experiment.nurses = simpy.Resource(env, capacity=experiment.n_nurses)
-
-    # start arrivals process
-    env.process(arrivals_generator(env, experiment))
-
-    # run simulation
-    env.run(until=rc_period)
-
-    # calculate results
-    run_results["01_mean_waiting_time"] = np.mean(experiment.results["waiting_times"])
-    run_results["02_operator_util"] = (
-        experiment.results["total_call_duration"] / 
-        (rc_period * experiment.n_operators)
-    ) * 100.0
-
-    # Calculate nurse results
-    if experiment.results["nurse_waiting_times"]:
-        run_results["03_mean_nurse_waiting_time"] = np.mean(experiment.results["nurse_waiting_times"])
-    else:
-        run_results["03_mean_nurse_waiting_time"] = 0.0
-
-    run_results["04_nurse_util"] = (
-        experiment.results["total_nurse_duration"] / 
-        (rc_period * experiment.n_nurses)
-    ) * 100.0
-
-    # Additional metrics
-    run_results["05_callback_rate"] = (
-        experiment.results["callbacks_requested"] / 
-        experiment.results["total_patients"]
-    ) * 100.0 if experiment.results["total_patients"] > 0 else 0.0
-
-    return run_results
-
-def multiple_replications(experiment, rc_period=RESULTS_COLLECTION_PERIOD, n_reps=5):
-    """Perform multiple replications of the enhanced model"""
-    results = [single_run(experiment, rep, rc_period) for rep in range(n_reps)]
-    df_results = pd.DataFrame(results)
-    df_results.index = np.arange(1, len(df_results) + 1)
-    df_results.index.name = "rep"
-    return df_results
-
-def set_trace(trace_on=True):
-    """Turn tracing on/off globally"""
-    global TRACE
-    TRACE = trace_on
-
-
-def run_simulation_from_dict(params: dict):
-    """
-    Wrapper to run the simulation using params supplied in a dictionary.
-    Only the parameters necessary for the Experiment's constructor are passed;
-    defaults are used for those not provided.
-    """
-    # Extract constructor signature of the Experiment class
-    signature = inspect.signature(Experiment.__init__)
-    
-    # Build kwargs dict: match keys in 'params' to Experiment.__init__ params
-    experiment_kwargs = {
-        key: params.get(key, param.default)
-        for key, param in signature.parameters.items()
-        if key != 'self'  # exclude 'self' from constructor
-    }
-
-    # Create Experiment instance with dynamic/default parameters
-    exp = Experiment(**experiment_kwargs)
-
-    # Separate non-constructor parameters
-    run_length = params.get("run_length", RESULTS_COLLECTION_PERIOD)
-    rep_seed = params.get("random_seed", DEFAULT_RND_SET)
-
-    # Run simulation
-    return single_run(exp, rep=rep_seed, rc_period=run_length)
-

From 68491f834c7fc068332b33e507ba6ab00848523e Mon Sep 17 00:00:00 2001
From: TomMonks <thomas.m.monks@gmail.com>
Date: Thu, 7 Aug 2025 16:03:57 +0100
Subject: [PATCH 11/23] chore(clean): removed redun top level resource dir

---
 resources/parameter_prompt.txt | 45 -------------------
 resources/schema.json          | 79 ----------------------------------
 2 files changed, 124 deletions(-)
 delete mode 100644 resources/parameter_prompt.txt
 delete mode 100644 resources/schema.json

diff --git a/resources/parameter_prompt.txt b/resources/parameter_prompt.txt
deleted file mode 100644
index bcfe6d5..0000000
--- a/resources/parameter_prompt.txt
+++ /dev/null
@@ -1,45 +0,0 @@
-You are a discrete-event simulation assistant.
-You help turn natural language into structured simulation parameters.
-
-Here is the simulation configuration schema:
-{schema}
-
-User request: {user_input}
-
-{validation_feedback}
-
-IMPORTANT:
-- Return ONLY the parameter VALUES, not the schema structure
-- Use the exact parameter names from the schema
-- Output a simple JSON object with key-value pairs
-- Do NOT include type information, descriptions, or constraints
-- CALCULATE all mathematical expressions and provide ONLY the final numerical result
-- Do NOT show calculations like "0.6 * 0.95" - provide the computed value like 0.57
-- All values must be final computed numbers, not mathematical expressions
-
-Example output format:
-{{
-  "n_operators": 15,
-  "n_nurses": 8,
-  "mean_iat": 0.57,
-  "random_seed": 42,
-  "run_length": 1000
-}}
-
-WRONG format (do not do this):
-{{
-  "mean_iat": "0.6 * 0.95"
-}}
-
-CORRECT format:
-{{
-  "mean_iat": 0.57
-}}
-
-
-Before providing the JSON:
-1. Mentally calculate any mathematical expressions
-2. Convert all calculations to final numerical values
-3. Then output only the JSON with computed numbers
-
-JSON Response:
diff --git a/resources/schema.json b/resources/schema.json
deleted file mode 100644
index 66d699d..0000000
--- a/resources/schema.json
+++ /dev/null
@@ -1,79 +0,0 @@
-{
-  "n_operators": {
-    "type": "int",
-    "default": 13,
-    "minimum": 1,
-    "maximum": 100,
-    "description": "Number of call operators (1-100)"
-  },
-  "n_nurses": {
-    "type": "int",
-    "default": 10,
-    "minimum": 1,
-    "maximum": 50,
-    "description": "Number of nurse callbacks (1-50)"
-  },
-  "mean_iat": {
-    "type": "float",
-    "default": 0.6,
-    "minimum": 0.1,
-    "maximum": 10.0,
-    "description": "Mean time between arrivals (minutes) or inter-arrival time. 0.1 = 6 calls/min, 10.0 = 1 call/10min. An increase in demand = a reduction in the inter-arrival time. A reduction in demand = an increase in inter-arrival time"
-  },
-  "call_low": {
-    "type": "float",
-    "default": 5.0,
-    "minimum": 1.0,
-    "maximum": 30.0,
-    "description": "Low bound for call duration (minutes)"
-  },
-  "call_mode": {
-    "type": "float",
-    "default": 7.0,
-    "minimum": 1.0,
-    "maximum": 30.0,
-    "description": "Mode for triangular call distribution (minutes)"
-  },
-  "call_high": {
-    "type": "float",
-    "default": 10.0,
-    "minimum": 1.0,
-    "maximum": 30.0,
-    "description": "High bound for call duration (minutes)"
-  },
-  "callback_prob": {
-    "type": "float",
-    "default": 0.4,
-    "minimum": 0.0,
-    "maximum": 1.0,
-    "description": "Nurse callback probability (0-1)"
-  },
-  "nurse_consult_low": {
-    "type": "float",
-    "default": 10.0,
-    "minimum": 5.0,
-    "maximum": 60.0,
-    "description": "Low bound for nurse consult duration (minutes)"
-  },
-  "nurse_consult_high": {
-    "type": "float",
-    "default": 20.0,
-    "minimum": 5.0,
-    "maximum": 60.0,
-    "description": "High bound for nurse consult duration (minutes)"
-  },
-  "random_seed": {
-    "type": "int",
-    "default": 0,
-    "minimum": 0,
-    "maximum": 999999,
-    "description": "Random number seed for reproducibility"
-  },
-  "run_length": {
-    "type": "int",
-    "default": 1000,
-    "minimum": 100,
-    "maximum": 86400,
-    "description": "Simulation runtime (minutes)"
-  }
-}

From eeb0dc4f872e9879251b29240b141ca6c2a1062a Mon Sep 17 00:00:00 2001
From: TomMonks <thomas.m.monks@gmail.com>
Date: Thu, 7 Aug 2025 16:06:49 +0100
Subject: [PATCH 12/23] docs(readme): updated run mcp instructio

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index f28e731..543cfb0 100644
--- a/README.md
+++ b/README.md
@@ -127,7 +127,7 @@ This project was developed and tested on the following system configuration:
 
 1. **Start the MCP server**
    ```bash
-   python mcp_server.py
+   python run_call_centre_server.py
    ```
    Server will be available at `http://localhost:8001/mcp`
 

From ef412b049bbfbedc7558a342ea5550108952effd Mon Sep 17 00:00:00 2001
From: TomMonks <thomas.m.monks@gmail.com>
Date: Mon, 11 Aug 2025 15:38:24 +0100
Subject: [PATCH 13/23] docs(env): +arize-phoenix

---
 environment.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/environment.yml b/environment.yml
index 74973e4..bb0c330 100644
--- a/environment.yml
+++ b/environment.yml
@@ -2,6 +2,7 @@ name: des-agent
 channels:
   - conda-forge
 dependencies:
+  - arize-phoenix
   - fastmcp
   - pandas
   - python=3.11

From 6c43aaea0de5e53bc3b0e29ca2fd3d04e131ead4 Mon Sep 17 00:00:00 2001
From: TomMonks <thomas.m.monks@gmail.com>
Date: Mon, 11 Aug 2025 17:01:32 +0100
Subject: [PATCH 14/23] feat(evals): added basic langgraph tracing
 arize-phoenix

---
 agent_self_reflection_evals.py                | 471 ++++++++++++++++++
 environment.yml                               |   3 +-
 .../callcentre/call_centre_mcp.py             |   8 +-
 .../resources/parameter_prompt.txt            |   0
 mcpsim/server.py                              |  14 +-
 mcpsim/tracing.py                             |  25 +
 6 files changed, 518 insertions(+), 3 deletions(-)
 create mode 100644 agent_self_reflection_evals.py
 rename mcpsim/{example_models/callcentre => }/resources/parameter_prompt.txt (100%)
 create mode 100644 mcpsim/tracing.py

diff --git a/agent_self_reflection_evals.py b/agent_self_reflection_evals.py
new file mode 100644
index 0000000..5d783f8
--- /dev/null
+++ b/agent_self_reflection_evals.py
@@ -0,0 +1,471 @@
+"""
+Agent Self-Reflection Simulation Parameter Generator
+
+*****************
+WITH EVALS in arize-phoenix
+*****************
+
+This module implements an simple agent that generates and validates simulation 
+parameters using natural language input and self-reflection capabilities. The agent 
+employs a graph-based workflow to iteratively refine parameters until they meet 
+validation requirements.
+
+Overview
+--------
+The agent uses a state machine architecture built with LangGraph to orchestrate 
+the parameter generation process. When validation fails, the agent reflects on 
+the errors and attempts to correct them through multiple retry cycles.
+
+Core Workflow
+-------------
+1. Schema Retrieval: Fetch parameter schema from MCP server
+2. Parameter Generation: Use LLM to convert natural language to JSON parameters  
+3. Validation: Validate parameters against schema constraints
+4. Self-Reflection: On validation failure, analyze errors and retry
+5. Simulation Execution: Run simulation with validated parameters
+6. Results Display: Present formatted results and parameter summaries
+
+Usage
+-----
+Run from command line with optional model specification:
+
+    $ python agent_self_reflection.py --llm gemma3:27b
+    $ python agent_self_reflection.py --llm mistral:7b
+
+The agent will prompt for a natural language simulation description and 
+automatically handle parameter generation, validation, and execution.
+
+Hard constraints
+-------------
+MAX_RETRIES : int
+    Maximum number of parameter generation attempts before bailout (default: 4)
+
+Examples
+--------
+Natural language inputs the agent can process:
+- "Simulate 14 operators, 12 nurses and 5% extra demand"
+- "Run scenario with high staffing and normal call volume"
+- "Test configuration with minimal staff"
+
+Notes
+-----
+Requires running MCP server on localhost:8001 and Ollama server on localhost:11434.
+Different LLM models show varying performance - gemma3:27b and mistral:7b are 
+recommended for reliable parameter generation.
+
+"""
+
+import asyncio, json, re
+from typing import Dict, Any, Optional, TypedDict
+
+from fastmcp import Client
+from langchain_ollama import OllamaLLM
+from langchain_core.prompts import PromptTemplate
+from langgraph.graph import StateGraph, END
+from rich.prompt import Prompt
+from rich.progress import Progress, SpinnerColumn, TextColumn
+from rich.console import Console
+from rich.markdown import Markdown
+
+from functools import partial
+
+import pandas as pd
+
+import argparse
+
+import phoenix as px
+from phoenix.otel import register
+
+import os
+
+from mcpsim.tracing import init_tracing
+
+console = Console()
+
+# maximum tries are jsonifying parameter list...
+MAX_RETRIES = 4
+
+PARAMETER_TABLE_TEMPLATE = """
+Given the following JSON object representing parameters updated in a simulation model:
+
+{json_object}
+
+Task:  
+Convert this JSON object into a Markdown table with two columns: **Parameter** and **Value**. 
+Each key should map to its value on a separate row. 
+Display only the table in clean Markdown format.
+Do NOT include any text before the table such as 'Here is the output:'
+Please add a graph emoji to the parameter column header.
+
+Output
+
+**Parameters used in simulation**
+
+[markdown table]
+"""
+
+
+class AgentState(TypedDict):
+    user_input: str
+    schema: dict
+    parameters: dict
+    formatted_parameters: str
+    validation: Optional[dict]
+    validation_history: list 
+    simulation_result: Optional[dict]
+    retry_count: int  
+
+
+# --------------------------- Helper funcs ---------------------------------------- #
+def clean_llm_response(response: Optional[str]) -> str:
+    """
+    Clean LLM response to extract JSON from markdown code blocks.
+
+    Large Language Models often wrap JSON responses in markdown code blocks.
+    This function removes markdown formatting and extracts the JSON content.
+
+    Parameters
+    ----------
+    response : Optional[str]
+        Raw LLM response that may contain markdown formatting
+
+    Returns
+    -------
+    str
+        Cleaned JSON string ready for parsing
+
+    Examples
+    --------
+    >>> clean_llm_response("```{'key':'value'}```")
+    "{'key': 'value'}"
+
+    >>> clean_llm_response("```json{'key':'value'}```")
+    "{'key': 'value'}"
+
+    >>> clean_llm_response("Some text {'key':value'} more text")
+    "{'key': 'value'}"
+
+    >>> clean_llm_response("")
+    '{}'
+    """
+    if not response:
+        return "{}"
+
+    # Remove markdown code block markers
+    response = response.strip()
+
+    # Remove ```json and ```
+    if response.startswith("```json"):
+        response = response[7:]  # Remove ```
+    if response.startswith("```"):
+        response = response[3:]  # Remove ```
+    if response.endswith("```"):
+        response = response[:-3]  # Remove closing ```
+
+    # Find JSON object boundaries
+    response = response.strip()
+    start_idx = response.find("{")
+    end_idx = response.rfind("}")
+
+    if start_idx != -1 and end_idx != -1:
+        return response[start_idx : end_idx + 1]
+
+    return response
+
+
+
+
+# --------------------------- graph nodes ------------------------------------ #
+async def fetch_schema(state: Dict[str, Any]) -> Dict[str, Any]:
+    async with Client("http://localhost:8001/mcp") as cl:
+        # Resource URI exactly as exposed by your server
+        res = await cl.read_resource("resource://schema/experiment_parameters")
+        state["schema"] = res[0].text if hasattr(res[0], "text") else res[0]
+    return state
+
+
+async def generate_parameters(state: Dict[str, Any], llm: OllamaLLM) -> Dict[str, Any]:
+    async with Client("http://localhost:8001/mcp") as cl:
+
+        # prompt parameters
+        prompt_vars = {
+            "schema": state["schema"],
+            "user_input": state["user_input"]
+        }
+        # include validation errors if present in state memory
+        validation_errors = state.get("validation", {}).get("errors")
+        if validation_errors:
+            prompt_vars["validation_errors"] = "\n".join(f"- {e}" for e in validation_errors)
+        else:
+            prompt_vars["validation_errors"] = ""  # always supply a string
+
+        # Ask MCP for the ready-made prompt that tells an LLM how to jsonify
+        prompt_resp = await cl.get_prompt("parameter_jsonification_prompt", prompt_vars)
+       
+    prompt_text = prompt_resp.messages[0].content.text
+    
+    progress_text = "[bold green]🧠 Reasoning about simulation parameters."
+    if state["retry_count"] > 0:
+        progress_text += f"[RETRY {state['retry_count']}]"
+    with Progress(
+        SpinnerColumn(),
+        TextColumn(progress_text),
+        transient=True,  
+    ) as progress:
+        task = progress.add_task("summarising", total=None)
+        llm_out = llm.invoke(prompt_text)
+        progress.remove_task(task)
+    
+    cleaned_response = clean_llm_response(llm_out)
+
+    try:
+        state["parameters"] = json.loads(cleaned_response)
+    except json.JSONDecodeError as e:
+        # Force validation failure by setting invalid parameters
+        # and create a mock validation response for self-reflection
+        state["parameters"] = {"__json_parse_error__": True}
+        console.print(f"[yellow]⚠️ JSON parsing failed - will retry with feedback")
+    except Exception as e:
+        # Handle other unexpected errors similarly
+        state["parameters"] = {"__unexpected_error__": True}
+        console.print(f"[yellow]⚠️ Unexpected error - will retry")
+
+    return state
+
+
+async def validate_parameters(state: Dict[str, Any]) -> Dict[str, Any]:
+    async with Client("http://localhost:8001/mcp") as cl:
+        resp = await cl.call_tool(
+            "validate_simulation_parameters",
+            {"parameters": state["parameters"]},
+        )
+
+    # Add current validation to history before overwriting
+    if "validation_history" not in state:
+        state["validation_history"] = []
+    
+    # Store the validation attempt with context
+    state["validation_history"].append({
+        "attempt": state.get("retry_count", 0) + 1,
+        "parameters": state["parameters"].copy(),
+        "validation_result": resp.data.copy()
+    })
+
+    state["validation"] = resp.data
+    return state
+
+def validation_branch(state: Dict[str, Any]) -> str:
+    return "valid" if state["validation"]["is_valid"] else "invalid"
+
+def retry_branch(state: AgentState) -> str:
+    """routing node.  bail out if max retries exceeded otherwise jsonify"""
+    if state.get("retry_count", 0) >= MAX_RETRIES:
+        return "bail_out"
+    return "jsonify"
+
+def increment_retry(state: AgentState) -> AgentState:
+    """Increment the number of attempts at jsonifying parameters"""
+    state["retry_count"] = state.get("retry_count", 0) + 1
+    report_parameter_reflection_failure(state, MAX_RETRIES)
+    return state
+
+def bail_out_node(state: AgentState) -> AgentState:
+    """A bail out node if exceeded max attempts at jsonifying parameters"""
+    state["error"] = "Maximum retries exceeded during parameter reflection."
+    return state
+
+async def run_simulation(state: Dict[str, Any]) -> Dict[str, Any]:
+    async with Client("http://localhost:8001/mcp") as cl:
+        resp = await cl.call_tool(
+            "run_call_centre_simulation",
+            {"parameters": state["parameters"]},
+        )
+    state["simulation_result"] = resp.data
+    return state
+
+
+async def summarise_parameters(state: Dict[str, Any], llm: OllamaLLM) -> Dict[str, Any]:
+    """Generates a formatted markdown table of parameters from JSON. 
+    Could do this programatically, but just for fun we will use a LLM"""
+    progress_text = "[bold blue]✏️ Summarising parameters used..."
+    with Progress(
+        SpinnerColumn(),
+        TextColumn(progress_text),
+        transient=True,  
+    ) as progress:
+        task = progress.add_task("summarising_params", total=None)
+        prompt = PromptTemplate.from_template(PARAMETER_TABLE_TEMPLATE)
+        param_display_prompt = prompt.format(json_object=state["parameters"])
+        llm_result = llm.invoke(param_display_prompt)
+        progress.remove_task(task)
+
+    state["formatted_parameters"] = llm_result
+    return state
+
+# --------------------------- build the graph -------------------------------- #
+def build_graph(llm: OllamaLLM) -> StateGraph:
+    graph = StateGraph(AgentState)  # Using the TypedDict from above
+
+    # 1. create notes
+    # NB: I've used a partial functions here to solve the issue with passing extra parameters to a coroutine
+    graph.add_node("get_schema", fetch_schema)
+    graph.add_node("jsonify", partial(generate_parameters, llm=llm))
+    graph.add_node("validate", validate_parameters)
+    graph.add_node("run_sim", run_simulation)
+    graph.add_node("format_params", partial(summarise_parameters, llm=llm))
+    graph.add_node("increment_retry", increment_retry)
+    graph.add_node("bail_out", bail_out_node)
+
+    # 2. create entry point and edges
+    graph.set_entry_point("get_schema")
+    graph.add_edge("get_schema", "jsonify")
+    graph.add_edge("jsonify", "validate")
+    graph.add_conditional_edges(
+        "validate",
+        validation_branch,
+        {"valid": "format_params", "invalid": "increment_retry"}
+    )
+
+    # handles limited number of retrries. links to bail_out (and END) 
+    # if retries exceeds a hard limit.
+    graph.add_conditional_edges(
+        "increment_retry",
+        retry_branch,
+        {"jsonify": "jsonify", "bail_out": "bail_out"}
+    )  
+    
+    graph.add_edge("format_params", "run_sim")
+    graph.add_edge("run_sim", END)
+    graph.add_edge("bail_out", END)
+    
+    return graph.compile()
+
+def display_results_table(state: AgentState):
+    """
+    Display simulation results to the user
+    """
+    console.print(Markdown("✅ **Simulation complete.**"))
+    df = pd.DataFrame(
+        list(state["simulation_result"].items()), columns=["KPIs", "Values"]
+    )
+    console.print(Markdown(df.round(2).to_markdown(index=False)))
+
+
+def display_param_summary_table(state: AgentState):
+    """
+    Display a summary table of parameters for human-in-loop validation
+    """
+    console.print(Markdown(state["formatted_parameters"]))
+
+def report_parameter_reflection_failure(state: dict, max_retries: int):
+    """
+    Display a clear error message for parameter reflection failure
+    """
+    retry_count = state.get("retry_count", 0)
+    console.print(
+        f"[bold red]❌ Parameter validation failed after {retry_count} attempt{'s' if retry_count!=1 else ''}."
+    )
+    if "validation" in state and "errors" in state["validation"]:
+        errors = state["validation"]["errors"]
+        if errors:
+            console.print("[red]Last validation errors were:")
+            for err in errors:
+                console.print(f"[red]- {err}")
+        else:
+            console.print("[red]No specific validation errors were provided by the server.")
+    else:
+        console.print("[red]No validation error details are available.")
+        console.print(f"[yellow]The agent was unable to generate valid simulation parameters in {retry_count} tries (limit: {max_retries}).")
+        console.print("[yellow]Try rephrasing your request or ensure parameter values are within allowed ranges. Refer to the simulation parameter schema for guidance.")
+
+
+def display_validation_history(state: AgentState):
+    """Display the history of validation attempts"""
+    history = state.get("validation_history", [])
+    
+    if not history:
+        return
+        
+    console.print(Markdown("🔍 **Parameter Generation Issues**"))
+    
+    for entry in history:
+        attempt_num = entry["attempt"]
+        is_valid = entry["validation_result"]["is_valid"]
+        errors = entry["validation_result"]["errors"]
+        
+        status = "✅ Success" if is_valid else "❌ Failed"
+        console.print(f"\n**Attempt {attempt_num}:** {status}")
+        
+        if errors:
+            console.print("Errors encountered:")
+            for error in errors:
+                console.print(f"{error}")
+
+
+
+async def main(model_name: str) -> None:
+
+    # 0. setup eval server
+    tracer = init_tracing()
+
+    # 1. Setup the graph and LLM
+    llm = OllamaLLM(model=model_name, base_url="http://localhost:11434")
+    compiled_graph = build_graph(llm)
+    
+    # 2. Prompt user
+    user_request = Prompt.ask(
+        "Simulation request",
+        default="Simulate 14 operators, 12 nurses and 5 % extra demand"
+    )
+
+    # 3. invoke graph
+    final_state = await compiled_graph.ainvoke({
+        "user_input": user_request,
+        "retry_count": 0,     
+        "validation_history": []
+    })
+
+    # 4. Report results
+    console.rule("[bold green]RESULTS")
+    if "simulation_result" in final_state:
+        display_param_summary_table(final_state)
+        display_results_table(final_state)
+    
+    retry_count = final_state.get("retry_count", 0)
+    if retry_count > 0:
+        display_validation_history(final_state)
+
+def parse_arguments():
+    """Parse command line arguments for model selection."""
+    parser = argparse.ArgumentParser(
+        description="Simulation Agent Workflow - Experiment with a simulation model using natural language",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+            Examples:
+            python agent_self_reasoning.py -l gemma3:27b
+            python agent_self_reasoning.py -l mistral:7b 
+            """,
+    )
+
+    parser.add_argument(
+        "-l",
+        "--llm",
+        type=str,
+        default="gemma3:27b",
+        help="Model to use for generating parameters (default:gemma3:27b)",
+    )
+
+    return parser.parse_args()
+
+
+
+
+if __name__ == "__main__":
+
+    # Parse command line arguments
+    args = parse_arguments()
+
+    asyncio.run(main(model_name=args.llm))
+
+
+
diff --git a/environment.yml b/environment.yml
index bb0c330..e974ae9 100644
--- a/environment.yml
+++ b/environment.yml
@@ -2,7 +2,6 @@ name: des-agent
 channels:
   - conda-forge
 dependencies:
-  - arize-phoenix
   - fastmcp
   - pandas
   - python=3.11
@@ -15,7 +14,9 @@ dependencies:
   - tabulate
   - langgraph
   - pip:
+      - arize-phoenix-otel
       - langchain
       - langchain-community
       - langchain-core
       - langchain-ollama
+      - openinference-instrumentation-langchain
diff --git a/mcpsim/example_models/callcentre/call_centre_mcp.py b/mcpsim/example_models/callcentre/call_centre_mcp.py
index 07761c0..158f10f 100644
--- a/mcpsim/example_models/callcentre/call_centre_mcp.py
+++ b/mcpsim/example_models/callcentre/call_centre_mcp.py
@@ -1,13 +1,19 @@
 import json
-from model import run_simulation_from_dict
+from .call_centre_model import run_simulation_from_dict
 from typing import Dict, Any
 
+import os
 
 class CallCentreSimulationAdapter:
     """Call centre simulation implementation following 
     the SimulationModelAdapter protocol."""
     
     def __init__(self, schema_path: str = "resources/schema.json"):
+        
+        # handle path to schema file
+        dir_path = os.path.dirname(os.path.realpath(__file__))
+        schema_path = os.path.join(dir_path, schema_path)
+        
         self.schema_path = schema_path
         self._schema = None
     
diff --git a/mcpsim/example_models/callcentre/resources/parameter_prompt.txt b/mcpsim/resources/parameter_prompt.txt
similarity index 100%
rename from mcpsim/example_models/callcentre/resources/parameter_prompt.txt
rename to mcpsim/resources/parameter_prompt.txt
diff --git a/mcpsim/server.py b/mcpsim/server.py
index 8f731ce..88133ea 100644
--- a/mcpsim/server.py
+++ b/mcpsim/server.py
@@ -1,9 +1,16 @@
+import os
+
 from fastmcp import FastMCP 
 from langchain_core.prompts import PromptTemplate
 from fastmcp.prompts.prompt import PromptMessage, TextContent
 
 from mcpsim.protocol import SimulationModelAdapter
 
+PARAMETER_PROMPT_PATH = "resources/parameter_prompt.txt"
+
+from mcpsim.tracing import init_tracing  # your helper module from previous discussion
+tracer_provider = init_tracing(project_name="sim-agent-mcp-server")
+
 class SimulationMCPServer:
     """MCP server that can work with any SimulationModelAdapter implementation."""
     
@@ -115,7 +122,12 @@ def parameter_jsonification_prompt(
             user_input: str,
             validation_errors: str = ""
         ) -> PromptMessage:
-            with open("resources/parameter_prompt.txt", encoding="utf-8") as f:
+            
+            # handle path to schema file
+            dir_path = os.path.dirname(os.path.realpath(__file__))
+            parameter_prompt_path = os.path.join(dir_path, PARAMETER_PROMPT_PATH)
+
+            with open(parameter_prompt_path, encoding="utf-8") as f:
                 prompt_template_text = f.read()
             prompt = PromptTemplate.from_template(prompt_template_text)
 
diff --git a/mcpsim/tracing.py b/mcpsim/tracing.py
new file mode 100644
index 0000000..940b82f
--- /dev/null
+++ b/mcpsim/tracing.py
@@ -0,0 +1,25 @@
+"""
+Utility module to support tracing of the agent and tools using arize-phoenix
+"""
+
+import os
+from phoenix.otel import register
+
+def init_tracing(project_name: str = "sim-agent", endpoint: str = "http://localhost:6006"):
+    """
+    Initialise tracing
+
+    Parameters:
+    ----------
+    project_name: str. optional (default="sim-agent")
+        Name of project
+
+    endpoint: str, optional (default = "http://localhost:6006")
+        Port for Phoenix eval server.
+
+    Returns:
+        tracer
+
+    """
+    os.environ["PHOENIX_COLLECTOR_ENDPOINT"] = endpoint
+    return register(project_name=project_name, auto_instrument=True)

From 01762187e9929ac15310c361be3bfe8fca819d6c Mon Sep 17 00:00:00 2001
From: TomMonks <thomas.m.monks@gmail.com>
Date: Tue, 12 Aug 2025 11:05:19 +0100
Subject: [PATCH 15/23] feat(evals): draft tracing

---
 agent_self_reflection_evals.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/agent_self_reflection_evals.py b/agent_self_reflection_evals.py
index 5d783f8..480f848 100644
--- a/agent_self_reflection_evals.py
+++ b/agent_self_reflection_evals.py
@@ -76,6 +76,10 @@
 import phoenix as px
 from phoenix.otel import register
 
+# used to annotate functions when tracing
+from opentelemetry import trace
+from opentelemetry.trace import Status, StatusCode
+
 import os
 
 from mcpsim.tracing import init_tracing
@@ -234,6 +238,9 @@ async def generate_parameters(state: Dict[str, Any], llm: OllamaLLM) -> Dict[str
 
 
 async def validate_parameters(state: Dict[str, Any]) -> Dict[str, Any]:
+
+
+    
     async with Client("http://localhost:8001/mcp") as cl:
         resp = await cl.call_tool(
             "validate_simulation_parameters",
@@ -251,7 +258,9 @@ async def validate_parameters(state: Dict[str, Any]) -> Dict[str, Any]:
         "validation_result": resp.data.copy()
     })
 
-    state["validation"] = resp.data
+    result = resp.data
+    state["validation"] = result
+
     return state
 
 def validation_branch(state: Dict[str, Any]) -> str:

From 1073cbe4091f6451ee0551b97f5db8542f7587fe Mon Sep 17 00:00:00 2001
From: TomMonks <thomas.m.monks@gmail.com>
Date: Tue, 12 Aug 2025 11:49:14 +0100
Subject: [PATCH 16/23] feat(evals): added initial eval data

---
 agent_self_reflection_evals.py |  15 ++-
 create_evals.py                |  22 ++++
 evals/evals.json               | 232 +++++++++++++++++++++++++++++++++
 evals/evals_input.json         | 171 ++++++++++++++++++++++++
 mcpsim/server.py               |   6 +-
 5 files changed, 439 insertions(+), 7 deletions(-)
 create mode 100644 create_evals.py
 create mode 100644 evals/evals.json
 create mode 100644 evals/evals_input.json

diff --git a/agent_self_reflection_evals.py b/agent_self_reflection_evals.py
index 480f848..18c841b 100644
--- a/agent_self_reflection_evals.py
+++ b/agent_self_reflection_evals.py
@@ -80,9 +80,15 @@
 from opentelemetry import trace
 from opentelemetry.trace import Status, StatusCode
 
-import os
 
 from mcpsim.tracing import init_tracing
+# Initialize tracing with the unified project name.
+tracer_provider = init_tracing(project_name="mcp-agent-evaluation")
+# Create a module-level tracer object.
+tracer = tracer_provider.get_tracer("langgraph-agent-tracer")
+
+import os
+
 
 console = Console()
 
@@ -187,7 +193,6 @@ async def fetch_schema(state: Dict[str, Any]) -> Dict[str, Any]:
         state["schema"] = res[0].text if hasattr(res[0], "text") else res[0]
     return state
 
-
 async def generate_parameters(state: Dict[str, Any], llm: OllamaLLM) -> Dict[str, Any]:
     async with Client("http://localhost:8001/mcp") as cl:
 
@@ -238,8 +243,6 @@ async def generate_parameters(state: Dict[str, Any], llm: OllamaLLM) -> Dict[str
 
 
 async def validate_parameters(state: Dict[str, Any]) -> Dict[str, Any]:
-
-
     
     async with Client("http://localhost:8001/mcp") as cl:
         resp = await cl.call_tool(
@@ -292,7 +295,6 @@ async def run_simulation(state: Dict[str, Any]) -> Dict[str, Any]:
     state["simulation_result"] = resp.data
     return state
 
-
 async def summarise_parameters(state: Dict[str, Any], llm: OllamaLLM) -> Dict[str, Any]:
     """Generates a formatted markdown table of parameters from JSON. 
     Could do this programatically, but just for fun we will use a LLM"""
@@ -415,7 +417,8 @@ def display_validation_history(state: AgentState):
 async def main(model_name: str) -> None:
 
     # 0. setup eval server
-    tracer = init_tracing()
+    # tracer_provider = init_tracing(project_name="sim-agent-evaluation")
+    # tracer = tracer_provider.get_tracer("langgraph-agent-tracer")
 
     # 1. Setup the graph and LLM
     llm = OllamaLLM(model=model_name, base_url="http://localhost:11434")
diff --git a/create_evals.py b/create_evals.py
new file mode 100644
index 0000000..e469a4e
--- /dev/null
+++ b/create_evals.py
@@ -0,0 +1,22 @@
+"""
+Script to add expected simulation results to the evals dataset
+single rep at this stage.
+"""
+import json
+from mcpsim.example_models.callcentre.call_centre_model import run_simulation_from_dict 
+
+# Load test cases from evals_input.json
+with open("evals/evals_input.json", "r") as f:
+    scenarios = json.load(f)
+
+# Run each scenario once and attach results
+for key, obj in scenarios.items():
+    params = obj["parameters"]
+    results = run_simulation_from_dict(params)
+    obj["expected_results"] = results
+
+# Save updated scenarios with results to evals.json
+with open("evals/evals.json", "w") as f:
+    json.dump(scenarios, f, indent=2)
+
+print("Simulation results saved to evals.json")
diff --git a/evals/evals.json b/evals/evals.json
new file mode 100644
index 0000000..35995bb
--- /dev/null
+++ b/evals/evals.json
@@ -0,0 +1,232 @@
+{
+  "example_1": {
+    "user_input": "Simulate 14 operators, 12 nurses and 5% extra demand",
+    "parameters": {
+      "n_operators": 14,
+      "n_nurses": 12,
+      "mean_iat": 0.57,
+      "call_low": 5.0,
+      "call_mode": 7.0,
+      "call_high": 10.0,
+      "callback_prob": 0.4,
+      "nurse_consult_low": 10.0,
+      "nurse_consult_high": 20.0,
+      "random_seed": 0,
+      "run_length": 1000
+    },
+    "expected_results": {
+      "01_mean_waiting_time": 2.3504694345055652,
+      "02_operator_util": 90.86219986414451,
+      "03_mean_nurse_waiting_time": 4.1106430489118715,
+      "04_nurse_util": 90.36357093206236,
+      "05_callback_rate": 42.5531914893617
+    }
+  },
+  "example_2": {
+    "user_input": "Run scenario with double staffing and normal call volume",
+    "parameters": {
+      "n_operators": 26,
+      "n_nurses": 20,
+      "mean_iat": 0.6,
+      "call_low": 5.0,
+      "call_mode": 7.0,
+      "call_high": 10.0,
+      "callback_prob": 0.4,
+      "nurse_consult_low": 10.0,
+      "nurse_consult_high": 20.0,
+      "random_seed": 0,
+      "run_length": 1000
+    },
+    "expected_results": {
+      "01_mean_waiting_time": 0.0,
+      "02_operator_util": 46.452122409679156,
+      "03_mean_nurse_waiting_time": 0.0041045663539504606,
+      "04_nurse_util": 51.20346053052576,
+      "05_callback_rate": 42.12121212121212
+    }
+  },
+  "example_3": {
+    "user_input": "Test configuration with 8 operators, 6 nurses, and 20% increased demand",
+    "parameters": {
+      "n_operators": 8,
+      "n_nurses": 6,
+      "mean_iat": 0.48,
+      "call_low": 5.0,
+      "call_mode": 7.0,
+      "call_high": 10.0,
+      "callback_prob": 0.4,
+      "nurse_consult_low": 10.0,
+      "nurse_consult_high": 20.0,
+      "random_seed": 0,
+      "run_length": 1000
+    },
+    "expected_results": {
+      "01_mean_waiting_time": 242.69198606467467,
+      "02_operator_util": 99.35481108027054,
+      "03_mean_nurse_waiting_time": 78.94431797274639,
+      "04_nurse_util": 97.8390040780754,
+      "05_callback_rate": 42.48847926267281
+    }
+  },
+  "example_4": {
+    "user_input": "Simulate with 20 operators, 15 nurses, and half the usual call volume",
+    "parameters": {
+      "n_operators": 20,
+      "n_nurses": 15,
+      "mean_iat": 1.2,
+      "call_low": 5.0,
+      "call_mode": 7.0,
+      "call_high": 10.0,
+      "callback_prob": 0.4,
+      "nurse_consult_low": 10.0,
+      "nurse_consult_high": 20.0,
+      "random_seed": 0,
+      "run_length": 1000
+    },
+    "expected_results": {
+      "01_mean_waiting_time": 0.0,
+      "02_operator_util": 31.21492694742085,
+      "03_mean_nurse_waiting_time": 0.0,
+      "04_nurse_util": 35.17900361818962,
+      "05_callback_rate": 42.17289719626168
+    }
+  },
+  "example_5": {
+    "user_input": "Run with 10 operators, 8 nurses, 30% callback probability, and 10% reduced demand",
+    "parameters": {
+      "n_operators": 10,
+      "n_nurses": 8,
+      "mean_iat": 0.66,
+      "call_low": 5.0,
+      "call_mode": 7.0,
+      "call_high": 10.0,
+      "callback_prob": 0.3,
+      "nurse_consult_low": 10.0,
+      "nurse_consult_high": 20.0,
+      "random_seed": 0,
+      "run_length": 1000
+    },
+    "expected_results": {
+      "01_mean_waiting_time": 62.26336943615235,
+      "02_operator_util": 98.91827940297605,
+      "03_mean_nurse_waiting_time": 2.3551925550777337,
+      "04_nurse_util": 82.46774067208518,
+      "05_callback_rate": 33.012583271650634
+    }
+  },
+  "example_6": {
+    "user_input": "Test scenario: 25 operators, 25 nurses, triple demand",
+    "parameters": {
+      "n_operators": 25,
+      "n_nurses": 25,
+      "mean_iat": 0.2,
+      "call_low": 5.0,
+      "call_mode": 7.0,
+      "call_high": 10.0,
+      "callback_prob": 0.4,
+      "nurse_consult_low": 10.0,
+      "nurse_consult_high": 20.0,
+      "random_seed": 0,
+      "run_length": 1000
+    },
+    "expected_results": {
+      "01_mean_waiting_time": 161.26826119772568,
+      "02_operator_util": 99.27336565709264,
+      "03_mean_nurse_waiting_time": 0.4505143843086236,
+      "04_nurse_util": 83.59706413413956,
+      "05_callback_rate": 41.69621749408983
+    }
+  },
+  "example_7": {
+    "user_input": "Simulate 15 operators, 12 nurses, call durations 3-6-9 minutes, 2000 minute runtime",
+    "parameters": {
+      "n_operators": 15,
+      "n_nurses": 12,
+      "mean_iat": 0.6,
+      "call_low": 3.0,
+      "call_mode": 6.0,
+      "call_high": 9.0,
+      "callback_prob": 0.4,
+      "nurse_consult_low": 10.0,
+      "nurse_consult_high": 20.0,
+      "random_seed": 0,
+      "run_length": 2000
+    },
+    "expected_results": {
+      "01_mean_waiting_time": 0.07192106569784922,
+      "02_operator_util": 66.74240386374329,
+      "03_mean_nurse_waiting_time": 2.663802537251699,
+      "04_nurse_util": 86.43094276820304,
+      "05_callback_rate": 41.684147437818396
+    }
+  },
+  "example_8": {
+    "user_input": "Run with 6 operators, 4 nurses, 60% callback rate, and 25% higher demand",
+    "parameters": {
+      "n_operators": 6,
+      "n_nurses": 4,
+      "mean_iat": 0.45,
+      "call_low": 5.0,
+      "call_mode": 7.0,
+      "call_high": 10.0,
+      "callback_prob": 0.6,
+      "nurse_consult_low": 10.0,
+      "nurse_consult_high": 20.0,
+      "random_seed": 0,
+      "run_length": 1000
+    },
+    "expected_results": {
+      "01_mean_waiting_time": 318.98668257451686,
+      "02_operator_util": 99.56949574531609,
+      "03_mean_nurse_waiting_time": 196.98753442211358,
+      "04_nurse_util": 97.69465771729217,
+      "05_callback_rate": 57.87545787545788
+    }
+  },
+  "example_9": {
+    "user_input": "Test configuration: 18 operators, 14 nurses, nurse consultations 15-25 minutes, normal demand",
+    "parameters": {
+      "n_operators": 18,
+      "n_nurses": 14,
+      "mean_iat": 0.6,
+      "call_low": 5.0,
+      "call_mode": 7.0,
+      "call_high": 10.0,
+      "callback_prob": 0.4,
+      "nurse_consult_low": 15.0,
+      "nurse_consult_high": 25.0,
+      "random_seed": 0,
+      "run_length": 1000
+    },
+    "expected_results": {
+      "01_mean_waiting_time": 0.08980920202587155,
+      "02_operator_util": 67.09751014731434,
+      "03_mean_nurse_waiting_time": 19.85524137214805,
+      "04_nurse_util": 97.15170780690659,
+      "05_callback_rate": 42.12121212121212
+    }
+  },
+  "example_10": {
+    "user_input": "Simulate 12 operators, 10 nurses, 50% callback probability, and 40% demand reduction",
+    "parameters": {
+      "n_operators": 12,
+      "n_nurses": 10,
+      "mean_iat": 0.84,
+      "call_low": 5.0,
+      "call_mode": 7.0,
+      "call_high": 10.0,
+      "callback_prob": 0.5,
+      "nurse_consult_low": 10.0,
+      "nurse_consult_high": 20.0,
+      "random_seed": 0,
+      "run_length": 1000
+    },
+    "expected_results": {
+      "01_mean_waiting_time": 0.38280107383630124,
+      "02_operator_util": 73.44752978233386,
+      "03_mean_nurse_waiting_time": 4.147154100012408,
+      "04_nurse_util": 90.70926116406883,
+      "05_callback_rate": 51.4546965918537
+    }
+  }
+}
\ No newline at end of file
diff --git a/evals/evals_input.json b/evals/evals_input.json
new file mode 100644
index 0000000..51fe6c5
--- /dev/null
+++ b/evals/evals_input.json
@@ -0,0 +1,171 @@
+{
+  "example_1": {
+    "user_input": "Simulate 14 operators, 12 nurses and 5% extra demand",
+    "parameters": {
+      "n_operators": 14,
+      "n_nurses": 12,
+      "mean_iat": 0.57,
+      "call_low": 5.0,
+      "call_mode": 7.0,
+      "call_high": 10.0,
+      "callback_prob": 0.4,
+      "nurse_consult_low": 10.0,
+      "nurse_consult_high": 20.0,
+      "random_seed": 0,
+      "run_length": 1000
+    }
+  },
+  
+  "example_2": {
+    "user_input": "Run scenario with double staffing and normal call volume",
+    "parameters": {
+      "n_operators": 26,
+      "n_nurses": 20,
+      "mean_iat": 0.6,
+      "call_low": 5.0,
+      "call_mode": 7.0,
+      "call_high": 10.0,
+      "callback_prob": 0.4,
+      "nurse_consult_low": 10.0,
+      "nurse_consult_high": 20.0,
+      "random_seed": 0,
+      "run_length": 1000
+    }
+  },
+  
+  "example_3": {
+    "user_input": "Test configuration with 8 operators, 6 nurses, and 20% increased demand",
+    "parameters": {
+      "n_operators": 8,
+      "n_nurses": 6,
+      "mean_iat": 0.48,
+      "call_low": 5.0,
+      "call_mode": 7.0,
+      "call_high": 10.0,
+      "callback_prob": 0.4,
+      "nurse_consult_low": 10.0,
+      "nurse_consult_high": 20.0,
+      "random_seed": 0,
+      "run_length": 1000
+    }
+  },
+  
+  "example_4": {
+    "user_input": "Simulate with 20 operators, 15 nurses, and half the usual call volume",
+    "parameters": {
+      "n_operators": 20,
+      "n_nurses": 15,
+      "mean_iat": 1.2,
+      "call_low": 5.0,
+      "call_mode": 7.0,
+      "call_high": 10.0,
+      "callback_prob": 0.4,
+      "nurse_consult_low": 10.0,
+      "nurse_consult_high": 20.0,
+      "random_seed": 0,
+      "run_length": 1000
+    }
+  },
+  
+  "example_5": {
+    "user_input": "Run with 10 operators, 8 nurses, 30% callback probability, and 10% reduced demand",
+    "parameters": {
+      "n_operators": 10,
+      "n_nurses": 8,
+      "mean_iat": 0.66,
+      "call_low": 5.0,
+      "call_mode": 7.0,
+      "call_high": 10.0,
+      "callback_prob": 0.3,
+      "nurse_consult_low": 10.0,
+      "nurse_consult_high": 20.0,
+      "random_seed": 0,
+      "run_length": 1000
+    }
+  },
+  
+  "example_6": {
+    "user_input": "Test scenario: 25 operators, 25 nurses, triple demand",
+    "parameters": {
+      "n_operators": 25,
+      "n_nurses": 25,
+      "mean_iat": 0.2,
+      "call_low": 5.0,
+      "call_mode": 7.0,
+      "call_high": 10.0,
+      "callback_prob": 0.4,
+      "nurse_consult_low": 10.0,
+      "nurse_consult_high": 20.0,
+      "random_seed": 0,
+      "run_length": 1000
+    }
+  },
+  
+  "example_7": {
+    "user_input": "Simulate 15 operators, 12 nurses, call durations 3-6-9 minutes, 2000 minute runtime",
+    "parameters": {
+      "n_operators": 15,
+      "n_nurses": 12,
+      "mean_iat": 0.6,
+      "call_low": 3.0,
+      "call_mode": 6.0,
+      "call_high": 9.0,
+      "callback_prob": 0.4,
+      "nurse_consult_low": 10.0,
+      "nurse_consult_high": 20.0,
+      "random_seed": 0,
+      "run_length": 2000
+    }
+  },
+  
+  "example_8": {
+    "user_input": "Run with 6 operators, 4 nurses, 60% callback rate, and 25% higher demand",
+    "parameters": {
+      "n_operators": 6,
+      "n_nurses": 4,
+      "mean_iat": 0.45,
+      "call_low": 5.0,
+      "call_mode": 7.0,
+      "call_high": 10.0,
+      "callback_prob": 0.6,
+      "nurse_consult_low": 10.0,
+      "nurse_consult_high": 20.0,
+      "random_seed": 0,
+      "run_length": 1000
+    }
+  },
+  
+  "example_9": {
+    "user_input": "Test configuration: 18 operators, 14 nurses, nurse consultations 15-25 minutes, normal demand",
+    "parameters": {
+      "n_operators": 18,
+      "n_nurses": 14,
+      "mean_iat": 0.6,
+      "call_low": 5.0,
+      "call_mode": 7.0,
+      "call_high": 10.0,
+      "callback_prob": 0.4,
+      "nurse_consult_low": 15.0,
+      "nurse_consult_high": 25.0,
+      "random_seed": 0,
+      "run_length": 1000
+    }
+  },
+  
+  "example_10": {
+    "user_input": "Simulate 12 operators, 10 nurses, 50% callback probability, and 40% demand reduction",
+    "parameters": {
+      "n_operators": 12,
+      "n_nurses": 10,
+      "mean_iat": 0.84,
+      "call_low": 5.0,
+      "call_mode": 7.0,
+      "call_high": 10.0,
+      "callback_prob": 0.5,
+      "nurse_consult_low": 10.0,
+      "nurse_consult_high": 20.0,
+      "random_seed": 0,
+      "run_length": 1000
+    }
+  }
+}
diff --git a/mcpsim/server.py b/mcpsim/server.py
index 88133ea..ee0f1f9 100644
--- a/mcpsim/server.py
+++ b/mcpsim/server.py
@@ -9,7 +9,9 @@
 PARAMETER_PROMPT_PATH = "resources/parameter_prompt.txt"
 
 from mcpsim.tracing import init_tracing  # your helper module from previous discussion
-tracer_provider = init_tracing(project_name="sim-agent-mcp-server")
+
+tracer_provider = init_tracing(project_name="sim-agent-evaluation")
+tracer = tracer_provider.get_tracer("mcp-server-tracer")
 
 class SimulationMCPServer:
     """MCP server that can work with any SimulationModelAdapter implementation."""
@@ -38,6 +40,7 @@ def _register_tools(self):
         """Register MCP tools that delegate to the simulation model."""
         
         # note investigate how to clean up description
+        @tracer.tool(name="MCP.run_call_centre_simulation")
         @self.mcp.tool(
             name=f"run_{self.model.model_name}_simulation",
             description=f"""
@@ -53,6 +56,7 @@ def _register_tools(self):
         def run_call_centre_simulation(parameters: dict) -> dict:
             return self.model.run_simulation(parameters)
 
+        @tracer.tool(name="MCP.validate_simulation_parameters")
         @self.mcp.tool(
             name="validate_simulation_parameters",
             description="""

From aa70d8699145cfe7b1939df2451bf41c0801fb0f Mon Sep 17 00:00:00 2001
From: TomMonks <thomas.m.monks@gmail.com>
Date: Tue, 12 Aug 2025 12:07:00 +0100
Subject: [PATCH 17/23] feat(evals): added basic eval script

---
 evals/evals_output.json | 1121 +++++++++++++++++++++++++++++++++++++++
 run_evals.py            |  157 ++++++
 2 files changed, 1278 insertions(+)
 create mode 100644 evals/evals_output.json
 create mode 100644 run_evals.py

diff --git a/evals/evals_output.json b/evals/evals_output.json
new file mode 100644
index 0000000..9453f85
--- /dev/null
+++ b/evals/evals_output.json
@@ -0,0 +1,1121 @@
+{
+  "example_1": {
+    "user_input": "Simulate 14 operators, 12 nurses and 5% extra demand",
+    "parameters": {
+      "n_operators": 14,
+      "n_nurses": 12,
+      "mean_iat": 0.57,
+      "call_low": 5.0,
+      "call_mode": 7.0,
+      "call_high": 10.0,
+      "callback_prob": 0.4,
+      "nurse_consult_low": 10.0,
+      "nurse_consult_high": 20.0,
+      "random_seed": 0,
+      "run_length": 1000
+    },
+    "expected_results": {
+      "01_mean_waiting_time": 2.3504694345055652,
+      "02_operator_util": 90.86219986414451,
+      "03_mean_nurse_waiting_time": 4.1106430489118715,
+      "04_nurse_util": 90.36357093206236,
+      "05_callback_rate": 42.5531914893617
+    },
+    "agent_run": {
+      "simulation_result": {
+        "01_mean_waiting_time": 2.9621655675237495,
+        "02_operator_util": 91.36685915350135,
+        "03_mean_nurse_waiting_time": 2.958020460174451,
+        "04_nurse_util": 84.3596921204659,
+        "05_callback_rate": 38.860103626943
+      },
+      "meta": {
+        "retry_count": 0,
+        "had_bailout": false,
+        "validation_history": [
+          {
+            "attempt": 1,
+            "parameters": {
+              "n_operators": 14,
+              "n_nurses": 12,
+              "mean_iat": 0.57,
+              "random_seed": 42,
+              "run_length": 1000
+            },
+            "validation_result": {
+              "is_valid": true,
+              "errors": []
+            }
+          }
+        ],
+        "parameters": {
+          "n_operators": 14,
+          "n_nurses": 12,
+          "mean_iat": 0.57,
+          "random_seed": 42,
+          "run_length": 1000
+        },
+        "formatted_parameters": " ```\n**Parameters used in simulation**\n\n| Parameter | Value |\n|-----------|-------|\n| \ud83d\udcca n_operators | 14    |\n| \ud83d\udcca n_nurses   | 12    |\n| \ud83d\udcca mean_iat   | 0.57  |\n| \ud83d\udcca random_seed| 42    |\n| \ud83d\udcca run_length | 1000  |\n```"
+      }
+    },
+    "comparison": {
+      "pass": false,
+      "details": {
+        "01_mean_waiting_time": {
+          "expected": 2.3504694345055652,
+          "got": 2.9621655675237495,
+          "match": false
+        },
+        "02_operator_util": {
+          "expected": 90.86219986414451,
+          "got": 91.36685915350135,
+          "match": false
+        },
+        "03_mean_nurse_waiting_time": {
+          "expected": 4.1106430489118715,
+          "got": 2.958020460174451,
+          "match": false
+        },
+        "04_nurse_util": {
+          "expected": 90.36357093206236,
+          "got": 84.3596921204659,
+          "match": false
+        },
+        "05_callback_rate": {
+          "expected": 42.5531914893617,
+          "got": 38.860103626943,
+          "match": false
+        }
+      },
+      "rtol": 0.001,
+      "atol": 1e-06
+    },
+    "passed": false
+  },
+  "example_2": {
+    "user_input": "Run scenario with double staffing and normal call volume",
+    "parameters": {
+      "n_operators": 26,
+      "n_nurses": 20,
+      "mean_iat": 0.6,
+      "call_low": 5.0,
+      "call_mode": 7.0,
+      "call_high": 10.0,
+      "callback_prob": 0.4,
+      "nurse_consult_low": 10.0,
+      "nurse_consult_high": 20.0,
+      "random_seed": 0,
+      "run_length": 1000
+    },
+    "expected_results": {
+      "01_mean_waiting_time": 0.0,
+      "02_operator_util": 46.452122409679156,
+      "03_mean_nurse_waiting_time": 0.0041045663539504606,
+      "04_nurse_util": 51.20346053052576,
+      "05_callback_rate": 42.12121212121212
+    },
+    "agent_run": {
+      "simulation_result": {
+        "01_mean_waiting_time": 0.24486927533243114,
+        "02_operator_util": 83.1290576750028,
+        "03_mean_nurse_waiting_time": 212.0429575856493,
+        "04_nurse_util": 97.94215299366186,
+        "05_callback_rate": 39.5769361992494
+      },
+      "meta": {
+        "retry_count": 0,
+        "had_bailout": false,
+        "validation_history": [
+          {
+            "attempt": 1,
+            "parameters": {
+              "n_operators": 26,
+              "n_nurses": 10,
+              "mean_iat": 0.335,
+              "random_seed": 42,
+              "run_length": 1000
+            },
+            "validation_result": {
+              "is_valid": true,
+              "errors": []
+            }
+          }
+        ],
+        "parameters": {
+          "n_operators": 26,
+          "n_nurses": 10,
+          "mean_iat": 0.335,
+          "random_seed": 42,
+          "run_length": 1000
+        },
+        "formatted_parameters": " ```markdown\n\ud83d\udcca **Parameters** | **Value**\n-----------------|--------\nn_operators      | 26\nn_nurses         | 10\nmean_iat          | 0.335\nrandom_seed       | 42\nrun_length        | 1000\n```"
+      }
+    },
+    "comparison": {
+      "pass": false,
+      "details": {
+        "01_mean_waiting_time": {
+          "expected": 0.0,
+          "got": 0.24486927533243114,
+          "match": false
+        },
+        "02_operator_util": {
+          "expected": 46.452122409679156,
+          "got": 83.1290576750028,
+          "match": false
+        },
+        "03_mean_nurse_waiting_time": {
+          "expected": 0.0041045663539504606,
+          "got": 212.0429575856493,
+          "match": false
+        },
+        "04_nurse_util": {
+          "expected": 51.20346053052576,
+          "got": 97.94215299366186,
+          "match": false
+        },
+        "05_callback_rate": {
+          "expected": 42.12121212121212,
+          "got": 39.5769361992494,
+          "match": false
+        }
+      },
+      "rtol": 0.001,
+      "atol": 1e-06
+    },
+    "passed": false
+  },
+  "example_3": {
+    "user_input": "Test configuration with 8 operators, 6 nurses, and 20% increased demand",
+    "parameters": {
+      "n_operators": 8,
+      "n_nurses": 6,
+      "mean_iat": 0.48,
+      "call_low": 5.0,
+      "call_mode": 7.0,
+      "call_high": 10.0,
+      "callback_prob": 0.4,
+      "nurse_consult_low": 10.0,
+      "nurse_consult_high": 20.0,
+      "random_seed": 0,
+      "run_length": 1000
+    },
+    "expected_results": {
+      "01_mean_waiting_time": 242.69198606467467,
+      "02_operator_util": 99.35481108027054,
+      "03_mean_nurse_waiting_time": 78.94431797274639,
+      "04_nurse_util": 97.8390040780754,
+      "05_callback_rate": 42.48847926267281
+    },
+    "agent_run": {
+      "simulation_result": {
+        "01_mean_waiting_time": 258.690360654302,
+        "02_operator_util": 99.44580381509923,
+        "03_mean_nurse_waiting_time": 78.94106695388635,
+        "04_nurse_util": 97.8390040780754,
+        "05_callback_rate": 42.5414364640884
+      },
+      "meta": {
+        "retry_count": 3,
+        "had_bailout": false,
+        "validation_history": [
+          {
+            "attempt": 1,
+            "parameters": {
+              "__json_parse_error__": true
+            },
+            "validation_result": {
+              "is_valid": false,
+              "errors": [
+                "Unknown parameter: __json_parse_error__"
+              ]
+            }
+          },
+          {
+            "attempt": 2,
+            "parameters": {
+              "__json_parse_error__": true
+            },
+            "validation_result": {
+              "is_valid": false,
+              "errors": [
+                "Unknown parameter: __json_parse_error__"
+              ]
+            }
+          },
+          {
+            "attempt": 3,
+            "parameters": {
+              "__json_parse_error__": true
+            },
+            "validation_result": {
+              "is_valid": false,
+              "errors": [
+                "Unknown parameter: __json_parse_error__"
+              ]
+            }
+          },
+          {
+            "attempt": 4,
+            "parameters": {
+              "n_operators": 8,
+              "n_nurses": 6,
+              "mean_iat": 0.45,
+              "random_seed": 0,
+              "run_length": 1000
+            },
+            "validation_result": {
+              "is_valid": true,
+              "errors": []
+            }
+          }
+        ],
+        "parameters": {
+          "n_operators": 8,
+          "n_nurses": 6,
+          "mean_iat": 0.45,
+          "random_seed": 0,
+          "run_length": 1000
+        },
+        "formatted_parameters": " ```\n**Parameters used in simulation**\n\n| Parameter | Value |\n|-----------|-------|\n| :bar: n_operators | 8 |\n| :bar: n_nurses   | 6 |\n| :bar: mean_iat    | 0.45 |\n| :bar: random_seed | 0 |\n| :bar: run_length  | 1000 |\n```"
+      }
+    },
+    "comparison": {
+      "pass": false,
+      "details": {
+        "01_mean_waiting_time": {
+          "expected": 242.69198606467467,
+          "got": 258.690360654302,
+          "match": false
+        },
+        "02_operator_util": {
+          "expected": 99.35481108027054,
+          "got": 99.44580381509923,
+          "match": true
+        },
+        "03_mean_nurse_waiting_time": {
+          "expected": 78.94431797274639,
+          "got": 78.94106695388635,
+          "match": true
+        },
+        "04_nurse_util": {
+          "expected": 97.8390040780754,
+          "got": 97.8390040780754,
+          "match": true
+        },
+        "05_callback_rate": {
+          "expected": 42.48847926267281,
+          "got": 42.5414364640884,
+          "match": false
+        }
+      },
+      "rtol": 0.001,
+      "atol": 1e-06
+    },
+    "passed": false
+  },
+  "example_4": {
+    "user_input": "Simulate with 20 operators, 15 nurses, and half the usual call volume",
+    "parameters": {
+      "n_operators": 20,
+      "n_nurses": 15,
+      "mean_iat": 1.2,
+      "call_low": 5.0,
+      "call_mode": 7.0,
+      "call_high": 10.0,
+      "callback_prob": 0.4,
+      "nurse_consult_low": 10.0,
+      "nurse_consult_high": 20.0,
+      "random_seed": 0,
+      "run_length": 1000
+    },
+    "expected_results": {
+      "01_mean_waiting_time": 0.0,
+      "02_operator_util": 31.21492694742085,
+      "03_mean_nurse_waiting_time": 0.0,
+      "04_nurse_util": 35.17900361818962,
+      "05_callback_rate": 42.17289719626168
+    },
+    "agent_run": {
+      "simulation_result": {
+        "01_mean_waiting_time": 0.0,
+        "02_operator_util": 27.417403035068194,
+        "03_mean_nurse_waiting_time": 0.0,
+        "04_nurse_util": 28.0192637065108,
+        "05_callback_rate": 38.22341857335128
+      },
+      "meta": {
+        "retry_count": 0,
+        "had_bailout": false,
+        "validation_history": [
+          {
+            "attempt": 1,
+            "parameters": {
+              "n_operators": 20,
+              "n_nurses": 15,
+              "mean_iat": 1.3,
+              "random_seed": 42,
+              "run_length": 1000
+            },
+            "validation_result": {
+              "is_valid": true,
+              "errors": []
+            }
+          }
+        ],
+        "parameters": {
+          "n_operators": 20,
+          "n_nurses": 15,
+          "mean_iat": 1.3,
+          "random_seed": 42,
+          "run_length": 1000
+        },
+        "formatted_parameters": " ```\n\ud83d\udcca **Parameters used in simulation**\n| Parameter | Value |\n|-----------|-------|\n| n_operators | 20    |\n| n_nurses    | 15    |\n| mean_iat    | 1.3   |\n| random_seed | 42    |\n| run_length  | 1000  |\n```"
+      }
+    },
+    "comparison": {
+      "pass": false,
+      "details": {
+        "01_mean_waiting_time": {
+          "expected": 0.0,
+          "got": 0.0,
+          "match": true
+        },
+        "02_operator_util": {
+          "expected": 31.21492694742085,
+          "got": 27.417403035068194,
+          "match": false
+        },
+        "03_mean_nurse_waiting_time": {
+          "expected": 0.0,
+          "got": 0.0,
+          "match": true
+        },
+        "04_nurse_util": {
+          "expected": 35.17900361818962,
+          "got": 28.0192637065108,
+          "match": false
+        },
+        "05_callback_rate": {
+          "expected": 42.17289719626168,
+          "got": 38.22341857335128,
+          "match": false
+        }
+      },
+      "rtol": 0.001,
+      "atol": 1e-06
+    },
+    "passed": false
+  },
+  "example_5": {
+    "user_input": "Run with 10 operators, 8 nurses, 30% callback probability, and 10% reduced demand",
+    "parameters": {
+      "n_operators": 10,
+      "n_nurses": 8,
+      "mean_iat": 0.66,
+      "call_low": 5.0,
+      "call_mode": 7.0,
+      "call_high": 10.0,
+      "callback_prob": 0.3,
+      "nurse_consult_low": 10.0,
+      "nurse_consult_high": 20.0,
+      "random_seed": 0,
+      "run_length": 1000
+    },
+    "expected_results": {
+      "01_mean_waiting_time": 62.26336943615235,
+      "02_operator_util": 98.91827940297605,
+      "03_mean_nurse_waiting_time": 2.3551925550777337,
+      "04_nurse_util": 82.46774067208518,
+      "05_callback_rate": 33.012583271650634
+    },
+    "agent_run": {
+      "simulation_result": {
+        "01_mean_waiting_time": 0.6478945342567207,
+        "02_operator_util": 76.87777817159127,
+        "03_mean_nurse_waiting_time": 0.30988264770159224,
+        "04_nurse_util": 65.36044859488209,
+        "05_callback_rate": 29.18287937743191
+      },
+      "meta": {
+        "retry_count": 3,
+        "had_bailout": false,
+        "validation_history": [
+          {
+            "attempt": 1,
+            "parameters": {
+              "__json_parse_error__": true
+            },
+            "validation_result": {
+              "is_valid": false,
+              "errors": [
+                "Unknown parameter: __json_parse_error__"
+              ]
+            }
+          },
+          {
+            "attempt": 2,
+            "parameters": {
+              "__json_parse_error__": true
+            },
+            "validation_result": {
+              "is_valid": false,
+              "errors": [
+                "Unknown parameter: __json_parse_error__"
+              ]
+            }
+          },
+          {
+            "attempt": 3,
+            "parameters": {
+              "n_operators": 10,
+              "n_nurses": 8,
+              "mean_iat": 0.74,
+              "callback_prob": 0.3,
+              "call_mode": 4.2,
+              "call_high": 9.0,
+              "call_low": 5.1,
+              "nurse_consult_low": 10.0,
+              "nurse_consult_high": 18.0,
+              "random_seed": 42,
+              "run_length": 600
+            },
+            "validation_result": {
+              "is_valid": false,
+              "errors": [
+                "call_low \u2264 call_mode \u2264 call_high violated"
+              ]
+            }
+          },
+          {
+            "attempt": 4,
+            "parameters": {
+              "n_operators": 9,
+              "n_nurses": 8,
+              "mean_iat": 0.78,
+              "call_low": 4.2,
+              "call_mode": 5.64,
+              "call_high": 6.3,
+              "callback_prob": 0.3,
+              "nurse_consult_low": 10.0,
+              "nurse_consult_high": 18.0,
+              "random_seed": 42,
+              "run_length": 1000
+            },
+            "validation_result": {
+              "is_valid": true,
+              "errors": []
+            }
+          }
+        ],
+        "parameters": {
+          "n_operators": 9,
+          "n_nurses": 8,
+          "mean_iat": 0.78,
+          "call_low": 4.2,
+          "call_mode": 5.64,
+          "call_high": 6.3,
+          "callback_prob": 0.3,
+          "nurse_consult_low": 10.0,
+          "nurse_consult_high": 18.0,
+          "random_seed": 42,
+          "run_length": 1000
+        },
+        "formatted_parameters": " **Parameters used in simulation**\n\n| \ud83d\udcca Parameter | Value |\n|-------------|---|\n| n_operators | 9 |\n| n_nurses    | 8   |\n| mean_iat     | 0.78 |\n| call_low     | 4.2  |\n| call_mode    | 5.64 |\n| call_high    | 6.3  |\n| callback_prob | 0.3  |\n| nurse_consult_low | 10.0   |\n| nurse_consult_high | 18.0   |\n| random_seed  | 42    |\n| run_length   | 1000  |"
+      }
+    },
+    "comparison": {
+      "pass": false,
+      "details": {
+        "01_mean_waiting_time": {
+          "expected": 62.26336943615235,
+          "got": 0.6478945342567207,
+          "match": false
+        },
+        "02_operator_util": {
+          "expected": 98.91827940297605,
+          "got": 76.87777817159127,
+          "match": false
+        },
+        "03_mean_nurse_waiting_time": {
+          "expected": 2.3551925550777337,
+          "got": 0.30988264770159224,
+          "match": false
+        },
+        "04_nurse_util": {
+          "expected": 82.46774067208518,
+          "got": 65.36044859488209,
+          "match": false
+        },
+        "05_callback_rate": {
+          "expected": 33.012583271650634,
+          "got": 29.18287937743191,
+          "match": false
+        }
+      },
+      "rtol": 0.001,
+      "atol": 1e-06
+    },
+    "passed": false
+  },
+  "example_6": {
+    "user_input": "Test scenario: 25 operators, 25 nurses, triple demand",
+    "parameters": {
+      "n_operators": 25,
+      "n_nurses": 25,
+      "mean_iat": 0.2,
+      "call_low": 5.0,
+      "call_mode": 7.0,
+      "call_high": 10.0,
+      "callback_prob": 0.4,
+      "nurse_consult_low": 10.0,
+      "nurse_consult_high": 20.0,
+      "random_seed": 0,
+      "run_length": 1000
+    },
+    "expected_results": {
+      "01_mean_waiting_time": 161.26826119772568,
+      "02_operator_util": 99.27336565709264,
+      "03_mean_nurse_waiting_time": 0.4505143843086236,
+      "04_nurse_util": 83.59706413413956,
+      "05_callback_rate": 41.69621749408983
+    },
+    "agent_run": {
+      "simulation_result": {
+        "01_mean_waiting_time": 0.5518432002497017,
+        "02_operator_util": 89.32672670938162,
+        "03_mean_nurse_waiting_time": 0.12567485568024855,
+        "04_nurse_util": 74.88123258139942,
+        "05_callback_rate": 41.477832512315274
+      },
+      "meta": {
+        "retry_count": 0,
+        "had_bailout": false,
+        "validation_history": [
+          {
+            "attempt": 1,
+            "parameters": {
+              "n_operators": 25,
+              "n_nurses": 25,
+              "mean_iat": 0.33,
+              "random_seed": 0,
+              "run_length": 1000
+            },
+            "validation_result": {
+              "is_valid": true,
+              "errors": []
+            }
+          }
+        ],
+        "parameters": {
+          "n_operators": 25,
+          "n_nurses": 25,
+          "mean_iat": 0.33,
+          "random_seed": 0,
+          "run_length": 1000
+        },
+        "formatted_parameters": " ```\n\ud83d\udcca Parameters used in simulation\n| Parameter | Value |\n|-----------|-------|\n| n_operators | 25    |\n| n_nurses   | 25    |\n| mean_iat   | 0.33  |\n| random_seed| 0     |\n| run_length | 1000  |\n```"
+      }
+    },
+    "comparison": {
+      "pass": false,
+      "details": {
+        "01_mean_waiting_time": {
+          "expected": 161.26826119772568,
+          "got": 0.5518432002497017,
+          "match": false
+        },
+        "02_operator_util": {
+          "expected": 99.27336565709264,
+          "got": 89.32672670938162,
+          "match": false
+        },
+        "03_mean_nurse_waiting_time": {
+          "expected": 0.4505143843086236,
+          "got": 0.12567485568024855,
+          "match": false
+        },
+        "04_nurse_util": {
+          "expected": 83.59706413413956,
+          "got": 74.88123258139942,
+          "match": false
+        },
+        "05_callback_rate": {
+          "expected": 41.69621749408983,
+          "got": 41.477832512315274,
+          "match": false
+        }
+      },
+      "rtol": 0.001,
+      "atol": 1e-06
+    },
+    "passed": false
+  },
+  "example_7": {
+    "user_input": "Simulate 15 operators, 12 nurses, call durations 3-6-9 minutes, 2000 minute runtime",
+    "parameters": {
+      "n_operators": 15,
+      "n_nurses": 12,
+      "mean_iat": 0.6,
+      "call_low": 3.0,
+      "call_mode": 6.0,
+      "call_high": 9.0,
+      "callback_prob": 0.4,
+      "nurse_consult_low": 10.0,
+      "nurse_consult_high": 20.0,
+      "random_seed": 0,
+      "run_length": 2000
+    },
+    "expected_results": {
+      "01_mean_waiting_time": 0.07192106569784922,
+      "02_operator_util": 66.74240386374329,
+      "03_mean_nurse_waiting_time": 2.663802537251699,
+      "04_nurse_util": 86.43094276820304,
+      "05_callback_rate": 41.684147437818396
+    },
+    "agent_run": {
+      "simulation_result": null,
+      "meta": {
+        "retry_count": 4,
+        "had_bailout": false,
+        "validation_history": [
+          {
+            "attempt": 1,
+            "parameters": {
+              "__json_parse_error__": true
+            },
+            "validation_result": {
+              "is_valid": false,
+              "errors": [
+                "Unknown parameter: __json_parse_error__"
+              ]
+            }
+          },
+          {
+            "attempt": 2,
+            "parameters": {
+              "n_operators": 15,
+              "n_nurses": 12,
+              "mean_iat": 0.67,
+              "call_low": 3,
+              "call_mode": 6,
+              "call_high": 9,
+              "callback_prob": 0.4,
+              "nurse_consult_low": 10,
+              "nurse_consult_high": 20,
+              "random_seed": 0,
+              "run_length": 2000
+            },
+            "validation_result": {
+              "is_valid": false,
+              "errors": [
+                "call_low must be float",
+                "call_mode must be float",
+                "call_high must be float",
+                "nurse_consult_low must be float",
+                "nurse_consult_high must be float"
+              ]
+            }
+          },
+          {
+            "attempt": 3,
+            "parameters": {
+              "__json_parse_error__": true
+            },
+            "validation_result": {
+              "is_valid": false,
+              "errors": [
+                "Unknown parameter: __json_parse_error__"
+              ]
+            }
+          },
+          {
+            "attempt": 4,
+            "parameters": {
+              "__json_parse_error__": true
+            },
+            "validation_result": {
+              "is_valid": false,
+              "errors": [
+                "Unknown parameter: __json_parse_error__"
+              ]
+            }
+          }
+        ],
+        "parameters": {
+          "__json_parse_error__": true
+        },
+        "formatted_parameters": null
+      }
+    },
+    "comparison": {
+      "note": "Either expected_results or agent simulation_result missing; comparison skipped."
+    },
+    "passed": false
+  },
+  "example_8": {
+    "user_input": "Run with 6 operators, 4 nurses, 60% callback rate, and 25% higher demand",
+    "parameters": {
+      "n_operators": 6,
+      "n_nurses": 4,
+      "mean_iat": 0.45,
+      "call_low": 5.0,
+      "call_mode": 7.0,
+      "call_high": 10.0,
+      "callback_prob": 0.6,
+      "nurse_consult_low": 10.0,
+      "nurse_consult_high": 20.0,
+      "random_seed": 0,
+      "run_length": 1000
+    },
+    "expected_results": {
+      "01_mean_waiting_time": 318.98668257451686,
+      "02_operator_util": 99.56949574531609,
+      "03_mean_nurse_waiting_time": 196.98753442211358,
+      "04_nurse_util": 97.69465771729217,
+      "05_callback_rate": 57.87545787545788
+    },
+    "agent_run": {
+      "simulation_result": {
+        "01_mean_waiting_time": 330.661658162098,
+        "02_operator_util": 99.47452901988686,
+        "03_mean_nurse_waiting_time": 223.8173355488663,
+        "04_nurse_util": 98.39302830500621,
+        "05_callback_rate": 61.76470588235294
+      },
+      "meta": {
+        "retry_count": 0,
+        "had_bailout": false,
+        "validation_history": [
+          {
+            "attempt": 1,
+            "parameters": {
+              "n_operators": 6,
+              "n_nurses": 4,
+              "mean_iat": 0.42,
+              "call_mode": 7.8,
+              "callback_prob": 0.6,
+              "random_seed": 42,
+              "run_length": 1000
+            },
+            "validation_result": {
+              "is_valid": true,
+              "errors": []
+            }
+          }
+        ],
+        "parameters": {
+          "n_operators": 6,
+          "n_nurses": 4,
+          "mean_iat": 0.42,
+          "call_mode": 7.8,
+          "callback_prob": 0.6,
+          "random_seed": 42,
+          "run_length": 1000
+        },
+        "formatted_parameters": " ```\n\ud83d\udcca Parameters used in simulation\n------------------------------\n| Parameter       | Value    |\n| --------------- | ------- |\n| n_operators     | 6       |\n| n_nurses        | 4       |\n| mean_iat         | 0.42    |\n| call_mode       | 7.8     |\n| callback_prob   | 0.6     |\n| random_seed     | 42      |\n| run_length      | 1000    |\n```"
+      }
+    },
+    "comparison": {
+      "pass": false,
+      "details": {
+        "01_mean_waiting_time": {
+          "expected": 318.98668257451686,
+          "got": 330.661658162098,
+          "match": false
+        },
+        "02_operator_util": {
+          "expected": 99.56949574531609,
+          "got": 99.47452901988686,
+          "match": true
+        },
+        "03_mean_nurse_waiting_time": {
+          "expected": 196.98753442211358,
+          "got": 223.8173355488663,
+          "match": false
+        },
+        "04_nurse_util": {
+          "expected": 97.69465771729217,
+          "got": 98.39302830500621,
+          "match": false
+        },
+        "05_callback_rate": {
+          "expected": 57.87545787545788,
+          "got": 61.76470588235294,
+          "match": false
+        }
+      },
+      "rtol": 0.001,
+      "atol": 1e-06
+    },
+    "passed": false
+  },
+  "example_9": {
+    "user_input": "Test configuration: 18 operators, 14 nurses, nurse consultations 15-25 minutes, normal demand",
+    "parameters": {
+      "n_operators": 18,
+      "n_nurses": 14,
+      "mean_iat": 0.6,
+      "call_low": 5.0,
+      "call_mode": 7.0,
+      "call_high": 10.0,
+      "callback_prob": 0.4,
+      "nurse_consult_low": 15.0,
+      "nurse_consult_high": 25.0,
+      "random_seed": 0,
+      "run_length": 1000
+    },
+    "expected_results": {
+      "01_mean_waiting_time": 0.08980920202587155,
+      "02_operator_util": 67.09751014731434,
+      "03_mean_nurse_waiting_time": 19.85524137214805,
+      "04_nurse_util": 97.15170780690659,
+      "05_callback_rate": 42.12121212121212
+    },
+    "agent_run": {
+      "simulation_result": {
+        "01_mean_waiting_time": 0.08980920202587155,
+        "02_operator_util": 67.09751014731434,
+        "03_mean_nurse_waiting_time": 19.85524137214805,
+        "04_nurse_util": 97.15170780690659,
+        "05_callback_rate": 42.12121212121212
+      },
+      "meta": {
+        "retry_count": 1,
+        "had_bailout": false,
+        "validation_history": [
+          {
+            "attempt": 1,
+            "parameters": {
+              "__json_parse_error__": true
+            },
+            "validation_result": {
+              "is_valid": false,
+              "errors": [
+                "Unknown parameter: __json_parse_error__"
+              ]
+            }
+          },
+          {
+            "attempt": 2,
+            "parameters": {
+              "n_operators": 18,
+              "n_nurses": 14,
+              "mean_iat": 0.6,
+              "call_low": 5.0,
+              "call_mode": 7.0,
+              "call_high": 10.0,
+              "callback_prob": 0.4,
+              "nurse_consult_low": 15.0,
+              "nurse_consult_high": 25.0,
+              "random_seed": 0,
+              "run_length": 1000
+            },
+            "validation_result": {
+              "is_valid": true,
+              "errors": []
+            }
+          }
+        ],
+        "parameters": {
+          "n_operators": 18,
+          "n_nurses": 14,
+          "mean_iat": 0.6,
+          "call_low": 5.0,
+          "call_mode": 7.0,
+          "call_high": 10.0,
+          "callback_prob": 0.4,
+          "nurse_consult_low": 15.0,
+          "nurse_consult_high": 25.0,
+          "random_seed": 0,
+          "run_length": 1000
+        },
+        "formatted_parameters": " ```md\n\ud83d\udcca **Parameters used in simulation**\n| Parameter | Value |\n| --- | --- |\n| n_operators | 18 |\n| n_nurses | 14 |\n| mean_iat | 0.6 |\n| call_low | 5.0 |\n| call_mode | 7.0 |\n| call_high | 10.0 |\n| callback_prob | 0.4 |\n| nurse_consult_low | 15.0 |\n| nurse_consult_high | 25.0 |\n| random_seed | 0 |\n| run_length | 1000 |\n```"
+      }
+    },
+    "comparison": {
+      "pass": true,
+      "details": {
+        "01_mean_waiting_time": {
+          "expected": 0.08980920202587155,
+          "got": 0.08980920202587155,
+          "match": true
+        },
+        "02_operator_util": {
+          "expected": 67.09751014731434,
+          "got": 67.09751014731434,
+          "match": true
+        },
+        "03_mean_nurse_waiting_time": {
+          "expected": 19.85524137214805,
+          "got": 19.85524137214805,
+          "match": true
+        },
+        "04_nurse_util": {
+          "expected": 97.15170780690659,
+          "got": 97.15170780690659,
+          "match": true
+        },
+        "05_callback_rate": {
+          "expected": 42.12121212121212,
+          "got": 42.12121212121212,
+          "match": true
+        }
+      },
+      "rtol": 0.001,
+      "atol": 1e-06
+    },
+    "passed": true
+  },
+  "example_10": {
+    "user_input": "Simulate 12 operators, 10 nurses, 50% callback probability, and 40% demand reduction",
+    "parameters": {
+      "n_operators": 12,
+      "n_nurses": 10,
+      "mean_iat": 0.84,
+      "call_low": 5.0,
+      "call_mode": 7.0,
+      "call_high": 10.0,
+      "callback_prob": 0.5,
+      "nurse_consult_low": 10.0,
+      "nurse_consult_high": 20.0,
+      "random_seed": 0,
+      "run_length": 1000
+    },
+    "expected_results": {
+      "01_mean_waiting_time": 0.38280107383630124,
+      "02_operator_util": 73.44752978233386,
+      "03_mean_nurse_waiting_time": 4.147154100012408,
+      "04_nurse_util": 90.70926116406883,
+      "05_callback_rate": 51.4546965918537
+    },
+    "agent_run": {
+      "simulation_result": {
+        "01_mean_waiting_time": 0.0,
+        "02_operator_util": 23.33484123800376,
+        "03_mean_nurse_waiting_time": 8.70119617931026,
+        "04_nurse_util": 87.09750711871568,
+        "05_callback_rate": 47.963800904977376
+      },
+      "meta": {
+        "retry_count": 2,
+        "had_bailout": false,
+        "validation_history": [
+          {
+            "attempt": 1,
+            "parameters": {
+              "n_operators": 12,
+              "n_nurses": 10,
+              "callback_prob": 0.5,
+              "mean_iat": 0.9,
+              "call_mode": 3.6,
+              "call_low": 4.8,
+              "call_high": 7.2,
+              "nurse_consult_low": 16,
+              "nurse_consult_high": 24,
+              "random_seed": 0,
+              "run_length": 1000
+            },
+            "validation_result": {
+              "is_valid": false,
+              "errors": [
+                "nurse_consult_low must be float",
+                "nurse_consult_high must be float",
+                "call_low \u2264 call_mode \u2264 call_high violated"
+              ]
+            }
+          },
+          {
+            "attempt": 2,
+            "parameters": {
+              "n_operators": 12,
+              "n_nurses": 10,
+              "mean_iat": 0.48,
+              "callback_prob": 0.5,
+              "call_low": 6.0,
+              "call_mode": 7.2,
+              "call_high": 9.6,
+              "nurse_consult_low": 10.0,
+              "nurse_consult_high": 8.0,
+              "random_seed": 42,
+              "run_length": 1000
+            },
+            "validation_result": {
+              "is_valid": false,
+              "errors": [
+                "nurse_consult_low \u2264 nurse_consult_high violated"
+              ]
+            }
+          },
+          {
+            "attempt": 3,
+            "parameters": {
+              "n_operators": 12,
+              "n_nurses": 10,
+              "mean_iat": 1.48,
+              "callback_prob": 0.5,
+              "call_mode": 3.6,
+              "call_low": 3.0,
+              "call_high": 6.0,
+              "nurse_consult_low": 24.0,
+              "nurse_consult_high": 32.0,
+              "random_seed": 42,
+              "run_length": 1000
+            },
+            "validation_result": {
+              "is_valid": true,
+              "errors": []
+            }
+          }
+        ],
+        "parameters": {
+          "n_operators": 12,
+          "n_nurses": 10,
+          "mean_iat": 1.48,
+          "callback_prob": 0.5,
+          "call_mode": 3.6,
+          "call_low": 3.0,
+          "call_high": 6.0,
+          "nurse_consult_low": 24.0,
+          "nurse_consult_high": 32.0,
+          "random_seed": 42,
+          "run_length": 1000
+        },
+        "formatted_parameters": " **Parameters used in simulation**\n\n| :heavy_check_mark: Parameter | Value |\n|-----------------------------|-------|\n| n\\_operators                | 12    |\n| n\\_nurses                   | 10    |\n| mean\\_iat                    | 1.48  |\n| callback\\_prob              | 0.5   |\n| call\\_mode                  | 3.6   |\n| call\\_low                   | 3.0   |\n| call\\_high                  | 6.0   |\n| nurse\\_consult\\_low         | 24.0  |\n| nurse\\_consult\\_high        | 32.0  |\n| random\\_seed                | 42    |\n| run\\_length                 | 1000  |"
+      }
+    },
+    "comparison": {
+      "pass": false,
+      "details": {
+        "01_mean_waiting_time": {
+          "expected": 0.38280107383630124,
+          "got": 0.0,
+          "match": false
+        },
+        "02_operator_util": {
+          "expected": 73.44752978233386,
+          "got": 23.33484123800376,
+          "match": false
+        },
+        "03_mean_nurse_waiting_time": {
+          "expected": 4.147154100012408,
+          "got": 8.70119617931026,
+          "match": false
+        },
+        "04_nurse_util": {
+          "expected": 90.70926116406883,
+          "got": 87.09750711871568,
+          "match": false
+        },
+        "05_callback_rate": {
+          "expected": 51.4546965918537,
+          "got": 47.963800904977376,
+          "match": false
+        }
+      },
+      "rtol": 0.001,
+      "atol": 1e-06
+    },
+    "passed": false
+  }
+}
\ No newline at end of file
diff --git a/run_evals.py b/run_evals.py
new file mode 100644
index 0000000..361c137
--- /dev/null
+++ b/run_evals.py
@@ -0,0 +1,157 @@
+import asyncio
+import json
+import math
+from typing import Dict, Any, Tuple, Optional
+
+# Import the agent builder and its dependencies from your agent module
+# Assumes agent_self_reflection.py exposes `build_graph`, `AgentState`, and a factory for the LLM.
+from agent_self_reflection import build_graph, AgentState
+from langchain_ollama import OllamaLLM
+
+
+def floats_close(a: Any, b: Any, rtol: float = 1e-3, atol: float = 1e-6) -> bool:
+    """
+    Compare numbers with tolerance, pass-through for non-numeric equality.
+    Returns True if both are NaN, or within tolerances for floats, or exactly equal otherwise.
+    """
+    # Handle None
+    if a is None or b is None:
+        return a is None and b is None
+
+    # Try numeric compare
+    try:
+        fa = float(a)
+        fb = float(b)
+        # Handle NaNs
+        if math.isnan(fa) and math.isnan(fb):
+            return True
+        return math.isclose(fa, fb, rel_tol=rtol, abs_tol=atol)
+    except (TypeError, ValueError):
+        # Non-numeric: exact equality
+        return a == b
+
+
+def compare_results(
+    got: Dict[str, Any],
+    expected: Dict[str, Any],
+    rtol: float = 1e-3,
+    atol: float = 1e-6,
+) -> Dict[str, Any]:
+    """
+    Compare simulation outputs to expected_results with numeric tolerance.
+    Returns a dict with per-key comparison, diffs, and overall pass flag.
+    """
+    keys = sorted(set(got.keys()) | set(expected.keys()))
+    per_key = {}
+    all_pass = True
+
+    for k in keys:
+        g = got.get(k, None)
+        e = expected.get(k, None)
+        ok = floats_close(g, e, rtol=rtol, atol=atol)
+        if not ok:
+            all_pass = False
+        per_key[k] = {
+            "expected": e,
+            "got": g,
+            "match": ok,
+        }
+
+    return {
+        "pass": all_pass,
+        "details": per_key,
+        "rtol": rtol,
+        "atol": atol,
+    }
+
+
+async def run_agent_once(
+    compiled_graph,
+    user_input: str,
+    llm: OllamaLLM,
+    max_retries: int = 4,
+) -> Dict[str, Any]:
+    """
+    Runs the agent end-to-end for a single natural-language user_input.
+    Returns the final state, including simulation_result or error.
+    """
+    state_in: AgentState = {
+        "user_input": user_input,
+        "retry_count": 0,
+        "validation_history": [],
+    }
+    final_state: AgentState = await compiled_graph.ainvoke(state_in)
+    return final_state
+
+
+def extract_agent_simulation_result(final_state: Dict[str, Any]) -> Tuple[Optional[Dict[str, Any]], Dict[str, Any]]:
+    """
+    Extract the simulation_result if present and return a compact agent_run summary
+    useful for evaluation logging.
+    """
+    sim_res = final_state.get("simulation_result")
+    agent_run = {
+        "retry_count": final_state.get("retry_count", 0),
+        "had_bailout": final_state.get("error") == "Maximum retries exceeded during parameter reflection.",
+        "validation_history": final_state.get("validation_history", []),
+        "parameters": final_state.get("parameters", {}),
+        "formatted_parameters": final_state.get("formatted_parameters", None),
+    }
+    return sim_res, agent_run
+
+
+async def main(
+    input_json_path: str = "evals/evals.json",
+    output_json_path: str = "evals/evals_output.json",
+    model_name: str = "gemma3:27b",
+    llm_base_url: str = "http://localhost:11434",
+    rtol: float = 1e-3,
+    atol: float = 1e-6,
+) -> None:
+    # 1) Load evals
+    with open(input_json_path, "r") as f:
+        evals = json.load(f)
+
+    # 2) Build LLM and graph once
+    llm = OllamaLLM(model=model_name, base_url=llm_base_url)
+    compiled_graph = build_graph(llm)
+
+    # 3) Run each eval in a loop via the agent
+    for key, case in evals.items():
+        user_input = case.get("user_input", "")
+        expected = case.get("parameters", {}).get("expected_results") or case.get("expected_results")
+        # In your earlier structure, expected_results is at the top level of each example after enrichment.
+        # If not present yet, this will remain None and comparison will be skipped.
+
+        final_state = await run_agent_once(compiled_graph, user_input, llm)
+        sim_res, agent_run = extract_agent_simulation_result(final_state)
+
+        # Record agent run outputs
+        case["agent_run"] = {
+            "simulation_result": sim_res,
+            "meta": agent_run,
+        }
+
+        # 4) Compare to expected_results if available
+        if expected is not None and sim_res is not None:
+            cmp = compare_results(sim_res, expected, rtol=rtol, atol=atol)
+            case["comparison"] = cmp
+            case["passed"] = bool(cmp["pass"])
+        else:
+            case["comparison"] = {
+                "note": "Either expected_results or agent simulation_result missing; comparison skipped."
+            }
+            case["passed"] = False if expected is not None else None
+
+    # 5) Save enriched evals to evals.json
+    with open(output_json_path, "w") as f:
+        json.dump(evals, f, indent=2)
+
+    print(f"Wrote evaluation results to {output_json_path}")
+
+
+if __name__ == "__main__":
+    # For CLI usage:
+    #   python eval_runner.py
+    # Optional: parameterize via env vars or argparse if desired.
+    asyncio.run(main())

From 1e1502078b6d85ef7868a2ace244c2a65951da22 Mon Sep 17 00:00:00 2001
From: TomMonks <thomas.m.monks@gmail.com>
Date: Tue, 12 Aug 2025 13:26:29 +0100
Subject: [PATCH 18/23] feat(evals): trial run with gemma3:27b - no phoenix

---
 create_evals.py => evals/create_evals.py |   0
 evals/evals_output.json                  | 633 ++++++++++-------------
 2 files changed, 277 insertions(+), 356 deletions(-)
 rename create_evals.py => evals/create_evals.py (100%)

diff --git a/create_evals.py b/evals/create_evals.py
similarity index 100%
rename from create_evals.py
rename to evals/create_evals.py
diff --git a/evals/evals_output.json b/evals/evals_output.json
index 9453f85..a5ef66d 100644
--- a/evals/evals_output.json
+++ b/evals/evals_output.json
@@ -23,11 +23,11 @@
     },
     "agent_run": {
       "simulation_result": {
-        "01_mean_waiting_time": 2.9621655675237495,
-        "02_operator_util": 91.36685915350135,
-        "03_mean_nurse_waiting_time": 2.958020460174451,
-        "04_nurse_util": 84.3596921204659,
-        "05_callback_rate": 38.860103626943
+        "01_mean_waiting_time": 2.3504694345055652,
+        "02_operator_util": 90.86219986414451,
+        "03_mean_nurse_waiting_time": 4.1106430489118715,
+        "04_nurse_util": 90.36357093206236,
+        "05_callback_rate": 42.5531914893617
       },
       "meta": {
         "retry_count": 0,
@@ -39,7 +39,13 @@
               "n_operators": 14,
               "n_nurses": 12,
               "mean_iat": 0.57,
-              "random_seed": 42,
+              "call_low": 5.0,
+              "call_mode": 7.0,
+              "call_high": 10.0,
+              "callback_prob": 0.4,
+              "nurse_consult_low": 10.0,
+              "nurse_consult_high": 20.0,
+              "random_seed": 0,
               "run_length": 1000
             },
             "validation_result": {
@@ -52,45 +58,51 @@
           "n_operators": 14,
           "n_nurses": 12,
           "mean_iat": 0.57,
-          "random_seed": 42,
+          "call_low": 5.0,
+          "call_mode": 7.0,
+          "call_high": 10.0,
+          "callback_prob": 0.4,
+          "nurse_consult_low": 10.0,
+          "nurse_consult_high": 20.0,
+          "random_seed": 0,
           "run_length": 1000
         },
-        "formatted_parameters": " ```\n**Parameters used in simulation**\n\n| Parameter | Value |\n|-----------|-------|\n| \ud83d\udcca n_operators | 14    |\n| \ud83d\udcca n_nurses   | 12    |\n| \ud83d\udcca mean_iat   | 0.57  |\n| \ud83d\udcca random_seed| 42    |\n| \ud83d\udcca run_length | 1000  |\n```"
+        "formatted_parameters": "**Parameters used in simulation**\n\n| \ud83d\udcc8 Parameter | Value |\n|---|---|\n| n_operators | 14 |\n| n_nurses | 12 |\n| mean_iat | 0.57 |\n| call_low | 5.0 |\n| call_mode | 7.0 |\n| call_high | 10.0 |\n| callback_prob | 0.4 |\n| nurse_consult_low | 10.0 |\n| nurse_consult_high | 20.0 |\n| random_seed | 0 |\n| run_length | 1000 |\n"
       }
     },
     "comparison": {
-      "pass": false,
+      "pass": true,
       "details": {
         "01_mean_waiting_time": {
           "expected": 2.3504694345055652,
-          "got": 2.9621655675237495,
-          "match": false
+          "got": 2.3504694345055652,
+          "match": true
         },
         "02_operator_util": {
           "expected": 90.86219986414451,
-          "got": 91.36685915350135,
-          "match": false
+          "got": 90.86219986414451,
+          "match": true
         },
         "03_mean_nurse_waiting_time": {
           "expected": 4.1106430489118715,
-          "got": 2.958020460174451,
-          "match": false
+          "got": 4.1106430489118715,
+          "match": true
         },
         "04_nurse_util": {
           "expected": 90.36357093206236,
-          "got": 84.3596921204659,
-          "match": false
+          "got": 90.36357093206236,
+          "match": true
         },
         "05_callback_rate": {
           "expected": 42.5531914893617,
-          "got": 38.860103626943,
-          "match": false
+          "got": 42.5531914893617,
+          "match": true
         }
       },
       "rtol": 0.001,
       "atol": 1e-06
     },
-    "passed": false
+    "passed": true
   },
   "example_2": {
     "user_input": "Run scenario with double staffing and normal call volume",
@@ -116,11 +128,11 @@
     },
     "agent_run": {
       "simulation_result": {
-        "01_mean_waiting_time": 0.24486927533243114,
-        "02_operator_util": 83.1290576750028,
-        "03_mean_nurse_waiting_time": 212.0429575856493,
-        "04_nurse_util": 97.94215299366186,
-        "05_callback_rate": 39.5769361992494
+        "01_mean_waiting_time": 0.0,
+        "02_operator_util": 46.452122409679156,
+        "03_mean_nurse_waiting_time": 43.94073145744734,
+        "04_nurse_util": 97.41291238872327,
+        "05_callback_rate": 42.12121212121212
       },
       "meta": {
         "retry_count": 0,
@@ -131,8 +143,14 @@
             "parameters": {
               "n_operators": 26,
               "n_nurses": 10,
-              "mean_iat": 0.335,
-              "random_seed": 42,
+              "mean_iat": 0.6,
+              "call_low": 5.0,
+              "call_mode": 7.0,
+              "call_high": 10.0,
+              "callback_prob": 0.4,
+              "nurse_consult_low": 10.0,
+              "nurse_consult_high": 20.0,
+              "random_seed": 0,
               "run_length": 1000
             },
             "validation_result": {
@@ -144,11 +162,17 @@
         "parameters": {
           "n_operators": 26,
           "n_nurses": 10,
-          "mean_iat": 0.335,
-          "random_seed": 42,
+          "mean_iat": 0.6,
+          "call_low": 5.0,
+          "call_mode": 7.0,
+          "call_high": 10.0,
+          "callback_prob": 0.4,
+          "nurse_consult_low": 10.0,
+          "nurse_consult_high": 20.0,
+          "random_seed": 0,
           "run_length": 1000
         },
-        "formatted_parameters": " ```markdown\n\ud83d\udcca **Parameters** | **Value**\n-----------------|--------\nn_operators      | 26\nn_nurses         | 10\nmean_iat          | 0.335\nrandom_seed       | 42\nrun_length        | 1000\n```"
+        "formatted_parameters": "**Parameters used in simulation**\n\n| \ud83d\udcc8 Parameter | Value |\n|---|---|\n| n_operators | 26 |\n| n_nurses | 10 |\n| mean_iat | 0.6 |\n| call_low | 5.0 |\n| call_mode | 7.0 |\n| call_high | 10.0 |\n| callback_prob | 0.4 |\n| nurse_consult_low | 10.0 |\n| nurse_consult_high | 20.0 |\n| random_seed | 0 |\n| run_length | 1000 |\n"
       }
     },
     "comparison": {
@@ -156,28 +180,28 @@
       "details": {
         "01_mean_waiting_time": {
           "expected": 0.0,
-          "got": 0.24486927533243114,
-          "match": false
+          "got": 0.0,
+          "match": true
         },
         "02_operator_util": {
           "expected": 46.452122409679156,
-          "got": 83.1290576750028,
-          "match": false
+          "got": 46.452122409679156,
+          "match": true
         },
         "03_mean_nurse_waiting_time": {
           "expected": 0.0041045663539504606,
-          "got": 212.0429575856493,
+          "got": 43.94073145744734,
           "match": false
         },
         "04_nurse_util": {
           "expected": 51.20346053052576,
-          "got": 97.94215299366186,
+          "got": 97.41291238872327,
           "match": false
         },
         "05_callback_rate": {
           "expected": 42.12121212121212,
-          "got": 39.5769361992494,
-          "match": false
+          "got": 42.12121212121212,
+          "match": true
         }
       },
       "rtol": 0.001,
@@ -209,58 +233,28 @@
     },
     "agent_run": {
       "simulation_result": {
-        "01_mean_waiting_time": 258.690360654302,
-        "02_operator_util": 99.44580381509923,
-        "03_mean_nurse_waiting_time": 78.94106695388635,
+        "01_mean_waiting_time": 230.08648598465186,
+        "02_operator_util": 99.35481108027054,
+        "03_mean_nurse_waiting_time": 78.94871019130441,
         "04_nurse_util": 97.8390040780754,
-        "05_callback_rate": 42.5414364640884
+        "05_callback_rate": 42.48847926267281
       },
       "meta": {
-        "retry_count": 3,
+        "retry_count": 0,
         "had_bailout": false,
         "validation_history": [
           {
             "attempt": 1,
-            "parameters": {
-              "__json_parse_error__": true
-            },
-            "validation_result": {
-              "is_valid": false,
-              "errors": [
-                "Unknown parameter: __json_parse_error__"
-              ]
-            }
-          },
-          {
-            "attempt": 2,
-            "parameters": {
-              "__json_parse_error__": true
-            },
-            "validation_result": {
-              "is_valid": false,
-              "errors": [
-                "Unknown parameter: __json_parse_error__"
-              ]
-            }
-          },
-          {
-            "attempt": 3,
-            "parameters": {
-              "__json_parse_error__": true
-            },
-            "validation_result": {
-              "is_valid": false,
-              "errors": [
-                "Unknown parameter: __json_parse_error__"
-              ]
-            }
-          },
-          {
-            "attempt": 4,
             "parameters": {
               "n_operators": 8,
               "n_nurses": 6,
-              "mean_iat": 0.45,
+              "mean_iat": 0.504,
+              "call_low": 5.0,
+              "call_mode": 7.0,
+              "call_high": 10.0,
+              "callback_prob": 0.4,
+              "nurse_consult_low": 10.0,
+              "nurse_consult_high": 20.0,
               "random_seed": 0,
               "run_length": 1000
             },
@@ -273,11 +267,17 @@
         "parameters": {
           "n_operators": 8,
           "n_nurses": 6,
-          "mean_iat": 0.45,
+          "mean_iat": 0.504,
+          "call_low": 5.0,
+          "call_mode": 7.0,
+          "call_high": 10.0,
+          "callback_prob": 0.4,
+          "nurse_consult_low": 10.0,
+          "nurse_consult_high": 20.0,
           "random_seed": 0,
           "run_length": 1000
         },
-        "formatted_parameters": " ```\n**Parameters used in simulation**\n\n| Parameter | Value |\n|-----------|-------|\n| :bar: n_operators | 8 |\n| :bar: n_nurses   | 6 |\n| :bar: mean_iat    | 0.45 |\n| :bar: random_seed | 0 |\n| :bar: run_length  | 1000 |\n```"
+        "formatted_parameters": "**Parameters used in simulation**\n\n| \ud83d\udcc8 Parameter | Value |\n|---|---|\n| n_operators | 8 |\n| n_nurses | 6 |\n| mean_iat | 0.504 |\n| call_low | 5.0 |\n| call_mode | 7.0 |\n| call_high | 10.0 |\n| callback_prob | 0.4 |\n| nurse_consult_low | 10.0 |\n| nurse_consult_high | 20.0 |\n| random_seed | 0 |\n| run_length | 1000 |\n"
       }
     },
     "comparison": {
@@ -285,17 +285,17 @@
       "details": {
         "01_mean_waiting_time": {
           "expected": 242.69198606467467,
-          "got": 258.690360654302,
+          "got": 230.08648598465186,
           "match": false
         },
         "02_operator_util": {
           "expected": 99.35481108027054,
-          "got": 99.44580381509923,
+          "got": 99.35481108027054,
           "match": true
         },
         "03_mean_nurse_waiting_time": {
           "expected": 78.94431797274639,
-          "got": 78.94106695388635,
+          "got": 78.94871019130441,
           "match": true
         },
         "04_nurse_util": {
@@ -305,8 +305,8 @@
         },
         "05_callback_rate": {
           "expected": 42.48847926267281,
-          "got": 42.5414364640884,
-          "match": false
+          "got": 42.48847926267281,
+          "match": true
         }
       },
       "rtol": 0.001,
@@ -339,10 +339,10 @@
     "agent_run": {
       "simulation_result": {
         "01_mean_waiting_time": 0.0,
-        "02_operator_util": 27.417403035068194,
+        "02_operator_util": 31.21492694742085,
         "03_mean_nurse_waiting_time": 0.0,
-        "04_nurse_util": 28.0192637065108,
-        "05_callback_rate": 38.22341857335128
+        "04_nurse_util": 35.17900361818962,
+        "05_callback_rate": 42.17289719626168
       },
       "meta": {
         "retry_count": 0,
@@ -353,8 +353,14 @@
             "parameters": {
               "n_operators": 20,
               "n_nurses": 15,
-              "mean_iat": 1.3,
-              "random_seed": 42,
+              "mean_iat": 1.2,
+              "call_low": 5.0,
+              "call_mode": 7.0,
+              "call_high": 10.0,
+              "callback_prob": 0.4,
+              "nurse_consult_low": 10.0,
+              "nurse_consult_high": 20.0,
+              "random_seed": 0,
               "run_length": 1000
             },
             "validation_result": {
@@ -366,15 +372,21 @@
         "parameters": {
           "n_operators": 20,
           "n_nurses": 15,
-          "mean_iat": 1.3,
-          "random_seed": 42,
+          "mean_iat": 1.2,
+          "call_low": 5.0,
+          "call_mode": 7.0,
+          "call_high": 10.0,
+          "callback_prob": 0.4,
+          "nurse_consult_low": 10.0,
+          "nurse_consult_high": 20.0,
+          "random_seed": 0,
           "run_length": 1000
         },
-        "formatted_parameters": " ```\n\ud83d\udcca **Parameters used in simulation**\n| Parameter | Value |\n|-----------|-------|\n| n_operators | 20    |\n| n_nurses    | 15    |\n| mean_iat    | 1.3   |\n| random_seed | 42    |\n| run_length  | 1000  |\n```"
+        "formatted_parameters": "**Parameters used in simulation**\n\n| \ud83d\udcc8 Parameter | Value |\n|---|---|\n| n_operators | 20 |\n| n_nurses | 15 |\n| mean_iat | 1.2 |\n| call_low | 5.0 |\n| call_mode | 7.0 |\n| call_high | 10.0 |\n| callback_prob | 0.4 |\n| nurse_consult_low | 10.0 |\n| nurse_consult_high | 20.0 |\n| random_seed | 0 |\n| run_length | 1000 |\n"
       }
     },
     "comparison": {
-      "pass": false,
+      "pass": true,
       "details": {
         "01_mean_waiting_time": {
           "expected": 0.0,
@@ -383,8 +395,8 @@
         },
         "02_operator_util": {
           "expected": 31.21492694742085,
-          "got": 27.417403035068194,
-          "match": false
+          "got": 31.21492694742085,
+          "match": true
         },
         "03_mean_nurse_waiting_time": {
           "expected": 0.0,
@@ -393,19 +405,19 @@
         },
         "04_nurse_util": {
           "expected": 35.17900361818962,
-          "got": 28.0192637065108,
-          "match": false
+          "got": 35.17900361818962,
+          "match": true
         },
         "05_callback_rate": {
           "expected": 42.17289719626168,
-          "got": 38.22341857335128,
-          "match": false
+          "got": 42.17289719626168,
+          "match": true
         }
       },
       "rtol": 0.001,
       "atol": 1e-06
     },
-    "passed": false
+    "passed": true
   },
   "example_5": {
     "user_input": "Run with 10 operators, 8 nurses, 30% callback probability, and 10% reduced demand",
@@ -431,75 +443,29 @@
     },
     "agent_run": {
       "simulation_result": {
-        "01_mean_waiting_time": 0.6478945342567207,
-        "02_operator_util": 76.87777817159127,
-        "03_mean_nurse_waiting_time": 0.30988264770159224,
-        "04_nurse_util": 65.36044859488209,
-        "05_callback_rate": 29.18287937743191
+        "01_mean_waiting_time": 46.3970260873325,
+        "02_operator_util": 98.6111946659619,
+        "03_mean_nurse_waiting_time": 2.290083053642536,
+        "04_nurse_util": 82.07127437115734,
+        "05_callback_rate": 33.11061618411284
       },
       "meta": {
-        "retry_count": 3,
+        "retry_count": 0,
         "had_bailout": false,
         "validation_history": [
           {
             "attempt": 1,
-            "parameters": {
-              "__json_parse_error__": true
-            },
-            "validation_result": {
-              "is_valid": false,
-              "errors": [
-                "Unknown parameter: __json_parse_error__"
-              ]
-            }
-          },
-          {
-            "attempt": 2,
-            "parameters": {
-              "__json_parse_error__": true
-            },
-            "validation_result": {
-              "is_valid": false,
-              "errors": [
-                "Unknown parameter: __json_parse_error__"
-              ]
-            }
-          },
-          {
-            "attempt": 3,
             "parameters": {
               "n_operators": 10,
               "n_nurses": 8,
-              "mean_iat": 0.74,
-              "callback_prob": 0.3,
-              "call_mode": 4.2,
-              "call_high": 9.0,
-              "call_low": 5.1,
-              "nurse_consult_low": 10.0,
-              "nurse_consult_high": 18.0,
-              "random_seed": 42,
-              "run_length": 600
-            },
-            "validation_result": {
-              "is_valid": false,
-              "errors": [
-                "call_low \u2264 call_mode \u2264 call_high violated"
-              ]
-            }
-          },
-          {
-            "attempt": 4,
-            "parameters": {
-              "n_operators": 9,
-              "n_nurses": 8,
-              "mean_iat": 0.78,
-              "call_low": 4.2,
-              "call_mode": 5.64,
-              "call_high": 6.3,
+              "mean_iat": 0.69,
               "callback_prob": 0.3,
               "nurse_consult_low": 10.0,
-              "nurse_consult_high": 18.0,
-              "random_seed": 42,
+              "nurse_consult_high": 20.0,
+              "call_low": 5.0,
+              "call_mode": 7.0,
+              "call_high": 10.0,
+              "random_seed": 0,
               "run_length": 1000
             },
             "validation_result": {
@@ -509,19 +475,19 @@
           }
         ],
         "parameters": {
-          "n_operators": 9,
+          "n_operators": 10,
           "n_nurses": 8,
-          "mean_iat": 0.78,
-          "call_low": 4.2,
-          "call_mode": 5.64,
-          "call_high": 6.3,
+          "mean_iat": 0.69,
           "callback_prob": 0.3,
           "nurse_consult_low": 10.0,
-          "nurse_consult_high": 18.0,
-          "random_seed": 42,
+          "nurse_consult_high": 20.0,
+          "call_low": 5.0,
+          "call_mode": 7.0,
+          "call_high": 10.0,
+          "random_seed": 0,
           "run_length": 1000
         },
-        "formatted_parameters": " **Parameters used in simulation**\n\n| \ud83d\udcca Parameter | Value |\n|-------------|---|\n| n_operators | 9 |\n| n_nurses    | 8   |\n| mean_iat     | 0.78 |\n| call_low     | 4.2  |\n| call_mode    | 5.64 |\n| call_high    | 6.3  |\n| callback_prob | 0.3  |\n| nurse_consult_low | 10.0   |\n| nurse_consult_high | 18.0   |\n| random_seed  | 42    |\n| run_length   | 1000  |"
+        "formatted_parameters": "**Parameters used in simulation**\n\n| \ud83d\udcc8 Parameter | Value |\n|---|---|\n| n_operators | 10 |\n| n_nurses | 8 |\n| mean_iat | 0.69 |\n| callback_prob | 0.3 |\n| nurse_consult_low | 10.0 |\n| nurse_consult_high | 20.0 |\n| call_low | 5.0 |\n| call_mode | 7.0 |\n| call_high | 10.0 |\n| random_seed | 0 |\n| run_length | 1000 |\n"
       }
     },
     "comparison": {
@@ -529,27 +495,27 @@
       "details": {
         "01_mean_waiting_time": {
           "expected": 62.26336943615235,
-          "got": 0.6478945342567207,
+          "got": 46.3970260873325,
           "match": false
         },
         "02_operator_util": {
           "expected": 98.91827940297605,
-          "got": 76.87777817159127,
+          "got": 98.6111946659619,
           "match": false
         },
         "03_mean_nurse_waiting_time": {
           "expected": 2.3551925550777337,
-          "got": 0.30988264770159224,
+          "got": 2.290083053642536,
           "match": false
         },
         "04_nurse_util": {
           "expected": 82.46774067208518,
-          "got": 65.36044859488209,
+          "got": 82.07127437115734,
           "match": false
         },
         "05_callback_rate": {
           "expected": 33.012583271650634,
-          "got": 29.18287937743191,
+          "got": 33.11061618411284,
           "match": false
         }
       },
@@ -582,11 +548,11 @@
     },
     "agent_run": {
       "simulation_result": {
-        "01_mean_waiting_time": 0.5518432002497017,
-        "02_operator_util": 89.32672670938162,
-        "03_mean_nurse_waiting_time": 0.12567485568024855,
-        "04_nurse_util": 74.88123258139942,
-        "05_callback_rate": 41.477832512315274
+        "01_mean_waiting_time": 161.26826119772568,
+        "02_operator_util": 99.27336565709264,
+        "03_mean_nurse_waiting_time": 0.4505143843086236,
+        "04_nurse_util": 83.59706413413956,
+        "05_callback_rate": 41.69621749408983
       },
       "meta": {
         "retry_count": 0,
@@ -597,7 +563,13 @@
             "parameters": {
               "n_operators": 25,
               "n_nurses": 25,
-              "mean_iat": 0.33,
+              "mean_iat": 0.2,
+              "call_low": 5.0,
+              "call_mode": 7.0,
+              "call_high": 10.0,
+              "callback_prob": 0.4,
+              "nurse_consult_low": 10.0,
+              "nurse_consult_high": 20.0,
               "random_seed": 0,
               "run_length": 1000
             },
@@ -610,46 +582,52 @@
         "parameters": {
           "n_operators": 25,
           "n_nurses": 25,
-          "mean_iat": 0.33,
+          "mean_iat": 0.2,
+          "call_low": 5.0,
+          "call_mode": 7.0,
+          "call_high": 10.0,
+          "callback_prob": 0.4,
+          "nurse_consult_low": 10.0,
+          "nurse_consult_high": 20.0,
           "random_seed": 0,
           "run_length": 1000
         },
-        "formatted_parameters": " ```\n\ud83d\udcca Parameters used in simulation\n| Parameter | Value |\n|-----------|-------|\n| n_operators | 25    |\n| n_nurses   | 25    |\n| mean_iat   | 0.33  |\n| random_seed| 0     |\n| run_length | 1000  |\n```"
+        "formatted_parameters": "**Parameters used in simulation**\n\n| \ud83d\udcc8 Parameter | Value |\n|---|---|\n| n_operators | 25 |\n| n_nurses | 25 |\n| mean_iat | 0.2 |\n| call_low | 5.0 |\n| call_mode | 7.0 |\n| call_high | 10.0 |\n| callback_prob | 0.4 |\n| nurse_consult_low | 10.0 |\n| nurse_consult_high | 20.0 |\n| random_seed | 0 |\n| run_length | 1000 |\n"
       }
     },
     "comparison": {
-      "pass": false,
+      "pass": true,
       "details": {
         "01_mean_waiting_time": {
           "expected": 161.26826119772568,
-          "got": 0.5518432002497017,
-          "match": false
+          "got": 161.26826119772568,
+          "match": true
         },
         "02_operator_util": {
           "expected": 99.27336565709264,
-          "got": 89.32672670938162,
-          "match": false
+          "got": 99.27336565709264,
+          "match": true
         },
         "03_mean_nurse_waiting_time": {
           "expected": 0.4505143843086236,
-          "got": 0.12567485568024855,
-          "match": false
+          "got": 0.4505143843086236,
+          "match": true
         },
         "04_nurse_util": {
           "expected": 83.59706413413956,
-          "got": 74.88123258139942,
-          "match": false
+          "got": 83.59706413413956,
+          "match": true
         },
         "05_callback_rate": {
           "expected": 41.69621749408983,
-          "got": 41.477832512315274,
-          "match": false
+          "got": 41.69621749408983,
+          "match": true
         }
       },
       "rtol": 0.001,
       "atol": 1e-06
     },
-    "passed": false
+    "passed": true
   },
   "example_7": {
     "user_input": "Simulate 15 operators, 12 nurses, call durations 3-6-9 minutes, 2000 minute runtime",
@@ -674,84 +652,87 @@
       "05_callback_rate": 41.684147437818396
     },
     "agent_run": {
-      "simulation_result": null,
+      "simulation_result": {
+        "01_mean_waiting_time": 0.07192106569784922,
+        "02_operator_util": 66.74240386374329,
+        "03_mean_nurse_waiting_time": 2.663802537251699,
+        "04_nurse_util": 86.43094276820304,
+        "05_callback_rate": 41.684147437818396
+      },
       "meta": {
-        "retry_count": 4,
+        "retry_count": 0,
         "had_bailout": false,
         "validation_history": [
           {
             "attempt": 1,
-            "parameters": {
-              "__json_parse_error__": true
-            },
-            "validation_result": {
-              "is_valid": false,
-              "errors": [
-                "Unknown parameter: __json_parse_error__"
-              ]
-            }
-          },
-          {
-            "attempt": 2,
             "parameters": {
               "n_operators": 15,
               "n_nurses": 12,
-              "mean_iat": 0.67,
-              "call_low": 3,
-              "call_mode": 6,
-              "call_high": 9,
+              "mean_iat": 0.6,
+              "call_low": 3.0,
+              "call_mode": 6.0,
+              "call_high": 9.0,
               "callback_prob": 0.4,
-              "nurse_consult_low": 10,
-              "nurse_consult_high": 20,
+              "nurse_consult_low": 10.0,
+              "nurse_consult_high": 20.0,
               "random_seed": 0,
               "run_length": 2000
             },
             "validation_result": {
-              "is_valid": false,
-              "errors": [
-                "call_low must be float",
-                "call_mode must be float",
-                "call_high must be float",
-                "nurse_consult_low must be float",
-                "nurse_consult_high must be float"
-              ]
-            }
-          },
-          {
-            "attempt": 3,
-            "parameters": {
-              "__json_parse_error__": true
-            },
-            "validation_result": {
-              "is_valid": false,
-              "errors": [
-                "Unknown parameter: __json_parse_error__"
-              ]
-            }
-          },
-          {
-            "attempt": 4,
-            "parameters": {
-              "__json_parse_error__": true
-            },
-            "validation_result": {
-              "is_valid": false,
-              "errors": [
-                "Unknown parameter: __json_parse_error__"
-              ]
+              "is_valid": true,
+              "errors": []
             }
           }
         ],
         "parameters": {
-          "__json_parse_error__": true
+          "n_operators": 15,
+          "n_nurses": 12,
+          "mean_iat": 0.6,
+          "call_low": 3.0,
+          "call_mode": 6.0,
+          "call_high": 9.0,
+          "callback_prob": 0.4,
+          "nurse_consult_low": 10.0,
+          "nurse_consult_high": 20.0,
+          "random_seed": 0,
+          "run_length": 2000
         },
-        "formatted_parameters": null
+        "formatted_parameters": "**Parameters used in simulation**\n\n| \ud83d\udcca Parameter | Value |\n|---|---|\n| n_operators | 15 |\n| n_nurses | 12 |\n| mean_iat | 0.6 |\n| call_low | 3.0 |\n| call_mode | 6.0 |\n| call_high | 9.0 |\n| callback_prob | 0.4 |\n| nurse_consult_low | 10.0 |\n| nurse_consult_high | 20.0 |\n| random_seed | 0 |\n| run_length | 2000 |\n"
       }
     },
     "comparison": {
-      "note": "Either expected_results or agent simulation_result missing; comparison skipped."
+      "pass": true,
+      "details": {
+        "01_mean_waiting_time": {
+          "expected": 0.07192106569784922,
+          "got": 0.07192106569784922,
+          "match": true
+        },
+        "02_operator_util": {
+          "expected": 66.74240386374329,
+          "got": 66.74240386374329,
+          "match": true
+        },
+        "03_mean_nurse_waiting_time": {
+          "expected": 2.663802537251699,
+          "got": 2.663802537251699,
+          "match": true
+        },
+        "04_nurse_util": {
+          "expected": 86.43094276820304,
+          "got": 86.43094276820304,
+          "match": true
+        },
+        "05_callback_rate": {
+          "expected": 41.684147437818396,
+          "got": 41.684147437818396,
+          "match": true
+        }
+      },
+      "rtol": 0.001,
+      "atol": 1e-06
     },
-    "passed": false
+    "passed": true
   },
   "example_8": {
     "user_input": "Run with 6 operators, 4 nurses, 60% callback rate, and 25% higher demand",
@@ -777,11 +758,11 @@
     },
     "agent_run": {
       "simulation_result": {
-        "01_mean_waiting_time": 330.661658162098,
-        "02_operator_util": 99.47452901988686,
-        "03_mean_nurse_waiting_time": 223.8173355488663,
-        "04_nurse_util": 98.39302830500621,
-        "05_callback_rate": 61.76470588235294
+        "01_mean_waiting_time": 306.74172321599656,
+        "02_operator_util": 99.45155480236895,
+        "03_mean_nurse_waiting_time": 196.98238666596234,
+        "04_nurse_util": 97.69465771729217,
+        "05_callback_rate": 57.8239608801956
       },
       "meta": {
         "retry_count": 0,
@@ -792,10 +773,9 @@
             "parameters": {
               "n_operators": 6,
               "n_nurses": 4,
-              "mean_iat": 0.42,
-              "call_mode": 7.8,
+              "mean_iat": 0.48,
               "callback_prob": 0.6,
-              "random_seed": 42,
+              "random_seed": 0,
               "run_length": 1000
             },
             "validation_result": {
@@ -807,13 +787,12 @@
         "parameters": {
           "n_operators": 6,
           "n_nurses": 4,
-          "mean_iat": 0.42,
-          "call_mode": 7.8,
+          "mean_iat": 0.48,
           "callback_prob": 0.6,
-          "random_seed": 42,
+          "random_seed": 0,
           "run_length": 1000
         },
-        "formatted_parameters": " ```\n\ud83d\udcca Parameters used in simulation\n------------------------------\n| Parameter       | Value    |\n| --------------- | ------- |\n| n_operators     | 6       |\n| n_nurses        | 4       |\n| mean_iat         | 0.42    |\n| call_mode       | 7.8     |\n| callback_prob   | 0.6     |\n| random_seed     | 42      |\n| run_length      | 1000    |\n```"
+        "formatted_parameters": "**Parameters used in simulation**\n\n| \ud83d\udcca Parameter | Value |\n|---|---|\n| n_operators | 6 |\n| n_nurses | 4 |\n| mean_iat | 0.48 |\n| callback_prob | 0.6 |\n| random_seed | 0 |\n| run_length | 1000 |\n"
       }
     },
     "comparison": {
@@ -821,28 +800,28 @@
       "details": {
         "01_mean_waiting_time": {
           "expected": 318.98668257451686,
-          "got": 330.661658162098,
+          "got": 306.74172321599656,
           "match": false
         },
         "02_operator_util": {
           "expected": 99.56949574531609,
-          "got": 99.47452901988686,
-          "match": true
+          "got": 99.45155480236895,
+          "match": false
         },
         "03_mean_nurse_waiting_time": {
           "expected": 196.98753442211358,
-          "got": 223.8173355488663,
-          "match": false
+          "got": 196.98238666596234,
+          "match": true
         },
         "04_nurse_util": {
           "expected": 97.69465771729217,
-          "got": 98.39302830500621,
-          "match": false
+          "got": 97.69465771729217,
+          "match": true
         },
         "05_callback_rate": {
           "expected": 57.87545787545788,
-          "got": 61.76470588235294,
-          "match": false
+          "got": 57.8239608801956,
+          "match": true
         }
       },
       "rtol": 0.001,
@@ -881,23 +860,11 @@
         "05_callback_rate": 42.12121212121212
       },
       "meta": {
-        "retry_count": 1,
+        "retry_count": 0,
         "had_bailout": false,
         "validation_history": [
           {
             "attempt": 1,
-            "parameters": {
-              "__json_parse_error__": true
-            },
-            "validation_result": {
-              "is_valid": false,
-              "errors": [
-                "Unknown parameter: __json_parse_error__"
-              ]
-            }
-          },
-          {
-            "attempt": 2,
             "parameters": {
               "n_operators": 18,
               "n_nurses": 14,
@@ -930,7 +897,7 @@
           "random_seed": 0,
           "run_length": 1000
         },
-        "formatted_parameters": " ```md\n\ud83d\udcca **Parameters used in simulation**\n| Parameter | Value |\n| --- | --- |\n| n_operators | 18 |\n| n_nurses | 14 |\n| mean_iat | 0.6 |\n| call_low | 5.0 |\n| call_mode | 7.0 |\n| call_high | 10.0 |\n| callback_prob | 0.4 |\n| nurse_consult_low | 15.0 |\n| nurse_consult_high | 25.0 |\n| random_seed | 0 |\n| run_length | 1000 |\n```"
+        "formatted_parameters": "**Parameters used in simulation**\n\n| \ud83d\udcc8 Parameter | Value |\n|---|---|\n| n_operators | 18 |\n| n_nurses | 14 |\n| mean_iat | 0.6 |\n| call_low | 5.0 |\n| call_mode | 7.0 |\n| call_high | 10.0 |\n| callback_prob | 0.4 |\n| nurse_consult_low | 15.0 |\n| nurse_consult_high | 25.0 |\n| random_seed | 0 |\n| run_length | 1000 |\n"
       }
     },
     "comparison": {
@@ -991,14 +958,14 @@
     },
     "agent_run": {
       "simulation_result": {
-        "01_mean_waiting_time": 0.0,
-        "02_operator_util": 23.33484123800376,
-        "03_mean_nurse_waiting_time": 8.70119617931026,
-        "04_nurse_util": 87.09750711871568,
-        "05_callback_rate": 47.963800904977376
+        "01_mean_waiting_time": 1.2583822886248166,
+        "02_operator_util": 84.9856571728051,
+        "03_mean_nurse_waiting_time": 46.17149830527762,
+        "04_nurse_util": 97.12003253429981,
+        "05_callback_rate": 50.43103448275862
       },
       "meta": {
-        "retry_count": 2,
+        "retry_count": 0,
         "had_bailout": false,
         "validation_history": [
           {
@@ -1006,60 +973,14 @@
             "parameters": {
               "n_operators": 12,
               "n_nurses": 10,
+              "mean_iat": 0.72,
               "callback_prob": 0.5,
-              "mean_iat": 0.9,
-              "call_mode": 3.6,
-              "call_low": 4.8,
-              "call_high": 7.2,
-              "nurse_consult_low": 16,
-              "nurse_consult_high": 24,
-              "random_seed": 0,
-              "run_length": 1000
-            },
-            "validation_result": {
-              "is_valid": false,
-              "errors": [
-                "nurse_consult_low must be float",
-                "nurse_consult_high must be float",
-                "call_low \u2264 call_mode \u2264 call_high violated"
-              ]
-            }
-          },
-          {
-            "attempt": 2,
-            "parameters": {
-              "n_operators": 12,
-              "n_nurses": 10,
-              "mean_iat": 0.48,
-              "callback_prob": 0.5,
-              "call_low": 6.0,
-              "call_mode": 7.2,
-              "call_high": 9.6,
+              "call_low": 5.0,
+              "call_mode": 7.0,
+              "call_high": 10.0,
               "nurse_consult_low": 10.0,
-              "nurse_consult_high": 8.0,
-              "random_seed": 42,
-              "run_length": 1000
-            },
-            "validation_result": {
-              "is_valid": false,
-              "errors": [
-                "nurse_consult_low \u2264 nurse_consult_high violated"
-              ]
-            }
-          },
-          {
-            "attempt": 3,
-            "parameters": {
-              "n_operators": 12,
-              "n_nurses": 10,
-              "mean_iat": 1.48,
-              "callback_prob": 0.5,
-              "call_mode": 3.6,
-              "call_low": 3.0,
-              "call_high": 6.0,
-              "nurse_consult_low": 24.0,
-              "nurse_consult_high": 32.0,
-              "random_seed": 42,
+              "nurse_consult_high": 20.0,
+              "random_seed": 0,
               "run_length": 1000
             },
             "validation_result": {
@@ -1071,17 +992,17 @@
         "parameters": {
           "n_operators": 12,
           "n_nurses": 10,
-          "mean_iat": 1.48,
+          "mean_iat": 0.72,
           "callback_prob": 0.5,
-          "call_mode": 3.6,
-          "call_low": 3.0,
-          "call_high": 6.0,
-          "nurse_consult_low": 24.0,
-          "nurse_consult_high": 32.0,
-          "random_seed": 42,
+          "call_low": 5.0,
+          "call_mode": 7.0,
+          "call_high": 10.0,
+          "nurse_consult_low": 10.0,
+          "nurse_consult_high": 20.0,
+          "random_seed": 0,
           "run_length": 1000
         },
-        "formatted_parameters": " **Parameters used in simulation**\n\n| :heavy_check_mark: Parameter | Value |\n|-----------------------------|-------|\n| n\\_operators                | 12    |\n| n\\_nurses                   | 10    |\n| mean\\_iat                    | 1.48  |\n| callback\\_prob              | 0.5   |\n| call\\_mode                  | 3.6   |\n| call\\_low                   | 3.0   |\n| call\\_high                  | 6.0   |\n| nurse\\_consult\\_low         | 24.0  |\n| nurse\\_consult\\_high        | 32.0  |\n| random\\_seed                | 42    |\n| run\\_length                 | 1000  |"
+        "formatted_parameters": "**Parameters used in simulation**\n\n| \ud83d\udcca Parameter | Value |\n|---|---|\n| n_operators | 12 |\n| n_nurses | 10 |\n| mean_iat | 0.72 |\n| callback_prob | 0.5 |\n| call_low | 5.0 |\n| call_mode | 7.0 |\n| call_high | 10.0 |\n| nurse_consult_low | 10.0 |\n| nurse_consult_high | 20.0 |\n| random_seed | 0 |\n| run_length | 1000 |\n"
       }
     },
     "comparison": {
@@ -1089,27 +1010,27 @@
       "details": {
         "01_mean_waiting_time": {
           "expected": 0.38280107383630124,
-          "got": 0.0,
+          "got": 1.2583822886248166,
           "match": false
         },
         "02_operator_util": {
           "expected": 73.44752978233386,
-          "got": 23.33484123800376,
+          "got": 84.9856571728051,
           "match": false
         },
         "03_mean_nurse_waiting_time": {
           "expected": 4.147154100012408,
-          "got": 8.70119617931026,
+          "got": 46.17149830527762,
           "match": false
         },
         "04_nurse_util": {
           "expected": 90.70926116406883,
-          "got": 87.09750711871568,
+          "got": 97.12003253429981,
           "match": false
         },
         "05_callback_rate": {
           "expected": 51.4546965918537,
-          "got": 47.963800904977376,
+          "got": 50.43103448275862,
           "match": false
         }
       },

From 5f896a64522494308bb4a4236df5bcdfe038068a Mon Sep 17 00:00:00 2001
From: TomMonks <thomas.m.monks@gmail.com>
Date: Tue, 12 Aug 2025 14:12:35 +0100
Subject: [PATCH 19/23] docs(env) + main arize-phoenix

---
 environment.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/environment.yml b/environment.yml
index e974ae9..1946eb8 100644
--- a/environment.yml
+++ b/environment.yml
@@ -1,4 +1,4 @@
-name: des-agent
+name: sim-agent
 channels:
   - conda-forge
 dependencies:
@@ -14,6 +14,7 @@ dependencies:
   - tabulate
   - langgraph
   - pip:
+      - arize-phoenix
       - arize-phoenix-otel
       - langchain
       - langchain-community

From 42c7a8b2cdd9104e09218cceadac374a3480fba3 Mon Sep 17 00:00:00 2001
From: TomMonks <thomas.m.monks@gmail.com>
Date: Tue, 12 Aug 2025 16:15:14 +0100
Subject: [PATCH 20/23] feat(evals): run all evals in one go or limit to n of
 set

---
 agent_self_reflection.py |   2 +-
 evals/evals_output.json  | 914 +++++----------------------------------
 run_evals.py             | 329 ++++++++------
 3 files changed, 315 insertions(+), 930 deletions(-)

diff --git a/agent_self_reflection.py b/agent_self_reflection.py
index 11c869f..ae6d18f 100644
--- a/agent_self_reflection.py
+++ b/agent_self_reflection.py
@@ -34,7 +34,7 @@
 Hard constraints
 -------------
 MAX_RETRIES : int
-    Maximum number of parameter generation attempts before bailout (default: 4)
+    Maximum number of parameter generation attempts before bailout (default: mam4)
 
 Examples
 --------
diff --git a/evals/evals_output.json b/evals/evals_output.json
index a5ef66d..063aeb5 100644
--- a/evals/evals_output.json
+++ b/evals/evals_output.json
@@ -21,88 +21,18 @@
       "04_nurse_util": 90.36357093206236,
       "05_callback_rate": 42.5531914893617
     },
-    "agent_run": {
-      "simulation_result": {
-        "01_mean_waiting_time": 2.3504694345055652,
-        "02_operator_util": 90.86219986414451,
-        "03_mean_nurse_waiting_time": 4.1106430489118715,
-        "04_nurse_util": 90.36357093206236,
-        "05_callback_rate": 42.5531914893617
-      },
-      "meta": {
-        "retry_count": 0,
-        "had_bailout": false,
-        "validation_history": [
-          {
-            "attempt": 1,
-            "parameters": {
-              "n_operators": 14,
-              "n_nurses": 12,
-              "mean_iat": 0.57,
-              "call_low": 5.0,
-              "call_mode": 7.0,
-              "call_high": 10.0,
-              "callback_prob": 0.4,
-              "nurse_consult_low": 10.0,
-              "nurse_consult_high": 20.0,
-              "random_seed": 0,
-              "run_length": 1000
-            },
-            "validation_result": {
-              "is_valid": true,
-              "errors": []
-            }
-          }
-        ],
-        "parameters": {
-          "n_operators": 14,
-          "n_nurses": 12,
-          "mean_iat": 0.57,
-          "call_low": 5.0,
-          "call_mode": 7.0,
-          "call_high": 10.0,
-          "callback_prob": 0.4,
-          "nurse_consult_low": 10.0,
-          "nurse_consult_high": 20.0,
-          "random_seed": 0,
-          "run_length": 1000
-        },
-        "formatted_parameters": "**Parameters used in simulation**\n\n| \ud83d\udcc8 Parameter | Value |\n|---|---|\n| n_operators | 14 |\n| n_nurses | 12 |\n| mean_iat | 0.57 |\n| call_low | 5.0 |\n| call_mode | 7.0 |\n| call_high | 10.0 |\n| callback_prob | 0.4 |\n| nurse_consult_low | 10.0 |\n| nurse_consult_high | 20.0 |\n| random_seed | 0 |\n| run_length | 1000 |\n"
-      }
-    },
-    "comparison": {
-      "pass": true,
-      "details": {
-        "01_mean_waiting_time": {
-          "expected": 2.3504694345055652,
-          "got": 2.3504694345055652,
-          "match": true
-        },
-        "02_operator_util": {
-          "expected": 90.86219986414451,
-          "got": 90.86219986414451,
-          "match": true
-        },
-        "03_mean_nurse_waiting_time": {
-          "expected": 4.1106430489118715,
-          "got": 4.1106430489118715,
-          "match": true
-        },
-        "04_nurse_util": {
-          "expected": 90.36357093206236,
-          "got": 90.36357093206236,
-          "match": true
-        },
-        "05_callback_rate": {
-          "expected": 42.5531914893617,
-          "got": 42.5531914893617,
-          "match": true
-        }
-      },
-      "rtol": 0.001,
-      "atol": 1e-06
+    "agent_result": {
+      "01_mean_waiting_time": 2.3504694345055652,
+      "02_operator_util": 90.86219986414451,
+      "03_mean_nurse_waiting_time": 4.1106430489118715,
+      "04_nurse_util": 90.36357093206236,
+      "05_callback_rate": 42.5531914893617
     },
-    "passed": true
+    "passed": true,
+    "context": {
+      "trace_id": "f5261ab566c4e5bebfbea7cc7f28b71c",
+      "span_id": "fd13f4f202d0d7a5"
+    }
   },
   "example_2": {
     "user_input": "Run scenario with double staffing and normal call volume",
@@ -126,88 +56,18 @@
       "04_nurse_util": 51.20346053052576,
       "05_callback_rate": 42.12121212121212
     },
-    "agent_run": {
-      "simulation_result": {
-        "01_mean_waiting_time": 0.0,
-        "02_operator_util": 46.452122409679156,
-        "03_mean_nurse_waiting_time": 43.94073145744734,
-        "04_nurse_util": 97.41291238872327,
-        "05_callback_rate": 42.12121212121212
-      },
-      "meta": {
-        "retry_count": 0,
-        "had_bailout": false,
-        "validation_history": [
-          {
-            "attempt": 1,
-            "parameters": {
-              "n_operators": 26,
-              "n_nurses": 10,
-              "mean_iat": 0.6,
-              "call_low": 5.0,
-              "call_mode": 7.0,
-              "call_high": 10.0,
-              "callback_prob": 0.4,
-              "nurse_consult_low": 10.0,
-              "nurse_consult_high": 20.0,
-              "random_seed": 0,
-              "run_length": 1000
-            },
-            "validation_result": {
-              "is_valid": true,
-              "errors": []
-            }
-          }
-        ],
-        "parameters": {
-          "n_operators": 26,
-          "n_nurses": 10,
-          "mean_iat": 0.6,
-          "call_low": 5.0,
-          "call_mode": 7.0,
-          "call_high": 10.0,
-          "callback_prob": 0.4,
-          "nurse_consult_low": 10.0,
-          "nurse_consult_high": 20.0,
-          "random_seed": 0,
-          "run_length": 1000
-        },
-        "formatted_parameters": "**Parameters used in simulation**\n\n| \ud83d\udcc8 Parameter | Value |\n|---|---|\n| n_operators | 26 |\n| n_nurses | 10 |\n| mean_iat | 0.6 |\n| call_low | 5.0 |\n| call_mode | 7.0 |\n| call_high | 10.0 |\n| callback_prob | 0.4 |\n| nurse_consult_low | 10.0 |\n| nurse_consult_high | 20.0 |\n| random_seed | 0 |\n| run_length | 1000 |\n"
-      }
-    },
-    "comparison": {
-      "pass": false,
-      "details": {
-        "01_mean_waiting_time": {
-          "expected": 0.0,
-          "got": 0.0,
-          "match": true
-        },
-        "02_operator_util": {
-          "expected": 46.452122409679156,
-          "got": 46.452122409679156,
-          "match": true
-        },
-        "03_mean_nurse_waiting_time": {
-          "expected": 0.0041045663539504606,
-          "got": 43.94073145744734,
-          "match": false
-        },
-        "04_nurse_util": {
-          "expected": 51.20346053052576,
-          "got": 97.41291238872327,
-          "match": false
-        },
-        "05_callback_rate": {
-          "expected": 42.12121212121212,
-          "got": 42.12121212121212,
-          "match": true
-        }
-      },
-      "rtol": 0.001,
-      "atol": 1e-06
+    "agent_result": {
+      "01_mean_waiting_time": 0.0,
+      "02_operator_util": 46.452122409679156,
+      "03_mean_nurse_waiting_time": 43.94073145744734,
+      "04_nurse_util": 97.41291238872327,
+      "05_callback_rate": 42.12121212121212
     },
-    "passed": false
+    "passed": false,
+    "context": {
+      "trace_id": "c93069f691d1a1edbca3886db55bc2cc",
+      "span_id": "1c4b8ed595ac9a41"
+    }
   },
   "example_3": {
     "user_input": "Test configuration with 8 operators, 6 nurses, and 20% increased demand",
@@ -231,88 +91,18 @@
       "04_nurse_util": 97.8390040780754,
       "05_callback_rate": 42.48847926267281
     },
-    "agent_run": {
-      "simulation_result": {
-        "01_mean_waiting_time": 230.08648598465186,
-        "02_operator_util": 99.35481108027054,
-        "03_mean_nurse_waiting_time": 78.94871019130441,
-        "04_nurse_util": 97.8390040780754,
-        "05_callback_rate": 42.48847926267281
-      },
-      "meta": {
-        "retry_count": 0,
-        "had_bailout": false,
-        "validation_history": [
-          {
-            "attempt": 1,
-            "parameters": {
-              "n_operators": 8,
-              "n_nurses": 6,
-              "mean_iat": 0.504,
-              "call_low": 5.0,
-              "call_mode": 7.0,
-              "call_high": 10.0,
-              "callback_prob": 0.4,
-              "nurse_consult_low": 10.0,
-              "nurse_consult_high": 20.0,
-              "random_seed": 0,
-              "run_length": 1000
-            },
-            "validation_result": {
-              "is_valid": true,
-              "errors": []
-            }
-          }
-        ],
-        "parameters": {
-          "n_operators": 8,
-          "n_nurses": 6,
-          "mean_iat": 0.504,
-          "call_low": 5.0,
-          "call_mode": 7.0,
-          "call_high": 10.0,
-          "callback_prob": 0.4,
-          "nurse_consult_low": 10.0,
-          "nurse_consult_high": 20.0,
-          "random_seed": 0,
-          "run_length": 1000
-        },
-        "formatted_parameters": "**Parameters used in simulation**\n\n| \ud83d\udcc8 Parameter | Value |\n|---|---|\n| n_operators | 8 |\n| n_nurses | 6 |\n| mean_iat | 0.504 |\n| call_low | 5.0 |\n| call_mode | 7.0 |\n| call_high | 10.0 |\n| callback_prob | 0.4 |\n| nurse_consult_low | 10.0 |\n| nurse_consult_high | 20.0 |\n| random_seed | 0 |\n| run_length | 1000 |\n"
-      }
-    },
-    "comparison": {
-      "pass": false,
-      "details": {
-        "01_mean_waiting_time": {
-          "expected": 242.69198606467467,
-          "got": 230.08648598465186,
-          "match": false
-        },
-        "02_operator_util": {
-          "expected": 99.35481108027054,
-          "got": 99.35481108027054,
-          "match": true
-        },
-        "03_mean_nurse_waiting_time": {
-          "expected": 78.94431797274639,
-          "got": 78.94871019130441,
-          "match": true
-        },
-        "04_nurse_util": {
-          "expected": 97.8390040780754,
-          "got": 97.8390040780754,
-          "match": true
-        },
-        "05_callback_rate": {
-          "expected": 42.48847926267281,
-          "got": 42.48847926267281,
-          "match": true
-        }
-      },
-      "rtol": 0.001,
-      "atol": 1e-06
+    "agent_result": {
+      "01_mean_waiting_time": 230.08648598465186,
+      "02_operator_util": 99.35481108027054,
+      "03_mean_nurse_waiting_time": 78.94871019130441,
+      "04_nurse_util": 97.8390040780754,
+      "05_callback_rate": 42.48847926267281
     },
-    "passed": false
+    "passed": false,
+    "context": {
+      "trace_id": "0f08ea84447ef6e9bdb05a17920807db",
+      "span_id": "b87894c014f8cf28"
+    }
   },
   "example_4": {
     "user_input": "Simulate with 20 operators, 15 nurses, and half the usual call volume",
@@ -336,88 +126,18 @@
       "04_nurse_util": 35.17900361818962,
       "05_callback_rate": 42.17289719626168
     },
-    "agent_run": {
-      "simulation_result": {
-        "01_mean_waiting_time": 0.0,
-        "02_operator_util": 31.21492694742085,
-        "03_mean_nurse_waiting_time": 0.0,
-        "04_nurse_util": 35.17900361818962,
-        "05_callback_rate": 42.17289719626168
-      },
-      "meta": {
-        "retry_count": 0,
-        "had_bailout": false,
-        "validation_history": [
-          {
-            "attempt": 1,
-            "parameters": {
-              "n_operators": 20,
-              "n_nurses": 15,
-              "mean_iat": 1.2,
-              "call_low": 5.0,
-              "call_mode": 7.0,
-              "call_high": 10.0,
-              "callback_prob": 0.4,
-              "nurse_consult_low": 10.0,
-              "nurse_consult_high": 20.0,
-              "random_seed": 0,
-              "run_length": 1000
-            },
-            "validation_result": {
-              "is_valid": true,
-              "errors": []
-            }
-          }
-        ],
-        "parameters": {
-          "n_operators": 20,
-          "n_nurses": 15,
-          "mean_iat": 1.2,
-          "call_low": 5.0,
-          "call_mode": 7.0,
-          "call_high": 10.0,
-          "callback_prob": 0.4,
-          "nurse_consult_low": 10.0,
-          "nurse_consult_high": 20.0,
-          "random_seed": 0,
-          "run_length": 1000
-        },
-        "formatted_parameters": "**Parameters used in simulation**\n\n| \ud83d\udcc8 Parameter | Value |\n|---|---|\n| n_operators | 20 |\n| n_nurses | 15 |\n| mean_iat | 1.2 |\n| call_low | 5.0 |\n| call_mode | 7.0 |\n| call_high | 10.0 |\n| callback_prob | 0.4 |\n| nurse_consult_low | 10.0 |\n| nurse_consult_high | 20.0 |\n| random_seed | 0 |\n| run_length | 1000 |\n"
-      }
-    },
-    "comparison": {
-      "pass": true,
-      "details": {
-        "01_mean_waiting_time": {
-          "expected": 0.0,
-          "got": 0.0,
-          "match": true
-        },
-        "02_operator_util": {
-          "expected": 31.21492694742085,
-          "got": 31.21492694742085,
-          "match": true
-        },
-        "03_mean_nurse_waiting_time": {
-          "expected": 0.0,
-          "got": 0.0,
-          "match": true
-        },
-        "04_nurse_util": {
-          "expected": 35.17900361818962,
-          "got": 35.17900361818962,
-          "match": true
-        },
-        "05_callback_rate": {
-          "expected": 42.17289719626168,
-          "got": 42.17289719626168,
-          "match": true
-        }
-      },
-      "rtol": 0.001,
-      "atol": 1e-06
+    "agent_result": {
+      "01_mean_waiting_time": 0.0,
+      "02_operator_util": 31.21492694742085,
+      "03_mean_nurse_waiting_time": 0.0,
+      "04_nurse_util": 35.17900361818962,
+      "05_callback_rate": 42.17289719626168
     },
-    "passed": true
+    "passed": true,
+    "context": {
+      "trace_id": "3a7d4f511089509f19d615d5b04a725d",
+      "span_id": "926e778adc644f4c"
+    }
   },
   "example_5": {
     "user_input": "Run with 10 operators, 8 nurses, 30% callback probability, and 10% reduced demand",
@@ -441,88 +161,18 @@
       "04_nurse_util": 82.46774067208518,
       "05_callback_rate": 33.012583271650634
     },
-    "agent_run": {
-      "simulation_result": {
-        "01_mean_waiting_time": 46.3970260873325,
-        "02_operator_util": 98.6111946659619,
-        "03_mean_nurse_waiting_time": 2.290083053642536,
-        "04_nurse_util": 82.07127437115734,
-        "05_callback_rate": 33.11061618411284
-      },
-      "meta": {
-        "retry_count": 0,
-        "had_bailout": false,
-        "validation_history": [
-          {
-            "attempt": 1,
-            "parameters": {
-              "n_operators": 10,
-              "n_nurses": 8,
-              "mean_iat": 0.69,
-              "callback_prob": 0.3,
-              "nurse_consult_low": 10.0,
-              "nurse_consult_high": 20.0,
-              "call_low": 5.0,
-              "call_mode": 7.0,
-              "call_high": 10.0,
-              "random_seed": 0,
-              "run_length": 1000
-            },
-            "validation_result": {
-              "is_valid": true,
-              "errors": []
-            }
-          }
-        ],
-        "parameters": {
-          "n_operators": 10,
-          "n_nurses": 8,
-          "mean_iat": 0.69,
-          "callback_prob": 0.3,
-          "nurse_consult_low": 10.0,
-          "nurse_consult_high": 20.0,
-          "call_low": 5.0,
-          "call_mode": 7.0,
-          "call_high": 10.0,
-          "random_seed": 0,
-          "run_length": 1000
-        },
-        "formatted_parameters": "**Parameters used in simulation**\n\n| \ud83d\udcc8 Parameter | Value |\n|---|---|\n| n_operators | 10 |\n| n_nurses | 8 |\n| mean_iat | 0.69 |\n| callback_prob | 0.3 |\n| nurse_consult_low | 10.0 |\n| nurse_consult_high | 20.0 |\n| call_low | 5.0 |\n| call_mode | 7.0 |\n| call_high | 10.0 |\n| random_seed | 0 |\n| run_length | 1000 |\n"
-      }
-    },
-    "comparison": {
-      "pass": false,
-      "details": {
-        "01_mean_waiting_time": {
-          "expected": 62.26336943615235,
-          "got": 46.3970260873325,
-          "match": false
-        },
-        "02_operator_util": {
-          "expected": 98.91827940297605,
-          "got": 98.6111946659619,
-          "match": false
-        },
-        "03_mean_nurse_waiting_time": {
-          "expected": 2.3551925550777337,
-          "got": 2.290083053642536,
-          "match": false
-        },
-        "04_nurse_util": {
-          "expected": 82.46774067208518,
-          "got": 82.07127437115734,
-          "match": false
-        },
-        "05_callback_rate": {
-          "expected": 33.012583271650634,
-          "got": 33.11061618411284,
-          "match": false
-        }
-      },
-      "rtol": 0.001,
-      "atol": 1e-06
-    },
-    "passed": false
+    "agent_result": {
+      "01_mean_waiting_time": 46.3970260873325,
+      "02_operator_util": 98.6111946659619,
+      "03_mean_nurse_waiting_time": 2.290083053642536,
+      "04_nurse_util": 82.07127437115734,
+      "05_callback_rate": 33.11061618411284
+    },
+    "passed": false,
+    "context": {
+      "trace_id": "3dd0f874abcf87f832d31fb352632636",
+      "span_id": "ad80c628897ebbbe"
+    }
   },
   "example_6": {
     "user_input": "Test scenario: 25 operators, 25 nurses, triple demand",
@@ -546,88 +196,18 @@
       "04_nurse_util": 83.59706413413956,
       "05_callback_rate": 41.69621749408983
     },
-    "agent_run": {
-      "simulation_result": {
-        "01_mean_waiting_time": 161.26826119772568,
-        "02_operator_util": 99.27336565709264,
-        "03_mean_nurse_waiting_time": 0.4505143843086236,
-        "04_nurse_util": 83.59706413413956,
-        "05_callback_rate": 41.69621749408983
-      },
-      "meta": {
-        "retry_count": 0,
-        "had_bailout": false,
-        "validation_history": [
-          {
-            "attempt": 1,
-            "parameters": {
-              "n_operators": 25,
-              "n_nurses": 25,
-              "mean_iat": 0.2,
-              "call_low": 5.0,
-              "call_mode": 7.0,
-              "call_high": 10.0,
-              "callback_prob": 0.4,
-              "nurse_consult_low": 10.0,
-              "nurse_consult_high": 20.0,
-              "random_seed": 0,
-              "run_length": 1000
-            },
-            "validation_result": {
-              "is_valid": true,
-              "errors": []
-            }
-          }
-        ],
-        "parameters": {
-          "n_operators": 25,
-          "n_nurses": 25,
-          "mean_iat": 0.2,
-          "call_low": 5.0,
-          "call_mode": 7.0,
-          "call_high": 10.0,
-          "callback_prob": 0.4,
-          "nurse_consult_low": 10.0,
-          "nurse_consult_high": 20.0,
-          "random_seed": 0,
-          "run_length": 1000
-        },
-        "formatted_parameters": "**Parameters used in simulation**\n\n| \ud83d\udcc8 Parameter | Value |\n|---|---|\n| n_operators | 25 |\n| n_nurses | 25 |\n| mean_iat | 0.2 |\n| call_low | 5.0 |\n| call_mode | 7.0 |\n| call_high | 10.0 |\n| callback_prob | 0.4 |\n| nurse_consult_low | 10.0 |\n| nurse_consult_high | 20.0 |\n| random_seed | 0 |\n| run_length | 1000 |\n"
-      }
-    },
-    "comparison": {
-      "pass": true,
-      "details": {
-        "01_mean_waiting_time": {
-          "expected": 161.26826119772568,
-          "got": 161.26826119772568,
-          "match": true
-        },
-        "02_operator_util": {
-          "expected": 99.27336565709264,
-          "got": 99.27336565709264,
-          "match": true
-        },
-        "03_mean_nurse_waiting_time": {
-          "expected": 0.4505143843086236,
-          "got": 0.4505143843086236,
-          "match": true
-        },
-        "04_nurse_util": {
-          "expected": 83.59706413413956,
-          "got": 83.59706413413956,
-          "match": true
-        },
-        "05_callback_rate": {
-          "expected": 41.69621749408983,
-          "got": 41.69621749408983,
-          "match": true
-        }
-      },
-      "rtol": 0.001,
-      "atol": 1e-06
+    "agent_result": {
+      "01_mean_waiting_time": 161.26826119772568,
+      "02_operator_util": 99.27336565709264,
+      "03_mean_nurse_waiting_time": 0.4505143843086236,
+      "04_nurse_util": 83.59706413413956,
+      "05_callback_rate": 41.69621749408983
     },
-    "passed": true
+    "passed": true,
+    "context": {
+      "trace_id": "7665b2b29408259e148fccd619ac80e6",
+      "span_id": "06181825b8538a83"
+    }
   },
   "example_7": {
     "user_input": "Simulate 15 operators, 12 nurses, call durations 3-6-9 minutes, 2000 minute runtime",
@@ -651,88 +231,18 @@
       "04_nurse_util": 86.43094276820304,
       "05_callback_rate": 41.684147437818396
     },
-    "agent_run": {
-      "simulation_result": {
-        "01_mean_waiting_time": 0.07192106569784922,
-        "02_operator_util": 66.74240386374329,
-        "03_mean_nurse_waiting_time": 2.663802537251699,
-        "04_nurse_util": 86.43094276820304,
-        "05_callback_rate": 41.684147437818396
-      },
-      "meta": {
-        "retry_count": 0,
-        "had_bailout": false,
-        "validation_history": [
-          {
-            "attempt": 1,
-            "parameters": {
-              "n_operators": 15,
-              "n_nurses": 12,
-              "mean_iat": 0.6,
-              "call_low": 3.0,
-              "call_mode": 6.0,
-              "call_high": 9.0,
-              "callback_prob": 0.4,
-              "nurse_consult_low": 10.0,
-              "nurse_consult_high": 20.0,
-              "random_seed": 0,
-              "run_length": 2000
-            },
-            "validation_result": {
-              "is_valid": true,
-              "errors": []
-            }
-          }
-        ],
-        "parameters": {
-          "n_operators": 15,
-          "n_nurses": 12,
-          "mean_iat": 0.6,
-          "call_low": 3.0,
-          "call_mode": 6.0,
-          "call_high": 9.0,
-          "callback_prob": 0.4,
-          "nurse_consult_low": 10.0,
-          "nurse_consult_high": 20.0,
-          "random_seed": 0,
-          "run_length": 2000
-        },
-        "formatted_parameters": "**Parameters used in simulation**\n\n| \ud83d\udcca Parameter | Value |\n|---|---|\n| n_operators | 15 |\n| n_nurses | 12 |\n| mean_iat | 0.6 |\n| call_low | 3.0 |\n| call_mode | 6.0 |\n| call_high | 9.0 |\n| callback_prob | 0.4 |\n| nurse_consult_low | 10.0 |\n| nurse_consult_high | 20.0 |\n| random_seed | 0 |\n| run_length | 2000 |\n"
-      }
-    },
-    "comparison": {
-      "pass": true,
-      "details": {
-        "01_mean_waiting_time": {
-          "expected": 0.07192106569784922,
-          "got": 0.07192106569784922,
-          "match": true
-        },
-        "02_operator_util": {
-          "expected": 66.74240386374329,
-          "got": 66.74240386374329,
-          "match": true
-        },
-        "03_mean_nurse_waiting_time": {
-          "expected": 2.663802537251699,
-          "got": 2.663802537251699,
-          "match": true
-        },
-        "04_nurse_util": {
-          "expected": 86.43094276820304,
-          "got": 86.43094276820304,
-          "match": true
-        },
-        "05_callback_rate": {
-          "expected": 41.684147437818396,
-          "got": 41.684147437818396,
-          "match": true
-        }
-      },
-      "rtol": 0.001,
-      "atol": 1e-06
+    "agent_result": {
+      "01_mean_waiting_time": 0.07192106569784922,
+      "02_operator_util": 66.74240386374329,
+      "03_mean_nurse_waiting_time": 2.663802537251699,
+      "04_nurse_util": 86.43094276820304,
+      "05_callback_rate": 41.684147437818396
     },
-    "passed": true
+    "passed": true,
+    "context": {
+      "trace_id": "351c928786d9364f8bf28983dd16c0a8",
+      "span_id": "6bffbbff22841ac8"
+    }
   },
   "example_8": {
     "user_input": "Run with 6 operators, 4 nurses, 60% callback rate, and 25% higher demand",
@@ -756,78 +266,18 @@
       "04_nurse_util": 97.69465771729217,
       "05_callback_rate": 57.87545787545788
     },
-    "agent_run": {
-      "simulation_result": {
-        "01_mean_waiting_time": 306.74172321599656,
-        "02_operator_util": 99.45155480236895,
-        "03_mean_nurse_waiting_time": 196.98238666596234,
-        "04_nurse_util": 97.69465771729217,
-        "05_callback_rate": 57.8239608801956
-      },
-      "meta": {
-        "retry_count": 0,
-        "had_bailout": false,
-        "validation_history": [
-          {
-            "attempt": 1,
-            "parameters": {
-              "n_operators": 6,
-              "n_nurses": 4,
-              "mean_iat": 0.48,
-              "callback_prob": 0.6,
-              "random_seed": 0,
-              "run_length": 1000
-            },
-            "validation_result": {
-              "is_valid": true,
-              "errors": []
-            }
-          }
-        ],
-        "parameters": {
-          "n_operators": 6,
-          "n_nurses": 4,
-          "mean_iat": 0.48,
-          "callback_prob": 0.6,
-          "random_seed": 0,
-          "run_length": 1000
-        },
-        "formatted_parameters": "**Parameters used in simulation**\n\n| \ud83d\udcca Parameter | Value |\n|---|---|\n| n_operators | 6 |\n| n_nurses | 4 |\n| mean_iat | 0.48 |\n| callback_prob | 0.6 |\n| random_seed | 0 |\n| run_length | 1000 |\n"
-      }
-    },
-    "comparison": {
-      "pass": false,
-      "details": {
-        "01_mean_waiting_time": {
-          "expected": 318.98668257451686,
-          "got": 306.74172321599656,
-          "match": false
-        },
-        "02_operator_util": {
-          "expected": 99.56949574531609,
-          "got": 99.45155480236895,
-          "match": false
-        },
-        "03_mean_nurse_waiting_time": {
-          "expected": 196.98753442211358,
-          "got": 196.98238666596234,
-          "match": true
-        },
-        "04_nurse_util": {
-          "expected": 97.69465771729217,
-          "got": 97.69465771729217,
-          "match": true
-        },
-        "05_callback_rate": {
-          "expected": 57.87545787545788,
-          "got": 57.8239608801956,
-          "match": true
-        }
-      },
-      "rtol": 0.001,
-      "atol": 1e-06
+    "agent_result": {
+      "01_mean_waiting_time": 306.74172321599656,
+      "02_operator_util": 99.45155480236895,
+      "03_mean_nurse_waiting_time": 196.98238666596234,
+      "04_nurse_util": 97.69465771729217,
+      "05_callback_rate": 57.8239608801956
     },
-    "passed": false
+    "passed": false,
+    "context": {
+      "trace_id": "774aa223e001e166a7d7c906779f1807",
+      "span_id": "784b26c1ca10d345"
+    }
   },
   "example_9": {
     "user_input": "Test configuration: 18 operators, 14 nurses, nurse consultations 15-25 minutes, normal demand",
@@ -851,88 +301,18 @@
       "04_nurse_util": 97.15170780690659,
       "05_callback_rate": 42.12121212121212
     },
-    "agent_run": {
-      "simulation_result": {
-        "01_mean_waiting_time": 0.08980920202587155,
-        "02_operator_util": 67.09751014731434,
-        "03_mean_nurse_waiting_time": 19.85524137214805,
-        "04_nurse_util": 97.15170780690659,
-        "05_callback_rate": 42.12121212121212
-      },
-      "meta": {
-        "retry_count": 0,
-        "had_bailout": false,
-        "validation_history": [
-          {
-            "attempt": 1,
-            "parameters": {
-              "n_operators": 18,
-              "n_nurses": 14,
-              "mean_iat": 0.6,
-              "call_low": 5.0,
-              "call_mode": 7.0,
-              "call_high": 10.0,
-              "callback_prob": 0.4,
-              "nurse_consult_low": 15.0,
-              "nurse_consult_high": 25.0,
-              "random_seed": 0,
-              "run_length": 1000
-            },
-            "validation_result": {
-              "is_valid": true,
-              "errors": []
-            }
-          }
-        ],
-        "parameters": {
-          "n_operators": 18,
-          "n_nurses": 14,
-          "mean_iat": 0.6,
-          "call_low": 5.0,
-          "call_mode": 7.0,
-          "call_high": 10.0,
-          "callback_prob": 0.4,
-          "nurse_consult_low": 15.0,
-          "nurse_consult_high": 25.0,
-          "random_seed": 0,
-          "run_length": 1000
-        },
-        "formatted_parameters": "**Parameters used in simulation**\n\n| \ud83d\udcc8 Parameter | Value |\n|---|---|\n| n_operators | 18 |\n| n_nurses | 14 |\n| mean_iat | 0.6 |\n| call_low | 5.0 |\n| call_mode | 7.0 |\n| call_high | 10.0 |\n| callback_prob | 0.4 |\n| nurse_consult_low | 15.0 |\n| nurse_consult_high | 25.0 |\n| random_seed | 0 |\n| run_length | 1000 |\n"
-      }
-    },
-    "comparison": {
-      "pass": true,
-      "details": {
-        "01_mean_waiting_time": {
-          "expected": 0.08980920202587155,
-          "got": 0.08980920202587155,
-          "match": true
-        },
-        "02_operator_util": {
-          "expected": 67.09751014731434,
-          "got": 67.09751014731434,
-          "match": true
-        },
-        "03_mean_nurse_waiting_time": {
-          "expected": 19.85524137214805,
-          "got": 19.85524137214805,
-          "match": true
-        },
-        "04_nurse_util": {
-          "expected": 97.15170780690659,
-          "got": 97.15170780690659,
-          "match": true
-        },
-        "05_callback_rate": {
-          "expected": 42.12121212121212,
-          "got": 42.12121212121212,
-          "match": true
-        }
-      },
-      "rtol": 0.001,
-      "atol": 1e-06
+    "agent_result": {
+      "01_mean_waiting_time": 0.08980920202587155,
+      "02_operator_util": 67.09751014731434,
+      "03_mean_nurse_waiting_time": 19.85524137214805,
+      "04_nurse_util": 97.15170780690659,
+      "05_callback_rate": 42.12121212121212
     },
-    "passed": true
+    "passed": true,
+    "context": {
+      "trace_id": "d8b2aedeb917ec4191b1c6a7fc467721",
+      "span_id": "14f5bf6c2a7008c2"
+    }
   },
   "example_10": {
     "user_input": "Simulate 12 operators, 10 nurses, 50% callback probability, and 40% demand reduction",
@@ -956,87 +336,17 @@
       "04_nurse_util": 90.70926116406883,
       "05_callback_rate": 51.4546965918537
     },
-    "agent_run": {
-      "simulation_result": {
-        "01_mean_waiting_time": 1.2583822886248166,
-        "02_operator_util": 84.9856571728051,
-        "03_mean_nurse_waiting_time": 46.17149830527762,
-        "04_nurse_util": 97.12003253429981,
-        "05_callback_rate": 50.43103448275862
-      },
-      "meta": {
-        "retry_count": 0,
-        "had_bailout": false,
-        "validation_history": [
-          {
-            "attempt": 1,
-            "parameters": {
-              "n_operators": 12,
-              "n_nurses": 10,
-              "mean_iat": 0.72,
-              "callback_prob": 0.5,
-              "call_low": 5.0,
-              "call_mode": 7.0,
-              "call_high": 10.0,
-              "nurse_consult_low": 10.0,
-              "nurse_consult_high": 20.0,
-              "random_seed": 0,
-              "run_length": 1000
-            },
-            "validation_result": {
-              "is_valid": true,
-              "errors": []
-            }
-          }
-        ],
-        "parameters": {
-          "n_operators": 12,
-          "n_nurses": 10,
-          "mean_iat": 0.72,
-          "callback_prob": 0.5,
-          "call_low": 5.0,
-          "call_mode": 7.0,
-          "call_high": 10.0,
-          "nurse_consult_low": 10.0,
-          "nurse_consult_high": 20.0,
-          "random_seed": 0,
-          "run_length": 1000
-        },
-        "formatted_parameters": "**Parameters used in simulation**\n\n| \ud83d\udcca Parameter | Value |\n|---|---|\n| n_operators | 12 |\n| n_nurses | 10 |\n| mean_iat | 0.72 |\n| callback_prob | 0.5 |\n| call_low | 5.0 |\n| call_mode | 7.0 |\n| call_high | 10.0 |\n| nurse_consult_low | 10.0 |\n| nurse_consult_high | 20.0 |\n| random_seed | 0 |\n| run_length | 1000 |\n"
-      }
-    },
-    "comparison": {
-      "pass": false,
-      "details": {
-        "01_mean_waiting_time": {
-          "expected": 0.38280107383630124,
-          "got": 1.2583822886248166,
-          "match": false
-        },
-        "02_operator_util": {
-          "expected": 73.44752978233386,
-          "got": 84.9856571728051,
-          "match": false
-        },
-        "03_mean_nurse_waiting_time": {
-          "expected": 4.147154100012408,
-          "got": 46.17149830527762,
-          "match": false
-        },
-        "04_nurse_util": {
-          "expected": 90.70926116406883,
-          "got": 97.12003253429981,
-          "match": false
-        },
-        "05_callback_rate": {
-          "expected": 51.4546965918537,
-          "got": 50.43103448275862,
-          "match": false
-        }
-      },
-      "rtol": 0.001,
-      "atol": 1e-06
-    },
-    "passed": false
+    "agent_result": {
+      "01_mean_waiting_time": 1.2583822886248166,
+      "02_operator_util": 84.9856571728051,
+      "03_mean_nurse_waiting_time": 46.17149830527762,
+      "04_nurse_util": 97.12003253429981,
+      "05_callback_rate": 50.43103448275862
+    },
+    "passed": false,
+    "context": {
+      "trace_id": "38fc463cffe7950265a08d9e4f57c852",
+      "span_id": "910e28879b5d85b8"
+    }
   }
 }
\ No newline at end of file
diff --git a/run_evals.py b/run_evals.py
index 361c137..a262ebf 100644
--- a/run_evals.py
+++ b/run_evals.py
@@ -1,157 +1,232 @@
 import asyncio
 import json
 import math
-from typing import Dict, Any, Tuple, Optional
+import pandas as pd
+import numpy as np
+from typing import Dict, Any, Optional
 
-# Import the agent builder and its dependencies from your agent module
-# Assumes agent_self_reflection.py exposes `build_graph`, `AgentState`, and a factory for the LLM.
 from agent_self_reflection import build_graph, AgentState
 from langchain_ollama import OllamaLLM
+from mcpsim.tracing import init_tracing
+import phoenix as px
+from phoenix.trace import SpanEvaluations
 
+import argparse
+import itertools
 
-def floats_close(a: Any, b: Any, rtol: float = 1e-3, atol: float = 1e-6) -> bool:
-    """
-    Compare numbers with tolerance, pass-through for non-numeric equality.
-    Returns True if both are NaN, or within tolerances for floats, or exactly equal otherwise.
-    """
-    # Handle None
-    if a is None or b is None:
-        return a is None and b is None
+# Import OpenTelemetry trace API to capture span context
+from opentelemetry import trace
+
+# Initialize a global tracer from your helper
+tracer_provider = init_tracing(project_name="mcp-agent-evaluation", endpoint="http://localhost:6006")
+tracer = tracer_provider.get_tracer("eval-runner-tracer")
 
-    # Try numeric compare
+
+# ----------------- Comparison helpers (unchanged) -----------------
+def floats_close(a: Any, b: Any, rtol: float = 1e-3, atol: float = 1e-6) -> bool:
+    if a is None or b is None: return a is None and b is None
     try:
-        fa = float(a)
-        fb = float(b)
-        # Handle NaNs
-        if math.isnan(fa) and math.isnan(fb):
-            return True
+        fa, fb = float(a), float(b)
+        if math.isnan(fa) and math.isnan(fb): return True
         return math.isclose(fa, fb, rel_tol=rtol, abs_tol=atol)
-    except (TypeError, ValueError):
-        # Non-numeric: exact equality
-        return a == b
-
+    except (TypeError, ValueError): return a == b
 
 def compare_results(
     got: Dict[str, Any],
     expected: Dict[str, Any],
     rtol: float = 1e-3,
-    atol: float = 1e-6,
-) -> Dict[str, Any]:
+    atol: float = 1e-6
+) -> bool:
+    """Compare two dictionaries of numerical simulation results.
+
+    This function provides a robust way to check if two dictionaries,
+    representing simulation outputs, are numerically equivalent within a
+    specified tolerance. It enforces that all metrics must be numeric
+    and that both dictionaries must have the exact same set of keys.
+
+    Parameters
+    ----------
+    got : dict
+        The dictionary of actual results obtained from a simulation run.
+        All values are expected to be numeric (int or float).
+    expected : dict
+        The dictionary of expected results to compare against.
+        All values are expected to be numeric (int or float).
+    rtol : float, optional
+        The relative tolerance parameter for `numpy.allclose`.
+        Default is 1e-3.
+    atol : float, optional
+        The absolute tolerance parameter for `numpy.allclose`.
+        Default is 1e-6.
+
+    Returns
+    -------
+    bool
+        True if the dictionaries are a match, False otherwise. A match
+        requires that:
+        1. Both dictionaries have the identical set of keys.
+        2. All values in both dictionaries are numeric.
+        3. All corresponding numeric values are close, as determined by
+           `numpy.allclose` with the given tolerances.
+
+    Examples
+    --------
+    >>> got = {'metric_a': 1.0001, 'metric_b': 200.0}
+    >>> expected = {'metric_a': 1.0, 'metric_b': 200.5}
+    >>> compare_results(got, expected, rtol=1e-2)
+    True
+
+    >>> got = {'metric_a': 1.01, 'metric_b': 200.0}
+    >>> expected = {'metric_a': 1.0, 'metric_b': 200.0}
+    >>> compare_results(got, expected, rtol=1e-3)
+    False
+
+    >>> got = {'metric_a': 1.0, 'metric_b': 'fail'}
+    >>> expected = {'metric_a': 1.0, 'metric_b': 2.0}
+    >>> compare_results(got, expected)
+    False
+
+    >>> got = {'metric_a': 1.0}
+    >>> expected = {'metric_a': 1.0, 'metric_b': 2.0}
+    >>> compare_results(got, expected)
+    False
     """
-    Compare simulation outputs to expected_results with numeric tolerance.
-    Returns a dict with per-key comparison, diffs, and overall pass flag.
+    if expected is None or got is None:
+        return False
+
+    s_got = pd.Series(got)
+    s_expected = pd.Series(expected)
+
+    # 1. Check for structural differences (different keys).
+    if set(s_got.index) != set(s_expected.index):
+        return False
+
+    # If both are empty but have same (no) keys, they match.
+    # if s_got.empty:
+    #     return True
+    
+    # 2. Verify that ALL values in BOTH series are numeric.
+    # pd.api.types.is_number is a robust way to check for int/float.
+    if not (s_got.apply(pd.api.types.is_number).all() and
+            s_expected.apply(pd.api.types.is_number).all()):
+        return False
+
+    # 3. Align and compare using NumPy's tolerance-based function.
+    # We already checked for key equality, so we can align `expected` to `got`.
+    s_expected_aligned = s_expected.loc[s_got.index]
+
+    # `np.allclose` is the gold standard for comparing arrays of floats.
+    return np.allclose(
+        s_got.values,
+        s_expected_aligned.values,
+        rtol=rtol,
+        atol=atol,
+        equal_nan=True  # Considers two NaN values to be equal.
+    )
+
+# ----------------- Agent run helpers (unchanged) -----------------
+async def run_agent_once(compiled_graph, user_input: str, llm: OllamaLLM) -> AgentState:
+    state_in: AgentState = {"user_input": user_input, "retry_count": 0, "validation_history": []}
+    return await compiled_graph.ainvoke(state_in)
+
+def extract_sim_result(state: AgentState) -> Optional[Dict[str, Any]]:
+    return state.get("simulation_result")
+
+# ---------------- UPDATED Bulk Ingest Function ----------------
+def bulk_ingest_to_phoenix(json_path: str, eval_name: str = "Simulation Agent Eval"):
     """
-    keys = sorted(set(got.keys()) | set(expected.keys()))
-    per_key = {}
-    all_pass = True
-
-    for k in keys:
-        g = got.get(k, None)
-        e = expected.get(k, None)
-        ok = floats_close(g, e, rtol=rtol, atol=atol)
-        if not ok:
-            all_pass = False
-        per_key[k] = {
-            "expected": e,
-            "got": g,
-            "match": ok,
-        }
-
-    return {
-        "pass": all_pass,
-        "details": per_key,
-        "rtol": rtol,
-        "atol": atol,
-    }
-
-
-async def run_agent_once(
-    compiled_graph,
-    user_input: str,
-    llm: OllamaLLM,
-    max_retries: int = 4,
-) -> Dict[str, Any]:
+    Loads an enriched evals.json file and bulk-ingests into Phoenix,
+    now including score, label, and explanation columns.
     """
-    Runs the agent end-to-end for a single natural-language user_input.
-    Returns the final state, including simulation_result or error.
-    """
-    state_in: AgentState = {
-        "user_input": user_input,
-        "retry_count": 0,
-        "validation_history": [],
-    }
-    final_state: AgentState = await compiled_graph.ainvoke(state_in)
-    return final_state
-
+    with open(json_path, "r") as f:
+        evals = json.load(f)
 
-def extract_agent_simulation_result(final_state: Dict[str, Any]) -> Tuple[Optional[Dict[str, Any]], Dict[str, Any]]:
+    eval_records = []
+    for ex_name, case in evals.items():
+        context = case.get("context")
+        if not context or "span_id" not in context or "trace_id" not in context:
+            print(f"⚠️ Skipping '{ex_name}': missing trace/span context in {json_path}")
+            continue
+
+        is_passed = bool(case.get("passed"))
+        
+        # **FIX:** Add all three required columns: score, label, and explanation.
+        eval_records.append({
+            "context.trace_id": context["trace_id"],
+            "context.span_id": context["span_id"],
+            "example_id": ex_name,
+            "score": 1 if is_passed else 0,
+            "label": "Pass" if is_passed else "Fail",
+            "explanation": "Agent result matched expected values within tolerance." if is_passed 
+                         else "Agent result did not match expected values.",
+        })
+
+    if not eval_records:
+        print("No valid records found to ingest. Did you run the agent first to generate evals.json?")
+        return
+        
+    eval_df = pd.DataFrame(eval_records)
+    eval_df = eval_df.set_index("context.span_id")
+
+    client = px.Client()
+    client.log_evaluations(SpanEvaluations(eval_name=eval_name, dataframe=eval_df))
+    print(f"[✓] Pushed {len(eval_df)} eval rows to Phoenix under '{eval_name}'")
+
+# ---------------- UPDATED Main eval runner ----------------
+async def run_all_and_save(model_name: str = "gemma3:27b", limit: int = None):
     """
-    Extract the simulation_result if present and return a compact agent_run summary
-    useful for evaluation logging.
+    Runs the full evaluation pipeline and saves an enriched evals.json
+    that now includes the necessary trace/span context for Phoenix.
     """
-    sim_res = final_state.get("simulation_result")
-    agent_run = {
-        "retry_count": final_state.get("retry_count", 0),
-        "had_bailout": final_state.get("error") == "Maximum retries exceeded during parameter reflection.",
-        "validation_history": final_state.get("validation_history", []),
-        "parameters": final_state.get("parameters", {}),
-        "formatted_parameters": final_state.get("formatted_parameters", None),
-    }
-    return sim_res, agent_run
-
-
-async def main(
-    input_json_path: str = "evals/evals.json",
-    output_json_path: str = "evals/evals_output.json",
-    model_name: str = "gemma3:27b",
-    llm_base_url: str = "http://localhost:11434",
-    rtol: float = 1e-3,
-    atol: float = 1e-6,
-) -> None:
-    # 1) Load evals
-    with open(input_json_path, "r") as f:
+    with open("evals/evals.json", "r") as f:
         evals = json.load(f)
 
-    # 2) Build LLM and graph once
-    llm = OllamaLLM(model=model_name, base_url=llm_base_url)
+    llm = OllamaLLM(model=model_name, base_url="http://localhost:11434")
     compiled_graph = build_graph(llm)
 
-    # 3) Run each eval in a loop via the agent
-    for key, case in evals.items():
-        user_input = case.get("user_input", "")
-        expected = case.get("parameters", {}).get("expected_results") or case.get("expected_results")
-        # In your earlier structure, expected_results is at the top level of each example after enrichment.
-        # If not present yet, this will remain None and comparison will be skipped.
-
-        final_state = await run_agent_once(compiled_graph, user_input, llm)
-        sim_res, agent_run = extract_agent_simulation_result(final_state)
-
-        # Record agent run outputs
-        case["agent_run"] = {
-            "simulation_result": sim_res,
-            "meta": agent_run,
-        }
-
-        # 4) Compare to expected_results if available
-        if expected is not None and sim_res is not None:
-            cmp = compare_results(sim_res, expected, rtol=rtol, atol=atol)
-            case["comparison"] = cmp
-            case["passed"] = bool(cmp["pass"])
-        else:
-            case["comparison"] = {
-                "note": "Either expected_results or agent simulation_result missing; comparison skipped."
-            }
-            case["passed"] = False if expected is not None else None
-
-    # 5) Save enriched evals to evals.json
-    with open(output_json_path, "w") as f:
+    # Use islice to limit the loop if a limit is provided
+    items_to_process = itertools.islice(evals.items(), limit) if limit else evals.items()
+
+    for ex_name, case in items_to_process:
+        # **FIX:** Create a parent span for each eval run to capture its context
+        with tracer.start_as_current_span(f"eval_run: {ex_name}") as span:
+            # Capture the context from the currently active span
+            span_context = span.get_span_context()
+            trace_id = f"{span_context.trace_id:032x}"
+            span_id = f"{span_context.span_id:016x}"
+
+            # Run the agent pipeline
+            final_state = await run_agent_once(compiled_graph, case["user_input"], llm)
+            
+            # Process results
+            got = extract_sim_result(final_state)
+            passed = compare_results(got, case.get("expected_results"))
+
+            # Store results and the new context back into the dictionary
+            case["agent_result"] = got
+            case["passed"] = passed
+            case["context"] = {"trace_id": trace_id, "span_id": span_id}
+            
+            # Optionally add attributes to the span
+            span.set_attribute("eval.passed", passed)
+            span.set_attribute("eval.example_id", ex_name)
+
+
+    with open("evals/evals_output.json", "w") as f:
         json.dump(evals, f, indent=2)
 
-    print(f"Wrote evaluation results to {output_json_path}")
-
+    print("[✓] Saved enriched evals.json with trace/span context.")
 
 if __name__ == "__main__":
-    # For CLI usage:
-    #   python eval_runner.py
-    # Optional: parameterize via env vars or argparse if desired.
-    asyncio.run(main())
+
+    parser = argparse.ArgumentParser(description="Run evals and/or bulk-ingest into Phoenix")
+    parser.add_argument("--skip-run", action="store_true", help="Skip agent runs and just bulk-ingest existing evals.json")
+    parser.add_argument("--eval-name", default="Simulation Agent Eval")
+    parser.add_argument("--limit", type=int, default=None, help="Limit the number of evaluations to run for debugging.")
+    args = parser.parse_args()
+
+    if args.skip_run:
+        bulk_ingest_to_phoenix("evals/evals_output.json", eval_name=args.eval_name)
+    else:
+        asyncio.run(run_all_and_save(model_name="gemma3:27b", limit=args.limit))
+        bulk_ingest_to_phoenix("evals/evals_output.json", eval_name=args.eval_name)

From 56a08f8ca1fc2a3b36ff1dd6c66d42a3d038856b Mon Sep 17 00:00:00 2001
From: TomMonks <thomas.m.monks@gmail.com>
Date: Tue, 12 Aug 2025 18:03:35 +0100
Subject: [PATCH 21/23] feat(callcentre): improved schema desc + improved json
 prompt

---
 evals/evals_output.json                       | 86 +++++++++----------
 .../callcentre/resources/schema.json          |  6 +-
 mcpsim/resources/parameter_prompt2.txt        | 35 ++++++++
 mcpsim/resources/parameter_prompt3.txt        | 33 +++++++
 mcpsim/server.py                              |  3 +-
 run_evals.py                                  | 16 ++--
 6 files changed, 122 insertions(+), 57 deletions(-)
 create mode 100644 mcpsim/resources/parameter_prompt2.txt
 create mode 100644 mcpsim/resources/parameter_prompt3.txt

diff --git a/evals/evals_output.json b/evals/evals_output.json
index 063aeb5..ed2acdb 100644
--- a/evals/evals_output.json
+++ b/evals/evals_output.json
@@ -30,8 +30,8 @@
     },
     "passed": true,
     "context": {
-      "trace_id": "f5261ab566c4e5bebfbea7cc7f28b71c",
-      "span_id": "fd13f4f202d0d7a5"
+      "trace_id": "0934a40e20fdfe070060c5388c844116",
+      "span_id": "31ed66eb214272b6"
     }
   },
   "example_2": {
@@ -59,14 +59,14 @@
     "agent_result": {
       "01_mean_waiting_time": 0.0,
       "02_operator_util": 46.452122409679156,
-      "03_mean_nurse_waiting_time": 43.94073145744734,
-      "04_nurse_util": 97.41291238872327,
+      "03_mean_nurse_waiting_time": 0.0041045663539504606,
+      "04_nurse_util": 51.20346053052576,
       "05_callback_rate": 42.12121212121212
     },
-    "passed": false,
+    "passed": true,
     "context": {
-      "trace_id": "c93069f691d1a1edbca3886db55bc2cc",
-      "span_id": "1c4b8ed595ac9a41"
+      "trace_id": "77ca44b81bf36c073755141a05677ead",
+      "span_id": "caec062bc22d0352"
     }
   },
   "example_3": {
@@ -92,16 +92,16 @@
       "05_callback_rate": 42.48847926267281
     },
     "agent_result": {
-      "01_mean_waiting_time": 230.08648598465186,
+      "01_mean_waiting_time": 242.69198606467467,
       "02_operator_util": 99.35481108027054,
-      "03_mean_nurse_waiting_time": 78.94871019130441,
+      "03_mean_nurse_waiting_time": 78.94431797274639,
       "04_nurse_util": 97.8390040780754,
       "05_callback_rate": 42.48847926267281
     },
-    "passed": false,
+    "passed": true,
     "context": {
-      "trace_id": "0f08ea84447ef6e9bdb05a17920807db",
-      "span_id": "b87894c014f8cf28"
+      "trace_id": "5f9634ef46fc071a31c5cf1110340b65",
+      "span_id": "e753ea4ba67d52a2"
     }
   },
   "example_4": {
@@ -135,8 +135,8 @@
     },
     "passed": true,
     "context": {
-      "trace_id": "3a7d4f511089509f19d615d5b04a725d",
-      "span_id": "926e778adc644f4c"
+      "trace_id": "00c51788956b474f796f4f43e50f83fe",
+      "span_id": "53ded666630ded84"
     }
   },
   "example_5": {
@@ -162,16 +162,16 @@
       "05_callback_rate": 33.012583271650634
     },
     "agent_result": {
-      "01_mean_waiting_time": 46.3970260873325,
-      "02_operator_util": 98.6111946659619,
-      "03_mean_nurse_waiting_time": 2.290083053642536,
-      "04_nurse_util": 82.07127437115734,
-      "05_callback_rate": 33.11061618411284
+      "01_mean_waiting_time": 62.26336943615235,
+      "02_operator_util": 98.91827940297605,
+      "03_mean_nurse_waiting_time": 2.3551925550777337,
+      "04_nurse_util": 82.46774067208518,
+      "05_callback_rate": 33.012583271650634
     },
-    "passed": false,
+    "passed": true,
     "context": {
-      "trace_id": "3dd0f874abcf87f832d31fb352632636",
-      "span_id": "ad80c628897ebbbe"
+      "trace_id": "bf05c8a6a1ec39d65c9968fb83601548",
+      "span_id": "948c6aad19a4740b"
     }
   },
   "example_6": {
@@ -205,8 +205,8 @@
     },
     "passed": true,
     "context": {
-      "trace_id": "7665b2b29408259e148fccd619ac80e6",
-      "span_id": "06181825b8538a83"
+      "trace_id": "0ab69e3e17a7e78309b5fe6167ea5ca7",
+      "span_id": "ea7b6b1569d943e2"
     }
   },
   "example_7": {
@@ -240,8 +240,8 @@
     },
     "passed": true,
     "context": {
-      "trace_id": "351c928786d9364f8bf28983dd16c0a8",
-      "span_id": "6bffbbff22841ac8"
+      "trace_id": "0f483bd78e24ae73065112e5d7a12b5a",
+      "span_id": "e1e58275b3473a9c"
     }
   },
   "example_8": {
@@ -267,16 +267,16 @@
       "05_callback_rate": 57.87545787545788
     },
     "agent_result": {
-      "01_mean_waiting_time": 306.74172321599656,
-      "02_operator_util": 99.45155480236895,
-      "03_mean_nurse_waiting_time": 196.98238666596234,
+      "01_mean_waiting_time": 318.98668257451686,
+      "02_operator_util": 99.56949574531609,
+      "03_mean_nurse_waiting_time": 196.98753442211358,
       "04_nurse_util": 97.69465771729217,
-      "05_callback_rate": 57.8239608801956
+      "05_callback_rate": 57.87545787545788
     },
-    "passed": false,
+    "passed": true,
     "context": {
-      "trace_id": "774aa223e001e166a7d7c906779f1807",
-      "span_id": "784b26c1ca10d345"
+      "trace_id": "df871b1e2f654ad8de10bbd51fbdaefe",
+      "span_id": "52f47e95d1fb0985"
     }
   },
   "example_9": {
@@ -310,8 +310,8 @@
     },
     "passed": true,
     "context": {
-      "trace_id": "d8b2aedeb917ec4191b1c6a7fc467721",
-      "span_id": "14f5bf6c2a7008c2"
+      "trace_id": "6d1203e63e2c1ffde945f0e20d33307f",
+      "span_id": "5b1a97dca34eaa49"
     }
   },
   "example_10": {
@@ -337,16 +337,16 @@
       "05_callback_rate": 51.4546965918537
     },
     "agent_result": {
-      "01_mean_waiting_time": 1.2583822886248166,
-      "02_operator_util": 84.9856571728051,
-      "03_mean_nurse_waiting_time": 46.17149830527762,
-      "04_nurse_util": 97.12003253429981,
-      "05_callback_rate": 50.43103448275862
+      "01_mean_waiting_time": 0.38280107383630124,
+      "02_operator_util": 73.44752978233386,
+      "03_mean_nurse_waiting_time": 4.147154100012408,
+      "04_nurse_util": 90.70926116406883,
+      "05_callback_rate": 51.4546965918537
     },
-    "passed": false,
+    "passed": true,
     "context": {
-      "trace_id": "38fc463cffe7950265a08d9e4f57c852",
-      "span_id": "910e28879b5d85b8"
+      "trace_id": "4bce453daf28c8f5912d2c76c14d173d",
+      "span_id": "d1ad3e87babd3758"
     }
   }
 }
\ No newline at end of file
diff --git a/mcpsim/example_models/callcentre/resources/schema.json b/mcpsim/example_models/callcentre/resources/schema.json
index 66d699d..6f8cba6 100644
--- a/mcpsim/example_models/callcentre/resources/schema.json
+++ b/mcpsim/example_models/callcentre/resources/schema.json
@@ -4,21 +4,21 @@
     "default": 13,
     "minimum": 1,
     "maximum": 100,
-    "description": "Number of call operators (1-100)"
+    "description": "Number of call operator staff on duty (1-100)"
   },
   "n_nurses": {
     "type": "int",
     "default": 10,
     "minimum": 1,
     "maximum": 50,
-    "description": "Number of nurse callbacks (1-50)"
+    "description": "Number of nurse staff on duty (1-50)"
   },
   "mean_iat": {
     "type": "float",
     "default": 0.6,
     "minimum": 0.1,
     "maximum": 10.0,
-    "description": "Mean time between arrivals (minutes) or inter-arrival time. 0.1 = 6 calls/min, 10.0 = 1 call/10min. An increase in demand = a reduction in the inter-arrival time. A reduction in demand = an increase in inter-arrival time"
+    "description": "Mean inter-arrival time in minutes (average time between arrivals). Baseline = 0.6 min.\n\nDemand changes:\n1. Percentage change form: If demand increases by fraction X, set mean_iat = baseline × (1 - X). If demand decreases by fraction X, set mean_iat = baseline × (1 + X).\n2. Multiplicative change form: If arrivals are multiplied by factor M (e.g., 'half the calls' = M=0.5, 'double the calls' = M=2), set mean_iat = baseline × (1 / M)."
   },
   "call_low": {
     "type": "float",
diff --git a/mcpsim/resources/parameter_prompt2.txt b/mcpsim/resources/parameter_prompt2.txt
new file mode 100644
index 0000000..4ae6a20
--- /dev/null
+++ b/mcpsim/resources/parameter_prompt2.txt
@@ -0,0 +1,35 @@
+You are a discrete-event simulation assistant.
+Your job is to convert a plain-English request into computed JSON parameter values for a simulation.
+
+Here is the simulation configuration schema:
+{schema}
+
+User request: {user_input}
+
+{validation_feedback}
+
+INSTRUCTIONS:
+1. Use the schema default for any parameter not supplied in the request.  
+2. Apply any percentage/fractional demand change directly to `mean_iat` using:
+   - Increase in demand (fraction X): `mean_iat = mean_iat × (1 - X)`
+   - Decrease in demand (fraction X): `mean_iat = mean_iat × (1 + X)`
+3. Perform all calculations mentally — never output an expression, only the Final computed number.
+4. Return all parameters present in the schema, filled with calculated or default values.
+5. Final output must be a valid JSON object with:
+   - Keys exactly as in the schema  
+   - Numeric values only (no strings, no units)
+
+Example output format
+{{
+  "n_operators": 12,
+  "n_nurses": 10,
+  "mean_iat": 0.81,
+  "call_low": 5.0,
+  "call_mode": 7.0,
+  "call_high": 10.0,
+  "callback_prob": 0.4,
+  "nurse_consult_low": 10.0,
+  "nurse_consult_high": 20.0,
+  "random_seed": 0,
+  "run_length": 1000
+}}
\ No newline at end of file
diff --git a/mcpsim/resources/parameter_prompt3.txt b/mcpsim/resources/parameter_prompt3.txt
new file mode 100644
index 0000000..4ce8404
--- /dev/null
+++ b/mcpsim/resources/parameter_prompt3.txt
@@ -0,0 +1,33 @@
+You are a discrete-event simulation assistant.
+Your job is to convert a plain-English request into computed JSON parameter values for a simulation.
+
+Here is the simulation configuration schema:
+{schema}
+
+User request: {user_input}
+
+{validation_feedback}
+
+INSTRUCTIONS:
+1. Use the schema default for any parameter not supplied in the request.  
+2. Adjust `mean_iat` according to the rules in its description:
+   - If change is given as a percentage/fraction of demand, scale baseline directly by (1 − X) for increases or (1 + X) for decreases.
+   - If change is given as a multiple of arrivals, first set arrival multiplier M, then mean_iat = baseline × (1 / M).
+3. Perform all calculations mentally — never output an expression, only the Final computed number.
+4. Return all parameters present in the schema, filled with calculated or default values.
+5. Keys must be exactly as in the schema, numbers only, no strings or units.
+
+Example output format
+{{
+  "n_operators": 12,
+  "n_nurses": 10,
+  "mean_iat": 0.81,
+  "call_low": 5.0,
+  "call_mode": 7.0,
+  "call_high": 10.0,
+  "callback_prob": 0.4,
+  "nurse_consult_low": 10.0,
+  "nurse_consult_high": 20.0,
+  "random_seed": 0,
+  "run_length": 1000
+}}
\ No newline at end of file
diff --git a/mcpsim/server.py b/mcpsim/server.py
index ee0f1f9..27ffffd 100644
--- a/mcpsim/server.py
+++ b/mcpsim/server.py
@@ -6,7 +6,8 @@
 
 from mcpsim.protocol import SimulationModelAdapter
 
-PARAMETER_PROMPT_PATH = "resources/parameter_prompt.txt"
+# modified for experimentation...
+PARAMETER_PROMPT_PATH = "resources/parameter_prompt2.txt"
 
 from mcpsim.tracing import init_tracing  # your helper module from previous discussion
 
diff --git a/run_evals.py b/run_evals.py
index a262ebf..6470e01 100644
--- a/run_evals.py
+++ b/run_evals.py
@@ -17,12 +17,12 @@
 # Import OpenTelemetry trace API to capture span context
 from opentelemetry import trace
 
-# Initialize a global tracer from your helper
+# Initialize a global tracer 
 tracer_provider = init_tracing(project_name="mcp-agent-evaluation", endpoint="http://localhost:6006")
 tracer = tracer_provider.get_tracer("eval-runner-tracer")
 
 
-# ----------------- Comparison helpers (unchanged) -----------------
+# ----------------- Comparison helpers -----------------
 def floats_close(a: Any, b: Any, rtol: float = 1e-3, atol: float = 1e-6) -> bool:
     if a is None or b is None: return a is None and b is None
     try:
@@ -100,10 +100,6 @@ def compare_results(
     # 1. Check for structural differences (different keys).
     if set(s_got.index) != set(s_expected.index):
         return False
-
-    # If both are empty but have same (no) keys, they match.
-    # if s_got.empty:
-    #     return True
     
     # 2. Verify that ALL values in BOTH series are numeric.
     # pd.api.types.is_number is a robust way to check for int/float.
@@ -124,7 +120,7 @@ def compare_results(
         equal_nan=True  # Considers two NaN values to be equal.
     )
 
-# ----------------- Agent run helpers (unchanged) -----------------
+# ----------------- Agent run helpers ----------------
 async def run_agent_once(compiled_graph, user_input: str, llm: OllamaLLM) -> AgentState:
     state_in: AgentState = {"user_input": user_input, "retry_count": 0, "validation_history": []}
     return await compiled_graph.ainvoke(state_in)
@@ -132,7 +128,7 @@ async def run_agent_once(compiled_graph, user_input: str, llm: OllamaLLM) -> Age
 def extract_sim_result(state: AgentState) -> Optional[Dict[str, Any]]:
     return state.get("simulation_result")
 
-# ---------------- UPDATED Bulk Ingest Function ----------------
+# ---------------- Bulk Ingest Function for Phoenix ----------------
 def bulk_ingest_to_phoenix(json_path: str, eval_name: str = "Simulation Agent Eval"):
     """
     Loads an enriched evals.json file and bulk-ingests into Phoenix,
@@ -172,7 +168,7 @@ def bulk_ingest_to_phoenix(json_path: str, eval_name: str = "Simulation Agent Ev
     client.log_evaluations(SpanEvaluations(eval_name=eval_name, dataframe=eval_df))
     print(f"[✓] Pushed {len(eval_df)} eval rows to Phoenix under '{eval_name}'")
 
-# ---------------- UPDATED Main eval runner ----------------
+# ---------------- Main eval runner ----------------
 async def run_all_and_save(model_name: str = "gemma3:27b", limit: int = None):
     """
     Runs the full evaluation pipeline and saves an enriched evals.json
@@ -228,5 +224,5 @@ async def run_all_and_save(model_name: str = "gemma3:27b", limit: int = None):
     if args.skip_run:
         bulk_ingest_to_phoenix("evals/evals_output.json", eval_name=args.eval_name)
     else:
-        asyncio.run(run_all_and_save(model_name="gemma3:27b", limit=args.limit))
+        asyncio.run(run_all_and_save(model_name="gpt-oss:20b", limit=args.limit))
         bulk_ingest_to_phoenix("evals/evals_output.json", eval_name=args.eval_name)

From b832d9da214e85b38808ba59ebe5d41440134993 Mon Sep 17 00:00:00 2001
From: TomMonks <thomas.m.monks@gmail.com>
Date: Wed, 13 Aug 2025 16:14:59 +0100
Subject: [PATCH 22/23] feat(evals): create script now uploads to phoenix as
 dataset

---
 create_evals.py         |  46 ++++++++++++++++++
 evals/create_evals.py   |  22 ---------
 evals/evals_output.json | 104 ++--------------------------------------
 3 files changed, 50 insertions(+), 122 deletions(-)
 create mode 100644 create_evals.py
 delete mode 100644 evals/create_evals.py

diff --git a/create_evals.py b/create_evals.py
new file mode 100644
index 0000000..07a445f
--- /dev/null
+++ b/create_evals.py
@@ -0,0 +1,46 @@
+"""
+Script to add expected simulation results to the evals dataset
+single rep at this stage.
+"""
+import pandas as pd
+import phoenix as px
+
+import json
+from datetime import datetime
+
+# sim-agent imports
+from mcpsim.example_models.callcentre.call_centre_model import run_simulation_from_dict 
+from mcpsim.tracing import init_tracing
+
+# Load test cases from evals_input.json
+with open("evals/evals_input.json", "r") as f:
+    scenarios = json.load(f)
+
+# Run each scenario once and attach results
+for key, obj in scenarios.items():
+    params = obj["parameters"]
+    results = run_simulation_from_dict(params)
+    obj["expected_results"] = results
+
+# Save updated scenarios with results to evals.json
+with open("evals/evals.json", "w") as f:
+    json.dump(scenarios, f, indent=2)
+
+# ------------- upload evals to phoenix --------------------------
+# Initialize a global tracer 
+tracer_provider = init_tracing(project_name="mcp-agent-evaluation", endpoint="http://localhost:6006")
+
+# convert to dataframe and transpose 
+df_evals = pd.DataFrame(scenarios).T
+
+# create a dataset consisting of input questions and expected outputs
+now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+phoenix_client = px.Client()
+dataset = phoenix_client.upload_dataset(
+    dataframe=df_evals, 
+    dataset_name=f"initial_evals-{now}", 
+    input_keys=["user_input"], 
+    output_keys=["parameters", "expected_results"]
+)
+
+print("Simulation results saved to evals.json")
diff --git a/evals/create_evals.py b/evals/create_evals.py
deleted file mode 100644
index e469a4e..0000000
--- a/evals/create_evals.py
+++ /dev/null
@@ -1,22 +0,0 @@
-"""
-Script to add expected simulation results to the evals dataset
-single rep at this stage.
-"""
-import json
-from mcpsim.example_models.callcentre.call_centre_model import run_simulation_from_dict 
-
-# Load test cases from evals_input.json
-with open("evals/evals_input.json", "r") as f:
-    scenarios = json.load(f)
-
-# Run each scenario once and attach results
-for key, obj in scenarios.items():
-    params = obj["parameters"]
-    results = run_simulation_from_dict(params)
-    obj["expected_results"] = results
-
-# Save updated scenarios with results to evals.json
-with open("evals/evals.json", "w") as f:
-    json.dump(scenarios, f, indent=2)
-
-print("Simulation results saved to evals.json")
diff --git a/evals/evals_output.json b/evals/evals_output.json
index ed2acdb..71caedf 100644
--- a/evals/evals_output.json
+++ b/evals/evals_output.json
@@ -30,8 +30,8 @@
     },
     "passed": true,
     "context": {
-      "trace_id": "0934a40e20fdfe070060c5388c844116",
-      "span_id": "31ed66eb214272b6"
+      "trace_id": "05e9185a30490fa463f00ea5b0e65108",
+      "span_id": "39cfc1324c7aee19"
     }
   },
   "example_2": {
@@ -65,8 +65,8 @@
     },
     "passed": true,
     "context": {
-      "trace_id": "77ca44b81bf36c073755141a05677ead",
-      "span_id": "caec062bc22d0352"
+      "trace_id": "95fc97f63b485e37d20ada5d01b2acdb",
+      "span_id": "f66b8c36689fbdb2"
     }
   },
   "example_3": {
@@ -90,18 +90,6 @@
       "03_mean_nurse_waiting_time": 78.94431797274639,
       "04_nurse_util": 97.8390040780754,
       "05_callback_rate": 42.48847926267281
-    },
-    "agent_result": {
-      "01_mean_waiting_time": 242.69198606467467,
-      "02_operator_util": 99.35481108027054,
-      "03_mean_nurse_waiting_time": 78.94431797274639,
-      "04_nurse_util": 97.8390040780754,
-      "05_callback_rate": 42.48847926267281
-    },
-    "passed": true,
-    "context": {
-      "trace_id": "5f9634ef46fc071a31c5cf1110340b65",
-      "span_id": "e753ea4ba67d52a2"
     }
   },
   "example_4": {
@@ -125,18 +113,6 @@
       "03_mean_nurse_waiting_time": 0.0,
       "04_nurse_util": 35.17900361818962,
       "05_callback_rate": 42.17289719626168
-    },
-    "agent_result": {
-      "01_mean_waiting_time": 0.0,
-      "02_operator_util": 31.21492694742085,
-      "03_mean_nurse_waiting_time": 0.0,
-      "04_nurse_util": 35.17900361818962,
-      "05_callback_rate": 42.17289719626168
-    },
-    "passed": true,
-    "context": {
-      "trace_id": "00c51788956b474f796f4f43e50f83fe",
-      "span_id": "53ded666630ded84"
     }
   },
   "example_5": {
@@ -160,18 +136,6 @@
       "03_mean_nurse_waiting_time": 2.3551925550777337,
       "04_nurse_util": 82.46774067208518,
       "05_callback_rate": 33.012583271650634
-    },
-    "agent_result": {
-      "01_mean_waiting_time": 62.26336943615235,
-      "02_operator_util": 98.91827940297605,
-      "03_mean_nurse_waiting_time": 2.3551925550777337,
-      "04_nurse_util": 82.46774067208518,
-      "05_callback_rate": 33.012583271650634
-    },
-    "passed": true,
-    "context": {
-      "trace_id": "bf05c8a6a1ec39d65c9968fb83601548",
-      "span_id": "948c6aad19a4740b"
     }
   },
   "example_6": {
@@ -195,18 +159,6 @@
       "03_mean_nurse_waiting_time": 0.4505143843086236,
       "04_nurse_util": 83.59706413413956,
       "05_callback_rate": 41.69621749408983
-    },
-    "agent_result": {
-      "01_mean_waiting_time": 161.26826119772568,
-      "02_operator_util": 99.27336565709264,
-      "03_mean_nurse_waiting_time": 0.4505143843086236,
-      "04_nurse_util": 83.59706413413956,
-      "05_callback_rate": 41.69621749408983
-    },
-    "passed": true,
-    "context": {
-      "trace_id": "0ab69e3e17a7e78309b5fe6167ea5ca7",
-      "span_id": "ea7b6b1569d943e2"
     }
   },
   "example_7": {
@@ -230,18 +182,6 @@
       "03_mean_nurse_waiting_time": 2.663802537251699,
       "04_nurse_util": 86.43094276820304,
       "05_callback_rate": 41.684147437818396
-    },
-    "agent_result": {
-      "01_mean_waiting_time": 0.07192106569784922,
-      "02_operator_util": 66.74240386374329,
-      "03_mean_nurse_waiting_time": 2.663802537251699,
-      "04_nurse_util": 86.43094276820304,
-      "05_callback_rate": 41.684147437818396
-    },
-    "passed": true,
-    "context": {
-      "trace_id": "0f483bd78e24ae73065112e5d7a12b5a",
-      "span_id": "e1e58275b3473a9c"
     }
   },
   "example_8": {
@@ -265,18 +205,6 @@
       "03_mean_nurse_waiting_time": 196.98753442211358,
       "04_nurse_util": 97.69465771729217,
       "05_callback_rate": 57.87545787545788
-    },
-    "agent_result": {
-      "01_mean_waiting_time": 318.98668257451686,
-      "02_operator_util": 99.56949574531609,
-      "03_mean_nurse_waiting_time": 196.98753442211358,
-      "04_nurse_util": 97.69465771729217,
-      "05_callback_rate": 57.87545787545788
-    },
-    "passed": true,
-    "context": {
-      "trace_id": "df871b1e2f654ad8de10bbd51fbdaefe",
-      "span_id": "52f47e95d1fb0985"
     }
   },
   "example_9": {
@@ -300,18 +228,6 @@
       "03_mean_nurse_waiting_time": 19.85524137214805,
       "04_nurse_util": 97.15170780690659,
       "05_callback_rate": 42.12121212121212
-    },
-    "agent_result": {
-      "01_mean_waiting_time": 0.08980920202587155,
-      "02_operator_util": 67.09751014731434,
-      "03_mean_nurse_waiting_time": 19.85524137214805,
-      "04_nurse_util": 97.15170780690659,
-      "05_callback_rate": 42.12121212121212
-    },
-    "passed": true,
-    "context": {
-      "trace_id": "6d1203e63e2c1ffde945f0e20d33307f",
-      "span_id": "5b1a97dca34eaa49"
     }
   },
   "example_10": {
@@ -335,18 +251,6 @@
       "03_mean_nurse_waiting_time": 4.147154100012408,
       "04_nurse_util": 90.70926116406883,
       "05_callback_rate": 51.4546965918537
-    },
-    "agent_result": {
-      "01_mean_waiting_time": 0.38280107383630124,
-      "02_operator_util": 73.44752978233386,
-      "03_mean_nurse_waiting_time": 4.147154100012408,
-      "04_nurse_util": 90.70926116406883,
-      "05_callback_rate": 51.4546965918537
-    },
-    "passed": true,
-    "context": {
-      "trace_id": "4bce453daf28c8f5912d2c76c14d173d",
-      "span_id": "d1ad3e87babd3758"
     }
   }
 }
\ No newline at end of file

From 6476f765b5c71c9fea4a5cdb8ad120d16d2d38ca Mon Sep 17 00:00:00 2001
From: TomMonks <thomas.m.monks@gmail.com>
Date: Tue, 30 Sep 2025 16:59:31 +0100
Subject: [PATCH 23/23] feat(app): draft of a basic streamlit app

---
 app.py                  | 501 ++++++++++++++++++++++++++++++++++++++++
 app2.py                 | 486 ++++++++++++++++++++++++++++++++++++++
 evals/evals_output.json |   8 +-
 3 files changed, 991 insertions(+), 4 deletions(-)
 create mode 100644 app.py
 create mode 100644 app2.py

diff --git a/app.py b/app.py
new file mode 100644
index 0000000..c44cdb4
--- /dev/null
+++ b/app.py
@@ -0,0 +1,501 @@
+
+import streamlit as st
+import asyncio
+import json
+import subprocess
+import time
+import threading
+import queue
+from io import StringIO
+import sys
+from contextlib import redirect_stdout, redirect_stderr
+import os
+from typing import Optional, Dict, Any
+import pandas as pd
+
+# Import the agents (assuming they're in the same directory or on path)
+try:
+    # These would be your actual imports
+    from langchain_ollama import OllamaLLM
+    from fastmcp import Client
+    from agent_planning_workflow import main as planning_main
+    from agent_self_reflection import main as reflection_main
+    import agent_planning_workflow
+    import agent_self_reflection
+except ImportError as e:
+    st.error(f"Import error: {e}. Please ensure all dependencies are installed and agents are accessible.")
+
+# Page configuration
+st.set_page_config(
+    page_title="DES Agent Interface",
+    page_icon="🤖",
+    layout="wide",
+    initial_sidebar_state="expanded"
+)
+
+# Custom CSS for better styling
+st.markdown("""
+<style>
+    .main-header {
+        font-size: 2.5rem;
+        color: #1f77b4;
+        text-align: center;
+        margin-bottom: 2rem;
+    }
+    .agent-card {
+        background-color: #f0f2f6;
+        padding: 1rem;
+        border-radius: 10px;
+        margin: 1rem 0;
+    }
+    .debug-output {
+        background-color: #2e2e2e;
+        color: #00ff00;
+        padding: 1rem;
+        border-radius: 5px;
+        font-family: 'Courier New', monospace;
+        max-height: 400px;
+        overflow-y: auto;
+    }
+    .simulation-results {
+        background-color: #e8f4fd;
+        padding: 1rem;
+        border-radius: 10px;
+        border-left: 5px solid #1f77b4;
+    }
+</style>
+""", unsafe_allow_html=True)
+
+# Initialize session state
+if 'execution_logs' not in st.session_state:
+    st.session_state.execution_logs = []
+if 'simulation_results' not in st.session_state:
+    st.session_state.simulation_results = None
+if 'mcp_server_status' not in st.session_state:
+    st.session_state.mcp_server_status = "Unknown"
+
+
+import requests
+
+def get_ollama_models():
+    """Get list of models from Ollama server"""
+    try:
+        response = requests.get("http://localhost:11434/api/tags")
+        if response.status_code == 200:
+            data = response.json()
+            models = [model["name"] for model in data["models"]]
+            return models
+        return []
+    except Exception as e:
+        print(f"Error: {e}")
+        return []
+
+
+
+# Header
+st.markdown('<h1 class="main-header">🤖 DES Agent Interface</h1>', unsafe_allow_html=True)
+
+# Sidebar configuration
+st.sidebar.header("🔧 Configuration")
+
+# Agent selection
+agent_type = st.sidebar.selectbox(
+    "Select Agent Type",
+    ["Dynamic Planner", "Self-Reflective"],
+    help="Choose between the two agent architectures"
+)
+
+# Model configuration based on agent type
+st.sidebar.subheader("🧠 LLM Configuration")
+
+# get the model list from Ollama and get default indexese
+available_models = get_ollama_models()
+try:
+    gemma_index = available_models.index("gemma3:27b")
+except ValueError:
+    gemma_index = 0
+
+try:
+    llama_index = available_models.index("lamma3:latest")
+except:
+    llama_index = 0
+
+if agent_type == "Dynamic Planner":
+    planning_model = st.sidebar.selectbox(
+        "Planning Model",
+        available_models,
+        index=gemma_index,  # Default to gemma3:27b
+        help="Model used for reasoning and task planning"
+    )
+
+    summary_model = st.sidebar.selectbox(
+        "Summary Model", 
+        available_models,
+        index=llama_index,  # Default to llama3:latest
+        help="Model used for summarizing parameters and results"
+    )
+
+    planning_debug = st.sidebar.checkbox(
+        "Enable Planning Debug Mode",
+        help="Show detailed workflow and planning information"
+    )
+
+else:  # Self-Reflective Agent
+    reflection_model = st.sidebar.selectbox(
+        "Reflection Model",
+        available_models,
+        index=gemma_index,  # Default to gemma3:27b
+        help="Model used for parameter generation and self-reflection"
+    )
+
+    reflection_debug = st.sidebar.checkbox(
+        "Enable Reflection Debug Mode",
+        help="Show detailed reflection and retry information"
+    )
+
+# Server status check
+st.sidebar.subheader("🌐 Server Status")
+
+def check_server_status(url: str, server_type: str = "generic", timeout: int = 3) -> str:
+    """Check if a server is running with proper protocol support"""
+    try:
+        if server_type == "mcp":
+            # MCP uses JSON-RPC - check if port is listening
+            import socket
+            from urllib.parse import urlparse
+            
+            parsed = urlparse(url)
+            host = parsed.hostname or 'localhost'
+            port = parsed.port or 8001
+            
+            sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+            sock.settimeout(timeout)
+            result = sock.connect_ex((host, port))
+            sock.close()
+            
+            return "Running" if result == 0 else "Not Running"
+                
+        elif server_type == "ollama":
+            response = requests.get(url, timeout=timeout)
+            return "Running" if response.status_code == 200 else "Error"
+            
+    except Exception as e:
+        return "Not Running"
+
+# Server status indicators
+mcp_status = mcp_status = check_server_status("http://localhost:8001/mcp", server_type="mcp") 
+ollama_status = check_server_status("http://localhost:11434/api/tags", server_type="ollama")
+
+st.sidebar.markdown(f"**MCP Server:** {'🟢' if mcp_status == 'Running' else '🔴'} {mcp_status}")
+st.sidebar.markdown(f"**Ollama Server:** {'🟢' if ollama_status == 'Running' else '🔴'} {ollama_status}")
+
+if mcp_status != "Running":
+    st.sidebar.warning("⚠️ MCP Server not detected. Start with: `python mcp_server.py`")
+if ollama_status != "Running":
+    st.sidebar.warning("⚠️ Ollama Server not detected. Start with: `ollama serve`")
+
+# Main interface
+col1, col2 = st.columns([2, 1])
+
+with col1:
+    st.subheader("📝 Simulation Request")
+
+    # Pre-defined examples
+    example_requests = [
+        "Simulate 14 operators, 12 nurses and 5% extra demand",
+        "Run scenario with high staffing and normal call volume", 
+        "Test configuration with minimal staff (5 operators, 3 nurses)",
+        "Simulate peak hours with 20 operators, 15 nurses and 20% increased demand",
+        "Run baseline scenario with default parameters"
+    ]
+
+    selected_example = st.selectbox(
+        "Quick Examples:",
+        ["Custom Input"] + example_requests,
+        help="Select a pre-defined example or choose 'Custom Input'"
+    )
+
+    if selected_example == "Custom Input":
+        user_input = st.text_area(
+            "Enter your simulation request:",
+            placeholder="e.g., Simulate 14 operators, 12 nurses and 5% extra demand",
+            height=100
+        )
+    else:
+        user_input = st.text_area(
+            "Simulation request:",
+            value=selected_example,
+            height=100
+        )
+
+with col2:
+    st.subheader("🎯 Agent Information")
+
+    if agent_type == "Dynamic Planner":
+        st.markdown("""
+        <div class="agent-card">
+        <h4>🧠 Dynamic Planner Agent</h4>
+        <p><strong>Features:</strong></p>
+        <ul>
+            <li>Dual LLM architecture</li>
+            <li>Dynamic task planning</li>
+            <li>Memory-driven execution</li>
+            <li>Step-by-step workflow</li>
+        </ul>
+        <p><strong>Models:</strong></p>
+        <ul>
+            <li>Planning: """ + planning_model + """</li>
+            <li>Summary: """ + summary_model + """</li>
+        </ul>
+        </div>
+        """, unsafe_allow_html=True)
+    else:
+        st.markdown("""
+        <div class="agent-card">
+        <h4>🔄 Self-Reflective Agent</h4>
+        <p><strong>Features:</strong></p>
+        <ul>
+            <li>LangGraph state machine</li>
+            <li>Validation-driven learning</li>
+            <li>Bounded retry logic</li>
+            <li>Error analysis & reflection</li>
+        </ul>
+        <p><strong>Model:</strong> """ + reflection_model + """</p>
+        </div>
+        """, unsafe_allow_html=True)
+
+# Execution section
+st.subheader("🚀 Execute Simulation")
+
+# Create execution button
+if st.button("Run Simulation", type="primary", disabled=(not user_input.strip())):
+    if mcp_status != "Running" or ollama_status != "Running":
+    #if ollama_status != "Running":
+        st.error("❌ Please ensure both MCP and Ollama servers are running before executing.")
+    else:
+        with st.spinner("🤖 Agent is working..."):
+            # Create placeholders for live updates
+            status_placeholder = st.empty()
+            debug_placeholder = st.empty()
+            results_placeholder = st.empty()
+
+            # Capture agent execution
+            class StreamCapture:
+                def __init__(self):
+                    self.logs = []
+                    self.current_status = "Initializing..."
+
+                def write(self, text):
+                    if text.strip():
+                        self.logs.append(text.strip())
+                        # Update status based on content
+                        if "Planning modelling task" in text:
+                            self.current_status = "🧠 Planning simulation task..."
+                        elif "Executing the plan" in text:
+                            self.current_status = "⚙️ Executing plan steps..."
+                        elif "Reasoning about simulation parameters" in text:
+                            self.current_status = "🤔 Generating parameters..."
+                        elif "Summarising parameters" in text:
+                            self.current_status = "📊 Summarizing results..."
+                        elif "Simulation complete" in text:
+                            self.current_status = "✅ Simulation complete!"
+
+                def flush(self):
+                    pass
+
+            capture = StreamCapture()
+
+            try:
+                # Redirect stdout and stderr to capture agent output
+                original_stdout = sys.stdout
+                original_stderr = sys.stderr
+                sys.stdout = capture
+                sys.stderr = capture
+
+                # Execute the selected agent
+                if agent_type == "Dynamic Planner":
+                    # Simulate the agent execution (in real implementation, you'd call the actual functions)
+                    status_placeholder.info("🧠 Starting Dynamic Planner Agent...")
+
+                    # In real implementation, you would do:
+                    # result = asyncio.run(planning_main(planning_model, summary_model, planning_debug))
+
+                    # For demonstration, simulate the process
+                    time.sleep(1)
+                    status_placeholder.info("📋 Generating execution plan...")
+
+                    if planning_debug:
+                        debug_placeholder.markdown("""
+                        <div class="debug-output">
+                        🐛 Debug mode enabled - showing workflow details<br>
+                        === MCP SERVER CAPABILITIES ===<br>
+                        🔧 Available tools (3): run_call_centre_simulation, validate_simulation_parameters<br>
+                        📚 Available resources (2): schema/experiment_parameters, model/description<br>
+                        📝 Available prompts (1): parameter_jsonification_prompt<br>
+                        === LLM GENERATED PLAN ===<br>
+                        Step 1: Get experiment parameter schema<br>
+                        → Type: Resource, Name: get_experiment_parameter_schema<br>
+                        Step 2: Generate simulation parameters<br>
+                        → Type: Prompt, Name: parameter_jsonification_prompt<br>
+                        Step 3: Validate parameters<br>
+                        → Type: Tool, Name: validate_simulation_parameters<br>
+                        Step 4: Run simulation<br>
+                        → Type: Tool, Name: run_call_centre_simulation
+                        </div>
+                        """, unsafe_allow_html=True)
+
+                    time.sleep(2)
+                    status_placeholder.info("⚙️ Executing planned steps...")
+                    time.sleep(2)
+                    status_placeholder.success("✅ Simulation complete!")
+
+                else:  # Self-Reflective Agent
+                    status_placeholder.info("🔄 Starting Self-Reflective Agent...")
+
+                    if reflection_debug:
+                        debug_placeholder.markdown("""
+                        <div class="debug-output">
+                        🐛 Debug mode enabled - showing reflection details<br>
+                        🧠 Reasoning about simulation parameters...<br>
+                        ✅ Parameter validation successful<br>
+                        📊 Parameters formatted successfully<br>
+                        🏥 Running healthcare call centre simulation...
+                        </div>
+                        """, unsafe_allow_html=True)
+
+                    time.sleep(1)
+                    status_placeholder.info("🤔 Generating parameters...")
+                    time.sleep(2)  
+                    status_placeholder.info("✅ Parameters validated successfully")
+                    time.sleep(1)
+                    status_placeholder.info("🏥 Running simulation...")
+                    time.sleep(2)
+                    status_placeholder.success("✅ Simulation complete!")
+
+                # Display mock results (in real implementation, these would come from the agents)
+                sample_parameters = {
+                    "n_operators": 14,
+                    "n_nurses": 12, 
+                    "mean_iat": 0.57,
+                    "call_low": 5.0,
+                    "call_mode": 7.0,
+                    "call_high": 10.0,
+                    "callback_prob": 0.4,
+                    "run_length": 1000,
+                    "random_seed": 42
+                }
+
+                sample_results = {
+                    "01_mean_waiting_time": 2.45,
+                    "02_operator_util": 78.5,
+                    "03_mean_nurse_waiting_time": 1.23,
+                    "04_nurse_util": 65.2,
+                    "05_callback_rate": 39.8
+                }
+
+                # Display results
+                results_placeholder.markdown("""
+                <div class="simulation-results">
+                <h3>🎯 Simulation Results</h3>
+                </div>
+                """, unsafe_allow_html=True)
+
+                # Parameters table
+                st.subheader("📊 Parameters Used")
+                param_df = pd.DataFrame(list(sample_parameters.items()), columns=['Parameter', 'Value'])
+                st.dataframe(param_df, use_container_width=True)
+
+                # Results table  
+                st.subheader("📈 Key Performance Indicators")
+                results_df = pd.DataFrame(list(sample_results.items()), columns=['KPI', 'Value'])
+                st.dataframe(results_df, use_container_width=True)
+
+                # Store results in session state
+                st.session_state.simulation_results = {
+                    'parameters': sample_parameters,
+                    'results': sample_results,
+                    'agent_type': agent_type,
+                    'timestamp': time.strftime("%Y-%m-%d %H:%M:%S")
+                }
+
+            except Exception as e:
+                st.error(f"❌ Error during execution: {str(e)}")
+
+            finally:
+                # Restore stdout and stderr
+                sys.stdout = original_stdout
+                sys.stderr = original_stderr
+
+# Results history section
+if st.session_state.simulation_results:
+    st.subheader("📚 Recent Results")
+
+    with st.expander("View Last Simulation Results", expanded=False):
+        results = st.session_state.simulation_results
+
+        col1, col2 = st.columns(2)
+        with col1:
+            st.write("**Agent Type:**", results['agent_type'])
+            st.write("**Timestamp:**", results['timestamp'])
+
+        with col2:
+            if st.button("📋 Copy Parameters JSON"):
+                st.code(json.dumps(results['parameters'], indent=2))
+
+        st.write("**Parameters:**")
+        st.json(results['parameters'])
+
+        st.write("**Results:**")  
+        st.json(results['results'])
+
+# Footer with helpful information
+# st.markdown("---")
+# st.markdown("""
+# ### 💡 Tips for Better Results
+
+# **For Dynamic Planner:**
+# - Use gemma3:27b for best planning performance
+# - Enable debug mode to see step-by-step execution
+# - Try complex requests that require multi-step reasoning
+
+# **For Self-Reflective Agent:**
+# - gemma3:27b and mistral:7b work well for parameter generation
+# - The agent will automatically retry if validation fails
+# - Watch for reflection patterns in debug mode
+
+# **Common Request Patterns:**
+# - "Simulate X operators, Y nurses and Z% extra demand"
+# - "Run scenario with [adjective] staffing and [adjective] call volume"
+# - "Test configuration with [specific parameters]"
+# """)
+
+# # Troubleshooting section
+# with st.expander("🔧 Troubleshooting", expanded=False):
+#     st.markdown("""
+#     **Common Issues:**
+
+#     1. **MCP Server Not Running:**
+#        ```bash
+#        python mcp_server.py
+#        ```
+#        Server should be available at http://localhost:8001/mcp
+
+#     2. **Ollama Server Not Running:**
+#        ```bash
+#        ollama serve
+#        ```
+#        Then pull required models:
+#        ```bash
+#        ollama pull gemma3:27b
+#        ollama pull llama3:latest
+#        ollama pull mistral:7b
+#        ```
+
+#     3. **Model Not Found:**
+#        Check available models: `ollama list`
+
+#     4. **Connection Errors:**
+#        - Check firewall settings
+#        - Ensure ports 8001 and 11434 are available
+#        - Verify model names match exactly
+#     """)
\ No newline at end of file
diff --git a/app2.py b/app2.py
new file mode 100644
index 0000000..6363621
--- /dev/null
+++ b/app2.py
@@ -0,0 +1,486 @@
+"""
+This code was written by perplexity.ai labs initially. I've been editing it.
+Likely to modify a lot as this isn't quite what I wanted.
+
+"""
+import streamlit as st
+import asyncio
+import json
+import time
+import sys
+import os
+from typing import Optional, Dict, Any, List
+import pandas as pd
+from datetime import datetime
+import requests
+import socket
+from urllib.parse import urlparse
+
+# Import the agents
+try:
+    from langchain_ollama import OllamaLLM
+    from fastmcp import Client
+    from agent_planning_workflow import main as planning_main
+    from agent_self_reflection import main as reflection_main
+    import agent_planning_workflow
+    import agent_self_reflection
+    AGENTS_AVAILABLE = True
+except ImportError as e:
+    st.error(f"Import error: {e}. Please ensure all dependencies are installed and agents are accessible.")
+    AGENTS_AVAILABLE = False
+
+# Page configuration
+st.set_page_config(
+    page_title="Sim Experimentation Agent Interface",
+    page_icon="🤖",
+    layout="wide",
+    initial_sidebar_state="expanded"
+)
+
+
+# Initialize session state
+if 'chat_messages' not in st.session_state:
+    st.session_state.chat_messages = []
+if 'simulation_results' not in st.session_state:
+    st.session_state.simulation_results = None
+if 'agent_busy' not in st.session_state:
+    st.session_state.agent_busy = False
+
+
+def get_ollama_models():
+    """Get list of models from Ollama server"""
+    try:
+        response = requests.get("http://localhost:11434/api/tags")
+        if response.status_code == 200:
+            data = response.json()
+            models = [model["name"] for model in data["models"]]
+            return models
+        return []
+    except Exception as e:
+        print(f"Error: {e}")
+        return []
+
+# maybe refactor into two seperate functions
+def check_server_status(url: str, server_type: str = "generic", timeout: int = 3) -> str:
+    """Check if a server (Ollama or Simulation MCP) is running with proper protocol support"""
+    try:
+        if server_type == "mcp":
+
+            parsed = urlparse(url)
+            host = parsed.hostname or 'localhost'
+            port = parsed.port or 8001
+
+            sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+            sock.settimeout(timeout)
+            result = sock.connect_ex((host, port))
+            sock.close()
+
+            return "Running" if result == 0 else "Not Running"
+
+        elif server_type == "ollama":
+            response = requests.get(url, timeout=timeout)
+            return "Running" if response.status_code == 200 else "Error"
+
+    except Exception as e:
+        return "Not Running"
+
+async def run_self_reflection_agent(user_input: str, model_name: str, debug_mode: bool = False) -> Dict[str, Any]:
+    """Run the actual self-reflection agent"""
+    try:
+        # Create the LLM and graph
+        llm = OllamaLLM(model=model_name, base_url="http://localhost:11434")
+        compiled_graph = agent_self_reflection.build_graph(llm)
+
+        # Run the agent
+        final_state = await compiled_graph.ainvoke({
+            "user_input": user_input,
+            "retry_count": 0,
+            "validation_history": []
+        })
+
+        return {
+            'status': 'success',
+            'agent_type': 'Self-Reflective',
+            'model': model_name,
+            'parameters': final_state.get('parameters', {}),
+            'simulation_results': final_state.get('simulation_result', {}),
+            'formatted_parameters': final_state.get('formatted_parameters', ''),
+            'validation_history': final_state.get('validation_history', []),
+            'retry_count': final_state.get('retry_count', 0),
+            'user_input': user_input,
+            'debug_mode': debug_mode
+        }
+
+    except Exception as e:
+        import traceback
+        return {
+            'status': 'error',
+            'error': str(e),
+            'traceback': traceback.format_exc(),
+            'agent_type': 'Self-Reflective'
+        }
+
+def format_results_for_chat(results: Dict[str, Any]) -> str:
+    """Format simulation results for display in chat"""
+    if results.get('status') != 'success':
+        return f"❌ **Error:** {results.get('error', 'Unknown error')}"
+
+    # Format the response
+    response = f"✅ **Simulation Complete!**\n\n"
+
+    # Add agent info
+    response += f"**Agent:** {results['agent_type']}\n"
+    if 'model' in results:
+        response += f"**Model:** {results['model']}\n"
+    elif 'models' in results:
+        response += f"**Planning Model:** {results['models']['planning']}\n"
+        response += f"**Summary Model:** {results['models']['summary']}\n"
+
+    # Add validation info for self-reflection agent
+    if results.get('retry_count', 0) > 0:
+        response += f"**Validation Attempts:** {results['retry_count'] + 1}\n"
+
+    response += "\n"
+
+    # Format key metrics
+    # sim_results = results.get('simulation_results', {})
+
+    # if sim_results:
+    #     response += "**📊 Key Performance Indicators:**\n"
+    #     response += f"• **Mean Wait Time:** {sim_results.get('01_mean_waiting_time', 0):.2f} minutes\n"
+    #     response += f"• **Operator Utilization:** {sim_results.get('02_operator_util', 0):.1f}%\n"
+    #     response += f"• **Nurse Wait Time:** {sim_results.get('03_mean_nurse_waiting_time', 0):.2f} minutes\n"
+    #     response += f"• **Nurse Utilization:** {sim_results.get('04_nurse_util', 0):.1f}%\n"
+    #     response += f"• **Callback Rate:** {sim_results.get('05_callback_rate', 0):.1f}%\n\n"
+
+
+    # Format parameters
+    params = results.get('parameters', {})
+    if params:
+        response += "**⚙️ Parameters Used:**\n"
+        for key, value in params.items():
+            clean_key = key.replace('_', ' ').title()
+            if isinstance(value, float):
+                response += f"• **{clean_key}:** {value:.3f}\n"
+            else:
+                response += f"• **{clean_key}:** {value}\n"
+
+    return response
+
+
+def format_results_table(results: Dict[str, Any]) -> pd.DataFrame:
+    """
+    Display simulation results to the user in Streamlit chat
+    """
+    sim_results = results.get('simulation_results', {})
+    df = pd.DataFrame(
+        list(sim_results.items()), columns=["KPIs", "Values"]
+    )
+    
+    return df
+
+
+def display_debug_info(results: Dict[str, Any]) -> str:
+    """Format debug information for chat display"""
+    if not results.get('debug_mode'):
+        return ""
+
+    debug_info = "\n**🐛 Debug Information:**\n"
+
+    if results['agent_type'] == 'Self-Reflective':
+        validation_history = results.get('validation_history', [])
+        if validation_history:
+            debug_info += "\n**Validation History:**\n"
+            for i, attempt in enumerate(validation_history, 1):
+                status = "✅ Success" if attempt.get('validation_result', {}).get('is_valid', False) else "❌ Failed"
+                debug_info += f"Attempt {i}: {status}\n"
+
+                errors = attempt.get('validation_result', {}).get('errors', [])
+                if errors:
+                    debug_info += "Errors: " + ", ".join(errors) + "\n"
+
+    return debug_info
+
+# Header
+st.markdown("# 🤖 Simulation Experimentation Agent Interface")
+
+st.markdown(
+    """**Ask the agent to perform simulation questions in natural language and it will provide you with results**.  
+    Click the buttons for example questions.""")
+
+# Sidebar configuration
+st.sidebar.header("🔧 Configuration")
+
+# Agent selection
+agent_type = st.sidebar.selectbox(
+    "Select Agent Type",
+    ["Self-Reflective", "Dynamic Planner"],  # Prioritize working agent
+    help="Choose between the two agent architectures"
+)
+
+# Model configuration
+st.sidebar.subheader("🧠 LLM Configuration")
+
+# Get models from Ollama with fallbacks
+available_models = get_ollama_models()
+if not available_models:
+    available_models = ["gemma3:27b", "llama3:7b", "llama3:latest", "mistral:7b"]
+
+# Find default indices
+try:
+    gemma_index = available_models.index("gemma3:27b")
+except ValueError:
+    gemma_index = 0
+
+try:
+    llama_index = available_models.index("llama3:latest")
+except ValueError:
+    llama_index = 0
+
+if agent_type == "Dynamic Planner":
+    planning_model = st.sidebar.selectbox(
+        "Planning Model",
+        available_models,
+        index=gemma_index,
+        help="Model used for reasoning and task planning"
+    )
+
+    summary_model = st.sidebar.selectbox(
+        "Summary Model", 
+        available_models,
+        index=llama_index,
+        help="Model used for summarizing parameters and results"
+    )
+
+    debug_mode = st.sidebar.checkbox(
+        "Enable Debug Mode",
+        help="Show detailed workflow and planning information"
+    )
+
+else:  # Self-Reflective Agent
+    reflection_model = st.sidebar.selectbox(
+        "Model",
+        available_models,
+        index=gemma_index,
+        help="Model used for parameter generation and self-reflection"
+    )
+
+    debug_mode = st.sidebar.checkbox(
+        "Enable Debug Mode",
+        help="Show detailed reflection and retry information"
+    )
+
+# Server status
+st.sidebar.subheader("🌐 Server Status")
+mcp_status = check_server_status("http://localhost:8001/mcp", server_type="mcp") 
+ollama_status = check_server_status("http://localhost:11434/api/tags", server_type="ollama")
+
+st.sidebar.markdown(f"**MCP Server:** {'🟢' if mcp_status == 'Running' else '🔴'} {mcp_status}")
+st.sidebar.markdown(f"**Ollama Server:** {'🟢' if ollama_status == 'Running' else '🔴'} {ollama_status}")
+
+if mcp_status != "Running":
+    st.sidebar.warning("⚠️ MCP Server not detected. Start with: `python mcp_server.py`")
+if ollama_status != "Running":
+    st.sidebar.warning("⚠️ Ollama Server not detected. Start with: `ollama serve`")
+
+# Clear chat button in sidebar
+if st.sidebar.button("🗑️ Clear Chat"):
+    st.session_state.chat_messages = []
+    st.session_state.simulation_results = None
+    st.rerun()
+
+# Main chat interface
+st.subheader(f"💬 Chat with {agent_type} Agent")
+
+# Chat container
+chat_container = st.container()
+
+with chat_container:
+    # Display chat messages
+    for message in st.session_state.chat_messages:
+        with st.chat_message(message["role"]):
+            st.markdown(message["content"])
+
+# Chat input
+if not st.session_state.agent_busy:
+    # Quick examples
+    st.subheader("💡 Quick Examples")
+    example_cols = st.columns(3)
+
+    examples = [
+        "Simulate 14 operators, 12 nurses and 5% extra demand",
+        "Run scenario with high staffing and normal call volume", 
+        "Test minimal configuration (5 operators, 3 nurses)"
+    ]
+
+    for i, example in enumerate(examples):
+        with example_cols[i]:
+            if st.button(example, key=f"example_{i}", use_container_width=True):
+                # Add example to chat and process
+                st.session_state.chat_messages.append({"role": "user", "content": example})
+                st.session_state.agent_busy = True
+                st.rerun()
+
+# Chat input
+if prompt := st.chat_input("Enter your simulation request...", disabled=st.session_state.agent_busy):
+    # Add user message to chat
+    st.session_state.chat_messages.append({"role": "user", "content": prompt})
+    st.session_state.agent_busy = True
+    st.rerun()
+
+# Process the latest message if agent is busy
+if st.session_state.agent_busy and st.session_state.chat_messages:
+    latest_message = st.session_state.chat_messages[-1]
+
+    if latest_message["role"] == "user":
+        user_input = latest_message["content"]
+
+        # Check if servers are ready
+        if mcp_status != "Running" or ollama_status != "Running":
+            error_msg = "❌ **System Not Ready**\n\nPlease ensure both MCP and Ollama servers are running:\n"
+            error_msg += "- MCP Server: `python mcp_server.py`\n"
+            error_msg += "- Ollama Server: `ollama serve`"
+
+            st.session_state.chat_messages.append({"role": "assistant", "content": error_msg})
+            st.session_state.agent_busy = False
+            st.rerun()
+
+        else:
+            # Show processing message
+            with st.chat_message("assistant"):
+                with st.spinner(f"🤖 {agent_type} Agent is processing your request..."):
+
+                    if agent_type == "Self-Reflective" and AGENTS_AVAILABLE:
+                        # Run actual self-reflection agent
+                        try:
+                            start_time = time.time()
+
+                            # Create progress indicator
+                            progress_placeholder = st.empty()
+                            progress_placeholder.info("🔄 Initializing self-reflection agent...")
+
+                            # Run the agent
+                            result = asyncio.run(run_self_reflection_agent(
+                                user_input, reflection_model, debug_mode
+                            ))
+
+                            execution_time = time.time() - start_time
+                            result['execution_time'] = execution_time
+
+                            progress_placeholder.success(f"✅ Completed in {execution_time:.1f} seconds")
+
+                        except Exception as e:
+                            result = {
+                                'status': 'error',
+                                'error': str(e),
+                                'agent_type': 'Self-Reflective'
+                            }
+
+                    else:
+                        # Mock execution for Dynamic Planner or when agents not available
+                        time.sleep(3)  # Simulate processing
+                        result = {
+                            'status': 'success',
+                            'agent_type': agent_type,
+                            'model': reflection_model if agent_type == "Self-Reflective" else None,
+                            'models': {'planning': planning_model, 'summary': summary_model} if agent_type == "Dynamic Planner" else None,
+                            'parameters': {
+                                'n_operators': 14, 'n_nurses': 12, 'mean_iat': 0.57,
+                                'call_low': 5.0, 'call_mode': 7.0, 'call_high': 10.0,
+                                'callback_prob': 0.4, 'run_length': 1000, 'random_seed': 42
+                            },
+                            'simulation_result': {
+                                '01_mean_waiting_time': 2.34, '02_operator_util': 76.8,
+                                '03_mean_nurse_waiting_time': 1.45, '04_nurse_util': 68.2,
+                                '05_callback_rate': 41.2
+                            },
+                            'retry_count': 0,
+                            'debug_mode': debug_mode
+                        }
+
+            # Format and add response
+            response_text = format_results_for_chat(result)
+            st.markdown(response_text)
+
+            # Display results table if simulation results exist
+            sim_results = result.get('simulation_results', {})   
+            if 'simulation_results' in result and result['simulation_results']:
+                st.write("\n")  # Add spacing
+                df = pd.DataFrame(
+                    list(sim_results.items()), 
+                    columns=["KPIs", "Values"]
+                )
+
+                response_text += "\n**📊 Key Performance Indicators:**\n"
+                st.markdown("**📊 Key Performance Indicators:**\n")
+                #st.dataframe(df.round(2), width='stretch')
+                st.markdown(df.round(2).to_markdown(index=False))
+            
+            # Add debug info if enabled
+            if debug_mode:
+                debug_info = display_debug_info(result)
+                st.markdown(debug_info)
+
+            # Store message content for history (text only, no table)
+            response_text_stored = format_results_for_chat(result)
+            response_text_stored += "\n\n📊 **Key Performance Indicators:**\n"
+            response_text_stored += df.round(2).to_markdown(index=False)
+            if debug_mode:
+                response_text_stored += display_debug_info(result)
+            response_text_stored += f"\n\n*Completed at {datetime.now().strftime('%H:%M:%S')}*"
+            
+            st.session_state.chat_messages.append({
+                "role": "assistant", 
+                "content": response_text_stored,
+                "has_table": True,
+                "table_data": sim_results
+            })
+            st.session_state.simulation_results = result
+            st.session_state.agent_busy = False
+            st.rerun()
+
+# Export options
+if st.session_state.simulation_results:
+    st.subheader("💾 Export Results")
+
+    col1, col2, col3 = st.columns(3)
+
+    results = st.session_state.simulation_results
+
+    with col1:
+        if results.get('parameters'):
+            param_df = pd.DataFrame([
+                {"Parameter": k.replace('_', ' ').title(), "Value": v}
+                for k, v in results['parameters'].items()
+            ])
+            csv_params = param_df.to_csv(index=False)
+            st.download_button(
+                "📊 Parameters CSV",
+                csv_params,
+                file_name=f"parameters_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv",
+                mime="text/csv"
+            )
+
+    with col2:
+        if results.get('simulation_results'):
+            results_df = pd.DataFrame([
+                {"KPI": k.replace('_', ' ').title(), "Value": v}
+                for k, v in results['simulation_results'].items()
+            ])
+            csv_results = results_df.to_csv(index=False)
+            st.download_button(
+                "📈 Results CSV",
+                csv_results,
+                file_name=f"results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv",
+                mime="text/csv"
+            )
+
+    with col3:
+        full_json = json.dumps(results, indent=2, default=str)
+        st.download_button(
+            "📋 Full Report JSON",
+            full_json,
+            file_name=f"simulation_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json",
+            mime="application/json"
+        )
+
diff --git a/evals/evals_output.json b/evals/evals_output.json
index 71caedf..62caeec 100644
--- a/evals/evals_output.json
+++ b/evals/evals_output.json
@@ -30,8 +30,8 @@
     },
     "passed": true,
     "context": {
-      "trace_id": "05e9185a30490fa463f00ea5b0e65108",
-      "span_id": "39cfc1324c7aee19"
+      "trace_id": "c717c279cc25965c8b93849431cbe2df",
+      "span_id": "105b7a80e342ca90"
     }
   },
   "example_2": {
@@ -65,8 +65,8 @@
     },
     "passed": true,
     "context": {
-      "trace_id": "95fc97f63b485e37d20ada5d01b2acdb",
-      "span_id": "f66b8c36689fbdb2"
+      "trace_id": "43b85c1dcc267e4c5e648552771de775",
+      "span_id": "26f1c84b0cecb5a6"
     }
   },
   "example_3": {