From ed832ca3e0b503ab55d2ca26bc809be8c93d0f81 Mon Sep 17 00:00:00 2001 From: TomMonks Date: Fri, 1 Aug 2025 16:02:13 +0100 Subject: [PATCH 01/23] feat(pkg): start of MCP sim package --- mcpsim/__init__.py | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 mcpsim/__init__.py diff --git a/mcpsim/__init__.py b/mcpsim/__init__.py new file mode 100644 index 0000000..66e77de --- /dev/null +++ b/mcpsim/__init__.py @@ -0,0 +1,5 @@ +"""Package for managing Model Context Protocol servers for simulation models +""" + +__version__ = "0.1.0" +__author__ = "Tom Monks, Alison Harper" \ No newline at end of file From a3abce3288e66ec4fc1bffdea56b80d5bcdd57a3 Mon Sep 17 00:00:00 2001 From: TomMonks Date: Fri, 1 Aug 2025 16:09:32 +0100 Subject: [PATCH 02/23] feat(pkg): +Protocol to adapt sim model interface --- mcpsim/protocol.py | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 mcpsim/protocol.py diff --git a/mcpsim/protocol.py b/mcpsim/protocol.py new file mode 100644 index 0000000..4b3be64 --- /dev/null +++ b/mcpsim/protocol.py @@ -0,0 +1,36 @@ +"""Defines a simple protocol that each simulation model should +use in order to be compatable with the agent. +""" + +from typing import Protocol, runtime_checkable, Dict, Any +from abc import abstractmethod + +@runtime_checkable +class SimulationModelAdapter(Protocol): + """Protocol defining the interface for MCP-compatible simulation models.""" + + @abstractmethod + def run_simulation(self, parameters: Dict[str, Any]) -> Dict[str, Any]: + """Execute the simulation with given parameters and return results.""" + ... + + @abstractmethod + def get_parameter_schema(self) -> Dict[str, Any]: + """Return JSON schema for valid simulation parameters.""" + ... + + @abstractmethod + def get_model_description(self) -> str: + """Return human-readable description of the simulation model.""" + ... + + @abstractmethod + def validate_parameters(self, parameters: Dict[str, Any]) -> Dict[str, Any]: + """Validate parameters and return validation results.""" + ... + + @property + @abstractmethod + def model_name(self) -> str: + """Return the name/identifier for this simulation model.""" + ... \ No newline at end of file From b6db7c585e5d448f98fd1c7c930b6e311882fe85 Mon Sep 17 00:00:00 2001 From: TomMonks Date: Fri, 1 Aug 2025 16:30:12 +0100 Subject: [PATCH 03/23] feat(examples): call_centre_model --- mcpsim/example_models/__init__.py | 0 mcpsim/example_models/callcentre/__init__.py | 0 .../callcentre/call_centre_mcp.py | 73 ++++ .../callcentre/call_centre_model.py | 374 ++++++++++++++++++ .../callcentre/resources/parameter_prompt.txt | 45 +++ .../callcentre/resources/schema.json | 79 ++++ 6 files changed, 571 insertions(+) create mode 100644 mcpsim/example_models/__init__.py create mode 100644 mcpsim/example_models/callcentre/__init__.py create mode 100644 mcpsim/example_models/callcentre/call_centre_mcp.py create mode 100644 mcpsim/example_models/callcentre/call_centre_model.py create mode 100644 mcpsim/example_models/callcentre/resources/parameter_prompt.txt create mode 100644 mcpsim/example_models/callcentre/resources/schema.json diff --git a/mcpsim/example_models/__init__.py b/mcpsim/example_models/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/mcpsim/example_models/callcentre/__init__.py b/mcpsim/example_models/callcentre/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/mcpsim/example_models/callcentre/call_centre_mcp.py b/mcpsim/example_models/callcentre/call_centre_mcp.py new file mode 100644 index 0000000..25c0fd8 --- /dev/null +++ b/mcpsim/example_models/callcentre/call_centre_mcp.py @@ -0,0 +1,73 @@ +import json +from model import run_simulation_from_dict +from typing import Dict, Any + + +class CallCentreSimulation: + """Call centre simulation implementation following + the SimulationModelAdapter protocol.""" + + def __init__(self, schema_path: str = "resources/schema.json"): + self.schema_path = schema_path + self._schema = None + + @property + def model_name(self) -> str: + return "urgent_care_call_centre" + + def run_simulation(self, parameters: Dict[str, Any]) -> Dict[str, Any]: + """Run the discrete-event healthcare call centre simulation.""" + return run_simulation_from_dict(parameters) + + def get_parameter_schema(self) -> Dict[str, Any]: + """Load and return the parameter schema.""" + if self._schema is None: + with open(self.schema_path) as f: + self._schema = json.load(f) + return self._schema + + def get_model_description(self) -> str: + """Return human-readable description of the call centre model.""" + return ( + "This is a discrete-event simulation of a healthcare call centre. " + "Patients call in, interact with operators, and a subset may require a nurse callback. " + "Simulation components: SimPy queues and resources. Tracks wait times, utilization, and callback rates. " + "Configurable parameters: number of operators and nurses, call durations and rates, etc. " + "Sample: 'Run with 14 operators and 5% higher demand.'" + ) + + def validate_parameters(self, parameters: Dict[str, Any]) -> Dict[str, Any]: + """Validate simulation parameters against the experiment schema.""" + schema = self.get_parameter_schema() + errors = [] + + for key, value in parameters.items(): + # Check for unknown parameters + if key not in schema: + errors.append(f"Unknown parameter: {key}") + continue + + spec = schema[key] + expected_type = int if spec["type"] == "int" else float + + # Type validation + if not isinstance(value, expected_type): + errors.append(f"{key} must be {spec['type']}") + continue + + # Range validation + if "minimum" in spec and value < spec["minimum"]: + errors.append(f"{key} below minimum {spec['minimum']}") + if "maximum" in spec and value > spec["maximum"]: + errors.append(f"{key} above maximum {spec['maximum']}") + + # Cross-parameter validation + if all(x in parameters for x in ("call_low", "call_mode", "call_high")): + if not (parameters["call_low"] <= parameters["call_mode"] <= parameters["call_high"]): + errors.append("call_low ≤ call_mode ≤ call_high violated") + + if all(x in parameters for x in ("nurse_consult_low", "nurse_consult_high")): + if not (parameters["nurse_consult_low"] <= parameters["nurse_consult_high"]): + errors.append("nurse_consult_low ≤ nurse_consult_high violated") + + return {"is_valid": len(errors) == 0, "errors": errors} \ No newline at end of file diff --git a/mcpsim/example_models/callcentre/call_centre_model.py b/mcpsim/example_models/callcentre/call_centre_model.py new file mode 100644 index 0000000..ede7143 --- /dev/null +++ b/mcpsim/example_models/callcentre/call_centre_model.py @@ -0,0 +1,374 @@ + +""" +Enhanced Call Centre Simulation Model with Nurse Callbacks + +A discrete event simulation model of a call centre using SimPy. +Extended to include nurse callbacks for 40% of patients. + +Author: Enhanced from Tom Monks' original model +""" + +import numpy as np +import pandas as pd +import simpy +import itertools +import matplotlib.pyplot as plt +import seaborn as sns +import inspect + +# ============================================================================= +# CONSTANTS AND DEFAULT VALUES +# ============================================================================= + +# Default resources +N_OPERATORS = 13 +N_NURSES = 10 + +# Default mean inter-arrival time (exp) +MEAN_IAT = 60 / 100 + +# Default service time parameters (triangular) +CALL_LOW = 5.0 +CALL_MODE = 7.0 +CALL_HIGH = 10.0 + +# Nurse callback parameters +CALLBACK_PROBABILITY = 0.4 # 40% of patients need nurse callback +NURSE_CONSULT_LOW = 10.0 # Uniform distribution 10-20 minutes +NURSE_CONSULT_HIGH = 20.0 + +# Sampling settings +N_STREAMS = 4 # Increased for additional distributions +DEFAULT_RND_SET = 0 + +# Boolean switch to display simulation results as the model runs +TRACE = False + +# Run variables +RESULTS_COLLECTION_PERIOD = 1000 + +# ============================================================================= +# DISTRIBUTION CLASSES +# ============================================================================= + +class Triangular: + """ + Convenience class for the triangular distribution. + Packages up distribution parameters, seed and random generator. + """ + def __init__(self, low, mode, high, random_seed=None): + self.rand = np.random.default_rng(seed=random_seed) + self.low = low + self.high = high + self.mode = mode + + def sample(self, size=None): + return self.rand.triangular(self.low, self.mode, self.high, size=size) + +class Exponential: + """ + Convenience class for the exponential distribution. + Packages up distribution parameters, seed and random generator. + """ + def __init__(self, mean, random_seed=None): + self.rand = np.random.default_rng(seed=random_seed) + self.mean = mean + + def sample(self, size=None): + return self.rand.exponential(self.mean, size=size) + +class Uniform: + """ + Convenience class for the uniform distribution. + Packages up distribution parameters, seed and random generator. + """ + def __init__(self, low, high, random_seed=None): + self.rand = np.random.default_rng(seed=random_seed) + self.low = low + self.high = high + + def sample(self, size=None): + return self.rand.uniform(self.low, self.high, size=size) + +class Bernoulli: + """ + Convenience class for the Bernoulli distribution. + Used for callback decision (40% probability). + """ + def __init__(self, p, random_seed=None): + self.rand = np.random.default_rng(seed=random_seed) + self.p = p + + def sample(self, size=None): + return self.rand.binomial(1, self.p, size=size) + +# ============================================================================= +# EXPERIMENT CLASS +# ============================================================================= + +class Experiment: + """ + Enhanced experiment class with nurse callback functionality. + """ + def __init__( + self, + random_number_set=DEFAULT_RND_SET, + n_operators=N_OPERATORS, + n_nurses=N_NURSES, + mean_iat=MEAN_IAT, + call_low=CALL_LOW, + call_mode=CALL_MODE, + call_high=CALL_HIGH, + callback_prob=CALLBACK_PROBABILITY, + nurse_consult_low=NURSE_CONSULT_LOW, + nurse_consult_high=NURSE_CONSULT_HIGH, + n_streams=N_STREAMS, + ): + # sampling + self.random_number_set = random_number_set + self.n_streams = n_streams + + # store parameters for the run of the model + self.n_operators = n_operators + self.n_nurses = n_nurses + self.mean_iat = mean_iat + self.call_low = call_low + self.call_mode = call_mode + self.call_high = call_high + self.callback_prob = callback_prob + self.nurse_consult_low = nurse_consult_low + self.nurse_consult_high = nurse_consult_high + + # resources: initialized after Environment is created + self.operators = None + self.nurses = None + + # initialise results and sampling + self.init_results_variables() + self.init_sampling() + + def set_random_no_set(self, random_number_set): + self.random_number_set = random_number_set + self.init_sampling() + + def init_sampling(self): + """Create the distributions used by the model""" + # produce n non-overlapping streams + seed_sequence = np.random.SeedSequence(self.random_number_set) + self.seeds = seed_sequence.spawn(self.n_streams) + + # create distributions + self.arrival_dist = Exponential(self.mean_iat, random_seed=self.seeds[0]) + self.call_dist = Triangular( + self.call_low, self.call_mode, self.call_high, random_seed=self.seeds[1] + ) + self.callback_dist = Bernoulli(self.callback_prob, random_seed=self.seeds[2]) + self.nurse_dist = Uniform( + self.nurse_consult_low, self.nurse_consult_high, random_seed=self.seeds[3] + ) + + def init_results_variables(self): + """Initialize all experiment variables used in results collection""" + self.results = {} + + # Original results + self.results["waiting_times"] = [] + self.results["total_call_duration"] = 0.0 + + # New nurse results + self.results["nurse_waiting_times"] = [] + self.results["total_nurse_duration"] = 0.0 + self.results["callbacks_requested"] = 0 + self.results["total_patients"] = 0 + +# ============================================================================= +# UTILITY FUNCTIONS +# ============================================================================= + +def trace(msg): + """Turning printing of events on and off.""" + if TRACE: + print(msg) + +# ============================================================================= +# MODEL LOGIC +# ============================================================================= + +def nurse_service(identifier, env, args): + """ + Simulates the nurse callback process + 1. request and wait for a nurse + 2. nurse consultation (uniform) + 3. exit system + """ + # record the time that patient entered nurse queue + start_wait = env.now + + # request a nurse + with args.nurses.request() as req: + yield req + + # record the waiting time for nurse callback + waiting_time = env.now - start_wait + args.results["nurse_waiting_times"].append(waiting_time) + + trace(f"nurse started callback for patient {identifier} at {env.now:.3f}") + + # nurse consultation time + consult_duration = args.nurse_dist.sample() + yield env.timeout(consult_duration) + + # update total nurse duration + args.results["total_nurse_duration"] += consult_duration + + trace(f"nurse callback for patient {identifier} ended at {env.now:.3f}; " + f"waiting time was {waiting_time:.3f}") + +def service(identifier, env, args): + """ + Enhanced service process with nurse callback + 1. request and wait for a call operator + 2. phone triage (triangular) + 3. determine if callback needed (40% probability) + 4. if needed, initiate nurse callback process + 5. exit system + """ + # record the time that call entered the queue + start_wait = env.now + + # request an operator + with args.operators.request() as req: + yield req + + # record the waiting time for call to be answered + waiting_time = env.now - start_wait + args.results["waiting_times"].append(waiting_time) + + trace(f"operator answered call {identifier} at {env.now:.3f}") + + # call duration + call_duration = args.call_dist.sample() + yield env.timeout(call_duration) + + # update the total call duration + args.results["total_call_duration"] += call_duration + + trace(f"call {identifier} ended {env.now:.3f}; " + f"waiting time was {waiting_time:.3f}") + + # After operator call, determine if nurse callback is needed + args.results["total_patients"] += 1 + callback_needed = args.callback_dist.sample() + + if callback_needed: + args.results["callbacks_requested"] += 1 + trace(f"patient {identifier} requires nurse callback") + + # Start nurse callback process + env.process(nurse_service(identifier, env, args)) + +def arrivals_generator(env, args): + """Generate patient arrivals with exponential inter-arrival times""" + for caller_count in itertools.count(start=1): + # sample inter-arrival time + inter_arrival_time = args.arrival_dist.sample() + yield env.timeout(inter_arrival_time) + + trace(f"call arrives at: {env.now:.3f}") + + # start service process + env.process(service(caller_count, env, args)) + +# ============================================================================= +# EXPERIMENT EXECUTION FUNCTIONS +# ============================================================================= + +def single_run(experiment, rep=0, rc_period=RESULTS_COLLECTION_PERIOD): + """Perform a single run of the enhanced model""" + run_results = {} + + # reset all result collection variables + experiment.init_results_variables() + + # set random number set + experiment.set_random_no_set(rep) + + # create environment + env = simpy.Environment() + + # create resources + experiment.operators = simpy.Resource(env, capacity=experiment.n_operators) + experiment.nurses = simpy.Resource(env, capacity=experiment.n_nurses) + + # start arrivals process + env.process(arrivals_generator(env, experiment)) + + # run simulation + env.run(until=rc_period) + + # calculate results + run_results["01_mean_waiting_time"] = np.mean(experiment.results["waiting_times"]) + run_results["02_operator_util"] = ( + experiment.results["total_call_duration"] / + (rc_period * experiment.n_operators) + ) * 100.0 + + # Calculate nurse results + if experiment.results["nurse_waiting_times"]: + run_results["03_mean_nurse_waiting_time"] = np.mean(experiment.results["nurse_waiting_times"]) + else: + run_results["03_mean_nurse_waiting_time"] = 0.0 + + run_results["04_nurse_util"] = ( + experiment.results["total_nurse_duration"] / + (rc_period * experiment.n_nurses) + ) * 100.0 + + # Additional metrics + run_results["05_callback_rate"] = ( + experiment.results["callbacks_requested"] / + experiment.results["total_patients"] + ) * 100.0 if experiment.results["total_patients"] > 0 else 0.0 + + return run_results + +def multiple_replications(experiment, rc_period=RESULTS_COLLECTION_PERIOD, n_reps=5): + """Perform multiple replications of the enhanced model""" + results = [single_run(experiment, rep, rc_period) for rep in range(n_reps)] + df_results = pd.DataFrame(results) + df_results.index = np.arange(1, len(df_results) + 1) + df_results.index.name = "rep" + return df_results + +def set_trace(trace_on=True): + """Turn tracing on/off globally""" + global TRACE + TRACE = trace_on + + +def run_simulation_from_dict(params: dict): + """ + Wrapper to run the simulation using params supplied in a dictionary. + Only the parameters necessary for the Experiment's constructor are passed; + defaults are used for those not provided. + """ + # Extract constructor signature of the Experiment class + signature = inspect.signature(Experiment.__init__) + + # Build kwargs dict: match keys in 'params' to Experiment.__init__ params + experiment_kwargs = { + key: params.get(key, param.default) + for key, param in signature.parameters.items() + if key != 'self' # exclude 'self' from constructor + } + + # Create Experiment instance with dynamic/default parameters + exp = Experiment(**experiment_kwargs) + + # Separate non-constructor parameters + run_length = params.get("run_length", RESULTS_COLLECTION_PERIOD) + rep_seed = params.get("random_seed", DEFAULT_RND_SET) + + # Run simulation + return single_run(exp, rep=rep_seed, rc_period=run_length) + diff --git a/mcpsim/example_models/callcentre/resources/parameter_prompt.txt b/mcpsim/example_models/callcentre/resources/parameter_prompt.txt new file mode 100644 index 0000000..bcfe6d5 --- /dev/null +++ b/mcpsim/example_models/callcentre/resources/parameter_prompt.txt @@ -0,0 +1,45 @@ +You are a discrete-event simulation assistant. +You help turn natural language into structured simulation parameters. + +Here is the simulation configuration schema: +{schema} + +User request: {user_input} + +{validation_feedback} + +IMPORTANT: +- Return ONLY the parameter VALUES, not the schema structure +- Use the exact parameter names from the schema +- Output a simple JSON object with key-value pairs +- Do NOT include type information, descriptions, or constraints +- CALCULATE all mathematical expressions and provide ONLY the final numerical result +- Do NOT show calculations like "0.6 * 0.95" - provide the computed value like 0.57 +- All values must be final computed numbers, not mathematical expressions + +Example output format: +{{ + "n_operators": 15, + "n_nurses": 8, + "mean_iat": 0.57, + "random_seed": 42, + "run_length": 1000 +}} + +WRONG format (do not do this): +{{ + "mean_iat": "0.6 * 0.95" +}} + +CORRECT format: +{{ + "mean_iat": 0.57 +}} + + +Before providing the JSON: +1. Mentally calculate any mathematical expressions +2. Convert all calculations to final numerical values +3. Then output only the JSON with computed numbers + +JSON Response: diff --git a/mcpsim/example_models/callcentre/resources/schema.json b/mcpsim/example_models/callcentre/resources/schema.json new file mode 100644 index 0000000..66d699d --- /dev/null +++ b/mcpsim/example_models/callcentre/resources/schema.json @@ -0,0 +1,79 @@ +{ + "n_operators": { + "type": "int", + "default": 13, + "minimum": 1, + "maximum": 100, + "description": "Number of call operators (1-100)" + }, + "n_nurses": { + "type": "int", + "default": 10, + "minimum": 1, + "maximum": 50, + "description": "Number of nurse callbacks (1-50)" + }, + "mean_iat": { + "type": "float", + "default": 0.6, + "minimum": 0.1, + "maximum": 10.0, + "description": "Mean time between arrivals (minutes) or inter-arrival time. 0.1 = 6 calls/min, 10.0 = 1 call/10min. An increase in demand = a reduction in the inter-arrival time. A reduction in demand = an increase in inter-arrival time" + }, + "call_low": { + "type": "float", + "default": 5.0, + "minimum": 1.0, + "maximum": 30.0, + "description": "Low bound for call duration (minutes)" + }, + "call_mode": { + "type": "float", + "default": 7.0, + "minimum": 1.0, + "maximum": 30.0, + "description": "Mode for triangular call distribution (minutes)" + }, + "call_high": { + "type": "float", + "default": 10.0, + "minimum": 1.0, + "maximum": 30.0, + "description": "High bound for call duration (minutes)" + }, + "callback_prob": { + "type": "float", + "default": 0.4, + "minimum": 0.0, + "maximum": 1.0, + "description": "Nurse callback probability (0-1)" + }, + "nurse_consult_low": { + "type": "float", + "default": 10.0, + "minimum": 5.0, + "maximum": 60.0, + "description": "Low bound for nurse consult duration (minutes)" + }, + "nurse_consult_high": { + "type": "float", + "default": 20.0, + "minimum": 5.0, + "maximum": 60.0, + "description": "High bound for nurse consult duration (minutes)" + }, + "random_seed": { + "type": "int", + "default": 0, + "minimum": 0, + "maximum": 999999, + "description": "Random number seed for reproducibility" + }, + "run_length": { + "type": "int", + "default": 1000, + "minimum": 100, + "maximum": 86400, + "description": "Simulation runtime (minutes)" + } +} From 0932d4210183cf9513eefdb371acf1b5c2c81a3c Mon Sep 17 00:00:00 2001 From: TomMonks Date: Fri, 1 Aug 2025 16:30:30 +0100 Subject: [PATCH 04/23] feat(server): added generic sim mcp server code --- mcpsim/server.py | 145 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 145 insertions(+) create mode 100644 mcpsim/server.py diff --git a/mcpsim/server.py b/mcpsim/server.py new file mode 100644 index 0000000..7984f62 --- /dev/null +++ b/mcpsim/server.py @@ -0,0 +1,145 @@ +from fastmcp import FastMCP +from langchain_core.prompts import PromptTemplate +from fastmcp.prompts.prompt import PromptMessage, TextContent + +from mcpsim.protocol import SimulationModelAdapter + +class SimulationMCPServer: + """MCP server that can work with any SimulationModelAdapter implementation.""" + + def __init__(self, simulation_model: SimulationModelAdapter, server_name: str = None): + # validate interface + if not isinstance(simulation_model, SimulationModelAdapter): + missing_methods = [] + for method in ['run_simulation', 'get_parameter_schema', 'get_model_description', + 'validate_parameters']: + if not hasattr(simulation_model, method): + missing_methods.append(method) + if not hasattr(simulation_model, 'model_name'): + missing_methods.append('model_name (property)') + + raise TypeError(f"Object {type(simulation_model)} missing required methods: {missing_methods}") + + self.model = simulation_model + server_name = server_name or f"{simulation_model.model_name} Simulation MCP Server" + self.mcp = FastMCP(server_name) + self._register_tools() + self._register_resources() + self._register_prompts() + + def _register_tools(self): + """Register MCP tools that delegate to the simulation model.""" + + # note investigate how to clean up description + @self.mcp.tool( + name=f"run_{self.model.model_name}_simulation", + description=f""" + Runs a discrete-event {self.model.model_name} simulation with specified parameters, + returning performance metrics. + + Inputs: parameters (dict) — JSON object matching the experiment schema. + Returns: dict with simulation metrics, such as mean wait times and resource utilizations. + + Tags: ["simulation", "{self.model.model_name}", "experiment"] + """ + ) + def run_simulation(parameters: dict) -> dict: + return self.model.run_simulation(parameters) + + @self.mcp.tool( + name="validate_simulation_parameters", + description=""" + Validate a proposed set of simulation parameters (JSON object) + against the experiment schema. + + Inputs: parameters (dict) + Returns: {"is_valid": bool, "errors": [str, ...]} — status and explanation. + + Tags: ["validation", "parameter_check", "pre_run_check", "schema"] + """ + ) + def validate_parameters(parameters: dict) -> dict: + return self.model.validate_parameters(parameters) + + def _register_resources(self): + """Register MCP resources that delegate to the simulation model.""" + + @self.mcp.resource( + uri="resource://schema/experiment_parameters", + description=""" + Returns the JSON schema defining all allowed input parameters, parameter types, + and value constraints. + + Outputs: dict (JSON schema), sent as a JSON object. + + Tags: ["schema", "parameters", "template"] + """ + ) + def get_schema() -> dict: + return self.model.get_parameter_schema() + + @self.mcp.resource( + uri="resource://model/description", + description=f""" + Provides a natural language description of + the {self.model.model_name} simulation model. + + Outputs: str (text description). + + Tags: ["model", "description", "documentation"] + """ + ) + def get_description() -> str: + return self.model.get_model_description() + + def _register_prompts(self): + """Register MCP prompts for parameter conversion.""" + + @self.mcp.prompt( + name="parameter_jsonification_prompt", + description=""" + INSTRUCTION TO LLM: Convert a user's freeform simulation request + into a JSON object matching a given schema. + + Inputs: + - schema (str): JSON Schema as a string + - user_input (str): User's natural language request + + Returns: PromptMessage (LLM input) guiding the agent to produce valid JSON parameters. + + Tags: ["jsonification", "schema_mapping", "prompt", "parameters"] + """ + ) + def parameter_jsonification_prompt( + schema: str, + user_input: str, + validation_errors: str = "" + ) -> PromptMessage: + with open("resources/parameter_prompt.txt", encoding="utf-8") as f: + prompt_template_text = f.read() + prompt = PromptTemplate.from_template(prompt_template_text) + + # Handle validation error feedback + if validation_errors and validation_errors.strip(): + validation_feedback = ( + "**Validation Feedback:**\n" + "Your last attempt did not pass validation for these reasons:\n" + f"{validation_errors}\n\n" + "Please address the issues above and try again." + ) + else: + validation_feedback = "" + + filled_prompt = prompt.format( + schema=schema, + user_input=user_input, + validation_feedback=validation_feedback + ) + return PromptMessage( + role="user", + content=TextContent(type="text", text=filled_prompt) + ) + + def run(self, **kwargs): + """Start the MCP server.""" + self.mcp.run(**kwargs) \ No newline at end of file From 8039f93e6238f9d043b062f9a53d976407153c1e Mon Sep 17 00:00:00 2001 From: TomMonks Date: Fri, 1 Aug 2025 16:31:35 +0100 Subject: [PATCH 05/23] clean(self_ref): removed commented out models --- agent_self_reflection.py | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/agent_self_reflection.py b/agent_self_reflection.py index bcb7b50..11c869f 100644 --- a/agent_self_reflection.py +++ b/agent_self_reflection.py @@ -445,20 +445,6 @@ def parse_arguments(): if __name__ == "__main__": - # TM notes: mistral:7b can fail and successed with reflection (or 2nd go) - # qwen2:math:7b is not suitable - # gemma3:27b both variants work well - # deepseek-r1:32b is too slow on 4080 - - #model_name = "gemma3n:e4b" - #model_name = "deepseek-r1:32b" - #model_name = "llama3:latest" - #model_name = "llama3.1:8b" - #model_name = "gemma3:27b" - # model_name = "gemma3:27b-it-qat" - #model_name = "qwen2-math:7b" - #model_name = "mistral:7b" - # Parse command line arguments args = parse_arguments() From 8b69a76336d224a4c2a85412aafa7b321e9a8b10 Mon Sep 17 00:00:00 2001 From: TomMonks Date: Fri, 1 Aug 2025 16:41:24 +0100 Subject: [PATCH 06/23] feat(call_centre): renamed adapter class --- mcpsim/example_models/callcentre/call_centre_mcp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mcpsim/example_models/callcentre/call_centre_mcp.py b/mcpsim/example_models/callcentre/call_centre_mcp.py index 25c0fd8..cbbb9ec 100644 --- a/mcpsim/example_models/callcentre/call_centre_mcp.py +++ b/mcpsim/example_models/callcentre/call_centre_mcp.py @@ -3,7 +3,7 @@ from typing import Dict, Any -class CallCentreSimulation: +class CallCentreSimulationAdapter: """Call centre simulation implementation following the SimulationModelAdapter protocol.""" From 783e5161d77e84170df07a7da2eb4c20727aef34 Mon Sep 17 00:00:00 2001 From: TomMonks Date: Fri, 1 Aug 2025 17:08:39 +0100 Subject: [PATCH 07/23] feat(mcp): setup script to run new framework --- mcpsim/__init__.py | 4 +++- mcpsim/example_models/__init__.py | 3 +++ mcpsim/example_models/callcentre/__init__.py | 2 ++ .../example_models/callcentre/call_centre_mcp.py | 2 +- mcpsim/server.py | 2 +- run_call_centre_server.py | 15 +++++++++++++++ 6 files changed, 25 insertions(+), 3 deletions(-) create mode 100644 run_call_centre_server.py diff --git a/mcpsim/__init__.py b/mcpsim/__init__.py index 66e77de..462e805 100644 --- a/mcpsim/__init__.py +++ b/mcpsim/__init__.py @@ -2,4 +2,6 @@ """ __version__ = "0.1.0" -__author__ = "Tom Monks, Alison Harper" \ No newline at end of file +__author__ = "Tom Monks, Alison Harper" + +from .example_models import CallCentreSimulationAdapter diff --git a/mcpsim/example_models/__init__.py b/mcpsim/example_models/__init__.py index e69de29..f09bbcf 100644 --- a/mcpsim/example_models/__init__.py +++ b/mcpsim/example_models/__init__.py @@ -0,0 +1,3 @@ +# example_models.__init__ +from .callcentre import CallCentreSimulationAdapter + diff --git a/mcpsim/example_models/callcentre/__init__.py b/mcpsim/example_models/callcentre/__init__.py index e69de29..5529771 100644 --- a/mcpsim/example_models/callcentre/__init__.py +++ b/mcpsim/example_models/callcentre/__init__.py @@ -0,0 +1,2 @@ +#callcentre.___init__ +from .call_centre_mcp import CallCentreSimulationAdapter \ No newline at end of file diff --git a/mcpsim/example_models/callcentre/call_centre_mcp.py b/mcpsim/example_models/callcentre/call_centre_mcp.py index cbbb9ec..07761c0 100644 --- a/mcpsim/example_models/callcentre/call_centre_mcp.py +++ b/mcpsim/example_models/callcentre/call_centre_mcp.py @@ -13,7 +13,7 @@ def __init__(self, schema_path: str = "resources/schema.json"): @property def model_name(self) -> str: - return "urgent_care_call_centre" + return "call_centre" def run_simulation(self, parameters: Dict[str, Any]) -> Dict[str, Any]: """Run the discrete-event healthcare call centre simulation.""" diff --git a/mcpsim/server.py b/mcpsim/server.py index 7984f62..8f731ce 100644 --- a/mcpsim/server.py +++ b/mcpsim/server.py @@ -43,7 +43,7 @@ def _register_tools(self): Tags: ["simulation", "{self.model.model_name}", "experiment"] """ ) - def run_simulation(parameters: dict) -> dict: + def run_call_centre_simulation(parameters: dict) -> dict: return self.model.run_simulation(parameters) @self.mcp.tool( diff --git a/run_call_centre_server.py b/run_call_centre_server.py new file mode 100644 index 0000000..7330373 --- /dev/null +++ b/run_call_centre_server.py @@ -0,0 +1,15 @@ +from mcpsim.example_models import CallCentreSimulationAdapter +from mcpsim.server import SimulationMCPServer + +def main(): + # Create the simulation model + call_centre_sim = CallCentreSimulationAdapter() + + # Create the MCP server + server = SimulationMCPServer(call_centre_sim) + + # Run the server + server.run(transport="http", host="127.0.0.1", port=8001, path="/mcp") + +if __name__ == "__main__": + main() \ No newline at end of file From 87a17d07c768cc66cb22761d9296cffdf963cc67 Mon Sep 17 00:00:00 2001 From: TomMonks Date: Thu, 7 Aug 2025 15:47:49 +0100 Subject: [PATCH 08/23] fix(planner): memory stores schema under get_schema --- agent_planning_workflow.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/agent_planning_workflow.py b/agent_planning_workflow.py index 05d2e27..2ae1941 100644 --- a/agent_planning_workflow.py +++ b/agent_planning_workflow.py @@ -438,11 +438,16 @@ async def run_plan( prompt = prompt_map[step_name] # Most prompts need schema, user input - schema = memory.get("get_experiment_parameter_schema") + schema = memory.get("get_schema") result = await client.get_prompt( prompt.name, {"schema": schema, "user_input": user_input} ) llm_prompt_text = result.messages[0].content.text + + if debug_mode: + print(f"🐛 {llm_prompt_text}") + + # Actually run LLM (synchronously!) llm_result = llm.invoke(llm_prompt_text) parameters = json.loads(clean_llm_response(llm_result)) @@ -539,7 +544,7 @@ async def main( logger.debug(f"Response length: {len(response)} characters") logger.debug("Response preview:") logger.debug( - response[:500] + "..." if len(response) > 500 else response + response[:1000] + "..." if len(response) > 1000 else response ) debug_print_plan(plan_steps, logger) @@ -551,7 +556,7 @@ async def main( transient=True, # Removes progress bar after completion ) as progress: task = progress.add_task("executing", total=None) - memory = await run_plan(plan_steps, features, llm, user_input) + memory = await run_plan(plan_steps, features, llm, user_input, debug_mode) progress.remove_task(task) # 4. Results From 1f119cca889ff827246a5fb6176d1664b9786197 Mon Sep 17 00:00:00 2001 From: TomMonks Date: Thu, 7 Aug 2025 16:00:05 +0100 Subject: [PATCH 09/23] feat(planner): shorten plan preview to 500 charac --- agent_planning_workflow.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/agent_planning_workflow.py b/agent_planning_workflow.py index 2ae1941..f6ca4b6 100644 --- a/agent_planning_workflow.py +++ b/agent_planning_workflow.py @@ -444,11 +444,7 @@ async def run_plan( ) llm_prompt_text = result.messages[0].content.text - if debug_mode: - print(f"🐛 {llm_prompt_text}") - - - # Actually run LLM (synchronously!) + # run the prompt llm_result = llm.invoke(llm_prompt_text) parameters = json.loads(clean_llm_response(llm_result)) memory["parameters"] = parameters @@ -544,7 +540,7 @@ async def main( logger.debug(f"Response length: {len(response)} characters") logger.debug("Response preview:") logger.debug( - response[:1000] + "..." if len(response) > 1000 else response + response[:500] + "..." if len(response) > 500 else response ) debug_print_plan(plan_steps, logger) From 66bfbd47e8eb3dba005e13eacdccba415288aa47 Mon Sep 17 00:00:00 2001 From: TomMonks Date: Thu, 7 Aug 2025 16:01:54 +0100 Subject: [PATCH 10/23] chore(clean): removed original mcp and model modules --- mcp_server.py | 229 ------------------------------- model.py | 374 -------------------------------------------------- 2 files changed, 603 deletions(-) delete mode 100644 mcp_server.py delete mode 100644 model.py diff --git a/mcp_server.py b/mcp_server.py deleted file mode 100644 index 50d47ce..0000000 --- a/mcp_server.py +++ /dev/null @@ -1,229 +0,0 @@ -""" -Call Centre Simulation MCP Server -================================= - -This module implements the Model Content Protocol (MCP) server for a discrete-event -simulation (DES) model of a healthcare call centre. The server exposes executable -simulation tools, parameter validation utilities, schema/documentation resources, -and prompting interfaces designed for integration with language model (LLM) agents. - -Main Features -------------- -- Run call centre simulations with configurable staffing and demand parameters. -- Validate proposed simulation parameters against a machine-readable JSON schema. -- Provide self-describing resources (schema, natural language model documentation). -- Generate LLM prompts to map user requests onto structured simulation parameter sets. - -Accessible Functionalities --------------------------- -- `run_call_centre_simulation`: Simulate the healthcare call centre and return metrics. -- `validate_simulation_parameters`: Pre-validate parameter sets to prevent runtime errors. -- `get_experiment_parameter_schema`: Obtain the schema for all simulation parameters. -- `get_model_description`: Retrieve a human-oriented description of the simulation model. -- `parameter_jsonification_prompt`: Produce a prompt for LLMs to convert text requests into structured parameters. - -Intended Usage --------------- -This server is intended to be used with agentic or LLM-driven client interfaces, -providing a robust, schema-driven, and discoverable API for advanced simulation reasoning, -configuration generation, and automated experimentation. - -""" - -from fastmcp import FastMCP -from langchain_core.prompts import PromptTemplate -from fastmcp.prompts.prompt import PromptMessage, TextContent -from model import run_simulation_from_dict -import json -import re - -mcp = FastMCP("Call Centre Simulation MCP Server") - -@mcp.tool( - name="run_call_centre_simulation", - description=""" -Runs a discrete-event healthcare call centre simulation with specified parameters, returning performance metrics. - -Inputs: parameters (dict) — JSON object matching the experiment schema. -Example: {"operators": 12, "nurses": 5, "call_arrival_rate": 120} - -Returns: dict with simulation metrics, such as mean wait times and resource utilizations. - -Tags: ["simulation", "call_centre", "experiment"] -""") -def run_call_centre_simulation(parameters: dict) -> dict: - """ - Run a discrete-event healthcare call centre simulation. - - Parameters - ---------- - parameters : dict - Simulation configuration as a JSON object matching the experiment parameter schema. - Example: - {"operators": 14, "nurses": 4, "call_arrival_rate": 120} - - Returns - ------- - dict - Simulation results, such as: - { - "mean_wait_time": float, - "operator_utilization": float, - "callback_rate": float, - ... - } - """ - return run_simulation_from_dict(parameters) - -@mcp.resource( - uri="resource://schema/experiment_parameters", - description=""" -Returns the JSON schema defining all allowed input parameters, parameter types, and value constraints. - -Outputs: dict (JSON schema), sent as a JSON object. - -Tags: ["schema", "parameters", "template"] -""") -def get_experiment_parameter_schema() -> dict: - """ - Retrieve the experiment parameter JSON schema. - - Returns - ------- - dict - JSON schema describing all allowable simulation parameters, accepted types, and value constraints. - Example: - { - "operators": {"type": "int", "minimum": 1, "maximum": 50}, - ... - } - """ - with open("resources/schema.json") as f: - return json.load(f) - -@mcp.resource( - uri="resource://model/description", - description=""" -Provides a natural language description of the healthcare call centre simulation model. - -Outputs: str (text description). - -Tags: ["model", "description", "documentation"] -""") -def get_model_description() -> str: - """ - Get a natural language description of the call centre simulation model. - - Returns - ------- - str - Human-readable description of the model's structure, main features, and configurable aspects. - """ - return ( - "This is a discrete-event simulation of a healthcare call centre. " - "Patients call in, interact with operators, and a subset may require a nurse callback. " - "Simulation components: SimPy queues and resources. Tracks wait times, utilization, and callback rates. " - "Configurable parameters: number of operators and nurses, call durations and rates, etc. " - "Sample: 'Run with 14 operators and 5% higher demand.'" - ) - -@mcp.prompt( - name="parameter_jsonification_prompt", - description=""" -INSTRUCTION TO LLM: Convert a user's freeform simulation request into a JSON object matching a given schema. - -Inputs: -- schema (str): JSON Schema as a string -- user_input (str): User's natural language request - -Returns: PromptMessage (LLM input) guiding the agent to produce valid JSON parameters. - -Tags: ["jsonification", "schema_mapping", "prompt", "parameters"] -""") -def parameter_jsonification_prompt( - schema: str, - user_input: str, - validation_errors: str = "" -) -> PromptMessage: - with open("resources/parameter_prompt.txt", encoding="utf-8") as f: - prompt_template_text = f.read() - prompt = PromptTemplate.from_template(prompt_template_text) - - # manage validation error str if not "" - if validation_errors and validation_errors.strip(): - validation_feedback = ( - "**Validation Feedback:**\n" - "Your last attempt did not pass validation for these reasons:\n" - f"{validation_errors}\n\n" - "Please address the issues above and try again." - ) - else: - validation_feedback = "" - - filled_prompt = prompt.format( - schema=schema, - user_input=user_input, - validation_feedback=validation_feedback - ) - return PromptMessage( - role="user", - content=TextContent(type="text", text=filled_prompt) - ) - - -@mcp.tool( - name="validate_simulation_parameters", - description=""" -Validate a proposed set of simulation parameters (JSON object) against the experiment schema. - -Inputs: parameters (dict) -Returns: {"is_valid": bool, "errors": [str, ...]} — status and explanation. - -Tags: ["validation", "parameter_check", "pre_run_check", "schema"] -""") -def validate_simulation_parameters(parameters: dict) -> dict: - """ - Validate simulation parameters against the experiment schema. - - Parameters - ---------- - parameters : dict - Proposed parameter set for simulation, as a JSON-compatible dictionary. - - Returns - ------- - dict - { - "is_valid": bool, - "errors": list of str - } - is_valid: True if all parameters are recognized and within allowed ranges. - errors: Descriptive messages for each invalid parameter or failed interparameter constraint. - """ - with open("resources/schema.json") as f: - schema = json.load(f) - errors = [] - for key, value in parameters.items(): - # catch hallucinated parameters - if key not in schema: - errors.append(f"Unknown parameter: {key}") - continue - spec = schema[key] - expected_type = int if spec["type"] == "int" else float - if not isinstance(value, expected_type): - errors.append(f"{key} must be {spec['type']}") - continue - if "minimum" in spec and value < spec["minimum"]: - errors.append(f"{key} below minimum {spec['minimum']}") - if "maximum" in spec and value > spec["maximum"]: - errors.append(f"{key} above maximum {spec['maximum']}") - if all(x in parameters for x in ("call_low", "call_mode", "call_high")): - if not (parameters["call_low"] <= parameters["call_mode"] <= parameters["call_high"]): - errors.append("call_low ≤ call_mode ≤ call_high violated") - if all(x in parameters for x in ("nurse_consult_low", "nurse_consult_high")): - if not (parameters["nurse_consult_low"] <= parameters["nurse_consult_high"]): - errors.append("nurse_consult_low ≤ nurse_consult_high violated") - return {"is_valid": len(errors) == 0, "errors": errors} - -if __name__ == "__main__": - mcp.run(transport="http", host="127.0.0.1", port=8001, path="/mcp") diff --git a/model.py b/model.py deleted file mode 100644 index ede7143..0000000 --- a/model.py +++ /dev/null @@ -1,374 +0,0 @@ - -""" -Enhanced Call Centre Simulation Model with Nurse Callbacks - -A discrete event simulation model of a call centre using SimPy. -Extended to include nurse callbacks for 40% of patients. - -Author: Enhanced from Tom Monks' original model -""" - -import numpy as np -import pandas as pd -import simpy -import itertools -import matplotlib.pyplot as plt -import seaborn as sns -import inspect - -# ============================================================================= -# CONSTANTS AND DEFAULT VALUES -# ============================================================================= - -# Default resources -N_OPERATORS = 13 -N_NURSES = 10 - -# Default mean inter-arrival time (exp) -MEAN_IAT = 60 / 100 - -# Default service time parameters (triangular) -CALL_LOW = 5.0 -CALL_MODE = 7.0 -CALL_HIGH = 10.0 - -# Nurse callback parameters -CALLBACK_PROBABILITY = 0.4 # 40% of patients need nurse callback -NURSE_CONSULT_LOW = 10.0 # Uniform distribution 10-20 minutes -NURSE_CONSULT_HIGH = 20.0 - -# Sampling settings -N_STREAMS = 4 # Increased for additional distributions -DEFAULT_RND_SET = 0 - -# Boolean switch to display simulation results as the model runs -TRACE = False - -# Run variables -RESULTS_COLLECTION_PERIOD = 1000 - -# ============================================================================= -# DISTRIBUTION CLASSES -# ============================================================================= - -class Triangular: - """ - Convenience class for the triangular distribution. - Packages up distribution parameters, seed and random generator. - """ - def __init__(self, low, mode, high, random_seed=None): - self.rand = np.random.default_rng(seed=random_seed) - self.low = low - self.high = high - self.mode = mode - - def sample(self, size=None): - return self.rand.triangular(self.low, self.mode, self.high, size=size) - -class Exponential: - """ - Convenience class for the exponential distribution. - Packages up distribution parameters, seed and random generator. - """ - def __init__(self, mean, random_seed=None): - self.rand = np.random.default_rng(seed=random_seed) - self.mean = mean - - def sample(self, size=None): - return self.rand.exponential(self.mean, size=size) - -class Uniform: - """ - Convenience class for the uniform distribution. - Packages up distribution parameters, seed and random generator. - """ - def __init__(self, low, high, random_seed=None): - self.rand = np.random.default_rng(seed=random_seed) - self.low = low - self.high = high - - def sample(self, size=None): - return self.rand.uniform(self.low, self.high, size=size) - -class Bernoulli: - """ - Convenience class for the Bernoulli distribution. - Used for callback decision (40% probability). - """ - def __init__(self, p, random_seed=None): - self.rand = np.random.default_rng(seed=random_seed) - self.p = p - - def sample(self, size=None): - return self.rand.binomial(1, self.p, size=size) - -# ============================================================================= -# EXPERIMENT CLASS -# ============================================================================= - -class Experiment: - """ - Enhanced experiment class with nurse callback functionality. - """ - def __init__( - self, - random_number_set=DEFAULT_RND_SET, - n_operators=N_OPERATORS, - n_nurses=N_NURSES, - mean_iat=MEAN_IAT, - call_low=CALL_LOW, - call_mode=CALL_MODE, - call_high=CALL_HIGH, - callback_prob=CALLBACK_PROBABILITY, - nurse_consult_low=NURSE_CONSULT_LOW, - nurse_consult_high=NURSE_CONSULT_HIGH, - n_streams=N_STREAMS, - ): - # sampling - self.random_number_set = random_number_set - self.n_streams = n_streams - - # store parameters for the run of the model - self.n_operators = n_operators - self.n_nurses = n_nurses - self.mean_iat = mean_iat - self.call_low = call_low - self.call_mode = call_mode - self.call_high = call_high - self.callback_prob = callback_prob - self.nurse_consult_low = nurse_consult_low - self.nurse_consult_high = nurse_consult_high - - # resources: initialized after Environment is created - self.operators = None - self.nurses = None - - # initialise results and sampling - self.init_results_variables() - self.init_sampling() - - def set_random_no_set(self, random_number_set): - self.random_number_set = random_number_set - self.init_sampling() - - def init_sampling(self): - """Create the distributions used by the model""" - # produce n non-overlapping streams - seed_sequence = np.random.SeedSequence(self.random_number_set) - self.seeds = seed_sequence.spawn(self.n_streams) - - # create distributions - self.arrival_dist = Exponential(self.mean_iat, random_seed=self.seeds[0]) - self.call_dist = Triangular( - self.call_low, self.call_mode, self.call_high, random_seed=self.seeds[1] - ) - self.callback_dist = Bernoulli(self.callback_prob, random_seed=self.seeds[2]) - self.nurse_dist = Uniform( - self.nurse_consult_low, self.nurse_consult_high, random_seed=self.seeds[3] - ) - - def init_results_variables(self): - """Initialize all experiment variables used in results collection""" - self.results = {} - - # Original results - self.results["waiting_times"] = [] - self.results["total_call_duration"] = 0.0 - - # New nurse results - self.results["nurse_waiting_times"] = [] - self.results["total_nurse_duration"] = 0.0 - self.results["callbacks_requested"] = 0 - self.results["total_patients"] = 0 - -# ============================================================================= -# UTILITY FUNCTIONS -# ============================================================================= - -def trace(msg): - """Turning printing of events on and off.""" - if TRACE: - print(msg) - -# ============================================================================= -# MODEL LOGIC -# ============================================================================= - -def nurse_service(identifier, env, args): - """ - Simulates the nurse callback process - 1. request and wait for a nurse - 2. nurse consultation (uniform) - 3. exit system - """ - # record the time that patient entered nurse queue - start_wait = env.now - - # request a nurse - with args.nurses.request() as req: - yield req - - # record the waiting time for nurse callback - waiting_time = env.now - start_wait - args.results["nurse_waiting_times"].append(waiting_time) - - trace(f"nurse started callback for patient {identifier} at {env.now:.3f}") - - # nurse consultation time - consult_duration = args.nurse_dist.sample() - yield env.timeout(consult_duration) - - # update total nurse duration - args.results["total_nurse_duration"] += consult_duration - - trace(f"nurse callback for patient {identifier} ended at {env.now:.3f}; " - f"waiting time was {waiting_time:.3f}") - -def service(identifier, env, args): - """ - Enhanced service process with nurse callback - 1. request and wait for a call operator - 2. phone triage (triangular) - 3. determine if callback needed (40% probability) - 4. if needed, initiate nurse callback process - 5. exit system - """ - # record the time that call entered the queue - start_wait = env.now - - # request an operator - with args.operators.request() as req: - yield req - - # record the waiting time for call to be answered - waiting_time = env.now - start_wait - args.results["waiting_times"].append(waiting_time) - - trace(f"operator answered call {identifier} at {env.now:.3f}") - - # call duration - call_duration = args.call_dist.sample() - yield env.timeout(call_duration) - - # update the total call duration - args.results["total_call_duration"] += call_duration - - trace(f"call {identifier} ended {env.now:.3f}; " - f"waiting time was {waiting_time:.3f}") - - # After operator call, determine if nurse callback is needed - args.results["total_patients"] += 1 - callback_needed = args.callback_dist.sample() - - if callback_needed: - args.results["callbacks_requested"] += 1 - trace(f"patient {identifier} requires nurse callback") - - # Start nurse callback process - env.process(nurse_service(identifier, env, args)) - -def arrivals_generator(env, args): - """Generate patient arrivals with exponential inter-arrival times""" - for caller_count in itertools.count(start=1): - # sample inter-arrival time - inter_arrival_time = args.arrival_dist.sample() - yield env.timeout(inter_arrival_time) - - trace(f"call arrives at: {env.now:.3f}") - - # start service process - env.process(service(caller_count, env, args)) - -# ============================================================================= -# EXPERIMENT EXECUTION FUNCTIONS -# ============================================================================= - -def single_run(experiment, rep=0, rc_period=RESULTS_COLLECTION_PERIOD): - """Perform a single run of the enhanced model""" - run_results = {} - - # reset all result collection variables - experiment.init_results_variables() - - # set random number set - experiment.set_random_no_set(rep) - - # create environment - env = simpy.Environment() - - # create resources - experiment.operators = simpy.Resource(env, capacity=experiment.n_operators) - experiment.nurses = simpy.Resource(env, capacity=experiment.n_nurses) - - # start arrivals process - env.process(arrivals_generator(env, experiment)) - - # run simulation - env.run(until=rc_period) - - # calculate results - run_results["01_mean_waiting_time"] = np.mean(experiment.results["waiting_times"]) - run_results["02_operator_util"] = ( - experiment.results["total_call_duration"] / - (rc_period * experiment.n_operators) - ) * 100.0 - - # Calculate nurse results - if experiment.results["nurse_waiting_times"]: - run_results["03_mean_nurse_waiting_time"] = np.mean(experiment.results["nurse_waiting_times"]) - else: - run_results["03_mean_nurse_waiting_time"] = 0.0 - - run_results["04_nurse_util"] = ( - experiment.results["total_nurse_duration"] / - (rc_period * experiment.n_nurses) - ) * 100.0 - - # Additional metrics - run_results["05_callback_rate"] = ( - experiment.results["callbacks_requested"] / - experiment.results["total_patients"] - ) * 100.0 if experiment.results["total_patients"] > 0 else 0.0 - - return run_results - -def multiple_replications(experiment, rc_period=RESULTS_COLLECTION_PERIOD, n_reps=5): - """Perform multiple replications of the enhanced model""" - results = [single_run(experiment, rep, rc_period) for rep in range(n_reps)] - df_results = pd.DataFrame(results) - df_results.index = np.arange(1, len(df_results) + 1) - df_results.index.name = "rep" - return df_results - -def set_trace(trace_on=True): - """Turn tracing on/off globally""" - global TRACE - TRACE = trace_on - - -def run_simulation_from_dict(params: dict): - """ - Wrapper to run the simulation using params supplied in a dictionary. - Only the parameters necessary for the Experiment's constructor are passed; - defaults are used for those not provided. - """ - # Extract constructor signature of the Experiment class - signature = inspect.signature(Experiment.__init__) - - # Build kwargs dict: match keys in 'params' to Experiment.__init__ params - experiment_kwargs = { - key: params.get(key, param.default) - for key, param in signature.parameters.items() - if key != 'self' # exclude 'self' from constructor - } - - # Create Experiment instance with dynamic/default parameters - exp = Experiment(**experiment_kwargs) - - # Separate non-constructor parameters - run_length = params.get("run_length", RESULTS_COLLECTION_PERIOD) - rep_seed = params.get("random_seed", DEFAULT_RND_SET) - - # Run simulation - return single_run(exp, rep=rep_seed, rc_period=run_length) - From 68491f834c7fc068332b33e507ba6ab00848523e Mon Sep 17 00:00:00 2001 From: TomMonks Date: Thu, 7 Aug 2025 16:03:57 +0100 Subject: [PATCH 11/23] chore(clean): removed redun top level resource dir --- resources/parameter_prompt.txt | 45 ------------------- resources/schema.json | 79 ---------------------------------- 2 files changed, 124 deletions(-) delete mode 100644 resources/parameter_prompt.txt delete mode 100644 resources/schema.json diff --git a/resources/parameter_prompt.txt b/resources/parameter_prompt.txt deleted file mode 100644 index bcfe6d5..0000000 --- a/resources/parameter_prompt.txt +++ /dev/null @@ -1,45 +0,0 @@ -You are a discrete-event simulation assistant. -You help turn natural language into structured simulation parameters. - -Here is the simulation configuration schema: -{schema} - -User request: {user_input} - -{validation_feedback} - -IMPORTANT: -- Return ONLY the parameter VALUES, not the schema structure -- Use the exact parameter names from the schema -- Output a simple JSON object with key-value pairs -- Do NOT include type information, descriptions, or constraints -- CALCULATE all mathematical expressions and provide ONLY the final numerical result -- Do NOT show calculations like "0.6 * 0.95" - provide the computed value like 0.57 -- All values must be final computed numbers, not mathematical expressions - -Example output format: -{{ - "n_operators": 15, - "n_nurses": 8, - "mean_iat": 0.57, - "random_seed": 42, - "run_length": 1000 -}} - -WRONG format (do not do this): -{{ - "mean_iat": "0.6 * 0.95" -}} - -CORRECT format: -{{ - "mean_iat": 0.57 -}} - - -Before providing the JSON: -1. Mentally calculate any mathematical expressions -2. Convert all calculations to final numerical values -3. Then output only the JSON with computed numbers - -JSON Response: diff --git a/resources/schema.json b/resources/schema.json deleted file mode 100644 index 66d699d..0000000 --- a/resources/schema.json +++ /dev/null @@ -1,79 +0,0 @@ -{ - "n_operators": { - "type": "int", - "default": 13, - "minimum": 1, - "maximum": 100, - "description": "Number of call operators (1-100)" - }, - "n_nurses": { - "type": "int", - "default": 10, - "minimum": 1, - "maximum": 50, - "description": "Number of nurse callbacks (1-50)" - }, - "mean_iat": { - "type": "float", - "default": 0.6, - "minimum": 0.1, - "maximum": 10.0, - "description": "Mean time between arrivals (minutes) or inter-arrival time. 0.1 = 6 calls/min, 10.0 = 1 call/10min. An increase in demand = a reduction in the inter-arrival time. A reduction in demand = an increase in inter-arrival time" - }, - "call_low": { - "type": "float", - "default": 5.0, - "minimum": 1.0, - "maximum": 30.0, - "description": "Low bound for call duration (minutes)" - }, - "call_mode": { - "type": "float", - "default": 7.0, - "minimum": 1.0, - "maximum": 30.0, - "description": "Mode for triangular call distribution (minutes)" - }, - "call_high": { - "type": "float", - "default": 10.0, - "minimum": 1.0, - "maximum": 30.0, - "description": "High bound for call duration (minutes)" - }, - "callback_prob": { - "type": "float", - "default": 0.4, - "minimum": 0.0, - "maximum": 1.0, - "description": "Nurse callback probability (0-1)" - }, - "nurse_consult_low": { - "type": "float", - "default": 10.0, - "minimum": 5.0, - "maximum": 60.0, - "description": "Low bound for nurse consult duration (minutes)" - }, - "nurse_consult_high": { - "type": "float", - "default": 20.0, - "minimum": 5.0, - "maximum": 60.0, - "description": "High bound for nurse consult duration (minutes)" - }, - "random_seed": { - "type": "int", - "default": 0, - "minimum": 0, - "maximum": 999999, - "description": "Random number seed for reproducibility" - }, - "run_length": { - "type": "int", - "default": 1000, - "minimum": 100, - "maximum": 86400, - "description": "Simulation runtime (minutes)" - } -} From eeb0dc4f872e9879251b29240b141ca6c2a1062a Mon Sep 17 00:00:00 2001 From: TomMonks Date: Thu, 7 Aug 2025 16:06:49 +0100 Subject: [PATCH 12/23] docs(readme): updated run mcp instructio --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index f28e731..543cfb0 100644 --- a/README.md +++ b/README.md @@ -127,7 +127,7 @@ This project was developed and tested on the following system configuration: 1. **Start the MCP server** ```bash - python mcp_server.py + python run_call_centre_server.py ``` Server will be available at `http://localhost:8001/mcp` From ef412b049bbfbedc7558a342ea5550108952effd Mon Sep 17 00:00:00 2001 From: TomMonks Date: Mon, 11 Aug 2025 15:38:24 +0100 Subject: [PATCH 13/23] docs(env): +arize-phoenix --- environment.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/environment.yml b/environment.yml index 74973e4..bb0c330 100644 --- a/environment.yml +++ b/environment.yml @@ -2,6 +2,7 @@ name: des-agent channels: - conda-forge dependencies: + - arize-phoenix - fastmcp - pandas - python=3.11 From 6c43aaea0de5e53bc3b0e29ca2fd3d04e131ead4 Mon Sep 17 00:00:00 2001 From: TomMonks Date: Mon, 11 Aug 2025 17:01:32 +0100 Subject: [PATCH 14/23] feat(evals): added basic langgraph tracing arize-phoenix --- agent_self_reflection_evals.py | 471 ++++++++++++++++++ environment.yml | 3 +- .../callcentre/call_centre_mcp.py | 8 +- .../resources/parameter_prompt.txt | 0 mcpsim/server.py | 14 +- mcpsim/tracing.py | 25 + 6 files changed, 518 insertions(+), 3 deletions(-) create mode 100644 agent_self_reflection_evals.py rename mcpsim/{example_models/callcentre => }/resources/parameter_prompt.txt (100%) create mode 100644 mcpsim/tracing.py diff --git a/agent_self_reflection_evals.py b/agent_self_reflection_evals.py new file mode 100644 index 0000000..5d783f8 --- /dev/null +++ b/agent_self_reflection_evals.py @@ -0,0 +1,471 @@ +""" +Agent Self-Reflection Simulation Parameter Generator + +***************** +WITH EVALS in arize-phoenix +***************** + +This module implements an simple agent that generates and validates simulation +parameters using natural language input and self-reflection capabilities. The agent +employs a graph-based workflow to iteratively refine parameters until they meet +validation requirements. + +Overview +-------- +The agent uses a state machine architecture built with LangGraph to orchestrate +the parameter generation process. When validation fails, the agent reflects on +the errors and attempts to correct them through multiple retry cycles. + +Core Workflow +------------- +1. Schema Retrieval: Fetch parameter schema from MCP server +2. Parameter Generation: Use LLM to convert natural language to JSON parameters +3. Validation: Validate parameters against schema constraints +4. Self-Reflection: On validation failure, analyze errors and retry +5. Simulation Execution: Run simulation with validated parameters +6. Results Display: Present formatted results and parameter summaries + +Usage +----- +Run from command line with optional model specification: + + $ python agent_self_reflection.py --llm gemma3:27b + $ python agent_self_reflection.py --llm mistral:7b + +The agent will prompt for a natural language simulation description and +automatically handle parameter generation, validation, and execution. + +Hard constraints +------------- +MAX_RETRIES : int + Maximum number of parameter generation attempts before bailout (default: 4) + +Examples +-------- +Natural language inputs the agent can process: +- "Simulate 14 operators, 12 nurses and 5% extra demand" +- "Run scenario with high staffing and normal call volume" +- "Test configuration with minimal staff" + +Notes +----- +Requires running MCP server on localhost:8001 and Ollama server on localhost:11434. +Different LLM models show varying performance - gemma3:27b and mistral:7b are +recommended for reliable parameter generation. + +""" + +import asyncio, json, re +from typing import Dict, Any, Optional, TypedDict + +from fastmcp import Client +from langchain_ollama import OllamaLLM +from langchain_core.prompts import PromptTemplate +from langgraph.graph import StateGraph, END +from rich.prompt import Prompt +from rich.progress import Progress, SpinnerColumn, TextColumn +from rich.console import Console +from rich.markdown import Markdown + +from functools import partial + +import pandas as pd + +import argparse + +import phoenix as px +from phoenix.otel import register + +import os + +from mcpsim.tracing import init_tracing + +console = Console() + +# maximum tries are jsonifying parameter list... +MAX_RETRIES = 4 + +PARAMETER_TABLE_TEMPLATE = """ +Given the following JSON object representing parameters updated in a simulation model: + +{json_object} + +Task: +Convert this JSON object into a Markdown table with two columns: **Parameter** and **Value**. +Each key should map to its value on a separate row. +Display only the table in clean Markdown format. +Do NOT include any text before the table such as 'Here is the output:' +Please add a graph emoji to the parameter column header. + +Output + +**Parameters used in simulation** + +[markdown table] +""" + + +class AgentState(TypedDict): + user_input: str + schema: dict + parameters: dict + formatted_parameters: str + validation: Optional[dict] + validation_history: list + simulation_result: Optional[dict] + retry_count: int + + +# --------------------------- Helper funcs ---------------------------------------- # +def clean_llm_response(response: Optional[str]) -> str: + """ + Clean LLM response to extract JSON from markdown code blocks. + + Large Language Models often wrap JSON responses in markdown code blocks. + This function removes markdown formatting and extracts the JSON content. + + Parameters + ---------- + response : Optional[str] + Raw LLM response that may contain markdown formatting + + Returns + ------- + str + Cleaned JSON string ready for parsing + + Examples + -------- + >>> clean_llm_response("```{'key':'value'}```") + "{'key': 'value'}" + + >>> clean_llm_response("```json{'key':'value'}```") + "{'key': 'value'}" + + >>> clean_llm_response("Some text {'key':value'} more text") + "{'key': 'value'}" + + >>> clean_llm_response("") + '{}' + """ + if not response: + return "{}" + + # Remove markdown code block markers + response = response.strip() + + # Remove ```json and ``` + if response.startswith("```json"): + response = response[7:] # Remove ``` + if response.startswith("```"): + response = response[3:] # Remove ``` + if response.endswith("```"): + response = response[:-3] # Remove closing ``` + + # Find JSON object boundaries + response = response.strip() + start_idx = response.find("{") + end_idx = response.rfind("}") + + if start_idx != -1 and end_idx != -1: + return response[start_idx : end_idx + 1] + + return response + + + + +# --------------------------- graph nodes ------------------------------------ # +async def fetch_schema(state: Dict[str, Any]) -> Dict[str, Any]: + async with Client("http://localhost:8001/mcp") as cl: + # Resource URI exactly as exposed by your server + res = await cl.read_resource("resource://schema/experiment_parameters") + state["schema"] = res[0].text if hasattr(res[0], "text") else res[0] + return state + + +async def generate_parameters(state: Dict[str, Any], llm: OllamaLLM) -> Dict[str, Any]: + async with Client("http://localhost:8001/mcp") as cl: + + # prompt parameters + prompt_vars = { + "schema": state["schema"], + "user_input": state["user_input"] + } + # include validation errors if present in state memory + validation_errors = state.get("validation", {}).get("errors") + if validation_errors: + prompt_vars["validation_errors"] = "\n".join(f"- {e}" for e in validation_errors) + else: + prompt_vars["validation_errors"] = "" # always supply a string + + # Ask MCP for the ready-made prompt that tells an LLM how to jsonify + prompt_resp = await cl.get_prompt("parameter_jsonification_prompt", prompt_vars) + + prompt_text = prompt_resp.messages[0].content.text + + progress_text = "[bold green]🧠 Reasoning about simulation parameters." + if state["retry_count"] > 0: + progress_text += f"[RETRY {state['retry_count']}]" + with Progress( + SpinnerColumn(), + TextColumn(progress_text), + transient=True, + ) as progress: + task = progress.add_task("summarising", total=None) + llm_out = llm.invoke(prompt_text) + progress.remove_task(task) + + cleaned_response = clean_llm_response(llm_out) + + try: + state["parameters"] = json.loads(cleaned_response) + except json.JSONDecodeError as e: + # Force validation failure by setting invalid parameters + # and create a mock validation response for self-reflection + state["parameters"] = {"__json_parse_error__": True} + console.print(f"[yellow]⚠️ JSON parsing failed - will retry with feedback") + except Exception as e: + # Handle other unexpected errors similarly + state["parameters"] = {"__unexpected_error__": True} + console.print(f"[yellow]⚠️ Unexpected error - will retry") + + return state + + +async def validate_parameters(state: Dict[str, Any]) -> Dict[str, Any]: + async with Client("http://localhost:8001/mcp") as cl: + resp = await cl.call_tool( + "validate_simulation_parameters", + {"parameters": state["parameters"]}, + ) + + # Add current validation to history before overwriting + if "validation_history" not in state: + state["validation_history"] = [] + + # Store the validation attempt with context + state["validation_history"].append({ + "attempt": state.get("retry_count", 0) + 1, + "parameters": state["parameters"].copy(), + "validation_result": resp.data.copy() + }) + + state["validation"] = resp.data + return state + +def validation_branch(state: Dict[str, Any]) -> str: + return "valid" if state["validation"]["is_valid"] else "invalid" + +def retry_branch(state: AgentState) -> str: + """routing node. bail out if max retries exceeded otherwise jsonify""" + if state.get("retry_count", 0) >= MAX_RETRIES: + return "bail_out" + return "jsonify" + +def increment_retry(state: AgentState) -> AgentState: + """Increment the number of attempts at jsonifying parameters""" + state["retry_count"] = state.get("retry_count", 0) + 1 + report_parameter_reflection_failure(state, MAX_RETRIES) + return state + +def bail_out_node(state: AgentState) -> AgentState: + """A bail out node if exceeded max attempts at jsonifying parameters""" + state["error"] = "Maximum retries exceeded during parameter reflection." + return state + +async def run_simulation(state: Dict[str, Any]) -> Dict[str, Any]: + async with Client("http://localhost:8001/mcp") as cl: + resp = await cl.call_tool( + "run_call_centre_simulation", + {"parameters": state["parameters"]}, + ) + state["simulation_result"] = resp.data + return state + + +async def summarise_parameters(state: Dict[str, Any], llm: OllamaLLM) -> Dict[str, Any]: + """Generates a formatted markdown table of parameters from JSON. + Could do this programatically, but just for fun we will use a LLM""" + progress_text = "[bold blue]✏️ Summarising parameters used..." + with Progress( + SpinnerColumn(), + TextColumn(progress_text), + transient=True, + ) as progress: + task = progress.add_task("summarising_params", total=None) + prompt = PromptTemplate.from_template(PARAMETER_TABLE_TEMPLATE) + param_display_prompt = prompt.format(json_object=state["parameters"]) + llm_result = llm.invoke(param_display_prompt) + progress.remove_task(task) + + state["formatted_parameters"] = llm_result + return state + +# --------------------------- build the graph -------------------------------- # +def build_graph(llm: OllamaLLM) -> StateGraph: + graph = StateGraph(AgentState) # Using the TypedDict from above + + # 1. create notes + # NB: I've used a partial functions here to solve the issue with passing extra parameters to a coroutine + graph.add_node("get_schema", fetch_schema) + graph.add_node("jsonify", partial(generate_parameters, llm=llm)) + graph.add_node("validate", validate_parameters) + graph.add_node("run_sim", run_simulation) + graph.add_node("format_params", partial(summarise_parameters, llm=llm)) + graph.add_node("increment_retry", increment_retry) + graph.add_node("bail_out", bail_out_node) + + # 2. create entry point and edges + graph.set_entry_point("get_schema") + graph.add_edge("get_schema", "jsonify") + graph.add_edge("jsonify", "validate") + graph.add_conditional_edges( + "validate", + validation_branch, + {"valid": "format_params", "invalid": "increment_retry"} + ) + + # handles limited number of retrries. links to bail_out (and END) + # if retries exceeds a hard limit. + graph.add_conditional_edges( + "increment_retry", + retry_branch, + {"jsonify": "jsonify", "bail_out": "bail_out"} + ) + + graph.add_edge("format_params", "run_sim") + graph.add_edge("run_sim", END) + graph.add_edge("bail_out", END) + + return graph.compile() + +def display_results_table(state: AgentState): + """ + Display simulation results to the user + """ + console.print(Markdown("✅ **Simulation complete.**")) + df = pd.DataFrame( + list(state["simulation_result"].items()), columns=["KPIs", "Values"] + ) + console.print(Markdown(df.round(2).to_markdown(index=False))) + + +def display_param_summary_table(state: AgentState): + """ + Display a summary table of parameters for human-in-loop validation + """ + console.print(Markdown(state["formatted_parameters"])) + +def report_parameter_reflection_failure(state: dict, max_retries: int): + """ + Display a clear error message for parameter reflection failure + """ + retry_count = state.get("retry_count", 0) + console.print( + f"[bold red]❌ Parameter validation failed after {retry_count} attempt{'s' if retry_count!=1 else ''}." + ) + if "validation" in state and "errors" in state["validation"]: + errors = state["validation"]["errors"] + if errors: + console.print("[red]Last validation errors were:") + for err in errors: + console.print(f"[red]- {err}") + else: + console.print("[red]No specific validation errors were provided by the server.") + else: + console.print("[red]No validation error details are available.") + console.print(f"[yellow]The agent was unable to generate valid simulation parameters in {retry_count} tries (limit: {max_retries}).") + console.print("[yellow]Try rephrasing your request or ensure parameter values are within allowed ranges. Refer to the simulation parameter schema for guidance.") + + +def display_validation_history(state: AgentState): + """Display the history of validation attempts""" + history = state.get("validation_history", []) + + if not history: + return + + console.print(Markdown("🔍 **Parameter Generation Issues**")) + + for entry in history: + attempt_num = entry["attempt"] + is_valid = entry["validation_result"]["is_valid"] + errors = entry["validation_result"]["errors"] + + status = "✅ Success" if is_valid else "❌ Failed" + console.print(f"\n**Attempt {attempt_num}:** {status}") + + if errors: + console.print("Errors encountered:") + for error in errors: + console.print(f"{error}") + + + +async def main(model_name: str) -> None: + + # 0. setup eval server + tracer = init_tracing() + + # 1. Setup the graph and LLM + llm = OllamaLLM(model=model_name, base_url="http://localhost:11434") + compiled_graph = build_graph(llm) + + # 2. Prompt user + user_request = Prompt.ask( + "Simulation request", + default="Simulate 14 operators, 12 nurses and 5 % extra demand" + ) + + # 3. invoke graph + final_state = await compiled_graph.ainvoke({ + "user_input": user_request, + "retry_count": 0, + "validation_history": [] + }) + + # 4. Report results + console.rule("[bold green]RESULTS") + if "simulation_result" in final_state: + display_param_summary_table(final_state) + display_results_table(final_state) + + retry_count = final_state.get("retry_count", 0) + if retry_count > 0: + display_validation_history(final_state) + +def parse_arguments(): + """Parse command line arguments for model selection.""" + parser = argparse.ArgumentParser( + description="Simulation Agent Workflow - Experiment with a simulation model using natural language", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" + Examples: + python agent_self_reasoning.py -l gemma3:27b + python agent_self_reasoning.py -l mistral:7b + """, + ) + + parser.add_argument( + "-l", + "--llm", + type=str, + default="gemma3:27b", + help="Model to use for generating parameters (default:gemma3:27b)", + ) + + return parser.parse_args() + + + + +if __name__ == "__main__": + + # Parse command line arguments + args = parse_arguments() + + asyncio.run(main(model_name=args.llm)) + + + diff --git a/environment.yml b/environment.yml index bb0c330..e974ae9 100644 --- a/environment.yml +++ b/environment.yml @@ -2,7 +2,6 @@ name: des-agent channels: - conda-forge dependencies: - - arize-phoenix - fastmcp - pandas - python=3.11 @@ -15,7 +14,9 @@ dependencies: - tabulate - langgraph - pip: + - arize-phoenix-otel - langchain - langchain-community - langchain-core - langchain-ollama + - openinference-instrumentation-langchain diff --git a/mcpsim/example_models/callcentre/call_centre_mcp.py b/mcpsim/example_models/callcentre/call_centre_mcp.py index 07761c0..158f10f 100644 --- a/mcpsim/example_models/callcentre/call_centre_mcp.py +++ b/mcpsim/example_models/callcentre/call_centre_mcp.py @@ -1,13 +1,19 @@ import json -from model import run_simulation_from_dict +from .call_centre_model import run_simulation_from_dict from typing import Dict, Any +import os class CallCentreSimulationAdapter: """Call centre simulation implementation following the SimulationModelAdapter protocol.""" def __init__(self, schema_path: str = "resources/schema.json"): + + # handle path to schema file + dir_path = os.path.dirname(os.path.realpath(__file__)) + schema_path = os.path.join(dir_path, schema_path) + self.schema_path = schema_path self._schema = None diff --git a/mcpsim/example_models/callcentre/resources/parameter_prompt.txt b/mcpsim/resources/parameter_prompt.txt similarity index 100% rename from mcpsim/example_models/callcentre/resources/parameter_prompt.txt rename to mcpsim/resources/parameter_prompt.txt diff --git a/mcpsim/server.py b/mcpsim/server.py index 8f731ce..88133ea 100644 --- a/mcpsim/server.py +++ b/mcpsim/server.py @@ -1,9 +1,16 @@ +import os + from fastmcp import FastMCP from langchain_core.prompts import PromptTemplate from fastmcp.prompts.prompt import PromptMessage, TextContent from mcpsim.protocol import SimulationModelAdapter +PARAMETER_PROMPT_PATH = "resources/parameter_prompt.txt" + +from mcpsim.tracing import init_tracing # your helper module from previous discussion +tracer_provider = init_tracing(project_name="sim-agent-mcp-server") + class SimulationMCPServer: """MCP server that can work with any SimulationModelAdapter implementation.""" @@ -115,7 +122,12 @@ def parameter_jsonification_prompt( user_input: str, validation_errors: str = "" ) -> PromptMessage: - with open("resources/parameter_prompt.txt", encoding="utf-8") as f: + + # handle path to schema file + dir_path = os.path.dirname(os.path.realpath(__file__)) + parameter_prompt_path = os.path.join(dir_path, PARAMETER_PROMPT_PATH) + + with open(parameter_prompt_path, encoding="utf-8") as f: prompt_template_text = f.read() prompt = PromptTemplate.from_template(prompt_template_text) diff --git a/mcpsim/tracing.py b/mcpsim/tracing.py new file mode 100644 index 0000000..940b82f --- /dev/null +++ b/mcpsim/tracing.py @@ -0,0 +1,25 @@ +""" +Utility module to support tracing of the agent and tools using arize-phoenix +""" + +import os +from phoenix.otel import register + +def init_tracing(project_name: str = "sim-agent", endpoint: str = "http://localhost:6006"): + """ + Initialise tracing + + Parameters: + ---------- + project_name: str. optional (default="sim-agent") + Name of project + + endpoint: str, optional (default = "http://localhost:6006") + Port for Phoenix eval server. + + Returns: + tracer + + """ + os.environ["PHOENIX_COLLECTOR_ENDPOINT"] = endpoint + return register(project_name=project_name, auto_instrument=True) From 01762187e9929ac15310c361be3bfe8fca819d6c Mon Sep 17 00:00:00 2001 From: TomMonks Date: Tue, 12 Aug 2025 11:05:19 +0100 Subject: [PATCH 15/23] feat(evals): draft tracing --- agent_self_reflection_evals.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/agent_self_reflection_evals.py b/agent_self_reflection_evals.py index 5d783f8..480f848 100644 --- a/agent_self_reflection_evals.py +++ b/agent_self_reflection_evals.py @@ -76,6 +76,10 @@ import phoenix as px from phoenix.otel import register +# used to annotate functions when tracing +from opentelemetry import trace +from opentelemetry.trace import Status, StatusCode + import os from mcpsim.tracing import init_tracing @@ -234,6 +238,9 @@ async def generate_parameters(state: Dict[str, Any], llm: OllamaLLM) -> Dict[str async def validate_parameters(state: Dict[str, Any]) -> Dict[str, Any]: + + + async with Client("http://localhost:8001/mcp") as cl: resp = await cl.call_tool( "validate_simulation_parameters", @@ -251,7 +258,9 @@ async def validate_parameters(state: Dict[str, Any]) -> Dict[str, Any]: "validation_result": resp.data.copy() }) - state["validation"] = resp.data + result = resp.data + state["validation"] = result + return state def validation_branch(state: Dict[str, Any]) -> str: From 1073cbe4091f6451ee0551b97f5db8542f7587fe Mon Sep 17 00:00:00 2001 From: TomMonks Date: Tue, 12 Aug 2025 11:49:14 +0100 Subject: [PATCH 16/23] feat(evals): added initial eval data --- agent_self_reflection_evals.py | 15 ++- create_evals.py | 22 ++++ evals/evals.json | 232 +++++++++++++++++++++++++++++++++ evals/evals_input.json | 171 ++++++++++++++++++++++++ mcpsim/server.py | 6 +- 5 files changed, 439 insertions(+), 7 deletions(-) create mode 100644 create_evals.py create mode 100644 evals/evals.json create mode 100644 evals/evals_input.json diff --git a/agent_self_reflection_evals.py b/agent_self_reflection_evals.py index 480f848..18c841b 100644 --- a/agent_self_reflection_evals.py +++ b/agent_self_reflection_evals.py @@ -80,9 +80,15 @@ from opentelemetry import trace from opentelemetry.trace import Status, StatusCode -import os from mcpsim.tracing import init_tracing +# Initialize tracing with the unified project name. +tracer_provider = init_tracing(project_name="mcp-agent-evaluation") +# Create a module-level tracer object. +tracer = tracer_provider.get_tracer("langgraph-agent-tracer") + +import os + console = Console() @@ -187,7 +193,6 @@ async def fetch_schema(state: Dict[str, Any]) -> Dict[str, Any]: state["schema"] = res[0].text if hasattr(res[0], "text") else res[0] return state - async def generate_parameters(state: Dict[str, Any], llm: OllamaLLM) -> Dict[str, Any]: async with Client("http://localhost:8001/mcp") as cl: @@ -238,8 +243,6 @@ async def generate_parameters(state: Dict[str, Any], llm: OllamaLLM) -> Dict[str async def validate_parameters(state: Dict[str, Any]) -> Dict[str, Any]: - - async with Client("http://localhost:8001/mcp") as cl: resp = await cl.call_tool( @@ -292,7 +295,6 @@ async def run_simulation(state: Dict[str, Any]) -> Dict[str, Any]: state["simulation_result"] = resp.data return state - async def summarise_parameters(state: Dict[str, Any], llm: OllamaLLM) -> Dict[str, Any]: """Generates a formatted markdown table of parameters from JSON. Could do this programatically, but just for fun we will use a LLM""" @@ -415,7 +417,8 @@ def display_validation_history(state: AgentState): async def main(model_name: str) -> None: # 0. setup eval server - tracer = init_tracing() + # tracer_provider = init_tracing(project_name="sim-agent-evaluation") + # tracer = tracer_provider.get_tracer("langgraph-agent-tracer") # 1. Setup the graph and LLM llm = OllamaLLM(model=model_name, base_url="http://localhost:11434") diff --git a/create_evals.py b/create_evals.py new file mode 100644 index 0000000..e469a4e --- /dev/null +++ b/create_evals.py @@ -0,0 +1,22 @@ +""" +Script to add expected simulation results to the evals dataset +single rep at this stage. +""" +import json +from mcpsim.example_models.callcentre.call_centre_model import run_simulation_from_dict + +# Load test cases from evals_input.json +with open("evals/evals_input.json", "r") as f: + scenarios = json.load(f) + +# Run each scenario once and attach results +for key, obj in scenarios.items(): + params = obj["parameters"] + results = run_simulation_from_dict(params) + obj["expected_results"] = results + +# Save updated scenarios with results to evals.json +with open("evals/evals.json", "w") as f: + json.dump(scenarios, f, indent=2) + +print("Simulation results saved to evals.json") diff --git a/evals/evals.json b/evals/evals.json new file mode 100644 index 0000000..35995bb --- /dev/null +++ b/evals/evals.json @@ -0,0 +1,232 @@ +{ + "example_1": { + "user_input": "Simulate 14 operators, 12 nurses and 5% extra demand", + "parameters": { + "n_operators": 14, + "n_nurses": 12, + "mean_iat": 0.57, + "call_low": 5.0, + "call_mode": 7.0, + "call_high": 10.0, + "callback_prob": 0.4, + "nurse_consult_low": 10.0, + "nurse_consult_high": 20.0, + "random_seed": 0, + "run_length": 1000 + }, + "expected_results": { + "01_mean_waiting_time": 2.3504694345055652, + "02_operator_util": 90.86219986414451, + "03_mean_nurse_waiting_time": 4.1106430489118715, + "04_nurse_util": 90.36357093206236, + "05_callback_rate": 42.5531914893617 + } + }, + "example_2": { + "user_input": "Run scenario with double staffing and normal call volume", + "parameters": { + "n_operators": 26, + "n_nurses": 20, + "mean_iat": 0.6, + "call_low": 5.0, + "call_mode": 7.0, + "call_high": 10.0, + "callback_prob": 0.4, + "nurse_consult_low": 10.0, + "nurse_consult_high": 20.0, + "random_seed": 0, + "run_length": 1000 + }, + "expected_results": { + "01_mean_waiting_time": 0.0, + "02_operator_util": 46.452122409679156, + "03_mean_nurse_waiting_time": 0.0041045663539504606, + "04_nurse_util": 51.20346053052576, + "05_callback_rate": 42.12121212121212 + } + }, + "example_3": { + "user_input": "Test configuration with 8 operators, 6 nurses, and 20% increased demand", + "parameters": { + "n_operators": 8, + "n_nurses": 6, + "mean_iat": 0.48, + "call_low": 5.0, + "call_mode": 7.0, + "call_high": 10.0, + "callback_prob": 0.4, + "nurse_consult_low": 10.0, + "nurse_consult_high": 20.0, + "random_seed": 0, + "run_length": 1000 + }, + "expected_results": { + "01_mean_waiting_time": 242.69198606467467, + "02_operator_util": 99.35481108027054, + "03_mean_nurse_waiting_time": 78.94431797274639, + "04_nurse_util": 97.8390040780754, + "05_callback_rate": 42.48847926267281 + } + }, + "example_4": { + "user_input": "Simulate with 20 operators, 15 nurses, and half the usual call volume", + "parameters": { + "n_operators": 20, + "n_nurses": 15, + "mean_iat": 1.2, + "call_low": 5.0, + "call_mode": 7.0, + "call_high": 10.0, + "callback_prob": 0.4, + "nurse_consult_low": 10.0, + "nurse_consult_high": 20.0, + "random_seed": 0, + "run_length": 1000 + }, + "expected_results": { + "01_mean_waiting_time": 0.0, + "02_operator_util": 31.21492694742085, + "03_mean_nurse_waiting_time": 0.0, + "04_nurse_util": 35.17900361818962, + "05_callback_rate": 42.17289719626168 + } + }, + "example_5": { + "user_input": "Run with 10 operators, 8 nurses, 30% callback probability, and 10% reduced demand", + "parameters": { + "n_operators": 10, + "n_nurses": 8, + "mean_iat": 0.66, + "call_low": 5.0, + "call_mode": 7.0, + "call_high": 10.0, + "callback_prob": 0.3, + "nurse_consult_low": 10.0, + "nurse_consult_high": 20.0, + "random_seed": 0, + "run_length": 1000 + }, + "expected_results": { + "01_mean_waiting_time": 62.26336943615235, + "02_operator_util": 98.91827940297605, + "03_mean_nurse_waiting_time": 2.3551925550777337, + "04_nurse_util": 82.46774067208518, + "05_callback_rate": 33.012583271650634 + } + }, + "example_6": { + "user_input": "Test scenario: 25 operators, 25 nurses, triple demand", + "parameters": { + "n_operators": 25, + "n_nurses": 25, + "mean_iat": 0.2, + "call_low": 5.0, + "call_mode": 7.0, + "call_high": 10.0, + "callback_prob": 0.4, + "nurse_consult_low": 10.0, + "nurse_consult_high": 20.0, + "random_seed": 0, + "run_length": 1000 + }, + "expected_results": { + "01_mean_waiting_time": 161.26826119772568, + "02_operator_util": 99.27336565709264, + "03_mean_nurse_waiting_time": 0.4505143843086236, + "04_nurse_util": 83.59706413413956, + "05_callback_rate": 41.69621749408983 + } + }, + "example_7": { + "user_input": "Simulate 15 operators, 12 nurses, call durations 3-6-9 minutes, 2000 minute runtime", + "parameters": { + "n_operators": 15, + "n_nurses": 12, + "mean_iat": 0.6, + "call_low": 3.0, + "call_mode": 6.0, + "call_high": 9.0, + "callback_prob": 0.4, + "nurse_consult_low": 10.0, + "nurse_consult_high": 20.0, + "random_seed": 0, + "run_length": 2000 + }, + "expected_results": { + "01_mean_waiting_time": 0.07192106569784922, + "02_operator_util": 66.74240386374329, + "03_mean_nurse_waiting_time": 2.663802537251699, + "04_nurse_util": 86.43094276820304, + "05_callback_rate": 41.684147437818396 + } + }, + "example_8": { + "user_input": "Run with 6 operators, 4 nurses, 60% callback rate, and 25% higher demand", + "parameters": { + "n_operators": 6, + "n_nurses": 4, + "mean_iat": 0.45, + "call_low": 5.0, + "call_mode": 7.0, + "call_high": 10.0, + "callback_prob": 0.6, + "nurse_consult_low": 10.0, + "nurse_consult_high": 20.0, + "random_seed": 0, + "run_length": 1000 + }, + "expected_results": { + "01_mean_waiting_time": 318.98668257451686, + "02_operator_util": 99.56949574531609, + "03_mean_nurse_waiting_time": 196.98753442211358, + "04_nurse_util": 97.69465771729217, + "05_callback_rate": 57.87545787545788 + } + }, + "example_9": { + "user_input": "Test configuration: 18 operators, 14 nurses, nurse consultations 15-25 minutes, normal demand", + "parameters": { + "n_operators": 18, + "n_nurses": 14, + "mean_iat": 0.6, + "call_low": 5.0, + "call_mode": 7.0, + "call_high": 10.0, + "callback_prob": 0.4, + "nurse_consult_low": 15.0, + "nurse_consult_high": 25.0, + "random_seed": 0, + "run_length": 1000 + }, + "expected_results": { + "01_mean_waiting_time": 0.08980920202587155, + "02_operator_util": 67.09751014731434, + "03_mean_nurse_waiting_time": 19.85524137214805, + "04_nurse_util": 97.15170780690659, + "05_callback_rate": 42.12121212121212 + } + }, + "example_10": { + "user_input": "Simulate 12 operators, 10 nurses, 50% callback probability, and 40% demand reduction", + "parameters": { + "n_operators": 12, + "n_nurses": 10, + "mean_iat": 0.84, + "call_low": 5.0, + "call_mode": 7.0, + "call_high": 10.0, + "callback_prob": 0.5, + "nurse_consult_low": 10.0, + "nurse_consult_high": 20.0, + "random_seed": 0, + "run_length": 1000 + }, + "expected_results": { + "01_mean_waiting_time": 0.38280107383630124, + "02_operator_util": 73.44752978233386, + "03_mean_nurse_waiting_time": 4.147154100012408, + "04_nurse_util": 90.70926116406883, + "05_callback_rate": 51.4546965918537 + } + } +} \ No newline at end of file diff --git a/evals/evals_input.json b/evals/evals_input.json new file mode 100644 index 0000000..51fe6c5 --- /dev/null +++ b/evals/evals_input.json @@ -0,0 +1,171 @@ +{ + "example_1": { + "user_input": "Simulate 14 operators, 12 nurses and 5% extra demand", + "parameters": { + "n_operators": 14, + "n_nurses": 12, + "mean_iat": 0.57, + "call_low": 5.0, + "call_mode": 7.0, + "call_high": 10.0, + "callback_prob": 0.4, + "nurse_consult_low": 10.0, + "nurse_consult_high": 20.0, + "random_seed": 0, + "run_length": 1000 + } + }, + + "example_2": { + "user_input": "Run scenario with double staffing and normal call volume", + "parameters": { + "n_operators": 26, + "n_nurses": 20, + "mean_iat": 0.6, + "call_low": 5.0, + "call_mode": 7.0, + "call_high": 10.0, + "callback_prob": 0.4, + "nurse_consult_low": 10.0, + "nurse_consult_high": 20.0, + "random_seed": 0, + "run_length": 1000 + } + }, + + "example_3": { + "user_input": "Test configuration with 8 operators, 6 nurses, and 20% increased demand", + "parameters": { + "n_operators": 8, + "n_nurses": 6, + "mean_iat": 0.48, + "call_low": 5.0, + "call_mode": 7.0, + "call_high": 10.0, + "callback_prob": 0.4, + "nurse_consult_low": 10.0, + "nurse_consult_high": 20.0, + "random_seed": 0, + "run_length": 1000 + } + }, + + "example_4": { + "user_input": "Simulate with 20 operators, 15 nurses, and half the usual call volume", + "parameters": { + "n_operators": 20, + "n_nurses": 15, + "mean_iat": 1.2, + "call_low": 5.0, + "call_mode": 7.0, + "call_high": 10.0, + "callback_prob": 0.4, + "nurse_consult_low": 10.0, + "nurse_consult_high": 20.0, + "random_seed": 0, + "run_length": 1000 + } + }, + + "example_5": { + "user_input": "Run with 10 operators, 8 nurses, 30% callback probability, and 10% reduced demand", + "parameters": { + "n_operators": 10, + "n_nurses": 8, + "mean_iat": 0.66, + "call_low": 5.0, + "call_mode": 7.0, + "call_high": 10.0, + "callback_prob": 0.3, + "nurse_consult_low": 10.0, + "nurse_consult_high": 20.0, + "random_seed": 0, + "run_length": 1000 + } + }, + + "example_6": { + "user_input": "Test scenario: 25 operators, 25 nurses, triple demand", + "parameters": { + "n_operators": 25, + "n_nurses": 25, + "mean_iat": 0.2, + "call_low": 5.0, + "call_mode": 7.0, + "call_high": 10.0, + "callback_prob": 0.4, + "nurse_consult_low": 10.0, + "nurse_consult_high": 20.0, + "random_seed": 0, + "run_length": 1000 + } + }, + + "example_7": { + "user_input": "Simulate 15 operators, 12 nurses, call durations 3-6-9 minutes, 2000 minute runtime", + "parameters": { + "n_operators": 15, + "n_nurses": 12, + "mean_iat": 0.6, + "call_low": 3.0, + "call_mode": 6.0, + "call_high": 9.0, + "callback_prob": 0.4, + "nurse_consult_low": 10.0, + "nurse_consult_high": 20.0, + "random_seed": 0, + "run_length": 2000 + } + }, + + "example_8": { + "user_input": "Run with 6 operators, 4 nurses, 60% callback rate, and 25% higher demand", + "parameters": { + "n_operators": 6, + "n_nurses": 4, + "mean_iat": 0.45, + "call_low": 5.0, + "call_mode": 7.0, + "call_high": 10.0, + "callback_prob": 0.6, + "nurse_consult_low": 10.0, + "nurse_consult_high": 20.0, + "random_seed": 0, + "run_length": 1000 + } + }, + + "example_9": { + "user_input": "Test configuration: 18 operators, 14 nurses, nurse consultations 15-25 minutes, normal demand", + "parameters": { + "n_operators": 18, + "n_nurses": 14, + "mean_iat": 0.6, + "call_low": 5.0, + "call_mode": 7.0, + "call_high": 10.0, + "callback_prob": 0.4, + "nurse_consult_low": 15.0, + "nurse_consult_high": 25.0, + "random_seed": 0, + "run_length": 1000 + } + }, + + "example_10": { + "user_input": "Simulate 12 operators, 10 nurses, 50% callback probability, and 40% demand reduction", + "parameters": { + "n_operators": 12, + "n_nurses": 10, + "mean_iat": 0.84, + "call_low": 5.0, + "call_mode": 7.0, + "call_high": 10.0, + "callback_prob": 0.5, + "nurse_consult_low": 10.0, + "nurse_consult_high": 20.0, + "random_seed": 0, + "run_length": 1000 + } + } +} diff --git a/mcpsim/server.py b/mcpsim/server.py index 88133ea..ee0f1f9 100644 --- a/mcpsim/server.py +++ b/mcpsim/server.py @@ -9,7 +9,9 @@ PARAMETER_PROMPT_PATH = "resources/parameter_prompt.txt" from mcpsim.tracing import init_tracing # your helper module from previous discussion -tracer_provider = init_tracing(project_name="sim-agent-mcp-server") + +tracer_provider = init_tracing(project_name="sim-agent-evaluation") +tracer = tracer_provider.get_tracer("mcp-server-tracer") class SimulationMCPServer: """MCP server that can work with any SimulationModelAdapter implementation.""" @@ -38,6 +40,7 @@ def _register_tools(self): """Register MCP tools that delegate to the simulation model.""" # note investigate how to clean up description + @tracer.tool(name="MCP.run_call_centre_simulation") @self.mcp.tool( name=f"run_{self.model.model_name}_simulation", description=f""" @@ -53,6 +56,7 @@ def _register_tools(self): def run_call_centre_simulation(parameters: dict) -> dict: return self.model.run_simulation(parameters) + @tracer.tool(name="MCP.validate_simulation_parameters") @self.mcp.tool( name="validate_simulation_parameters", description=""" From aa70d8699145cfe7b1939df2451bf41c0801fb0f Mon Sep 17 00:00:00 2001 From: TomMonks Date: Tue, 12 Aug 2025 12:07:00 +0100 Subject: [PATCH 17/23] feat(evals): added basic eval script --- evals/evals_output.json | 1121 +++++++++++++++++++++++++++++++++++++++ run_evals.py | 157 ++++++ 2 files changed, 1278 insertions(+) create mode 100644 evals/evals_output.json create mode 100644 run_evals.py diff --git a/evals/evals_output.json b/evals/evals_output.json new file mode 100644 index 0000000..9453f85 --- /dev/null +++ b/evals/evals_output.json @@ -0,0 +1,1121 @@ +{ + "example_1": { + "user_input": "Simulate 14 operators, 12 nurses and 5% extra demand", + "parameters": { + "n_operators": 14, + "n_nurses": 12, + "mean_iat": 0.57, + "call_low": 5.0, + "call_mode": 7.0, + "call_high": 10.0, + "callback_prob": 0.4, + "nurse_consult_low": 10.0, + "nurse_consult_high": 20.0, + "random_seed": 0, + "run_length": 1000 + }, + "expected_results": { + "01_mean_waiting_time": 2.3504694345055652, + "02_operator_util": 90.86219986414451, + "03_mean_nurse_waiting_time": 4.1106430489118715, + "04_nurse_util": 90.36357093206236, + "05_callback_rate": 42.5531914893617 + }, + "agent_run": { + "simulation_result": { + "01_mean_waiting_time": 2.9621655675237495, + "02_operator_util": 91.36685915350135, + "03_mean_nurse_waiting_time": 2.958020460174451, + "04_nurse_util": 84.3596921204659, + "05_callback_rate": 38.860103626943 + }, + "meta": { + "retry_count": 0, + "had_bailout": false, + "validation_history": [ + { + "attempt": 1, + "parameters": { + "n_operators": 14, + "n_nurses": 12, + "mean_iat": 0.57, + "random_seed": 42, + "run_length": 1000 + }, + "validation_result": { + "is_valid": true, + "errors": [] + } + } + ], + "parameters": { + "n_operators": 14, + "n_nurses": 12, + "mean_iat": 0.57, + "random_seed": 42, + "run_length": 1000 + }, + "formatted_parameters": " ```\n**Parameters used in simulation**\n\n| Parameter | Value |\n|-----------|-------|\n| \ud83d\udcca n_operators | 14 |\n| \ud83d\udcca n_nurses | 12 |\n| \ud83d\udcca mean_iat | 0.57 |\n| \ud83d\udcca random_seed| 42 |\n| \ud83d\udcca run_length | 1000 |\n```" + } + }, + "comparison": { + "pass": false, + "details": { + "01_mean_waiting_time": { + "expected": 2.3504694345055652, + "got": 2.9621655675237495, + "match": false + }, + "02_operator_util": { + "expected": 90.86219986414451, + "got": 91.36685915350135, + "match": false + }, + "03_mean_nurse_waiting_time": { + "expected": 4.1106430489118715, + "got": 2.958020460174451, + "match": false + }, + "04_nurse_util": { + "expected": 90.36357093206236, + "got": 84.3596921204659, + "match": false + }, + "05_callback_rate": { + "expected": 42.5531914893617, + "got": 38.860103626943, + "match": false + } + }, + "rtol": 0.001, + "atol": 1e-06 + }, + "passed": false + }, + "example_2": { + "user_input": "Run scenario with double staffing and normal call volume", + "parameters": { + "n_operators": 26, + "n_nurses": 20, + "mean_iat": 0.6, + "call_low": 5.0, + "call_mode": 7.0, + "call_high": 10.0, + "callback_prob": 0.4, + "nurse_consult_low": 10.0, + "nurse_consult_high": 20.0, + "random_seed": 0, + "run_length": 1000 + }, + "expected_results": { + "01_mean_waiting_time": 0.0, + "02_operator_util": 46.452122409679156, + "03_mean_nurse_waiting_time": 0.0041045663539504606, + "04_nurse_util": 51.20346053052576, + "05_callback_rate": 42.12121212121212 + }, + "agent_run": { + "simulation_result": { + "01_mean_waiting_time": 0.24486927533243114, + "02_operator_util": 83.1290576750028, + "03_mean_nurse_waiting_time": 212.0429575856493, + "04_nurse_util": 97.94215299366186, + "05_callback_rate": 39.5769361992494 + }, + "meta": { + "retry_count": 0, + "had_bailout": false, + "validation_history": [ + { + "attempt": 1, + "parameters": { + "n_operators": 26, + "n_nurses": 10, + "mean_iat": 0.335, + "random_seed": 42, + "run_length": 1000 + }, + "validation_result": { + "is_valid": true, + "errors": [] + } + } + ], + "parameters": { + "n_operators": 26, + "n_nurses": 10, + "mean_iat": 0.335, + "random_seed": 42, + "run_length": 1000 + }, + "formatted_parameters": " ```markdown\n\ud83d\udcca **Parameters** | **Value**\n-----------------|--------\nn_operators | 26\nn_nurses | 10\nmean_iat | 0.335\nrandom_seed | 42\nrun_length | 1000\n```" + } + }, + "comparison": { + "pass": false, + "details": { + "01_mean_waiting_time": { + "expected": 0.0, + "got": 0.24486927533243114, + "match": false + }, + "02_operator_util": { + "expected": 46.452122409679156, + "got": 83.1290576750028, + "match": false + }, + "03_mean_nurse_waiting_time": { + "expected": 0.0041045663539504606, + "got": 212.0429575856493, + "match": false + }, + "04_nurse_util": { + "expected": 51.20346053052576, + "got": 97.94215299366186, + "match": false + }, + "05_callback_rate": { + "expected": 42.12121212121212, + "got": 39.5769361992494, + "match": false + } + }, + "rtol": 0.001, + "atol": 1e-06 + }, + "passed": false + }, + "example_3": { + "user_input": "Test configuration with 8 operators, 6 nurses, and 20% increased demand", + "parameters": { + "n_operators": 8, + "n_nurses": 6, + "mean_iat": 0.48, + "call_low": 5.0, + "call_mode": 7.0, + "call_high": 10.0, + "callback_prob": 0.4, + "nurse_consult_low": 10.0, + "nurse_consult_high": 20.0, + "random_seed": 0, + "run_length": 1000 + }, + "expected_results": { + "01_mean_waiting_time": 242.69198606467467, + "02_operator_util": 99.35481108027054, + "03_mean_nurse_waiting_time": 78.94431797274639, + "04_nurse_util": 97.8390040780754, + "05_callback_rate": 42.48847926267281 + }, + "agent_run": { + "simulation_result": { + "01_mean_waiting_time": 258.690360654302, + "02_operator_util": 99.44580381509923, + "03_mean_nurse_waiting_time": 78.94106695388635, + "04_nurse_util": 97.8390040780754, + "05_callback_rate": 42.5414364640884 + }, + "meta": { + "retry_count": 3, + "had_bailout": false, + "validation_history": [ + { + "attempt": 1, + "parameters": { + "__json_parse_error__": true + }, + "validation_result": { + "is_valid": false, + "errors": [ + "Unknown parameter: __json_parse_error__" + ] + } + }, + { + "attempt": 2, + "parameters": { + "__json_parse_error__": true + }, + "validation_result": { + "is_valid": false, + "errors": [ + "Unknown parameter: __json_parse_error__" + ] + } + }, + { + "attempt": 3, + "parameters": { + "__json_parse_error__": true + }, + "validation_result": { + "is_valid": false, + "errors": [ + "Unknown parameter: __json_parse_error__" + ] + } + }, + { + "attempt": 4, + "parameters": { + "n_operators": 8, + "n_nurses": 6, + "mean_iat": 0.45, + "random_seed": 0, + "run_length": 1000 + }, + "validation_result": { + "is_valid": true, + "errors": [] + } + } + ], + "parameters": { + "n_operators": 8, + "n_nurses": 6, + "mean_iat": 0.45, + "random_seed": 0, + "run_length": 1000 + }, + "formatted_parameters": " ```\n**Parameters used in simulation**\n\n| Parameter | Value |\n|-----------|-------|\n| :bar: n_operators | 8 |\n| :bar: n_nurses | 6 |\n| :bar: mean_iat | 0.45 |\n| :bar: random_seed | 0 |\n| :bar: run_length | 1000 |\n```" + } + }, + "comparison": { + "pass": false, + "details": { + "01_mean_waiting_time": { + "expected": 242.69198606467467, + "got": 258.690360654302, + "match": false + }, + "02_operator_util": { + "expected": 99.35481108027054, + "got": 99.44580381509923, + "match": true + }, + "03_mean_nurse_waiting_time": { + "expected": 78.94431797274639, + "got": 78.94106695388635, + "match": true + }, + "04_nurse_util": { + "expected": 97.8390040780754, + "got": 97.8390040780754, + "match": true + }, + "05_callback_rate": { + "expected": 42.48847926267281, + "got": 42.5414364640884, + "match": false + } + }, + "rtol": 0.001, + "atol": 1e-06 + }, + "passed": false + }, + "example_4": { + "user_input": "Simulate with 20 operators, 15 nurses, and half the usual call volume", + "parameters": { + "n_operators": 20, + "n_nurses": 15, + "mean_iat": 1.2, + "call_low": 5.0, + "call_mode": 7.0, + "call_high": 10.0, + "callback_prob": 0.4, + "nurse_consult_low": 10.0, + "nurse_consult_high": 20.0, + "random_seed": 0, + "run_length": 1000 + }, + "expected_results": { + "01_mean_waiting_time": 0.0, + "02_operator_util": 31.21492694742085, + "03_mean_nurse_waiting_time": 0.0, + "04_nurse_util": 35.17900361818962, + "05_callback_rate": 42.17289719626168 + }, + "agent_run": { + "simulation_result": { + "01_mean_waiting_time": 0.0, + "02_operator_util": 27.417403035068194, + "03_mean_nurse_waiting_time": 0.0, + "04_nurse_util": 28.0192637065108, + "05_callback_rate": 38.22341857335128 + }, + "meta": { + "retry_count": 0, + "had_bailout": false, + "validation_history": [ + { + "attempt": 1, + "parameters": { + "n_operators": 20, + "n_nurses": 15, + "mean_iat": 1.3, + "random_seed": 42, + "run_length": 1000 + }, + "validation_result": { + "is_valid": true, + "errors": [] + } + } + ], + "parameters": { + "n_operators": 20, + "n_nurses": 15, + "mean_iat": 1.3, + "random_seed": 42, + "run_length": 1000 + }, + "formatted_parameters": " ```\n\ud83d\udcca **Parameters used in simulation**\n| Parameter | Value |\n|-----------|-------|\n| n_operators | 20 |\n| n_nurses | 15 |\n| mean_iat | 1.3 |\n| random_seed | 42 |\n| run_length | 1000 |\n```" + } + }, + "comparison": { + "pass": false, + "details": { + "01_mean_waiting_time": { + "expected": 0.0, + "got": 0.0, + "match": true + }, + "02_operator_util": { + "expected": 31.21492694742085, + "got": 27.417403035068194, + "match": false + }, + "03_mean_nurse_waiting_time": { + "expected": 0.0, + "got": 0.0, + "match": true + }, + "04_nurse_util": { + "expected": 35.17900361818962, + "got": 28.0192637065108, + "match": false + }, + "05_callback_rate": { + "expected": 42.17289719626168, + "got": 38.22341857335128, + "match": false + } + }, + "rtol": 0.001, + "atol": 1e-06 + }, + "passed": false + }, + "example_5": { + "user_input": "Run with 10 operators, 8 nurses, 30% callback probability, and 10% reduced demand", + "parameters": { + "n_operators": 10, + "n_nurses": 8, + "mean_iat": 0.66, + "call_low": 5.0, + "call_mode": 7.0, + "call_high": 10.0, + "callback_prob": 0.3, + "nurse_consult_low": 10.0, + "nurse_consult_high": 20.0, + "random_seed": 0, + "run_length": 1000 + }, + "expected_results": { + "01_mean_waiting_time": 62.26336943615235, + "02_operator_util": 98.91827940297605, + "03_mean_nurse_waiting_time": 2.3551925550777337, + "04_nurse_util": 82.46774067208518, + "05_callback_rate": 33.012583271650634 + }, + "agent_run": { + "simulation_result": { + "01_mean_waiting_time": 0.6478945342567207, + "02_operator_util": 76.87777817159127, + "03_mean_nurse_waiting_time": 0.30988264770159224, + "04_nurse_util": 65.36044859488209, + "05_callback_rate": 29.18287937743191 + }, + "meta": { + "retry_count": 3, + "had_bailout": false, + "validation_history": [ + { + "attempt": 1, + "parameters": { + "__json_parse_error__": true + }, + "validation_result": { + "is_valid": false, + "errors": [ + "Unknown parameter: __json_parse_error__" + ] + } + }, + { + "attempt": 2, + "parameters": { + "__json_parse_error__": true + }, + "validation_result": { + "is_valid": false, + "errors": [ + "Unknown parameter: __json_parse_error__" + ] + } + }, + { + "attempt": 3, + "parameters": { + "n_operators": 10, + "n_nurses": 8, + "mean_iat": 0.74, + "callback_prob": 0.3, + "call_mode": 4.2, + "call_high": 9.0, + "call_low": 5.1, + "nurse_consult_low": 10.0, + "nurse_consult_high": 18.0, + "random_seed": 42, + "run_length": 600 + }, + "validation_result": { + "is_valid": false, + "errors": [ + "call_low \u2264 call_mode \u2264 call_high violated" + ] + } + }, + { + "attempt": 4, + "parameters": { + "n_operators": 9, + "n_nurses": 8, + "mean_iat": 0.78, + "call_low": 4.2, + "call_mode": 5.64, + "call_high": 6.3, + "callback_prob": 0.3, + "nurse_consult_low": 10.0, + "nurse_consult_high": 18.0, + "random_seed": 42, + "run_length": 1000 + }, + "validation_result": { + "is_valid": true, + "errors": [] + } + } + ], + "parameters": { + "n_operators": 9, + "n_nurses": 8, + "mean_iat": 0.78, + "call_low": 4.2, + "call_mode": 5.64, + "call_high": 6.3, + "callback_prob": 0.3, + "nurse_consult_low": 10.0, + "nurse_consult_high": 18.0, + "random_seed": 42, + "run_length": 1000 + }, + "formatted_parameters": " **Parameters used in simulation**\n\n| \ud83d\udcca Parameter | Value |\n|-------------|---|\n| n_operators | 9 |\n| n_nurses | 8 |\n| mean_iat | 0.78 |\n| call_low | 4.2 |\n| call_mode | 5.64 |\n| call_high | 6.3 |\n| callback_prob | 0.3 |\n| nurse_consult_low | 10.0 |\n| nurse_consult_high | 18.0 |\n| random_seed | 42 |\n| run_length | 1000 |" + } + }, + "comparison": { + "pass": false, + "details": { + "01_mean_waiting_time": { + "expected": 62.26336943615235, + "got": 0.6478945342567207, + "match": false + }, + "02_operator_util": { + "expected": 98.91827940297605, + "got": 76.87777817159127, + "match": false + }, + "03_mean_nurse_waiting_time": { + "expected": 2.3551925550777337, + "got": 0.30988264770159224, + "match": false + }, + "04_nurse_util": { + "expected": 82.46774067208518, + "got": 65.36044859488209, + "match": false + }, + "05_callback_rate": { + "expected": 33.012583271650634, + "got": 29.18287937743191, + "match": false + } + }, + "rtol": 0.001, + "atol": 1e-06 + }, + "passed": false + }, + "example_6": { + "user_input": "Test scenario: 25 operators, 25 nurses, triple demand", + "parameters": { + "n_operators": 25, + "n_nurses": 25, + "mean_iat": 0.2, + "call_low": 5.0, + "call_mode": 7.0, + "call_high": 10.0, + "callback_prob": 0.4, + "nurse_consult_low": 10.0, + "nurse_consult_high": 20.0, + "random_seed": 0, + "run_length": 1000 + }, + "expected_results": { + "01_mean_waiting_time": 161.26826119772568, + "02_operator_util": 99.27336565709264, + "03_mean_nurse_waiting_time": 0.4505143843086236, + "04_nurse_util": 83.59706413413956, + "05_callback_rate": 41.69621749408983 + }, + "agent_run": { + "simulation_result": { + "01_mean_waiting_time": 0.5518432002497017, + "02_operator_util": 89.32672670938162, + "03_mean_nurse_waiting_time": 0.12567485568024855, + "04_nurse_util": 74.88123258139942, + "05_callback_rate": 41.477832512315274 + }, + "meta": { + "retry_count": 0, + "had_bailout": false, + "validation_history": [ + { + "attempt": 1, + "parameters": { + "n_operators": 25, + "n_nurses": 25, + "mean_iat": 0.33, + "random_seed": 0, + "run_length": 1000 + }, + "validation_result": { + "is_valid": true, + "errors": [] + } + } + ], + "parameters": { + "n_operators": 25, + "n_nurses": 25, + "mean_iat": 0.33, + "random_seed": 0, + "run_length": 1000 + }, + "formatted_parameters": " ```\n\ud83d\udcca Parameters used in simulation\n| Parameter | Value |\n|-----------|-------|\n| n_operators | 25 |\n| n_nurses | 25 |\n| mean_iat | 0.33 |\n| random_seed| 0 |\n| run_length | 1000 |\n```" + } + }, + "comparison": { + "pass": false, + "details": { + "01_mean_waiting_time": { + "expected": 161.26826119772568, + "got": 0.5518432002497017, + "match": false + }, + "02_operator_util": { + "expected": 99.27336565709264, + "got": 89.32672670938162, + "match": false + }, + "03_mean_nurse_waiting_time": { + "expected": 0.4505143843086236, + "got": 0.12567485568024855, + "match": false + }, + "04_nurse_util": { + "expected": 83.59706413413956, + "got": 74.88123258139942, + "match": false + }, + "05_callback_rate": { + "expected": 41.69621749408983, + "got": 41.477832512315274, + "match": false + } + }, + "rtol": 0.001, + "atol": 1e-06 + }, + "passed": false + }, + "example_7": { + "user_input": "Simulate 15 operators, 12 nurses, call durations 3-6-9 minutes, 2000 minute runtime", + "parameters": { + "n_operators": 15, + "n_nurses": 12, + "mean_iat": 0.6, + "call_low": 3.0, + "call_mode": 6.0, + "call_high": 9.0, + "callback_prob": 0.4, + "nurse_consult_low": 10.0, + "nurse_consult_high": 20.0, + "random_seed": 0, + "run_length": 2000 + }, + "expected_results": { + "01_mean_waiting_time": 0.07192106569784922, + "02_operator_util": 66.74240386374329, + "03_mean_nurse_waiting_time": 2.663802537251699, + "04_nurse_util": 86.43094276820304, + "05_callback_rate": 41.684147437818396 + }, + "agent_run": { + "simulation_result": null, + "meta": { + "retry_count": 4, + "had_bailout": false, + "validation_history": [ + { + "attempt": 1, + "parameters": { + "__json_parse_error__": true + }, + "validation_result": { + "is_valid": false, + "errors": [ + "Unknown parameter: __json_parse_error__" + ] + } + }, + { + "attempt": 2, + "parameters": { + "n_operators": 15, + "n_nurses": 12, + "mean_iat": 0.67, + "call_low": 3, + "call_mode": 6, + "call_high": 9, + "callback_prob": 0.4, + "nurse_consult_low": 10, + "nurse_consult_high": 20, + "random_seed": 0, + "run_length": 2000 + }, + "validation_result": { + "is_valid": false, + "errors": [ + "call_low must be float", + "call_mode must be float", + "call_high must be float", + "nurse_consult_low must be float", + "nurse_consult_high must be float" + ] + } + }, + { + "attempt": 3, + "parameters": { + "__json_parse_error__": true + }, + "validation_result": { + "is_valid": false, + "errors": [ + "Unknown parameter: __json_parse_error__" + ] + } + }, + { + "attempt": 4, + "parameters": { + "__json_parse_error__": true + }, + "validation_result": { + "is_valid": false, + "errors": [ + "Unknown parameter: __json_parse_error__" + ] + } + } + ], + "parameters": { + "__json_parse_error__": true + }, + "formatted_parameters": null + } + }, + "comparison": { + "note": "Either expected_results or agent simulation_result missing; comparison skipped." + }, + "passed": false + }, + "example_8": { + "user_input": "Run with 6 operators, 4 nurses, 60% callback rate, and 25% higher demand", + "parameters": { + "n_operators": 6, + "n_nurses": 4, + "mean_iat": 0.45, + "call_low": 5.0, + "call_mode": 7.0, + "call_high": 10.0, + "callback_prob": 0.6, + "nurse_consult_low": 10.0, + "nurse_consult_high": 20.0, + "random_seed": 0, + "run_length": 1000 + }, + "expected_results": { + "01_mean_waiting_time": 318.98668257451686, + "02_operator_util": 99.56949574531609, + "03_mean_nurse_waiting_time": 196.98753442211358, + "04_nurse_util": 97.69465771729217, + "05_callback_rate": 57.87545787545788 + }, + "agent_run": { + "simulation_result": { + "01_mean_waiting_time": 330.661658162098, + "02_operator_util": 99.47452901988686, + "03_mean_nurse_waiting_time": 223.8173355488663, + "04_nurse_util": 98.39302830500621, + "05_callback_rate": 61.76470588235294 + }, + "meta": { + "retry_count": 0, + "had_bailout": false, + "validation_history": [ + { + "attempt": 1, + "parameters": { + "n_operators": 6, + "n_nurses": 4, + "mean_iat": 0.42, + "call_mode": 7.8, + "callback_prob": 0.6, + "random_seed": 42, + "run_length": 1000 + }, + "validation_result": { + "is_valid": true, + "errors": [] + } + } + ], + "parameters": { + "n_operators": 6, + "n_nurses": 4, + "mean_iat": 0.42, + "call_mode": 7.8, + "callback_prob": 0.6, + "random_seed": 42, + "run_length": 1000 + }, + "formatted_parameters": " ```\n\ud83d\udcca Parameters used in simulation\n------------------------------\n| Parameter | Value |\n| --------------- | ------- |\n| n_operators | 6 |\n| n_nurses | 4 |\n| mean_iat | 0.42 |\n| call_mode | 7.8 |\n| callback_prob | 0.6 |\n| random_seed | 42 |\n| run_length | 1000 |\n```" + } + }, + "comparison": { + "pass": false, + "details": { + "01_mean_waiting_time": { + "expected": 318.98668257451686, + "got": 330.661658162098, + "match": false + }, + "02_operator_util": { + "expected": 99.56949574531609, + "got": 99.47452901988686, + "match": true + }, + "03_mean_nurse_waiting_time": { + "expected": 196.98753442211358, + "got": 223.8173355488663, + "match": false + }, + "04_nurse_util": { + "expected": 97.69465771729217, + "got": 98.39302830500621, + "match": false + }, + "05_callback_rate": { + "expected": 57.87545787545788, + "got": 61.76470588235294, + "match": false + } + }, + "rtol": 0.001, + "atol": 1e-06 + }, + "passed": false + }, + "example_9": { + "user_input": "Test configuration: 18 operators, 14 nurses, nurse consultations 15-25 minutes, normal demand", + "parameters": { + "n_operators": 18, + "n_nurses": 14, + "mean_iat": 0.6, + "call_low": 5.0, + "call_mode": 7.0, + "call_high": 10.0, + "callback_prob": 0.4, + "nurse_consult_low": 15.0, + "nurse_consult_high": 25.0, + "random_seed": 0, + "run_length": 1000 + }, + "expected_results": { + "01_mean_waiting_time": 0.08980920202587155, + "02_operator_util": 67.09751014731434, + "03_mean_nurse_waiting_time": 19.85524137214805, + "04_nurse_util": 97.15170780690659, + "05_callback_rate": 42.12121212121212 + }, + "agent_run": { + "simulation_result": { + "01_mean_waiting_time": 0.08980920202587155, + "02_operator_util": 67.09751014731434, + "03_mean_nurse_waiting_time": 19.85524137214805, + "04_nurse_util": 97.15170780690659, + "05_callback_rate": 42.12121212121212 + }, + "meta": { + "retry_count": 1, + "had_bailout": false, + "validation_history": [ + { + "attempt": 1, + "parameters": { + "__json_parse_error__": true + }, + "validation_result": { + "is_valid": false, + "errors": [ + "Unknown parameter: __json_parse_error__" + ] + } + }, + { + "attempt": 2, + "parameters": { + "n_operators": 18, + "n_nurses": 14, + "mean_iat": 0.6, + "call_low": 5.0, + "call_mode": 7.0, + "call_high": 10.0, + "callback_prob": 0.4, + "nurse_consult_low": 15.0, + "nurse_consult_high": 25.0, + "random_seed": 0, + "run_length": 1000 + }, + "validation_result": { + "is_valid": true, + "errors": [] + } + } + ], + "parameters": { + "n_operators": 18, + "n_nurses": 14, + "mean_iat": 0.6, + "call_low": 5.0, + "call_mode": 7.0, + "call_high": 10.0, + "callback_prob": 0.4, + "nurse_consult_low": 15.0, + "nurse_consult_high": 25.0, + "random_seed": 0, + "run_length": 1000 + }, + "formatted_parameters": " ```md\n\ud83d\udcca **Parameters used in simulation**\n| Parameter | Value |\n| --- | --- |\n| n_operators | 18 |\n| n_nurses | 14 |\n| mean_iat | 0.6 |\n| call_low | 5.0 |\n| call_mode | 7.0 |\n| call_high | 10.0 |\n| callback_prob | 0.4 |\n| nurse_consult_low | 15.0 |\n| nurse_consult_high | 25.0 |\n| random_seed | 0 |\n| run_length | 1000 |\n```" + } + }, + "comparison": { + "pass": true, + "details": { + "01_mean_waiting_time": { + "expected": 0.08980920202587155, + "got": 0.08980920202587155, + "match": true + }, + "02_operator_util": { + "expected": 67.09751014731434, + "got": 67.09751014731434, + "match": true + }, + "03_mean_nurse_waiting_time": { + "expected": 19.85524137214805, + "got": 19.85524137214805, + "match": true + }, + "04_nurse_util": { + "expected": 97.15170780690659, + "got": 97.15170780690659, + "match": true + }, + "05_callback_rate": { + "expected": 42.12121212121212, + "got": 42.12121212121212, + "match": true + } + }, + "rtol": 0.001, + "atol": 1e-06 + }, + "passed": true + }, + "example_10": { + "user_input": "Simulate 12 operators, 10 nurses, 50% callback probability, and 40% demand reduction", + "parameters": { + "n_operators": 12, + "n_nurses": 10, + "mean_iat": 0.84, + "call_low": 5.0, + "call_mode": 7.0, + "call_high": 10.0, + "callback_prob": 0.5, + "nurse_consult_low": 10.0, + "nurse_consult_high": 20.0, + "random_seed": 0, + "run_length": 1000 + }, + "expected_results": { + "01_mean_waiting_time": 0.38280107383630124, + "02_operator_util": 73.44752978233386, + "03_mean_nurse_waiting_time": 4.147154100012408, + "04_nurse_util": 90.70926116406883, + "05_callback_rate": 51.4546965918537 + }, + "agent_run": { + "simulation_result": { + "01_mean_waiting_time": 0.0, + "02_operator_util": 23.33484123800376, + "03_mean_nurse_waiting_time": 8.70119617931026, + "04_nurse_util": 87.09750711871568, + "05_callback_rate": 47.963800904977376 + }, + "meta": { + "retry_count": 2, + "had_bailout": false, + "validation_history": [ + { + "attempt": 1, + "parameters": { + "n_operators": 12, + "n_nurses": 10, + "callback_prob": 0.5, + "mean_iat": 0.9, + "call_mode": 3.6, + "call_low": 4.8, + "call_high": 7.2, + "nurse_consult_low": 16, + "nurse_consult_high": 24, + "random_seed": 0, + "run_length": 1000 + }, + "validation_result": { + "is_valid": false, + "errors": [ + "nurse_consult_low must be float", + "nurse_consult_high must be float", + "call_low \u2264 call_mode \u2264 call_high violated" + ] + } + }, + { + "attempt": 2, + "parameters": { + "n_operators": 12, + "n_nurses": 10, + "mean_iat": 0.48, + "callback_prob": 0.5, + "call_low": 6.0, + "call_mode": 7.2, + "call_high": 9.6, + "nurse_consult_low": 10.0, + "nurse_consult_high": 8.0, + "random_seed": 42, + "run_length": 1000 + }, + "validation_result": { + "is_valid": false, + "errors": [ + "nurse_consult_low \u2264 nurse_consult_high violated" + ] + } + }, + { + "attempt": 3, + "parameters": { + "n_operators": 12, + "n_nurses": 10, + "mean_iat": 1.48, + "callback_prob": 0.5, + "call_mode": 3.6, + "call_low": 3.0, + "call_high": 6.0, + "nurse_consult_low": 24.0, + "nurse_consult_high": 32.0, + "random_seed": 42, + "run_length": 1000 + }, + "validation_result": { + "is_valid": true, + "errors": [] + } + } + ], + "parameters": { + "n_operators": 12, + "n_nurses": 10, + "mean_iat": 1.48, + "callback_prob": 0.5, + "call_mode": 3.6, + "call_low": 3.0, + "call_high": 6.0, + "nurse_consult_low": 24.0, + "nurse_consult_high": 32.0, + "random_seed": 42, + "run_length": 1000 + }, + "formatted_parameters": " **Parameters used in simulation**\n\n| :heavy_check_mark: Parameter | Value |\n|-----------------------------|-------|\n| n\\_operators | 12 |\n| n\\_nurses | 10 |\n| mean\\_iat | 1.48 |\n| callback\\_prob | 0.5 |\n| call\\_mode | 3.6 |\n| call\\_low | 3.0 |\n| call\\_high | 6.0 |\n| nurse\\_consult\\_low | 24.0 |\n| nurse\\_consult\\_high | 32.0 |\n| random\\_seed | 42 |\n| run\\_length | 1000 |" + } + }, + "comparison": { + "pass": false, + "details": { + "01_mean_waiting_time": { + "expected": 0.38280107383630124, + "got": 0.0, + "match": false + }, + "02_operator_util": { + "expected": 73.44752978233386, + "got": 23.33484123800376, + "match": false + }, + "03_mean_nurse_waiting_time": { + "expected": 4.147154100012408, + "got": 8.70119617931026, + "match": false + }, + "04_nurse_util": { + "expected": 90.70926116406883, + "got": 87.09750711871568, + "match": false + }, + "05_callback_rate": { + "expected": 51.4546965918537, + "got": 47.963800904977376, + "match": false + } + }, + "rtol": 0.001, + "atol": 1e-06 + }, + "passed": false + } +} \ No newline at end of file diff --git a/run_evals.py b/run_evals.py new file mode 100644 index 0000000..361c137 --- /dev/null +++ b/run_evals.py @@ -0,0 +1,157 @@ +import asyncio +import json +import math +from typing import Dict, Any, Tuple, Optional + +# Import the agent builder and its dependencies from your agent module +# Assumes agent_self_reflection.py exposes `build_graph`, `AgentState`, and a factory for the LLM. +from agent_self_reflection import build_graph, AgentState +from langchain_ollama import OllamaLLM + + +def floats_close(a: Any, b: Any, rtol: float = 1e-3, atol: float = 1e-6) -> bool: + """ + Compare numbers with tolerance, pass-through for non-numeric equality. + Returns True if both are NaN, or within tolerances for floats, or exactly equal otherwise. + """ + # Handle None + if a is None or b is None: + return a is None and b is None + + # Try numeric compare + try: + fa = float(a) + fb = float(b) + # Handle NaNs + if math.isnan(fa) and math.isnan(fb): + return True + return math.isclose(fa, fb, rel_tol=rtol, abs_tol=atol) + except (TypeError, ValueError): + # Non-numeric: exact equality + return a == b + + +def compare_results( + got: Dict[str, Any], + expected: Dict[str, Any], + rtol: float = 1e-3, + atol: float = 1e-6, +) -> Dict[str, Any]: + """ + Compare simulation outputs to expected_results with numeric tolerance. + Returns a dict with per-key comparison, diffs, and overall pass flag. + """ + keys = sorted(set(got.keys()) | set(expected.keys())) + per_key = {} + all_pass = True + + for k in keys: + g = got.get(k, None) + e = expected.get(k, None) + ok = floats_close(g, e, rtol=rtol, atol=atol) + if not ok: + all_pass = False + per_key[k] = { + "expected": e, + "got": g, + "match": ok, + } + + return { + "pass": all_pass, + "details": per_key, + "rtol": rtol, + "atol": atol, + } + + +async def run_agent_once( + compiled_graph, + user_input: str, + llm: OllamaLLM, + max_retries: int = 4, +) -> Dict[str, Any]: + """ + Runs the agent end-to-end for a single natural-language user_input. + Returns the final state, including simulation_result or error. + """ + state_in: AgentState = { + "user_input": user_input, + "retry_count": 0, + "validation_history": [], + } + final_state: AgentState = await compiled_graph.ainvoke(state_in) + return final_state + + +def extract_agent_simulation_result(final_state: Dict[str, Any]) -> Tuple[Optional[Dict[str, Any]], Dict[str, Any]]: + """ + Extract the simulation_result if present and return a compact agent_run summary + useful for evaluation logging. + """ + sim_res = final_state.get("simulation_result") + agent_run = { + "retry_count": final_state.get("retry_count", 0), + "had_bailout": final_state.get("error") == "Maximum retries exceeded during parameter reflection.", + "validation_history": final_state.get("validation_history", []), + "parameters": final_state.get("parameters", {}), + "formatted_parameters": final_state.get("formatted_parameters", None), + } + return sim_res, agent_run + + +async def main( + input_json_path: str = "evals/evals.json", + output_json_path: str = "evals/evals_output.json", + model_name: str = "gemma3:27b", + llm_base_url: str = "http://localhost:11434", + rtol: float = 1e-3, + atol: float = 1e-6, +) -> None: + # 1) Load evals + with open(input_json_path, "r") as f: + evals = json.load(f) + + # 2) Build LLM and graph once + llm = OllamaLLM(model=model_name, base_url=llm_base_url) + compiled_graph = build_graph(llm) + + # 3) Run each eval in a loop via the agent + for key, case in evals.items(): + user_input = case.get("user_input", "") + expected = case.get("parameters", {}).get("expected_results") or case.get("expected_results") + # In your earlier structure, expected_results is at the top level of each example after enrichment. + # If not present yet, this will remain None and comparison will be skipped. + + final_state = await run_agent_once(compiled_graph, user_input, llm) + sim_res, agent_run = extract_agent_simulation_result(final_state) + + # Record agent run outputs + case["agent_run"] = { + "simulation_result": sim_res, + "meta": agent_run, + } + + # 4) Compare to expected_results if available + if expected is not None and sim_res is not None: + cmp = compare_results(sim_res, expected, rtol=rtol, atol=atol) + case["comparison"] = cmp + case["passed"] = bool(cmp["pass"]) + else: + case["comparison"] = { + "note": "Either expected_results or agent simulation_result missing; comparison skipped." + } + case["passed"] = False if expected is not None else None + + # 5) Save enriched evals to evals.json + with open(output_json_path, "w") as f: + json.dump(evals, f, indent=2) + + print(f"Wrote evaluation results to {output_json_path}") + + +if __name__ == "__main__": + # For CLI usage: + # python eval_runner.py + # Optional: parameterize via env vars or argparse if desired. + asyncio.run(main()) From 1e1502078b6d85ef7868a2ace244c2a65951da22 Mon Sep 17 00:00:00 2001 From: TomMonks Date: Tue, 12 Aug 2025 13:26:29 +0100 Subject: [PATCH 18/23] feat(evals): trial run with gemma3:27b - no phoenix --- create_evals.py => evals/create_evals.py | 0 evals/evals_output.json | 633 ++++++++++------------- 2 files changed, 277 insertions(+), 356 deletions(-) rename create_evals.py => evals/create_evals.py (100%) diff --git a/create_evals.py b/evals/create_evals.py similarity index 100% rename from create_evals.py rename to evals/create_evals.py diff --git a/evals/evals_output.json b/evals/evals_output.json index 9453f85..a5ef66d 100644 --- a/evals/evals_output.json +++ b/evals/evals_output.json @@ -23,11 +23,11 @@ }, "agent_run": { "simulation_result": { - "01_mean_waiting_time": 2.9621655675237495, - "02_operator_util": 91.36685915350135, - "03_mean_nurse_waiting_time": 2.958020460174451, - "04_nurse_util": 84.3596921204659, - "05_callback_rate": 38.860103626943 + "01_mean_waiting_time": 2.3504694345055652, + "02_operator_util": 90.86219986414451, + "03_mean_nurse_waiting_time": 4.1106430489118715, + "04_nurse_util": 90.36357093206236, + "05_callback_rate": 42.5531914893617 }, "meta": { "retry_count": 0, @@ -39,7 +39,13 @@ "n_operators": 14, "n_nurses": 12, "mean_iat": 0.57, - "random_seed": 42, + "call_low": 5.0, + "call_mode": 7.0, + "call_high": 10.0, + "callback_prob": 0.4, + "nurse_consult_low": 10.0, + "nurse_consult_high": 20.0, + "random_seed": 0, "run_length": 1000 }, "validation_result": { @@ -52,45 +58,51 @@ "n_operators": 14, "n_nurses": 12, "mean_iat": 0.57, - "random_seed": 42, + "call_low": 5.0, + "call_mode": 7.0, + "call_high": 10.0, + "callback_prob": 0.4, + "nurse_consult_low": 10.0, + "nurse_consult_high": 20.0, + "random_seed": 0, "run_length": 1000 }, - "formatted_parameters": " ```\n**Parameters used in simulation**\n\n| Parameter | Value |\n|-----------|-------|\n| \ud83d\udcca n_operators | 14 |\n| \ud83d\udcca n_nurses | 12 |\n| \ud83d\udcca mean_iat | 0.57 |\n| \ud83d\udcca random_seed| 42 |\n| \ud83d\udcca run_length | 1000 |\n```" + "formatted_parameters": "**Parameters used in simulation**\n\n| \ud83d\udcc8 Parameter | Value |\n|---|---|\n| n_operators | 14 |\n| n_nurses | 12 |\n| mean_iat | 0.57 |\n| call_low | 5.0 |\n| call_mode | 7.0 |\n| call_high | 10.0 |\n| callback_prob | 0.4 |\n| nurse_consult_low | 10.0 |\n| nurse_consult_high | 20.0 |\n| random_seed | 0 |\n| run_length | 1000 |\n" } }, "comparison": { - "pass": false, + "pass": true, "details": { "01_mean_waiting_time": { "expected": 2.3504694345055652, - "got": 2.9621655675237495, - "match": false + "got": 2.3504694345055652, + "match": true }, "02_operator_util": { "expected": 90.86219986414451, - "got": 91.36685915350135, - "match": false + "got": 90.86219986414451, + "match": true }, "03_mean_nurse_waiting_time": { "expected": 4.1106430489118715, - "got": 2.958020460174451, - "match": false + "got": 4.1106430489118715, + "match": true }, "04_nurse_util": { "expected": 90.36357093206236, - "got": 84.3596921204659, - "match": false + "got": 90.36357093206236, + "match": true }, "05_callback_rate": { "expected": 42.5531914893617, - "got": 38.860103626943, - "match": false + "got": 42.5531914893617, + "match": true } }, "rtol": 0.001, "atol": 1e-06 }, - "passed": false + "passed": true }, "example_2": { "user_input": "Run scenario with double staffing and normal call volume", @@ -116,11 +128,11 @@ }, "agent_run": { "simulation_result": { - "01_mean_waiting_time": 0.24486927533243114, - "02_operator_util": 83.1290576750028, - "03_mean_nurse_waiting_time": 212.0429575856493, - "04_nurse_util": 97.94215299366186, - "05_callback_rate": 39.5769361992494 + "01_mean_waiting_time": 0.0, + "02_operator_util": 46.452122409679156, + "03_mean_nurse_waiting_time": 43.94073145744734, + "04_nurse_util": 97.41291238872327, + "05_callback_rate": 42.12121212121212 }, "meta": { "retry_count": 0, @@ -131,8 +143,14 @@ "parameters": { "n_operators": 26, "n_nurses": 10, - "mean_iat": 0.335, - "random_seed": 42, + "mean_iat": 0.6, + "call_low": 5.0, + "call_mode": 7.0, + "call_high": 10.0, + "callback_prob": 0.4, + "nurse_consult_low": 10.0, + "nurse_consult_high": 20.0, + "random_seed": 0, "run_length": 1000 }, "validation_result": { @@ -144,11 +162,17 @@ "parameters": { "n_operators": 26, "n_nurses": 10, - "mean_iat": 0.335, - "random_seed": 42, + "mean_iat": 0.6, + "call_low": 5.0, + "call_mode": 7.0, + "call_high": 10.0, + "callback_prob": 0.4, + "nurse_consult_low": 10.0, + "nurse_consult_high": 20.0, + "random_seed": 0, "run_length": 1000 }, - "formatted_parameters": " ```markdown\n\ud83d\udcca **Parameters** | **Value**\n-----------------|--------\nn_operators | 26\nn_nurses | 10\nmean_iat | 0.335\nrandom_seed | 42\nrun_length | 1000\n```" + "formatted_parameters": "**Parameters used in simulation**\n\n| \ud83d\udcc8 Parameter | Value |\n|---|---|\n| n_operators | 26 |\n| n_nurses | 10 |\n| mean_iat | 0.6 |\n| call_low | 5.0 |\n| call_mode | 7.0 |\n| call_high | 10.0 |\n| callback_prob | 0.4 |\n| nurse_consult_low | 10.0 |\n| nurse_consult_high | 20.0 |\n| random_seed | 0 |\n| run_length | 1000 |\n" } }, "comparison": { @@ -156,28 +180,28 @@ "details": { "01_mean_waiting_time": { "expected": 0.0, - "got": 0.24486927533243114, - "match": false + "got": 0.0, + "match": true }, "02_operator_util": { "expected": 46.452122409679156, - "got": 83.1290576750028, - "match": false + "got": 46.452122409679156, + "match": true }, "03_mean_nurse_waiting_time": { "expected": 0.0041045663539504606, - "got": 212.0429575856493, + "got": 43.94073145744734, "match": false }, "04_nurse_util": { "expected": 51.20346053052576, - "got": 97.94215299366186, + "got": 97.41291238872327, "match": false }, "05_callback_rate": { "expected": 42.12121212121212, - "got": 39.5769361992494, - "match": false + "got": 42.12121212121212, + "match": true } }, "rtol": 0.001, @@ -209,58 +233,28 @@ }, "agent_run": { "simulation_result": { - "01_mean_waiting_time": 258.690360654302, - "02_operator_util": 99.44580381509923, - "03_mean_nurse_waiting_time": 78.94106695388635, + "01_mean_waiting_time": 230.08648598465186, + "02_operator_util": 99.35481108027054, + "03_mean_nurse_waiting_time": 78.94871019130441, "04_nurse_util": 97.8390040780754, - "05_callback_rate": 42.5414364640884 + "05_callback_rate": 42.48847926267281 }, "meta": { - "retry_count": 3, + "retry_count": 0, "had_bailout": false, "validation_history": [ { "attempt": 1, - "parameters": { - "__json_parse_error__": true - }, - "validation_result": { - "is_valid": false, - "errors": [ - "Unknown parameter: __json_parse_error__" - ] - } - }, - { - "attempt": 2, - "parameters": { - "__json_parse_error__": true - }, - "validation_result": { - "is_valid": false, - "errors": [ - "Unknown parameter: __json_parse_error__" - ] - } - }, - { - "attempt": 3, - "parameters": { - "__json_parse_error__": true - }, - "validation_result": { - "is_valid": false, - "errors": [ - "Unknown parameter: __json_parse_error__" - ] - } - }, - { - "attempt": 4, "parameters": { "n_operators": 8, "n_nurses": 6, - "mean_iat": 0.45, + "mean_iat": 0.504, + "call_low": 5.0, + "call_mode": 7.0, + "call_high": 10.0, + "callback_prob": 0.4, + "nurse_consult_low": 10.0, + "nurse_consult_high": 20.0, "random_seed": 0, "run_length": 1000 }, @@ -273,11 +267,17 @@ "parameters": { "n_operators": 8, "n_nurses": 6, - "mean_iat": 0.45, + "mean_iat": 0.504, + "call_low": 5.0, + "call_mode": 7.0, + "call_high": 10.0, + "callback_prob": 0.4, + "nurse_consult_low": 10.0, + "nurse_consult_high": 20.0, "random_seed": 0, "run_length": 1000 }, - "formatted_parameters": " ```\n**Parameters used in simulation**\n\n| Parameter | Value |\n|-----------|-------|\n| :bar: n_operators | 8 |\n| :bar: n_nurses | 6 |\n| :bar: mean_iat | 0.45 |\n| :bar: random_seed | 0 |\n| :bar: run_length | 1000 |\n```" + "formatted_parameters": "**Parameters used in simulation**\n\n| \ud83d\udcc8 Parameter | Value |\n|---|---|\n| n_operators | 8 |\n| n_nurses | 6 |\n| mean_iat | 0.504 |\n| call_low | 5.0 |\n| call_mode | 7.0 |\n| call_high | 10.0 |\n| callback_prob | 0.4 |\n| nurse_consult_low | 10.0 |\n| nurse_consult_high | 20.0 |\n| random_seed | 0 |\n| run_length | 1000 |\n" } }, "comparison": { @@ -285,17 +285,17 @@ "details": { "01_mean_waiting_time": { "expected": 242.69198606467467, - "got": 258.690360654302, + "got": 230.08648598465186, "match": false }, "02_operator_util": { "expected": 99.35481108027054, - "got": 99.44580381509923, + "got": 99.35481108027054, "match": true }, "03_mean_nurse_waiting_time": { "expected": 78.94431797274639, - "got": 78.94106695388635, + "got": 78.94871019130441, "match": true }, "04_nurse_util": { @@ -305,8 +305,8 @@ }, "05_callback_rate": { "expected": 42.48847926267281, - "got": 42.5414364640884, - "match": false + "got": 42.48847926267281, + "match": true } }, "rtol": 0.001, @@ -339,10 +339,10 @@ "agent_run": { "simulation_result": { "01_mean_waiting_time": 0.0, - "02_operator_util": 27.417403035068194, + "02_operator_util": 31.21492694742085, "03_mean_nurse_waiting_time": 0.0, - "04_nurse_util": 28.0192637065108, - "05_callback_rate": 38.22341857335128 + "04_nurse_util": 35.17900361818962, + "05_callback_rate": 42.17289719626168 }, "meta": { "retry_count": 0, @@ -353,8 +353,14 @@ "parameters": { "n_operators": 20, "n_nurses": 15, - "mean_iat": 1.3, - "random_seed": 42, + "mean_iat": 1.2, + "call_low": 5.0, + "call_mode": 7.0, + "call_high": 10.0, + "callback_prob": 0.4, + "nurse_consult_low": 10.0, + "nurse_consult_high": 20.0, + "random_seed": 0, "run_length": 1000 }, "validation_result": { @@ -366,15 +372,21 @@ "parameters": { "n_operators": 20, "n_nurses": 15, - "mean_iat": 1.3, - "random_seed": 42, + "mean_iat": 1.2, + "call_low": 5.0, + "call_mode": 7.0, + "call_high": 10.0, + "callback_prob": 0.4, + "nurse_consult_low": 10.0, + "nurse_consult_high": 20.0, + "random_seed": 0, "run_length": 1000 }, - "formatted_parameters": " ```\n\ud83d\udcca **Parameters used in simulation**\n| Parameter | Value |\n|-----------|-------|\n| n_operators | 20 |\n| n_nurses | 15 |\n| mean_iat | 1.3 |\n| random_seed | 42 |\n| run_length | 1000 |\n```" + "formatted_parameters": "**Parameters used in simulation**\n\n| \ud83d\udcc8 Parameter | Value |\n|---|---|\n| n_operators | 20 |\n| n_nurses | 15 |\n| mean_iat | 1.2 |\n| call_low | 5.0 |\n| call_mode | 7.0 |\n| call_high | 10.0 |\n| callback_prob | 0.4 |\n| nurse_consult_low | 10.0 |\n| nurse_consult_high | 20.0 |\n| random_seed | 0 |\n| run_length | 1000 |\n" } }, "comparison": { - "pass": false, + "pass": true, "details": { "01_mean_waiting_time": { "expected": 0.0, @@ -383,8 +395,8 @@ }, "02_operator_util": { "expected": 31.21492694742085, - "got": 27.417403035068194, - "match": false + "got": 31.21492694742085, + "match": true }, "03_mean_nurse_waiting_time": { "expected": 0.0, @@ -393,19 +405,19 @@ }, "04_nurse_util": { "expected": 35.17900361818962, - "got": 28.0192637065108, - "match": false + "got": 35.17900361818962, + "match": true }, "05_callback_rate": { "expected": 42.17289719626168, - "got": 38.22341857335128, - "match": false + "got": 42.17289719626168, + "match": true } }, "rtol": 0.001, "atol": 1e-06 }, - "passed": false + "passed": true }, "example_5": { "user_input": "Run with 10 operators, 8 nurses, 30% callback probability, and 10% reduced demand", @@ -431,75 +443,29 @@ }, "agent_run": { "simulation_result": { - "01_mean_waiting_time": 0.6478945342567207, - "02_operator_util": 76.87777817159127, - "03_mean_nurse_waiting_time": 0.30988264770159224, - "04_nurse_util": 65.36044859488209, - "05_callback_rate": 29.18287937743191 + "01_mean_waiting_time": 46.3970260873325, + "02_operator_util": 98.6111946659619, + "03_mean_nurse_waiting_time": 2.290083053642536, + "04_nurse_util": 82.07127437115734, + "05_callback_rate": 33.11061618411284 }, "meta": { - "retry_count": 3, + "retry_count": 0, "had_bailout": false, "validation_history": [ { "attempt": 1, - "parameters": { - "__json_parse_error__": true - }, - "validation_result": { - "is_valid": false, - "errors": [ - "Unknown parameter: __json_parse_error__" - ] - } - }, - { - "attempt": 2, - "parameters": { - "__json_parse_error__": true - }, - "validation_result": { - "is_valid": false, - "errors": [ - "Unknown parameter: __json_parse_error__" - ] - } - }, - { - "attempt": 3, "parameters": { "n_operators": 10, "n_nurses": 8, - "mean_iat": 0.74, - "callback_prob": 0.3, - "call_mode": 4.2, - "call_high": 9.0, - "call_low": 5.1, - "nurse_consult_low": 10.0, - "nurse_consult_high": 18.0, - "random_seed": 42, - "run_length": 600 - }, - "validation_result": { - "is_valid": false, - "errors": [ - "call_low \u2264 call_mode \u2264 call_high violated" - ] - } - }, - { - "attempt": 4, - "parameters": { - "n_operators": 9, - "n_nurses": 8, - "mean_iat": 0.78, - "call_low": 4.2, - "call_mode": 5.64, - "call_high": 6.3, + "mean_iat": 0.69, "callback_prob": 0.3, "nurse_consult_low": 10.0, - "nurse_consult_high": 18.0, - "random_seed": 42, + "nurse_consult_high": 20.0, + "call_low": 5.0, + "call_mode": 7.0, + "call_high": 10.0, + "random_seed": 0, "run_length": 1000 }, "validation_result": { @@ -509,19 +475,19 @@ } ], "parameters": { - "n_operators": 9, + "n_operators": 10, "n_nurses": 8, - "mean_iat": 0.78, - "call_low": 4.2, - "call_mode": 5.64, - "call_high": 6.3, + "mean_iat": 0.69, "callback_prob": 0.3, "nurse_consult_low": 10.0, - "nurse_consult_high": 18.0, - "random_seed": 42, + "nurse_consult_high": 20.0, + "call_low": 5.0, + "call_mode": 7.0, + "call_high": 10.0, + "random_seed": 0, "run_length": 1000 }, - "formatted_parameters": " **Parameters used in simulation**\n\n| \ud83d\udcca Parameter | Value |\n|-------------|---|\n| n_operators | 9 |\n| n_nurses | 8 |\n| mean_iat | 0.78 |\n| call_low | 4.2 |\n| call_mode | 5.64 |\n| call_high | 6.3 |\n| callback_prob | 0.3 |\n| nurse_consult_low | 10.0 |\n| nurse_consult_high | 18.0 |\n| random_seed | 42 |\n| run_length | 1000 |" + "formatted_parameters": "**Parameters used in simulation**\n\n| \ud83d\udcc8 Parameter | Value |\n|---|---|\n| n_operators | 10 |\n| n_nurses | 8 |\n| mean_iat | 0.69 |\n| callback_prob | 0.3 |\n| nurse_consult_low | 10.0 |\n| nurse_consult_high | 20.0 |\n| call_low | 5.0 |\n| call_mode | 7.0 |\n| call_high | 10.0 |\n| random_seed | 0 |\n| run_length | 1000 |\n" } }, "comparison": { @@ -529,27 +495,27 @@ "details": { "01_mean_waiting_time": { "expected": 62.26336943615235, - "got": 0.6478945342567207, + "got": 46.3970260873325, "match": false }, "02_operator_util": { "expected": 98.91827940297605, - "got": 76.87777817159127, + "got": 98.6111946659619, "match": false }, "03_mean_nurse_waiting_time": { "expected": 2.3551925550777337, - "got": 0.30988264770159224, + "got": 2.290083053642536, "match": false }, "04_nurse_util": { "expected": 82.46774067208518, - "got": 65.36044859488209, + "got": 82.07127437115734, "match": false }, "05_callback_rate": { "expected": 33.012583271650634, - "got": 29.18287937743191, + "got": 33.11061618411284, "match": false } }, @@ -582,11 +548,11 @@ }, "agent_run": { "simulation_result": { - "01_mean_waiting_time": 0.5518432002497017, - "02_operator_util": 89.32672670938162, - "03_mean_nurse_waiting_time": 0.12567485568024855, - "04_nurse_util": 74.88123258139942, - "05_callback_rate": 41.477832512315274 + "01_mean_waiting_time": 161.26826119772568, + "02_operator_util": 99.27336565709264, + "03_mean_nurse_waiting_time": 0.4505143843086236, + "04_nurse_util": 83.59706413413956, + "05_callback_rate": 41.69621749408983 }, "meta": { "retry_count": 0, @@ -597,7 +563,13 @@ "parameters": { "n_operators": 25, "n_nurses": 25, - "mean_iat": 0.33, + "mean_iat": 0.2, + "call_low": 5.0, + "call_mode": 7.0, + "call_high": 10.0, + "callback_prob": 0.4, + "nurse_consult_low": 10.0, + "nurse_consult_high": 20.0, "random_seed": 0, "run_length": 1000 }, @@ -610,46 +582,52 @@ "parameters": { "n_operators": 25, "n_nurses": 25, - "mean_iat": 0.33, + "mean_iat": 0.2, + "call_low": 5.0, + "call_mode": 7.0, + "call_high": 10.0, + "callback_prob": 0.4, + "nurse_consult_low": 10.0, + "nurse_consult_high": 20.0, "random_seed": 0, "run_length": 1000 }, - "formatted_parameters": " ```\n\ud83d\udcca Parameters used in simulation\n| Parameter | Value |\n|-----------|-------|\n| n_operators | 25 |\n| n_nurses | 25 |\n| mean_iat | 0.33 |\n| random_seed| 0 |\n| run_length | 1000 |\n```" + "formatted_parameters": "**Parameters used in simulation**\n\n| \ud83d\udcc8 Parameter | Value |\n|---|---|\n| n_operators | 25 |\n| n_nurses | 25 |\n| mean_iat | 0.2 |\n| call_low | 5.0 |\n| call_mode | 7.0 |\n| call_high | 10.0 |\n| callback_prob | 0.4 |\n| nurse_consult_low | 10.0 |\n| nurse_consult_high | 20.0 |\n| random_seed | 0 |\n| run_length | 1000 |\n" } }, "comparison": { - "pass": false, + "pass": true, "details": { "01_mean_waiting_time": { "expected": 161.26826119772568, - "got": 0.5518432002497017, - "match": false + "got": 161.26826119772568, + "match": true }, "02_operator_util": { "expected": 99.27336565709264, - "got": 89.32672670938162, - "match": false + "got": 99.27336565709264, + "match": true }, "03_mean_nurse_waiting_time": { "expected": 0.4505143843086236, - "got": 0.12567485568024855, - "match": false + "got": 0.4505143843086236, + "match": true }, "04_nurse_util": { "expected": 83.59706413413956, - "got": 74.88123258139942, - "match": false + "got": 83.59706413413956, + "match": true }, "05_callback_rate": { "expected": 41.69621749408983, - "got": 41.477832512315274, - "match": false + "got": 41.69621749408983, + "match": true } }, "rtol": 0.001, "atol": 1e-06 }, - "passed": false + "passed": true }, "example_7": { "user_input": "Simulate 15 operators, 12 nurses, call durations 3-6-9 minutes, 2000 minute runtime", @@ -674,84 +652,87 @@ "05_callback_rate": 41.684147437818396 }, "agent_run": { - "simulation_result": null, + "simulation_result": { + "01_mean_waiting_time": 0.07192106569784922, + "02_operator_util": 66.74240386374329, + "03_mean_nurse_waiting_time": 2.663802537251699, + "04_nurse_util": 86.43094276820304, + "05_callback_rate": 41.684147437818396 + }, "meta": { - "retry_count": 4, + "retry_count": 0, "had_bailout": false, "validation_history": [ { "attempt": 1, - "parameters": { - "__json_parse_error__": true - }, - "validation_result": { - "is_valid": false, - "errors": [ - "Unknown parameter: __json_parse_error__" - ] - } - }, - { - "attempt": 2, "parameters": { "n_operators": 15, "n_nurses": 12, - "mean_iat": 0.67, - "call_low": 3, - "call_mode": 6, - "call_high": 9, + "mean_iat": 0.6, + "call_low": 3.0, + "call_mode": 6.0, + "call_high": 9.0, "callback_prob": 0.4, - "nurse_consult_low": 10, - "nurse_consult_high": 20, + "nurse_consult_low": 10.0, + "nurse_consult_high": 20.0, "random_seed": 0, "run_length": 2000 }, "validation_result": { - "is_valid": false, - "errors": [ - "call_low must be float", - "call_mode must be float", - "call_high must be float", - "nurse_consult_low must be float", - "nurse_consult_high must be float" - ] - } - }, - { - "attempt": 3, - "parameters": { - "__json_parse_error__": true - }, - "validation_result": { - "is_valid": false, - "errors": [ - "Unknown parameter: __json_parse_error__" - ] - } - }, - { - "attempt": 4, - "parameters": { - "__json_parse_error__": true - }, - "validation_result": { - "is_valid": false, - "errors": [ - "Unknown parameter: __json_parse_error__" - ] + "is_valid": true, + "errors": [] } } ], "parameters": { - "__json_parse_error__": true + "n_operators": 15, + "n_nurses": 12, + "mean_iat": 0.6, + "call_low": 3.0, + "call_mode": 6.0, + "call_high": 9.0, + "callback_prob": 0.4, + "nurse_consult_low": 10.0, + "nurse_consult_high": 20.0, + "random_seed": 0, + "run_length": 2000 }, - "formatted_parameters": null + "formatted_parameters": "**Parameters used in simulation**\n\n| \ud83d\udcca Parameter | Value |\n|---|---|\n| n_operators | 15 |\n| n_nurses | 12 |\n| mean_iat | 0.6 |\n| call_low | 3.0 |\n| call_mode | 6.0 |\n| call_high | 9.0 |\n| callback_prob | 0.4 |\n| nurse_consult_low | 10.0 |\n| nurse_consult_high | 20.0 |\n| random_seed | 0 |\n| run_length | 2000 |\n" } }, "comparison": { - "note": "Either expected_results or agent simulation_result missing; comparison skipped." + "pass": true, + "details": { + "01_mean_waiting_time": { + "expected": 0.07192106569784922, + "got": 0.07192106569784922, + "match": true + }, + "02_operator_util": { + "expected": 66.74240386374329, + "got": 66.74240386374329, + "match": true + }, + "03_mean_nurse_waiting_time": { + "expected": 2.663802537251699, + "got": 2.663802537251699, + "match": true + }, + "04_nurse_util": { + "expected": 86.43094276820304, + "got": 86.43094276820304, + "match": true + }, + "05_callback_rate": { + "expected": 41.684147437818396, + "got": 41.684147437818396, + "match": true + } + }, + "rtol": 0.001, + "atol": 1e-06 }, - "passed": false + "passed": true }, "example_8": { "user_input": "Run with 6 operators, 4 nurses, 60% callback rate, and 25% higher demand", @@ -777,11 +758,11 @@ }, "agent_run": { "simulation_result": { - "01_mean_waiting_time": 330.661658162098, - "02_operator_util": 99.47452901988686, - "03_mean_nurse_waiting_time": 223.8173355488663, - "04_nurse_util": 98.39302830500621, - "05_callback_rate": 61.76470588235294 + "01_mean_waiting_time": 306.74172321599656, + "02_operator_util": 99.45155480236895, + "03_mean_nurse_waiting_time": 196.98238666596234, + "04_nurse_util": 97.69465771729217, + "05_callback_rate": 57.8239608801956 }, "meta": { "retry_count": 0, @@ -792,10 +773,9 @@ "parameters": { "n_operators": 6, "n_nurses": 4, - "mean_iat": 0.42, - "call_mode": 7.8, + "mean_iat": 0.48, "callback_prob": 0.6, - "random_seed": 42, + "random_seed": 0, "run_length": 1000 }, "validation_result": { @@ -807,13 +787,12 @@ "parameters": { "n_operators": 6, "n_nurses": 4, - "mean_iat": 0.42, - "call_mode": 7.8, + "mean_iat": 0.48, "callback_prob": 0.6, - "random_seed": 42, + "random_seed": 0, "run_length": 1000 }, - "formatted_parameters": " ```\n\ud83d\udcca Parameters used in simulation\n------------------------------\n| Parameter | Value |\n| --------------- | ------- |\n| n_operators | 6 |\n| n_nurses | 4 |\n| mean_iat | 0.42 |\n| call_mode | 7.8 |\n| callback_prob | 0.6 |\n| random_seed | 42 |\n| run_length | 1000 |\n```" + "formatted_parameters": "**Parameters used in simulation**\n\n| \ud83d\udcca Parameter | Value |\n|---|---|\n| n_operators | 6 |\n| n_nurses | 4 |\n| mean_iat | 0.48 |\n| callback_prob | 0.6 |\n| random_seed | 0 |\n| run_length | 1000 |\n" } }, "comparison": { @@ -821,28 +800,28 @@ "details": { "01_mean_waiting_time": { "expected": 318.98668257451686, - "got": 330.661658162098, + "got": 306.74172321599656, "match": false }, "02_operator_util": { "expected": 99.56949574531609, - "got": 99.47452901988686, - "match": true + "got": 99.45155480236895, + "match": false }, "03_mean_nurse_waiting_time": { "expected": 196.98753442211358, - "got": 223.8173355488663, - "match": false + "got": 196.98238666596234, + "match": true }, "04_nurse_util": { "expected": 97.69465771729217, - "got": 98.39302830500621, - "match": false + "got": 97.69465771729217, + "match": true }, "05_callback_rate": { "expected": 57.87545787545788, - "got": 61.76470588235294, - "match": false + "got": 57.8239608801956, + "match": true } }, "rtol": 0.001, @@ -881,23 +860,11 @@ "05_callback_rate": 42.12121212121212 }, "meta": { - "retry_count": 1, + "retry_count": 0, "had_bailout": false, "validation_history": [ { "attempt": 1, - "parameters": { - "__json_parse_error__": true - }, - "validation_result": { - "is_valid": false, - "errors": [ - "Unknown parameter: __json_parse_error__" - ] - } - }, - { - "attempt": 2, "parameters": { "n_operators": 18, "n_nurses": 14, @@ -930,7 +897,7 @@ "random_seed": 0, "run_length": 1000 }, - "formatted_parameters": " ```md\n\ud83d\udcca **Parameters used in simulation**\n| Parameter | Value |\n| --- | --- |\n| n_operators | 18 |\n| n_nurses | 14 |\n| mean_iat | 0.6 |\n| call_low | 5.0 |\n| call_mode | 7.0 |\n| call_high | 10.0 |\n| callback_prob | 0.4 |\n| nurse_consult_low | 15.0 |\n| nurse_consult_high | 25.0 |\n| random_seed | 0 |\n| run_length | 1000 |\n```" + "formatted_parameters": "**Parameters used in simulation**\n\n| \ud83d\udcc8 Parameter | Value |\n|---|---|\n| n_operators | 18 |\n| n_nurses | 14 |\n| mean_iat | 0.6 |\n| call_low | 5.0 |\n| call_mode | 7.0 |\n| call_high | 10.0 |\n| callback_prob | 0.4 |\n| nurse_consult_low | 15.0 |\n| nurse_consult_high | 25.0 |\n| random_seed | 0 |\n| run_length | 1000 |\n" } }, "comparison": { @@ -991,14 +958,14 @@ }, "agent_run": { "simulation_result": { - "01_mean_waiting_time": 0.0, - "02_operator_util": 23.33484123800376, - "03_mean_nurse_waiting_time": 8.70119617931026, - "04_nurse_util": 87.09750711871568, - "05_callback_rate": 47.963800904977376 + "01_mean_waiting_time": 1.2583822886248166, + "02_operator_util": 84.9856571728051, + "03_mean_nurse_waiting_time": 46.17149830527762, + "04_nurse_util": 97.12003253429981, + "05_callback_rate": 50.43103448275862 }, "meta": { - "retry_count": 2, + "retry_count": 0, "had_bailout": false, "validation_history": [ { @@ -1006,60 +973,14 @@ "parameters": { "n_operators": 12, "n_nurses": 10, + "mean_iat": 0.72, "callback_prob": 0.5, - "mean_iat": 0.9, - "call_mode": 3.6, - "call_low": 4.8, - "call_high": 7.2, - "nurse_consult_low": 16, - "nurse_consult_high": 24, - "random_seed": 0, - "run_length": 1000 - }, - "validation_result": { - "is_valid": false, - "errors": [ - "nurse_consult_low must be float", - "nurse_consult_high must be float", - "call_low \u2264 call_mode \u2264 call_high violated" - ] - } - }, - { - "attempt": 2, - "parameters": { - "n_operators": 12, - "n_nurses": 10, - "mean_iat": 0.48, - "callback_prob": 0.5, - "call_low": 6.0, - "call_mode": 7.2, - "call_high": 9.6, + "call_low": 5.0, + "call_mode": 7.0, + "call_high": 10.0, "nurse_consult_low": 10.0, - "nurse_consult_high": 8.0, - "random_seed": 42, - "run_length": 1000 - }, - "validation_result": { - "is_valid": false, - "errors": [ - "nurse_consult_low \u2264 nurse_consult_high violated" - ] - } - }, - { - "attempt": 3, - "parameters": { - "n_operators": 12, - "n_nurses": 10, - "mean_iat": 1.48, - "callback_prob": 0.5, - "call_mode": 3.6, - "call_low": 3.0, - "call_high": 6.0, - "nurse_consult_low": 24.0, - "nurse_consult_high": 32.0, - "random_seed": 42, + "nurse_consult_high": 20.0, + "random_seed": 0, "run_length": 1000 }, "validation_result": { @@ -1071,17 +992,17 @@ "parameters": { "n_operators": 12, "n_nurses": 10, - "mean_iat": 1.48, + "mean_iat": 0.72, "callback_prob": 0.5, - "call_mode": 3.6, - "call_low": 3.0, - "call_high": 6.0, - "nurse_consult_low": 24.0, - "nurse_consult_high": 32.0, - "random_seed": 42, + "call_low": 5.0, + "call_mode": 7.0, + "call_high": 10.0, + "nurse_consult_low": 10.0, + "nurse_consult_high": 20.0, + "random_seed": 0, "run_length": 1000 }, - "formatted_parameters": " **Parameters used in simulation**\n\n| :heavy_check_mark: Parameter | Value |\n|-----------------------------|-------|\n| n\\_operators | 12 |\n| n\\_nurses | 10 |\n| mean\\_iat | 1.48 |\n| callback\\_prob | 0.5 |\n| call\\_mode | 3.6 |\n| call\\_low | 3.0 |\n| call\\_high | 6.0 |\n| nurse\\_consult\\_low | 24.0 |\n| nurse\\_consult\\_high | 32.0 |\n| random\\_seed | 42 |\n| run\\_length | 1000 |" + "formatted_parameters": "**Parameters used in simulation**\n\n| \ud83d\udcca Parameter | Value |\n|---|---|\n| n_operators | 12 |\n| n_nurses | 10 |\n| mean_iat | 0.72 |\n| callback_prob | 0.5 |\n| call_low | 5.0 |\n| call_mode | 7.0 |\n| call_high | 10.0 |\n| nurse_consult_low | 10.0 |\n| nurse_consult_high | 20.0 |\n| random_seed | 0 |\n| run_length | 1000 |\n" } }, "comparison": { @@ -1089,27 +1010,27 @@ "details": { "01_mean_waiting_time": { "expected": 0.38280107383630124, - "got": 0.0, + "got": 1.2583822886248166, "match": false }, "02_operator_util": { "expected": 73.44752978233386, - "got": 23.33484123800376, + "got": 84.9856571728051, "match": false }, "03_mean_nurse_waiting_time": { "expected": 4.147154100012408, - "got": 8.70119617931026, + "got": 46.17149830527762, "match": false }, "04_nurse_util": { "expected": 90.70926116406883, - "got": 87.09750711871568, + "got": 97.12003253429981, "match": false }, "05_callback_rate": { "expected": 51.4546965918537, - "got": 47.963800904977376, + "got": 50.43103448275862, "match": false } }, From 5f896a64522494308bb4a4236df5bcdfe038068a Mon Sep 17 00:00:00 2001 From: TomMonks Date: Tue, 12 Aug 2025 14:12:35 +0100 Subject: [PATCH 19/23] docs(env) + main arize-phoenix --- environment.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/environment.yml b/environment.yml index e974ae9..1946eb8 100644 --- a/environment.yml +++ b/environment.yml @@ -1,4 +1,4 @@ -name: des-agent +name: sim-agent channels: - conda-forge dependencies: @@ -14,6 +14,7 @@ dependencies: - tabulate - langgraph - pip: + - arize-phoenix - arize-phoenix-otel - langchain - langchain-community From 42c7a8b2cdd9104e09218cceadac374a3480fba3 Mon Sep 17 00:00:00 2001 From: TomMonks Date: Tue, 12 Aug 2025 16:15:14 +0100 Subject: [PATCH 20/23] feat(evals): run all evals in one go or limit to n of set --- agent_self_reflection.py | 2 +- evals/evals_output.json | 914 +++++---------------------------------- run_evals.py | 329 ++++++++------ 3 files changed, 315 insertions(+), 930 deletions(-) diff --git a/agent_self_reflection.py b/agent_self_reflection.py index 11c869f..ae6d18f 100644 --- a/agent_self_reflection.py +++ b/agent_self_reflection.py @@ -34,7 +34,7 @@ Hard constraints ------------- MAX_RETRIES : int - Maximum number of parameter generation attempts before bailout (default: 4) + Maximum number of parameter generation attempts before bailout (default: mam4) Examples -------- diff --git a/evals/evals_output.json b/evals/evals_output.json index a5ef66d..063aeb5 100644 --- a/evals/evals_output.json +++ b/evals/evals_output.json @@ -21,88 +21,18 @@ "04_nurse_util": 90.36357093206236, "05_callback_rate": 42.5531914893617 }, - "agent_run": { - "simulation_result": { - "01_mean_waiting_time": 2.3504694345055652, - "02_operator_util": 90.86219986414451, - "03_mean_nurse_waiting_time": 4.1106430489118715, - "04_nurse_util": 90.36357093206236, - "05_callback_rate": 42.5531914893617 - }, - "meta": { - "retry_count": 0, - "had_bailout": false, - "validation_history": [ - { - "attempt": 1, - "parameters": { - "n_operators": 14, - "n_nurses": 12, - "mean_iat": 0.57, - "call_low": 5.0, - "call_mode": 7.0, - "call_high": 10.0, - "callback_prob": 0.4, - "nurse_consult_low": 10.0, - "nurse_consult_high": 20.0, - "random_seed": 0, - "run_length": 1000 - }, - "validation_result": { - "is_valid": true, - "errors": [] - } - } - ], - "parameters": { - "n_operators": 14, - "n_nurses": 12, - "mean_iat": 0.57, - "call_low": 5.0, - "call_mode": 7.0, - "call_high": 10.0, - "callback_prob": 0.4, - "nurse_consult_low": 10.0, - "nurse_consult_high": 20.0, - "random_seed": 0, - "run_length": 1000 - }, - "formatted_parameters": "**Parameters used in simulation**\n\n| \ud83d\udcc8 Parameter | Value |\n|---|---|\n| n_operators | 14 |\n| n_nurses | 12 |\n| mean_iat | 0.57 |\n| call_low | 5.0 |\n| call_mode | 7.0 |\n| call_high | 10.0 |\n| callback_prob | 0.4 |\n| nurse_consult_low | 10.0 |\n| nurse_consult_high | 20.0 |\n| random_seed | 0 |\n| run_length | 1000 |\n" - } - }, - "comparison": { - "pass": true, - "details": { - "01_mean_waiting_time": { - "expected": 2.3504694345055652, - "got": 2.3504694345055652, - "match": true - }, - "02_operator_util": { - "expected": 90.86219986414451, - "got": 90.86219986414451, - "match": true - }, - "03_mean_nurse_waiting_time": { - "expected": 4.1106430489118715, - "got": 4.1106430489118715, - "match": true - }, - "04_nurse_util": { - "expected": 90.36357093206236, - "got": 90.36357093206236, - "match": true - }, - "05_callback_rate": { - "expected": 42.5531914893617, - "got": 42.5531914893617, - "match": true - } - }, - "rtol": 0.001, - "atol": 1e-06 + "agent_result": { + "01_mean_waiting_time": 2.3504694345055652, + "02_operator_util": 90.86219986414451, + "03_mean_nurse_waiting_time": 4.1106430489118715, + "04_nurse_util": 90.36357093206236, + "05_callback_rate": 42.5531914893617 }, - "passed": true + "passed": true, + "context": { + "trace_id": "f5261ab566c4e5bebfbea7cc7f28b71c", + "span_id": "fd13f4f202d0d7a5" + } }, "example_2": { "user_input": "Run scenario with double staffing and normal call volume", @@ -126,88 +56,18 @@ "04_nurse_util": 51.20346053052576, "05_callback_rate": 42.12121212121212 }, - "agent_run": { - "simulation_result": { - "01_mean_waiting_time": 0.0, - "02_operator_util": 46.452122409679156, - "03_mean_nurse_waiting_time": 43.94073145744734, - "04_nurse_util": 97.41291238872327, - "05_callback_rate": 42.12121212121212 - }, - "meta": { - "retry_count": 0, - "had_bailout": false, - "validation_history": [ - { - "attempt": 1, - "parameters": { - "n_operators": 26, - "n_nurses": 10, - "mean_iat": 0.6, - "call_low": 5.0, - "call_mode": 7.0, - "call_high": 10.0, - "callback_prob": 0.4, - "nurse_consult_low": 10.0, - "nurse_consult_high": 20.0, - "random_seed": 0, - "run_length": 1000 - }, - "validation_result": { - "is_valid": true, - "errors": [] - } - } - ], - "parameters": { - "n_operators": 26, - "n_nurses": 10, - "mean_iat": 0.6, - "call_low": 5.0, - "call_mode": 7.0, - "call_high": 10.0, - "callback_prob": 0.4, - "nurse_consult_low": 10.0, - "nurse_consult_high": 20.0, - "random_seed": 0, - "run_length": 1000 - }, - "formatted_parameters": "**Parameters used in simulation**\n\n| \ud83d\udcc8 Parameter | Value |\n|---|---|\n| n_operators | 26 |\n| n_nurses | 10 |\n| mean_iat | 0.6 |\n| call_low | 5.0 |\n| call_mode | 7.0 |\n| call_high | 10.0 |\n| callback_prob | 0.4 |\n| nurse_consult_low | 10.0 |\n| nurse_consult_high | 20.0 |\n| random_seed | 0 |\n| run_length | 1000 |\n" - } - }, - "comparison": { - "pass": false, - "details": { - "01_mean_waiting_time": { - "expected": 0.0, - "got": 0.0, - "match": true - }, - "02_operator_util": { - "expected": 46.452122409679156, - "got": 46.452122409679156, - "match": true - }, - "03_mean_nurse_waiting_time": { - "expected": 0.0041045663539504606, - "got": 43.94073145744734, - "match": false - }, - "04_nurse_util": { - "expected": 51.20346053052576, - "got": 97.41291238872327, - "match": false - }, - "05_callback_rate": { - "expected": 42.12121212121212, - "got": 42.12121212121212, - "match": true - } - }, - "rtol": 0.001, - "atol": 1e-06 + "agent_result": { + "01_mean_waiting_time": 0.0, + "02_operator_util": 46.452122409679156, + "03_mean_nurse_waiting_time": 43.94073145744734, + "04_nurse_util": 97.41291238872327, + "05_callback_rate": 42.12121212121212 }, - "passed": false + "passed": false, + "context": { + "trace_id": "c93069f691d1a1edbca3886db55bc2cc", + "span_id": "1c4b8ed595ac9a41" + } }, "example_3": { "user_input": "Test configuration with 8 operators, 6 nurses, and 20% increased demand", @@ -231,88 +91,18 @@ "04_nurse_util": 97.8390040780754, "05_callback_rate": 42.48847926267281 }, - "agent_run": { - "simulation_result": { - "01_mean_waiting_time": 230.08648598465186, - "02_operator_util": 99.35481108027054, - "03_mean_nurse_waiting_time": 78.94871019130441, - "04_nurse_util": 97.8390040780754, - "05_callback_rate": 42.48847926267281 - }, - "meta": { - "retry_count": 0, - "had_bailout": false, - "validation_history": [ - { - "attempt": 1, - "parameters": { - "n_operators": 8, - "n_nurses": 6, - "mean_iat": 0.504, - "call_low": 5.0, - "call_mode": 7.0, - "call_high": 10.0, - "callback_prob": 0.4, - "nurse_consult_low": 10.0, - "nurse_consult_high": 20.0, - "random_seed": 0, - "run_length": 1000 - }, - "validation_result": { - "is_valid": true, - "errors": [] - } - } - ], - "parameters": { - "n_operators": 8, - "n_nurses": 6, - "mean_iat": 0.504, - "call_low": 5.0, - "call_mode": 7.0, - "call_high": 10.0, - "callback_prob": 0.4, - "nurse_consult_low": 10.0, - "nurse_consult_high": 20.0, - "random_seed": 0, - "run_length": 1000 - }, - "formatted_parameters": "**Parameters used in simulation**\n\n| \ud83d\udcc8 Parameter | Value |\n|---|---|\n| n_operators | 8 |\n| n_nurses | 6 |\n| mean_iat | 0.504 |\n| call_low | 5.0 |\n| call_mode | 7.0 |\n| call_high | 10.0 |\n| callback_prob | 0.4 |\n| nurse_consult_low | 10.0 |\n| nurse_consult_high | 20.0 |\n| random_seed | 0 |\n| run_length | 1000 |\n" - } - }, - "comparison": { - "pass": false, - "details": { - "01_mean_waiting_time": { - "expected": 242.69198606467467, - "got": 230.08648598465186, - "match": false - }, - "02_operator_util": { - "expected": 99.35481108027054, - "got": 99.35481108027054, - "match": true - }, - "03_mean_nurse_waiting_time": { - "expected": 78.94431797274639, - "got": 78.94871019130441, - "match": true - }, - "04_nurse_util": { - "expected": 97.8390040780754, - "got": 97.8390040780754, - "match": true - }, - "05_callback_rate": { - "expected": 42.48847926267281, - "got": 42.48847926267281, - "match": true - } - }, - "rtol": 0.001, - "atol": 1e-06 + "agent_result": { + "01_mean_waiting_time": 230.08648598465186, + "02_operator_util": 99.35481108027054, + "03_mean_nurse_waiting_time": 78.94871019130441, + "04_nurse_util": 97.8390040780754, + "05_callback_rate": 42.48847926267281 }, - "passed": false + "passed": false, + "context": { + "trace_id": "0f08ea84447ef6e9bdb05a17920807db", + "span_id": "b87894c014f8cf28" + } }, "example_4": { "user_input": "Simulate with 20 operators, 15 nurses, and half the usual call volume", @@ -336,88 +126,18 @@ "04_nurse_util": 35.17900361818962, "05_callback_rate": 42.17289719626168 }, - "agent_run": { - "simulation_result": { - "01_mean_waiting_time": 0.0, - "02_operator_util": 31.21492694742085, - "03_mean_nurse_waiting_time": 0.0, - "04_nurse_util": 35.17900361818962, - "05_callback_rate": 42.17289719626168 - }, - "meta": { - "retry_count": 0, - "had_bailout": false, - "validation_history": [ - { - "attempt": 1, - "parameters": { - "n_operators": 20, - "n_nurses": 15, - "mean_iat": 1.2, - "call_low": 5.0, - "call_mode": 7.0, - "call_high": 10.0, - "callback_prob": 0.4, - "nurse_consult_low": 10.0, - "nurse_consult_high": 20.0, - "random_seed": 0, - "run_length": 1000 - }, - "validation_result": { - "is_valid": true, - "errors": [] - } - } - ], - "parameters": { - "n_operators": 20, - "n_nurses": 15, - "mean_iat": 1.2, - "call_low": 5.0, - "call_mode": 7.0, - "call_high": 10.0, - "callback_prob": 0.4, - "nurse_consult_low": 10.0, - "nurse_consult_high": 20.0, - "random_seed": 0, - "run_length": 1000 - }, - "formatted_parameters": "**Parameters used in simulation**\n\n| \ud83d\udcc8 Parameter | Value |\n|---|---|\n| n_operators | 20 |\n| n_nurses | 15 |\n| mean_iat | 1.2 |\n| call_low | 5.0 |\n| call_mode | 7.0 |\n| call_high | 10.0 |\n| callback_prob | 0.4 |\n| nurse_consult_low | 10.0 |\n| nurse_consult_high | 20.0 |\n| random_seed | 0 |\n| run_length | 1000 |\n" - } - }, - "comparison": { - "pass": true, - "details": { - "01_mean_waiting_time": { - "expected": 0.0, - "got": 0.0, - "match": true - }, - "02_operator_util": { - "expected": 31.21492694742085, - "got": 31.21492694742085, - "match": true - }, - "03_mean_nurse_waiting_time": { - "expected": 0.0, - "got": 0.0, - "match": true - }, - "04_nurse_util": { - "expected": 35.17900361818962, - "got": 35.17900361818962, - "match": true - }, - "05_callback_rate": { - "expected": 42.17289719626168, - "got": 42.17289719626168, - "match": true - } - }, - "rtol": 0.001, - "atol": 1e-06 + "agent_result": { + "01_mean_waiting_time": 0.0, + "02_operator_util": 31.21492694742085, + "03_mean_nurse_waiting_time": 0.0, + "04_nurse_util": 35.17900361818962, + "05_callback_rate": 42.17289719626168 }, - "passed": true + "passed": true, + "context": { + "trace_id": "3a7d4f511089509f19d615d5b04a725d", + "span_id": "926e778adc644f4c" + } }, "example_5": { "user_input": "Run with 10 operators, 8 nurses, 30% callback probability, and 10% reduced demand", @@ -441,88 +161,18 @@ "04_nurse_util": 82.46774067208518, "05_callback_rate": 33.012583271650634 }, - "agent_run": { - "simulation_result": { - "01_mean_waiting_time": 46.3970260873325, - "02_operator_util": 98.6111946659619, - "03_mean_nurse_waiting_time": 2.290083053642536, - "04_nurse_util": 82.07127437115734, - "05_callback_rate": 33.11061618411284 - }, - "meta": { - "retry_count": 0, - "had_bailout": false, - "validation_history": [ - { - "attempt": 1, - "parameters": { - "n_operators": 10, - "n_nurses": 8, - "mean_iat": 0.69, - "callback_prob": 0.3, - "nurse_consult_low": 10.0, - "nurse_consult_high": 20.0, - "call_low": 5.0, - "call_mode": 7.0, - "call_high": 10.0, - "random_seed": 0, - "run_length": 1000 - }, - "validation_result": { - "is_valid": true, - "errors": [] - } - } - ], - "parameters": { - "n_operators": 10, - "n_nurses": 8, - "mean_iat": 0.69, - "callback_prob": 0.3, - "nurse_consult_low": 10.0, - "nurse_consult_high": 20.0, - "call_low": 5.0, - "call_mode": 7.0, - "call_high": 10.0, - "random_seed": 0, - "run_length": 1000 - }, - "formatted_parameters": "**Parameters used in simulation**\n\n| \ud83d\udcc8 Parameter | Value |\n|---|---|\n| n_operators | 10 |\n| n_nurses | 8 |\n| mean_iat | 0.69 |\n| callback_prob | 0.3 |\n| nurse_consult_low | 10.0 |\n| nurse_consult_high | 20.0 |\n| call_low | 5.0 |\n| call_mode | 7.0 |\n| call_high | 10.0 |\n| random_seed | 0 |\n| run_length | 1000 |\n" - } - }, - "comparison": { - "pass": false, - "details": { - "01_mean_waiting_time": { - "expected": 62.26336943615235, - "got": 46.3970260873325, - "match": false - }, - "02_operator_util": { - "expected": 98.91827940297605, - "got": 98.6111946659619, - "match": false - }, - "03_mean_nurse_waiting_time": { - "expected": 2.3551925550777337, - "got": 2.290083053642536, - "match": false - }, - "04_nurse_util": { - "expected": 82.46774067208518, - "got": 82.07127437115734, - "match": false - }, - "05_callback_rate": { - "expected": 33.012583271650634, - "got": 33.11061618411284, - "match": false - } - }, - "rtol": 0.001, - "atol": 1e-06 - }, - "passed": false + "agent_result": { + "01_mean_waiting_time": 46.3970260873325, + "02_operator_util": 98.6111946659619, + "03_mean_nurse_waiting_time": 2.290083053642536, + "04_nurse_util": 82.07127437115734, + "05_callback_rate": 33.11061618411284 + }, + "passed": false, + "context": { + "trace_id": "3dd0f874abcf87f832d31fb352632636", + "span_id": "ad80c628897ebbbe" + } }, "example_6": { "user_input": "Test scenario: 25 operators, 25 nurses, triple demand", @@ -546,88 +196,18 @@ "04_nurse_util": 83.59706413413956, "05_callback_rate": 41.69621749408983 }, - "agent_run": { - "simulation_result": { - "01_mean_waiting_time": 161.26826119772568, - "02_operator_util": 99.27336565709264, - "03_mean_nurse_waiting_time": 0.4505143843086236, - "04_nurse_util": 83.59706413413956, - "05_callback_rate": 41.69621749408983 - }, - "meta": { - "retry_count": 0, - "had_bailout": false, - "validation_history": [ - { - "attempt": 1, - "parameters": { - "n_operators": 25, - "n_nurses": 25, - "mean_iat": 0.2, - "call_low": 5.0, - "call_mode": 7.0, - "call_high": 10.0, - "callback_prob": 0.4, - "nurse_consult_low": 10.0, - "nurse_consult_high": 20.0, - "random_seed": 0, - "run_length": 1000 - }, - "validation_result": { - "is_valid": true, - "errors": [] - } - } - ], - "parameters": { - "n_operators": 25, - "n_nurses": 25, - "mean_iat": 0.2, - "call_low": 5.0, - "call_mode": 7.0, - "call_high": 10.0, - "callback_prob": 0.4, - "nurse_consult_low": 10.0, - "nurse_consult_high": 20.0, - "random_seed": 0, - "run_length": 1000 - }, - "formatted_parameters": "**Parameters used in simulation**\n\n| \ud83d\udcc8 Parameter | Value |\n|---|---|\n| n_operators | 25 |\n| n_nurses | 25 |\n| mean_iat | 0.2 |\n| call_low | 5.0 |\n| call_mode | 7.0 |\n| call_high | 10.0 |\n| callback_prob | 0.4 |\n| nurse_consult_low | 10.0 |\n| nurse_consult_high | 20.0 |\n| random_seed | 0 |\n| run_length | 1000 |\n" - } - }, - "comparison": { - "pass": true, - "details": { - "01_mean_waiting_time": { - "expected": 161.26826119772568, - "got": 161.26826119772568, - "match": true - }, - "02_operator_util": { - "expected": 99.27336565709264, - "got": 99.27336565709264, - "match": true - }, - "03_mean_nurse_waiting_time": { - "expected": 0.4505143843086236, - "got": 0.4505143843086236, - "match": true - }, - "04_nurse_util": { - "expected": 83.59706413413956, - "got": 83.59706413413956, - "match": true - }, - "05_callback_rate": { - "expected": 41.69621749408983, - "got": 41.69621749408983, - "match": true - } - }, - "rtol": 0.001, - "atol": 1e-06 + "agent_result": { + "01_mean_waiting_time": 161.26826119772568, + "02_operator_util": 99.27336565709264, + "03_mean_nurse_waiting_time": 0.4505143843086236, + "04_nurse_util": 83.59706413413956, + "05_callback_rate": 41.69621749408983 }, - "passed": true + "passed": true, + "context": { + "trace_id": "7665b2b29408259e148fccd619ac80e6", + "span_id": "06181825b8538a83" + } }, "example_7": { "user_input": "Simulate 15 operators, 12 nurses, call durations 3-6-9 minutes, 2000 minute runtime", @@ -651,88 +231,18 @@ "04_nurse_util": 86.43094276820304, "05_callback_rate": 41.684147437818396 }, - "agent_run": { - "simulation_result": { - "01_mean_waiting_time": 0.07192106569784922, - "02_operator_util": 66.74240386374329, - "03_mean_nurse_waiting_time": 2.663802537251699, - "04_nurse_util": 86.43094276820304, - "05_callback_rate": 41.684147437818396 - }, - "meta": { - "retry_count": 0, - "had_bailout": false, - "validation_history": [ - { - "attempt": 1, - "parameters": { - "n_operators": 15, - "n_nurses": 12, - "mean_iat": 0.6, - "call_low": 3.0, - "call_mode": 6.0, - "call_high": 9.0, - "callback_prob": 0.4, - "nurse_consult_low": 10.0, - "nurse_consult_high": 20.0, - "random_seed": 0, - "run_length": 2000 - }, - "validation_result": { - "is_valid": true, - "errors": [] - } - } - ], - "parameters": { - "n_operators": 15, - "n_nurses": 12, - "mean_iat": 0.6, - "call_low": 3.0, - "call_mode": 6.0, - "call_high": 9.0, - "callback_prob": 0.4, - "nurse_consult_low": 10.0, - "nurse_consult_high": 20.0, - "random_seed": 0, - "run_length": 2000 - }, - "formatted_parameters": "**Parameters used in simulation**\n\n| \ud83d\udcca Parameter | Value |\n|---|---|\n| n_operators | 15 |\n| n_nurses | 12 |\n| mean_iat | 0.6 |\n| call_low | 3.0 |\n| call_mode | 6.0 |\n| call_high | 9.0 |\n| callback_prob | 0.4 |\n| nurse_consult_low | 10.0 |\n| nurse_consult_high | 20.0 |\n| random_seed | 0 |\n| run_length | 2000 |\n" - } - }, - "comparison": { - "pass": true, - "details": { - "01_mean_waiting_time": { - "expected": 0.07192106569784922, - "got": 0.07192106569784922, - "match": true - }, - "02_operator_util": { - "expected": 66.74240386374329, - "got": 66.74240386374329, - "match": true - }, - "03_mean_nurse_waiting_time": { - "expected": 2.663802537251699, - "got": 2.663802537251699, - "match": true - }, - "04_nurse_util": { - "expected": 86.43094276820304, - "got": 86.43094276820304, - "match": true - }, - "05_callback_rate": { - "expected": 41.684147437818396, - "got": 41.684147437818396, - "match": true - } - }, - "rtol": 0.001, - "atol": 1e-06 + "agent_result": { + "01_mean_waiting_time": 0.07192106569784922, + "02_operator_util": 66.74240386374329, + "03_mean_nurse_waiting_time": 2.663802537251699, + "04_nurse_util": 86.43094276820304, + "05_callback_rate": 41.684147437818396 }, - "passed": true + "passed": true, + "context": { + "trace_id": "351c928786d9364f8bf28983dd16c0a8", + "span_id": "6bffbbff22841ac8" + } }, "example_8": { "user_input": "Run with 6 operators, 4 nurses, 60% callback rate, and 25% higher demand", @@ -756,78 +266,18 @@ "04_nurse_util": 97.69465771729217, "05_callback_rate": 57.87545787545788 }, - "agent_run": { - "simulation_result": { - "01_mean_waiting_time": 306.74172321599656, - "02_operator_util": 99.45155480236895, - "03_mean_nurse_waiting_time": 196.98238666596234, - "04_nurse_util": 97.69465771729217, - "05_callback_rate": 57.8239608801956 - }, - "meta": { - "retry_count": 0, - "had_bailout": false, - "validation_history": [ - { - "attempt": 1, - "parameters": { - "n_operators": 6, - "n_nurses": 4, - "mean_iat": 0.48, - "callback_prob": 0.6, - "random_seed": 0, - "run_length": 1000 - }, - "validation_result": { - "is_valid": true, - "errors": [] - } - } - ], - "parameters": { - "n_operators": 6, - "n_nurses": 4, - "mean_iat": 0.48, - "callback_prob": 0.6, - "random_seed": 0, - "run_length": 1000 - }, - "formatted_parameters": "**Parameters used in simulation**\n\n| \ud83d\udcca Parameter | Value |\n|---|---|\n| n_operators | 6 |\n| n_nurses | 4 |\n| mean_iat | 0.48 |\n| callback_prob | 0.6 |\n| random_seed | 0 |\n| run_length | 1000 |\n" - } - }, - "comparison": { - "pass": false, - "details": { - "01_mean_waiting_time": { - "expected": 318.98668257451686, - "got": 306.74172321599656, - "match": false - }, - "02_operator_util": { - "expected": 99.56949574531609, - "got": 99.45155480236895, - "match": false - }, - "03_mean_nurse_waiting_time": { - "expected": 196.98753442211358, - "got": 196.98238666596234, - "match": true - }, - "04_nurse_util": { - "expected": 97.69465771729217, - "got": 97.69465771729217, - "match": true - }, - "05_callback_rate": { - "expected": 57.87545787545788, - "got": 57.8239608801956, - "match": true - } - }, - "rtol": 0.001, - "atol": 1e-06 + "agent_result": { + "01_mean_waiting_time": 306.74172321599656, + "02_operator_util": 99.45155480236895, + "03_mean_nurse_waiting_time": 196.98238666596234, + "04_nurse_util": 97.69465771729217, + "05_callback_rate": 57.8239608801956 }, - "passed": false + "passed": false, + "context": { + "trace_id": "774aa223e001e166a7d7c906779f1807", + "span_id": "784b26c1ca10d345" + } }, "example_9": { "user_input": "Test configuration: 18 operators, 14 nurses, nurse consultations 15-25 minutes, normal demand", @@ -851,88 +301,18 @@ "04_nurse_util": 97.15170780690659, "05_callback_rate": 42.12121212121212 }, - "agent_run": { - "simulation_result": { - "01_mean_waiting_time": 0.08980920202587155, - "02_operator_util": 67.09751014731434, - "03_mean_nurse_waiting_time": 19.85524137214805, - "04_nurse_util": 97.15170780690659, - "05_callback_rate": 42.12121212121212 - }, - "meta": { - "retry_count": 0, - "had_bailout": false, - "validation_history": [ - { - "attempt": 1, - "parameters": { - "n_operators": 18, - "n_nurses": 14, - "mean_iat": 0.6, - "call_low": 5.0, - "call_mode": 7.0, - "call_high": 10.0, - "callback_prob": 0.4, - "nurse_consult_low": 15.0, - "nurse_consult_high": 25.0, - "random_seed": 0, - "run_length": 1000 - }, - "validation_result": { - "is_valid": true, - "errors": [] - } - } - ], - "parameters": { - "n_operators": 18, - "n_nurses": 14, - "mean_iat": 0.6, - "call_low": 5.0, - "call_mode": 7.0, - "call_high": 10.0, - "callback_prob": 0.4, - "nurse_consult_low": 15.0, - "nurse_consult_high": 25.0, - "random_seed": 0, - "run_length": 1000 - }, - "formatted_parameters": "**Parameters used in simulation**\n\n| \ud83d\udcc8 Parameter | Value |\n|---|---|\n| n_operators | 18 |\n| n_nurses | 14 |\n| mean_iat | 0.6 |\n| call_low | 5.0 |\n| call_mode | 7.0 |\n| call_high | 10.0 |\n| callback_prob | 0.4 |\n| nurse_consult_low | 15.0 |\n| nurse_consult_high | 25.0 |\n| random_seed | 0 |\n| run_length | 1000 |\n" - } - }, - "comparison": { - "pass": true, - "details": { - "01_mean_waiting_time": { - "expected": 0.08980920202587155, - "got": 0.08980920202587155, - "match": true - }, - "02_operator_util": { - "expected": 67.09751014731434, - "got": 67.09751014731434, - "match": true - }, - "03_mean_nurse_waiting_time": { - "expected": 19.85524137214805, - "got": 19.85524137214805, - "match": true - }, - "04_nurse_util": { - "expected": 97.15170780690659, - "got": 97.15170780690659, - "match": true - }, - "05_callback_rate": { - "expected": 42.12121212121212, - "got": 42.12121212121212, - "match": true - } - }, - "rtol": 0.001, - "atol": 1e-06 + "agent_result": { + "01_mean_waiting_time": 0.08980920202587155, + "02_operator_util": 67.09751014731434, + "03_mean_nurse_waiting_time": 19.85524137214805, + "04_nurse_util": 97.15170780690659, + "05_callback_rate": 42.12121212121212 }, - "passed": true + "passed": true, + "context": { + "trace_id": "d8b2aedeb917ec4191b1c6a7fc467721", + "span_id": "14f5bf6c2a7008c2" + } }, "example_10": { "user_input": "Simulate 12 operators, 10 nurses, 50% callback probability, and 40% demand reduction", @@ -956,87 +336,17 @@ "04_nurse_util": 90.70926116406883, "05_callback_rate": 51.4546965918537 }, - "agent_run": { - "simulation_result": { - "01_mean_waiting_time": 1.2583822886248166, - "02_operator_util": 84.9856571728051, - "03_mean_nurse_waiting_time": 46.17149830527762, - "04_nurse_util": 97.12003253429981, - "05_callback_rate": 50.43103448275862 - }, - "meta": { - "retry_count": 0, - "had_bailout": false, - "validation_history": [ - { - "attempt": 1, - "parameters": { - "n_operators": 12, - "n_nurses": 10, - "mean_iat": 0.72, - "callback_prob": 0.5, - "call_low": 5.0, - "call_mode": 7.0, - "call_high": 10.0, - "nurse_consult_low": 10.0, - "nurse_consult_high": 20.0, - "random_seed": 0, - "run_length": 1000 - }, - "validation_result": { - "is_valid": true, - "errors": [] - } - } - ], - "parameters": { - "n_operators": 12, - "n_nurses": 10, - "mean_iat": 0.72, - "callback_prob": 0.5, - "call_low": 5.0, - "call_mode": 7.0, - "call_high": 10.0, - "nurse_consult_low": 10.0, - "nurse_consult_high": 20.0, - "random_seed": 0, - "run_length": 1000 - }, - "formatted_parameters": "**Parameters used in simulation**\n\n| \ud83d\udcca Parameter | Value |\n|---|---|\n| n_operators | 12 |\n| n_nurses | 10 |\n| mean_iat | 0.72 |\n| callback_prob | 0.5 |\n| call_low | 5.0 |\n| call_mode | 7.0 |\n| call_high | 10.0 |\n| nurse_consult_low | 10.0 |\n| nurse_consult_high | 20.0 |\n| random_seed | 0 |\n| run_length | 1000 |\n" - } - }, - "comparison": { - "pass": false, - "details": { - "01_mean_waiting_time": { - "expected": 0.38280107383630124, - "got": 1.2583822886248166, - "match": false - }, - "02_operator_util": { - "expected": 73.44752978233386, - "got": 84.9856571728051, - "match": false - }, - "03_mean_nurse_waiting_time": { - "expected": 4.147154100012408, - "got": 46.17149830527762, - "match": false - }, - "04_nurse_util": { - "expected": 90.70926116406883, - "got": 97.12003253429981, - "match": false - }, - "05_callback_rate": { - "expected": 51.4546965918537, - "got": 50.43103448275862, - "match": false - } - }, - "rtol": 0.001, - "atol": 1e-06 - }, - "passed": false + "agent_result": { + "01_mean_waiting_time": 1.2583822886248166, + "02_operator_util": 84.9856571728051, + "03_mean_nurse_waiting_time": 46.17149830527762, + "04_nurse_util": 97.12003253429981, + "05_callback_rate": 50.43103448275862 + }, + "passed": false, + "context": { + "trace_id": "38fc463cffe7950265a08d9e4f57c852", + "span_id": "910e28879b5d85b8" + } } } \ No newline at end of file diff --git a/run_evals.py b/run_evals.py index 361c137..a262ebf 100644 --- a/run_evals.py +++ b/run_evals.py @@ -1,157 +1,232 @@ import asyncio import json import math -from typing import Dict, Any, Tuple, Optional +import pandas as pd +import numpy as np +from typing import Dict, Any, Optional -# Import the agent builder and its dependencies from your agent module -# Assumes agent_self_reflection.py exposes `build_graph`, `AgentState`, and a factory for the LLM. from agent_self_reflection import build_graph, AgentState from langchain_ollama import OllamaLLM +from mcpsim.tracing import init_tracing +import phoenix as px +from phoenix.trace import SpanEvaluations +import argparse +import itertools -def floats_close(a: Any, b: Any, rtol: float = 1e-3, atol: float = 1e-6) -> bool: - """ - Compare numbers with tolerance, pass-through for non-numeric equality. - Returns True if both are NaN, or within tolerances for floats, or exactly equal otherwise. - """ - # Handle None - if a is None or b is None: - return a is None and b is None +# Import OpenTelemetry trace API to capture span context +from opentelemetry import trace + +# Initialize a global tracer from your helper +tracer_provider = init_tracing(project_name="mcp-agent-evaluation", endpoint="http://localhost:6006") +tracer = tracer_provider.get_tracer("eval-runner-tracer") - # Try numeric compare + +# ----------------- Comparison helpers (unchanged) ----------------- +def floats_close(a: Any, b: Any, rtol: float = 1e-3, atol: float = 1e-6) -> bool: + if a is None or b is None: return a is None and b is None try: - fa = float(a) - fb = float(b) - # Handle NaNs - if math.isnan(fa) and math.isnan(fb): - return True + fa, fb = float(a), float(b) + if math.isnan(fa) and math.isnan(fb): return True return math.isclose(fa, fb, rel_tol=rtol, abs_tol=atol) - except (TypeError, ValueError): - # Non-numeric: exact equality - return a == b - + except (TypeError, ValueError): return a == b def compare_results( got: Dict[str, Any], expected: Dict[str, Any], rtol: float = 1e-3, - atol: float = 1e-6, -) -> Dict[str, Any]: + atol: float = 1e-6 +) -> bool: + """Compare two dictionaries of numerical simulation results. + + This function provides a robust way to check if two dictionaries, + representing simulation outputs, are numerically equivalent within a + specified tolerance. It enforces that all metrics must be numeric + and that both dictionaries must have the exact same set of keys. + + Parameters + ---------- + got : dict + The dictionary of actual results obtained from a simulation run. + All values are expected to be numeric (int or float). + expected : dict + The dictionary of expected results to compare against. + All values are expected to be numeric (int or float). + rtol : float, optional + The relative tolerance parameter for `numpy.allclose`. + Default is 1e-3. + atol : float, optional + The absolute tolerance parameter for `numpy.allclose`. + Default is 1e-6. + + Returns + ------- + bool + True if the dictionaries are a match, False otherwise. A match + requires that: + 1. Both dictionaries have the identical set of keys. + 2. All values in both dictionaries are numeric. + 3. All corresponding numeric values are close, as determined by + `numpy.allclose` with the given tolerances. + + Examples + -------- + >>> got = {'metric_a': 1.0001, 'metric_b': 200.0} + >>> expected = {'metric_a': 1.0, 'metric_b': 200.5} + >>> compare_results(got, expected, rtol=1e-2) + True + + >>> got = {'metric_a': 1.01, 'metric_b': 200.0} + >>> expected = {'metric_a': 1.0, 'metric_b': 200.0} + >>> compare_results(got, expected, rtol=1e-3) + False + + >>> got = {'metric_a': 1.0, 'metric_b': 'fail'} + >>> expected = {'metric_a': 1.0, 'metric_b': 2.0} + >>> compare_results(got, expected) + False + + >>> got = {'metric_a': 1.0} + >>> expected = {'metric_a': 1.0, 'metric_b': 2.0} + >>> compare_results(got, expected) + False """ - Compare simulation outputs to expected_results with numeric tolerance. - Returns a dict with per-key comparison, diffs, and overall pass flag. + if expected is None or got is None: + return False + + s_got = pd.Series(got) + s_expected = pd.Series(expected) + + # 1. Check for structural differences (different keys). + if set(s_got.index) != set(s_expected.index): + return False + + # If both are empty but have same (no) keys, they match. + # if s_got.empty: + # return True + + # 2. Verify that ALL values in BOTH series are numeric. + # pd.api.types.is_number is a robust way to check for int/float. + if not (s_got.apply(pd.api.types.is_number).all() and + s_expected.apply(pd.api.types.is_number).all()): + return False + + # 3. Align and compare using NumPy's tolerance-based function. + # We already checked for key equality, so we can align `expected` to `got`. + s_expected_aligned = s_expected.loc[s_got.index] + + # `np.allclose` is the gold standard for comparing arrays of floats. + return np.allclose( + s_got.values, + s_expected_aligned.values, + rtol=rtol, + atol=atol, + equal_nan=True # Considers two NaN values to be equal. + ) + +# ----------------- Agent run helpers (unchanged) ----------------- +async def run_agent_once(compiled_graph, user_input: str, llm: OllamaLLM) -> AgentState: + state_in: AgentState = {"user_input": user_input, "retry_count": 0, "validation_history": []} + return await compiled_graph.ainvoke(state_in) + +def extract_sim_result(state: AgentState) -> Optional[Dict[str, Any]]: + return state.get("simulation_result") + +# ---------------- UPDATED Bulk Ingest Function ---------------- +def bulk_ingest_to_phoenix(json_path: str, eval_name: str = "Simulation Agent Eval"): """ - keys = sorted(set(got.keys()) | set(expected.keys())) - per_key = {} - all_pass = True - - for k in keys: - g = got.get(k, None) - e = expected.get(k, None) - ok = floats_close(g, e, rtol=rtol, atol=atol) - if not ok: - all_pass = False - per_key[k] = { - "expected": e, - "got": g, - "match": ok, - } - - return { - "pass": all_pass, - "details": per_key, - "rtol": rtol, - "atol": atol, - } - - -async def run_agent_once( - compiled_graph, - user_input: str, - llm: OllamaLLM, - max_retries: int = 4, -) -> Dict[str, Any]: + Loads an enriched evals.json file and bulk-ingests into Phoenix, + now including score, label, and explanation columns. """ - Runs the agent end-to-end for a single natural-language user_input. - Returns the final state, including simulation_result or error. - """ - state_in: AgentState = { - "user_input": user_input, - "retry_count": 0, - "validation_history": [], - } - final_state: AgentState = await compiled_graph.ainvoke(state_in) - return final_state - + with open(json_path, "r") as f: + evals = json.load(f) -def extract_agent_simulation_result(final_state: Dict[str, Any]) -> Tuple[Optional[Dict[str, Any]], Dict[str, Any]]: + eval_records = [] + for ex_name, case in evals.items(): + context = case.get("context") + if not context or "span_id" not in context or "trace_id" not in context: + print(f"⚠️ Skipping '{ex_name}': missing trace/span context in {json_path}") + continue + + is_passed = bool(case.get("passed")) + + # **FIX:** Add all three required columns: score, label, and explanation. + eval_records.append({ + "context.trace_id": context["trace_id"], + "context.span_id": context["span_id"], + "example_id": ex_name, + "score": 1 if is_passed else 0, + "label": "Pass" if is_passed else "Fail", + "explanation": "Agent result matched expected values within tolerance." if is_passed + else "Agent result did not match expected values.", + }) + + if not eval_records: + print("No valid records found to ingest. Did you run the agent first to generate evals.json?") + return + + eval_df = pd.DataFrame(eval_records) + eval_df = eval_df.set_index("context.span_id") + + client = px.Client() + client.log_evaluations(SpanEvaluations(eval_name=eval_name, dataframe=eval_df)) + print(f"[✓] Pushed {len(eval_df)} eval rows to Phoenix under '{eval_name}'") + +# ---------------- UPDATED Main eval runner ---------------- +async def run_all_and_save(model_name: str = "gemma3:27b", limit: int = None): """ - Extract the simulation_result if present and return a compact agent_run summary - useful for evaluation logging. + Runs the full evaluation pipeline and saves an enriched evals.json + that now includes the necessary trace/span context for Phoenix. """ - sim_res = final_state.get("simulation_result") - agent_run = { - "retry_count": final_state.get("retry_count", 0), - "had_bailout": final_state.get("error") == "Maximum retries exceeded during parameter reflection.", - "validation_history": final_state.get("validation_history", []), - "parameters": final_state.get("parameters", {}), - "formatted_parameters": final_state.get("formatted_parameters", None), - } - return sim_res, agent_run - - -async def main( - input_json_path: str = "evals/evals.json", - output_json_path: str = "evals/evals_output.json", - model_name: str = "gemma3:27b", - llm_base_url: str = "http://localhost:11434", - rtol: float = 1e-3, - atol: float = 1e-6, -) -> None: - # 1) Load evals - with open(input_json_path, "r") as f: + with open("evals/evals.json", "r") as f: evals = json.load(f) - # 2) Build LLM and graph once - llm = OllamaLLM(model=model_name, base_url=llm_base_url) + llm = OllamaLLM(model=model_name, base_url="http://localhost:11434") compiled_graph = build_graph(llm) - # 3) Run each eval in a loop via the agent - for key, case in evals.items(): - user_input = case.get("user_input", "") - expected = case.get("parameters", {}).get("expected_results") or case.get("expected_results") - # In your earlier structure, expected_results is at the top level of each example after enrichment. - # If not present yet, this will remain None and comparison will be skipped. - - final_state = await run_agent_once(compiled_graph, user_input, llm) - sim_res, agent_run = extract_agent_simulation_result(final_state) - - # Record agent run outputs - case["agent_run"] = { - "simulation_result": sim_res, - "meta": agent_run, - } - - # 4) Compare to expected_results if available - if expected is not None and sim_res is not None: - cmp = compare_results(sim_res, expected, rtol=rtol, atol=atol) - case["comparison"] = cmp - case["passed"] = bool(cmp["pass"]) - else: - case["comparison"] = { - "note": "Either expected_results or agent simulation_result missing; comparison skipped." - } - case["passed"] = False if expected is not None else None - - # 5) Save enriched evals to evals.json - with open(output_json_path, "w") as f: + # Use islice to limit the loop if a limit is provided + items_to_process = itertools.islice(evals.items(), limit) if limit else evals.items() + + for ex_name, case in items_to_process: + # **FIX:** Create a parent span for each eval run to capture its context + with tracer.start_as_current_span(f"eval_run: {ex_name}") as span: + # Capture the context from the currently active span + span_context = span.get_span_context() + trace_id = f"{span_context.trace_id:032x}" + span_id = f"{span_context.span_id:016x}" + + # Run the agent pipeline + final_state = await run_agent_once(compiled_graph, case["user_input"], llm) + + # Process results + got = extract_sim_result(final_state) + passed = compare_results(got, case.get("expected_results")) + + # Store results and the new context back into the dictionary + case["agent_result"] = got + case["passed"] = passed + case["context"] = {"trace_id": trace_id, "span_id": span_id} + + # Optionally add attributes to the span + span.set_attribute("eval.passed", passed) + span.set_attribute("eval.example_id", ex_name) + + + with open("evals/evals_output.json", "w") as f: json.dump(evals, f, indent=2) - print(f"Wrote evaluation results to {output_json_path}") - + print("[✓] Saved enriched evals.json with trace/span context.") if __name__ == "__main__": - # For CLI usage: - # python eval_runner.py - # Optional: parameterize via env vars or argparse if desired. - asyncio.run(main()) + + parser = argparse.ArgumentParser(description="Run evals and/or bulk-ingest into Phoenix") + parser.add_argument("--skip-run", action="store_true", help="Skip agent runs and just bulk-ingest existing evals.json") + parser.add_argument("--eval-name", default="Simulation Agent Eval") + parser.add_argument("--limit", type=int, default=None, help="Limit the number of evaluations to run for debugging.") + args = parser.parse_args() + + if args.skip_run: + bulk_ingest_to_phoenix("evals/evals_output.json", eval_name=args.eval_name) + else: + asyncio.run(run_all_and_save(model_name="gemma3:27b", limit=args.limit)) + bulk_ingest_to_phoenix("evals/evals_output.json", eval_name=args.eval_name) From 56a08f8ca1fc2a3b36ff1dd6c66d42a3d038856b Mon Sep 17 00:00:00 2001 From: TomMonks Date: Tue, 12 Aug 2025 18:03:35 +0100 Subject: [PATCH 21/23] feat(callcentre): improved schema desc + improved json prompt --- evals/evals_output.json | 86 +++++++++---------- .../callcentre/resources/schema.json | 6 +- mcpsim/resources/parameter_prompt2.txt | 35 ++++++++ mcpsim/resources/parameter_prompt3.txt | 33 +++++++ mcpsim/server.py | 3 +- run_evals.py | 16 ++-- 6 files changed, 122 insertions(+), 57 deletions(-) create mode 100644 mcpsim/resources/parameter_prompt2.txt create mode 100644 mcpsim/resources/parameter_prompt3.txt diff --git a/evals/evals_output.json b/evals/evals_output.json index 063aeb5..ed2acdb 100644 --- a/evals/evals_output.json +++ b/evals/evals_output.json @@ -30,8 +30,8 @@ }, "passed": true, "context": { - "trace_id": "f5261ab566c4e5bebfbea7cc7f28b71c", - "span_id": "fd13f4f202d0d7a5" + "trace_id": "0934a40e20fdfe070060c5388c844116", + "span_id": "31ed66eb214272b6" } }, "example_2": { @@ -59,14 +59,14 @@ "agent_result": { "01_mean_waiting_time": 0.0, "02_operator_util": 46.452122409679156, - "03_mean_nurse_waiting_time": 43.94073145744734, - "04_nurse_util": 97.41291238872327, + "03_mean_nurse_waiting_time": 0.0041045663539504606, + "04_nurse_util": 51.20346053052576, "05_callback_rate": 42.12121212121212 }, - "passed": false, + "passed": true, "context": { - "trace_id": "c93069f691d1a1edbca3886db55bc2cc", - "span_id": "1c4b8ed595ac9a41" + "trace_id": "77ca44b81bf36c073755141a05677ead", + "span_id": "caec062bc22d0352" } }, "example_3": { @@ -92,16 +92,16 @@ "05_callback_rate": 42.48847926267281 }, "agent_result": { - "01_mean_waiting_time": 230.08648598465186, + "01_mean_waiting_time": 242.69198606467467, "02_operator_util": 99.35481108027054, - "03_mean_nurse_waiting_time": 78.94871019130441, + "03_mean_nurse_waiting_time": 78.94431797274639, "04_nurse_util": 97.8390040780754, "05_callback_rate": 42.48847926267281 }, - "passed": false, + "passed": true, "context": { - "trace_id": "0f08ea84447ef6e9bdb05a17920807db", - "span_id": "b87894c014f8cf28" + "trace_id": "5f9634ef46fc071a31c5cf1110340b65", + "span_id": "e753ea4ba67d52a2" } }, "example_4": { @@ -135,8 +135,8 @@ }, "passed": true, "context": { - "trace_id": "3a7d4f511089509f19d615d5b04a725d", - "span_id": "926e778adc644f4c" + "trace_id": "00c51788956b474f796f4f43e50f83fe", + "span_id": "53ded666630ded84" } }, "example_5": { @@ -162,16 +162,16 @@ "05_callback_rate": 33.012583271650634 }, "agent_result": { - "01_mean_waiting_time": 46.3970260873325, - "02_operator_util": 98.6111946659619, - "03_mean_nurse_waiting_time": 2.290083053642536, - "04_nurse_util": 82.07127437115734, - "05_callback_rate": 33.11061618411284 + "01_mean_waiting_time": 62.26336943615235, + "02_operator_util": 98.91827940297605, + "03_mean_nurse_waiting_time": 2.3551925550777337, + "04_nurse_util": 82.46774067208518, + "05_callback_rate": 33.012583271650634 }, - "passed": false, + "passed": true, "context": { - "trace_id": "3dd0f874abcf87f832d31fb352632636", - "span_id": "ad80c628897ebbbe" + "trace_id": "bf05c8a6a1ec39d65c9968fb83601548", + "span_id": "948c6aad19a4740b" } }, "example_6": { @@ -205,8 +205,8 @@ }, "passed": true, "context": { - "trace_id": "7665b2b29408259e148fccd619ac80e6", - "span_id": "06181825b8538a83" + "trace_id": "0ab69e3e17a7e78309b5fe6167ea5ca7", + "span_id": "ea7b6b1569d943e2" } }, "example_7": { @@ -240,8 +240,8 @@ }, "passed": true, "context": { - "trace_id": "351c928786d9364f8bf28983dd16c0a8", - "span_id": "6bffbbff22841ac8" + "trace_id": "0f483bd78e24ae73065112e5d7a12b5a", + "span_id": "e1e58275b3473a9c" } }, "example_8": { @@ -267,16 +267,16 @@ "05_callback_rate": 57.87545787545788 }, "agent_result": { - "01_mean_waiting_time": 306.74172321599656, - "02_operator_util": 99.45155480236895, - "03_mean_nurse_waiting_time": 196.98238666596234, + "01_mean_waiting_time": 318.98668257451686, + "02_operator_util": 99.56949574531609, + "03_mean_nurse_waiting_time": 196.98753442211358, "04_nurse_util": 97.69465771729217, - "05_callback_rate": 57.8239608801956 + "05_callback_rate": 57.87545787545788 }, - "passed": false, + "passed": true, "context": { - "trace_id": "774aa223e001e166a7d7c906779f1807", - "span_id": "784b26c1ca10d345" + "trace_id": "df871b1e2f654ad8de10bbd51fbdaefe", + "span_id": "52f47e95d1fb0985" } }, "example_9": { @@ -310,8 +310,8 @@ }, "passed": true, "context": { - "trace_id": "d8b2aedeb917ec4191b1c6a7fc467721", - "span_id": "14f5bf6c2a7008c2" + "trace_id": "6d1203e63e2c1ffde945f0e20d33307f", + "span_id": "5b1a97dca34eaa49" } }, "example_10": { @@ -337,16 +337,16 @@ "05_callback_rate": 51.4546965918537 }, "agent_result": { - "01_mean_waiting_time": 1.2583822886248166, - "02_operator_util": 84.9856571728051, - "03_mean_nurse_waiting_time": 46.17149830527762, - "04_nurse_util": 97.12003253429981, - "05_callback_rate": 50.43103448275862 + "01_mean_waiting_time": 0.38280107383630124, + "02_operator_util": 73.44752978233386, + "03_mean_nurse_waiting_time": 4.147154100012408, + "04_nurse_util": 90.70926116406883, + "05_callback_rate": 51.4546965918537 }, - "passed": false, + "passed": true, "context": { - "trace_id": "38fc463cffe7950265a08d9e4f57c852", - "span_id": "910e28879b5d85b8" + "trace_id": "4bce453daf28c8f5912d2c76c14d173d", + "span_id": "d1ad3e87babd3758" } } } \ No newline at end of file diff --git a/mcpsim/example_models/callcentre/resources/schema.json b/mcpsim/example_models/callcentre/resources/schema.json index 66d699d..6f8cba6 100644 --- a/mcpsim/example_models/callcentre/resources/schema.json +++ b/mcpsim/example_models/callcentre/resources/schema.json @@ -4,21 +4,21 @@ "default": 13, "minimum": 1, "maximum": 100, - "description": "Number of call operators (1-100)" + "description": "Number of call operator staff on duty (1-100)" }, "n_nurses": { "type": "int", "default": 10, "minimum": 1, "maximum": 50, - "description": "Number of nurse callbacks (1-50)" + "description": "Number of nurse staff on duty (1-50)" }, "mean_iat": { "type": "float", "default": 0.6, "minimum": 0.1, "maximum": 10.0, - "description": "Mean time between arrivals (minutes) or inter-arrival time. 0.1 = 6 calls/min, 10.0 = 1 call/10min. An increase in demand = a reduction in the inter-arrival time. A reduction in demand = an increase in inter-arrival time" + "description": "Mean inter-arrival time in minutes (average time between arrivals). Baseline = 0.6 min.\n\nDemand changes:\n1. Percentage change form: If demand increases by fraction X, set mean_iat = baseline × (1 - X). If demand decreases by fraction X, set mean_iat = baseline × (1 + X).\n2. Multiplicative change form: If arrivals are multiplied by factor M (e.g., 'half the calls' = M=0.5, 'double the calls' = M=2), set mean_iat = baseline × (1 / M)." }, "call_low": { "type": "float", diff --git a/mcpsim/resources/parameter_prompt2.txt b/mcpsim/resources/parameter_prompt2.txt new file mode 100644 index 0000000..4ae6a20 --- /dev/null +++ b/mcpsim/resources/parameter_prompt2.txt @@ -0,0 +1,35 @@ +You are a discrete-event simulation assistant. +Your job is to convert a plain-English request into computed JSON parameter values for a simulation. + +Here is the simulation configuration schema: +{schema} + +User request: {user_input} + +{validation_feedback} + +INSTRUCTIONS: +1. Use the schema default for any parameter not supplied in the request. +2. Apply any percentage/fractional demand change directly to `mean_iat` using: + - Increase in demand (fraction X): `mean_iat = mean_iat × (1 - X)` + - Decrease in demand (fraction X): `mean_iat = mean_iat × (1 + X)` +3. Perform all calculations mentally — never output an expression, only the Final computed number. +4. Return all parameters present in the schema, filled with calculated or default values. +5. Final output must be a valid JSON object with: + - Keys exactly as in the schema + - Numeric values only (no strings, no units) + +Example output format +{{ + "n_operators": 12, + "n_nurses": 10, + "mean_iat": 0.81, + "call_low": 5.0, + "call_mode": 7.0, + "call_high": 10.0, + "callback_prob": 0.4, + "nurse_consult_low": 10.0, + "nurse_consult_high": 20.0, + "random_seed": 0, + "run_length": 1000 +}} \ No newline at end of file diff --git a/mcpsim/resources/parameter_prompt3.txt b/mcpsim/resources/parameter_prompt3.txt new file mode 100644 index 0000000..4ce8404 --- /dev/null +++ b/mcpsim/resources/parameter_prompt3.txt @@ -0,0 +1,33 @@ +You are a discrete-event simulation assistant. +Your job is to convert a plain-English request into computed JSON parameter values for a simulation. + +Here is the simulation configuration schema: +{schema} + +User request: {user_input} + +{validation_feedback} + +INSTRUCTIONS: +1. Use the schema default for any parameter not supplied in the request. +2. Adjust `mean_iat` according to the rules in its description: + - If change is given as a percentage/fraction of demand, scale baseline directly by (1 − X) for increases or (1 + X) for decreases. + - If change is given as a multiple of arrivals, first set arrival multiplier M, then mean_iat = baseline × (1 / M). +3. Perform all calculations mentally — never output an expression, only the Final computed number. +4. Return all parameters present in the schema, filled with calculated or default values. +5. Keys must be exactly as in the schema, numbers only, no strings or units. + +Example output format +{{ + "n_operators": 12, + "n_nurses": 10, + "mean_iat": 0.81, + "call_low": 5.0, + "call_mode": 7.0, + "call_high": 10.0, + "callback_prob": 0.4, + "nurse_consult_low": 10.0, + "nurse_consult_high": 20.0, + "random_seed": 0, + "run_length": 1000 +}} \ No newline at end of file diff --git a/mcpsim/server.py b/mcpsim/server.py index ee0f1f9..27ffffd 100644 --- a/mcpsim/server.py +++ b/mcpsim/server.py @@ -6,7 +6,8 @@ from mcpsim.protocol import SimulationModelAdapter -PARAMETER_PROMPT_PATH = "resources/parameter_prompt.txt" +# modified for experimentation... +PARAMETER_PROMPT_PATH = "resources/parameter_prompt2.txt" from mcpsim.tracing import init_tracing # your helper module from previous discussion diff --git a/run_evals.py b/run_evals.py index a262ebf..6470e01 100644 --- a/run_evals.py +++ b/run_evals.py @@ -17,12 +17,12 @@ # Import OpenTelemetry trace API to capture span context from opentelemetry import trace -# Initialize a global tracer from your helper +# Initialize a global tracer tracer_provider = init_tracing(project_name="mcp-agent-evaluation", endpoint="http://localhost:6006") tracer = tracer_provider.get_tracer("eval-runner-tracer") -# ----------------- Comparison helpers (unchanged) ----------------- +# ----------------- Comparison helpers ----------------- def floats_close(a: Any, b: Any, rtol: float = 1e-3, atol: float = 1e-6) -> bool: if a is None or b is None: return a is None and b is None try: @@ -100,10 +100,6 @@ def compare_results( # 1. Check for structural differences (different keys). if set(s_got.index) != set(s_expected.index): return False - - # If both are empty but have same (no) keys, they match. - # if s_got.empty: - # return True # 2. Verify that ALL values in BOTH series are numeric. # pd.api.types.is_number is a robust way to check for int/float. @@ -124,7 +120,7 @@ def compare_results( equal_nan=True # Considers two NaN values to be equal. ) -# ----------------- Agent run helpers (unchanged) ----------------- +# ----------------- Agent run helpers ---------------- async def run_agent_once(compiled_graph, user_input: str, llm: OllamaLLM) -> AgentState: state_in: AgentState = {"user_input": user_input, "retry_count": 0, "validation_history": []} return await compiled_graph.ainvoke(state_in) @@ -132,7 +128,7 @@ async def run_agent_once(compiled_graph, user_input: str, llm: OllamaLLM) -> Age def extract_sim_result(state: AgentState) -> Optional[Dict[str, Any]]: return state.get("simulation_result") -# ---------------- UPDATED Bulk Ingest Function ---------------- +# ---------------- Bulk Ingest Function for Phoenix ---------------- def bulk_ingest_to_phoenix(json_path: str, eval_name: str = "Simulation Agent Eval"): """ Loads an enriched evals.json file and bulk-ingests into Phoenix, @@ -172,7 +168,7 @@ def bulk_ingest_to_phoenix(json_path: str, eval_name: str = "Simulation Agent Ev client.log_evaluations(SpanEvaluations(eval_name=eval_name, dataframe=eval_df)) print(f"[✓] Pushed {len(eval_df)} eval rows to Phoenix under '{eval_name}'") -# ---------------- UPDATED Main eval runner ---------------- +# ---------------- Main eval runner ---------------- async def run_all_and_save(model_name: str = "gemma3:27b", limit: int = None): """ Runs the full evaluation pipeline and saves an enriched evals.json @@ -228,5 +224,5 @@ async def run_all_and_save(model_name: str = "gemma3:27b", limit: int = None): if args.skip_run: bulk_ingest_to_phoenix("evals/evals_output.json", eval_name=args.eval_name) else: - asyncio.run(run_all_and_save(model_name="gemma3:27b", limit=args.limit)) + asyncio.run(run_all_and_save(model_name="gpt-oss:20b", limit=args.limit)) bulk_ingest_to_phoenix("evals/evals_output.json", eval_name=args.eval_name) From b832d9da214e85b38808ba59ebe5d41440134993 Mon Sep 17 00:00:00 2001 From: TomMonks Date: Wed, 13 Aug 2025 16:14:59 +0100 Subject: [PATCH 22/23] feat(evals): create script now uploads to phoenix as dataset --- create_evals.py | 46 ++++++++++++++++++ evals/create_evals.py | 22 --------- evals/evals_output.json | 104 ++-------------------------------------- 3 files changed, 50 insertions(+), 122 deletions(-) create mode 100644 create_evals.py delete mode 100644 evals/create_evals.py diff --git a/create_evals.py b/create_evals.py new file mode 100644 index 0000000..07a445f --- /dev/null +++ b/create_evals.py @@ -0,0 +1,46 @@ +""" +Script to add expected simulation results to the evals dataset +single rep at this stage. +""" +import pandas as pd +import phoenix as px + +import json +from datetime import datetime + +# sim-agent imports +from mcpsim.example_models.callcentre.call_centre_model import run_simulation_from_dict +from mcpsim.tracing import init_tracing + +# Load test cases from evals_input.json +with open("evals/evals_input.json", "r") as f: + scenarios = json.load(f) + +# Run each scenario once and attach results +for key, obj in scenarios.items(): + params = obj["parameters"] + results = run_simulation_from_dict(params) + obj["expected_results"] = results + +# Save updated scenarios with results to evals.json +with open("evals/evals.json", "w") as f: + json.dump(scenarios, f, indent=2) + +# ------------- upload evals to phoenix -------------------------- +# Initialize a global tracer +tracer_provider = init_tracing(project_name="mcp-agent-evaluation", endpoint="http://localhost:6006") + +# convert to dataframe and transpose +df_evals = pd.DataFrame(scenarios).T + +# create a dataset consisting of input questions and expected outputs +now = datetime.now().strftime("%Y-%m-%d %H:%M:%S") +phoenix_client = px.Client() +dataset = phoenix_client.upload_dataset( + dataframe=df_evals, + dataset_name=f"initial_evals-{now}", + input_keys=["user_input"], + output_keys=["parameters", "expected_results"] +) + +print("Simulation results saved to evals.json") diff --git a/evals/create_evals.py b/evals/create_evals.py deleted file mode 100644 index e469a4e..0000000 --- a/evals/create_evals.py +++ /dev/null @@ -1,22 +0,0 @@ -""" -Script to add expected simulation results to the evals dataset -single rep at this stage. -""" -import json -from mcpsim.example_models.callcentre.call_centre_model import run_simulation_from_dict - -# Load test cases from evals_input.json -with open("evals/evals_input.json", "r") as f: - scenarios = json.load(f) - -# Run each scenario once and attach results -for key, obj in scenarios.items(): - params = obj["parameters"] - results = run_simulation_from_dict(params) - obj["expected_results"] = results - -# Save updated scenarios with results to evals.json -with open("evals/evals.json", "w") as f: - json.dump(scenarios, f, indent=2) - -print("Simulation results saved to evals.json") diff --git a/evals/evals_output.json b/evals/evals_output.json index ed2acdb..71caedf 100644 --- a/evals/evals_output.json +++ b/evals/evals_output.json @@ -30,8 +30,8 @@ }, "passed": true, "context": { - "trace_id": "0934a40e20fdfe070060c5388c844116", - "span_id": "31ed66eb214272b6" + "trace_id": "05e9185a30490fa463f00ea5b0e65108", + "span_id": "39cfc1324c7aee19" } }, "example_2": { @@ -65,8 +65,8 @@ }, "passed": true, "context": { - "trace_id": "77ca44b81bf36c073755141a05677ead", - "span_id": "caec062bc22d0352" + "trace_id": "95fc97f63b485e37d20ada5d01b2acdb", + "span_id": "f66b8c36689fbdb2" } }, "example_3": { @@ -90,18 +90,6 @@ "03_mean_nurse_waiting_time": 78.94431797274639, "04_nurse_util": 97.8390040780754, "05_callback_rate": 42.48847926267281 - }, - "agent_result": { - "01_mean_waiting_time": 242.69198606467467, - "02_operator_util": 99.35481108027054, - "03_mean_nurse_waiting_time": 78.94431797274639, - "04_nurse_util": 97.8390040780754, - "05_callback_rate": 42.48847926267281 - }, - "passed": true, - "context": { - "trace_id": "5f9634ef46fc071a31c5cf1110340b65", - "span_id": "e753ea4ba67d52a2" } }, "example_4": { @@ -125,18 +113,6 @@ "03_mean_nurse_waiting_time": 0.0, "04_nurse_util": 35.17900361818962, "05_callback_rate": 42.17289719626168 - }, - "agent_result": { - "01_mean_waiting_time": 0.0, - "02_operator_util": 31.21492694742085, - "03_mean_nurse_waiting_time": 0.0, - "04_nurse_util": 35.17900361818962, - "05_callback_rate": 42.17289719626168 - }, - "passed": true, - "context": { - "trace_id": "00c51788956b474f796f4f43e50f83fe", - "span_id": "53ded666630ded84" } }, "example_5": { @@ -160,18 +136,6 @@ "03_mean_nurse_waiting_time": 2.3551925550777337, "04_nurse_util": 82.46774067208518, "05_callback_rate": 33.012583271650634 - }, - "agent_result": { - "01_mean_waiting_time": 62.26336943615235, - "02_operator_util": 98.91827940297605, - "03_mean_nurse_waiting_time": 2.3551925550777337, - "04_nurse_util": 82.46774067208518, - "05_callback_rate": 33.012583271650634 - }, - "passed": true, - "context": { - "trace_id": "bf05c8a6a1ec39d65c9968fb83601548", - "span_id": "948c6aad19a4740b" } }, "example_6": { @@ -195,18 +159,6 @@ "03_mean_nurse_waiting_time": 0.4505143843086236, "04_nurse_util": 83.59706413413956, "05_callback_rate": 41.69621749408983 - }, - "agent_result": { - "01_mean_waiting_time": 161.26826119772568, - "02_operator_util": 99.27336565709264, - "03_mean_nurse_waiting_time": 0.4505143843086236, - "04_nurse_util": 83.59706413413956, - "05_callback_rate": 41.69621749408983 - }, - "passed": true, - "context": { - "trace_id": "0ab69e3e17a7e78309b5fe6167ea5ca7", - "span_id": "ea7b6b1569d943e2" } }, "example_7": { @@ -230,18 +182,6 @@ "03_mean_nurse_waiting_time": 2.663802537251699, "04_nurse_util": 86.43094276820304, "05_callback_rate": 41.684147437818396 - }, - "agent_result": { - "01_mean_waiting_time": 0.07192106569784922, - "02_operator_util": 66.74240386374329, - "03_mean_nurse_waiting_time": 2.663802537251699, - "04_nurse_util": 86.43094276820304, - "05_callback_rate": 41.684147437818396 - }, - "passed": true, - "context": { - "trace_id": "0f483bd78e24ae73065112e5d7a12b5a", - "span_id": "e1e58275b3473a9c" } }, "example_8": { @@ -265,18 +205,6 @@ "03_mean_nurse_waiting_time": 196.98753442211358, "04_nurse_util": 97.69465771729217, "05_callback_rate": 57.87545787545788 - }, - "agent_result": { - "01_mean_waiting_time": 318.98668257451686, - "02_operator_util": 99.56949574531609, - "03_mean_nurse_waiting_time": 196.98753442211358, - "04_nurse_util": 97.69465771729217, - "05_callback_rate": 57.87545787545788 - }, - "passed": true, - "context": { - "trace_id": "df871b1e2f654ad8de10bbd51fbdaefe", - "span_id": "52f47e95d1fb0985" } }, "example_9": { @@ -300,18 +228,6 @@ "03_mean_nurse_waiting_time": 19.85524137214805, "04_nurse_util": 97.15170780690659, "05_callback_rate": 42.12121212121212 - }, - "agent_result": { - "01_mean_waiting_time": 0.08980920202587155, - "02_operator_util": 67.09751014731434, - "03_mean_nurse_waiting_time": 19.85524137214805, - "04_nurse_util": 97.15170780690659, - "05_callback_rate": 42.12121212121212 - }, - "passed": true, - "context": { - "trace_id": "6d1203e63e2c1ffde945f0e20d33307f", - "span_id": "5b1a97dca34eaa49" } }, "example_10": { @@ -335,18 +251,6 @@ "03_mean_nurse_waiting_time": 4.147154100012408, "04_nurse_util": 90.70926116406883, "05_callback_rate": 51.4546965918537 - }, - "agent_result": { - "01_mean_waiting_time": 0.38280107383630124, - "02_operator_util": 73.44752978233386, - "03_mean_nurse_waiting_time": 4.147154100012408, - "04_nurse_util": 90.70926116406883, - "05_callback_rate": 51.4546965918537 - }, - "passed": true, - "context": { - "trace_id": "4bce453daf28c8f5912d2c76c14d173d", - "span_id": "d1ad3e87babd3758" } } } \ No newline at end of file From 6476f765b5c71c9fea4a5cdb8ad120d16d2d38ca Mon Sep 17 00:00:00 2001 From: TomMonks Date: Tue, 30 Sep 2025 16:59:31 +0100 Subject: [PATCH 23/23] feat(app): draft of a basic streamlit app --- app.py | 501 ++++++++++++++++++++++++++++++++++++++++ app2.py | 486 ++++++++++++++++++++++++++++++++++++++ evals/evals_output.json | 8 +- 3 files changed, 991 insertions(+), 4 deletions(-) create mode 100644 app.py create mode 100644 app2.py diff --git a/app.py b/app.py new file mode 100644 index 0000000..c44cdb4 --- /dev/null +++ b/app.py @@ -0,0 +1,501 @@ + +import streamlit as st +import asyncio +import json +import subprocess +import time +import threading +import queue +from io import StringIO +import sys +from contextlib import redirect_stdout, redirect_stderr +import os +from typing import Optional, Dict, Any +import pandas as pd + +# Import the agents (assuming they're in the same directory or on path) +try: + # These would be your actual imports + from langchain_ollama import OllamaLLM + from fastmcp import Client + from agent_planning_workflow import main as planning_main + from agent_self_reflection import main as reflection_main + import agent_planning_workflow + import agent_self_reflection +except ImportError as e: + st.error(f"Import error: {e}. Please ensure all dependencies are installed and agents are accessible.") + +# Page configuration +st.set_page_config( + page_title="DES Agent Interface", + page_icon="🤖", + layout="wide", + initial_sidebar_state="expanded" +) + +# Custom CSS for better styling +st.markdown(""" + +""", unsafe_allow_html=True) + +# Initialize session state +if 'execution_logs' not in st.session_state: + st.session_state.execution_logs = [] +if 'simulation_results' not in st.session_state: + st.session_state.simulation_results = None +if 'mcp_server_status' not in st.session_state: + st.session_state.mcp_server_status = "Unknown" + + +import requests + +def get_ollama_models(): + """Get list of models from Ollama server""" + try: + response = requests.get("http://localhost:11434/api/tags") + if response.status_code == 200: + data = response.json() + models = [model["name"] for model in data["models"]] + return models + return [] + except Exception as e: + print(f"Error: {e}") + return [] + + + +# Header +st.markdown('

🤖 DES Agent Interface

', unsafe_allow_html=True) + +# Sidebar configuration +st.sidebar.header("🔧 Configuration") + +# Agent selection +agent_type = st.sidebar.selectbox( + "Select Agent Type", + ["Dynamic Planner", "Self-Reflective"], + help="Choose between the two agent architectures" +) + +# Model configuration based on agent type +st.sidebar.subheader("🧠 LLM Configuration") + +# get the model list from Ollama and get default indexese +available_models = get_ollama_models() +try: + gemma_index = available_models.index("gemma3:27b") +except ValueError: + gemma_index = 0 + +try: + llama_index = available_models.index("lamma3:latest") +except: + llama_index = 0 + +if agent_type == "Dynamic Planner": + planning_model = st.sidebar.selectbox( + "Planning Model", + available_models, + index=gemma_index, # Default to gemma3:27b + help="Model used for reasoning and task planning" + ) + + summary_model = st.sidebar.selectbox( + "Summary Model", + available_models, + index=llama_index, # Default to llama3:latest + help="Model used for summarizing parameters and results" + ) + + planning_debug = st.sidebar.checkbox( + "Enable Planning Debug Mode", + help="Show detailed workflow and planning information" + ) + +else: # Self-Reflective Agent + reflection_model = st.sidebar.selectbox( + "Reflection Model", + available_models, + index=gemma_index, # Default to gemma3:27b + help="Model used for parameter generation and self-reflection" + ) + + reflection_debug = st.sidebar.checkbox( + "Enable Reflection Debug Mode", + help="Show detailed reflection and retry information" + ) + +# Server status check +st.sidebar.subheader("🌐 Server Status") + +def check_server_status(url: str, server_type: str = "generic", timeout: int = 3) -> str: + """Check if a server is running with proper protocol support""" + try: + if server_type == "mcp": + # MCP uses JSON-RPC - check if port is listening + import socket + from urllib.parse import urlparse + + parsed = urlparse(url) + host = parsed.hostname or 'localhost' + port = parsed.port or 8001 + + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + sock.settimeout(timeout) + result = sock.connect_ex((host, port)) + sock.close() + + return "Running" if result == 0 else "Not Running" + + elif server_type == "ollama": + response = requests.get(url, timeout=timeout) + return "Running" if response.status_code == 200 else "Error" + + except Exception as e: + return "Not Running" + +# Server status indicators +mcp_status = mcp_status = check_server_status("http://localhost:8001/mcp", server_type="mcp") +ollama_status = check_server_status("http://localhost:11434/api/tags", server_type="ollama") + +st.sidebar.markdown(f"**MCP Server:** {'🟢' if mcp_status == 'Running' else '🔴'} {mcp_status}") +st.sidebar.markdown(f"**Ollama Server:** {'🟢' if ollama_status == 'Running' else '🔴'} {ollama_status}") + +if mcp_status != "Running": + st.sidebar.warning("⚠️ MCP Server not detected. Start with: `python mcp_server.py`") +if ollama_status != "Running": + st.sidebar.warning("⚠️ Ollama Server not detected. Start with: `ollama serve`") + +# Main interface +col1, col2 = st.columns([2, 1]) + +with col1: + st.subheader("📝 Simulation Request") + + # Pre-defined examples + example_requests = [ + "Simulate 14 operators, 12 nurses and 5% extra demand", + "Run scenario with high staffing and normal call volume", + "Test configuration with minimal staff (5 operators, 3 nurses)", + "Simulate peak hours with 20 operators, 15 nurses and 20% increased demand", + "Run baseline scenario with default parameters" + ] + + selected_example = st.selectbox( + "Quick Examples:", + ["Custom Input"] + example_requests, + help="Select a pre-defined example or choose 'Custom Input'" + ) + + if selected_example == "Custom Input": + user_input = st.text_area( + "Enter your simulation request:", + placeholder="e.g., Simulate 14 operators, 12 nurses and 5% extra demand", + height=100 + ) + else: + user_input = st.text_area( + "Simulation request:", + value=selected_example, + height=100 + ) + +with col2: + st.subheader("🎯 Agent Information") + + if agent_type == "Dynamic Planner": + st.markdown(""" +
+

🧠 Dynamic Planner Agent

+

Features:

+
    +
  • Dual LLM architecture
  • +
  • Dynamic task planning
  • +
  • Memory-driven execution
  • +
  • Step-by-step workflow
  • +
+

Models:

+
    +
  • Planning: """ + planning_model + """
  • +
  • Summary: """ + summary_model + """
  • +
+
+ """, unsafe_allow_html=True) + else: + st.markdown(""" +
+

🔄 Self-Reflective Agent

+

Features:

+
    +
  • LangGraph state machine
  • +
  • Validation-driven learning
  • +
  • Bounded retry logic
  • +
  • Error analysis & reflection
  • +
+

Model: """ + reflection_model + """

+
+ """, unsafe_allow_html=True) + +# Execution section +st.subheader("🚀 Execute Simulation") + +# Create execution button +if st.button("Run Simulation", type="primary", disabled=(not user_input.strip())): + if mcp_status != "Running" or ollama_status != "Running": + #if ollama_status != "Running": + st.error("❌ Please ensure both MCP and Ollama servers are running before executing.") + else: + with st.spinner("🤖 Agent is working..."): + # Create placeholders for live updates + status_placeholder = st.empty() + debug_placeholder = st.empty() + results_placeholder = st.empty() + + # Capture agent execution + class StreamCapture: + def __init__(self): + self.logs = [] + self.current_status = "Initializing..." + + def write(self, text): + if text.strip(): + self.logs.append(text.strip()) + # Update status based on content + if "Planning modelling task" in text: + self.current_status = "🧠 Planning simulation task..." + elif "Executing the plan" in text: + self.current_status = "⚙️ Executing plan steps..." + elif "Reasoning about simulation parameters" in text: + self.current_status = "🤔 Generating parameters..." + elif "Summarising parameters" in text: + self.current_status = "📊 Summarizing results..." + elif "Simulation complete" in text: + self.current_status = "✅ Simulation complete!" + + def flush(self): + pass + + capture = StreamCapture() + + try: + # Redirect stdout and stderr to capture agent output + original_stdout = sys.stdout + original_stderr = sys.stderr + sys.stdout = capture + sys.stderr = capture + + # Execute the selected agent + if agent_type == "Dynamic Planner": + # Simulate the agent execution (in real implementation, you'd call the actual functions) + status_placeholder.info("🧠 Starting Dynamic Planner Agent...") + + # In real implementation, you would do: + # result = asyncio.run(planning_main(planning_model, summary_model, planning_debug)) + + # For demonstration, simulate the process + time.sleep(1) + status_placeholder.info("📋 Generating execution plan...") + + if planning_debug: + debug_placeholder.markdown(""" +
+ 🐛 Debug mode enabled - showing workflow details
+ === MCP SERVER CAPABILITIES ===
+ 🔧 Available tools (3): run_call_centre_simulation, validate_simulation_parameters
+ 📚 Available resources (2): schema/experiment_parameters, model/description
+ 📝 Available prompts (1): parameter_jsonification_prompt
+ === LLM GENERATED PLAN ===
+ Step 1: Get experiment parameter schema
+ → Type: Resource, Name: get_experiment_parameter_schema
+ Step 2: Generate simulation parameters
+ → Type: Prompt, Name: parameter_jsonification_prompt
+ Step 3: Validate parameters
+ → Type: Tool, Name: validate_simulation_parameters
+ Step 4: Run simulation
+ → Type: Tool, Name: run_call_centre_simulation +
+ """, unsafe_allow_html=True) + + time.sleep(2) + status_placeholder.info("⚙️ Executing planned steps...") + time.sleep(2) + status_placeholder.success("✅ Simulation complete!") + + else: # Self-Reflective Agent + status_placeholder.info("🔄 Starting Self-Reflective Agent...") + + if reflection_debug: + debug_placeholder.markdown(""" +
+ 🐛 Debug mode enabled - showing reflection details
+ 🧠 Reasoning about simulation parameters...
+ ✅ Parameter validation successful
+ 📊 Parameters formatted successfully
+ 🏥 Running healthcare call centre simulation... +
+ """, unsafe_allow_html=True) + + time.sleep(1) + status_placeholder.info("🤔 Generating parameters...") + time.sleep(2) + status_placeholder.info("✅ Parameters validated successfully") + time.sleep(1) + status_placeholder.info("🏥 Running simulation...") + time.sleep(2) + status_placeholder.success("✅ Simulation complete!") + + # Display mock results (in real implementation, these would come from the agents) + sample_parameters = { + "n_operators": 14, + "n_nurses": 12, + "mean_iat": 0.57, + "call_low": 5.0, + "call_mode": 7.0, + "call_high": 10.0, + "callback_prob": 0.4, + "run_length": 1000, + "random_seed": 42 + } + + sample_results = { + "01_mean_waiting_time": 2.45, + "02_operator_util": 78.5, + "03_mean_nurse_waiting_time": 1.23, + "04_nurse_util": 65.2, + "05_callback_rate": 39.8 + } + + # Display results + results_placeholder.markdown(""" +
+

🎯 Simulation Results

+
+ """, unsafe_allow_html=True) + + # Parameters table + st.subheader("📊 Parameters Used") + param_df = pd.DataFrame(list(sample_parameters.items()), columns=['Parameter', 'Value']) + st.dataframe(param_df, use_container_width=True) + + # Results table + st.subheader("📈 Key Performance Indicators") + results_df = pd.DataFrame(list(sample_results.items()), columns=['KPI', 'Value']) + st.dataframe(results_df, use_container_width=True) + + # Store results in session state + st.session_state.simulation_results = { + 'parameters': sample_parameters, + 'results': sample_results, + 'agent_type': agent_type, + 'timestamp': time.strftime("%Y-%m-%d %H:%M:%S") + } + + except Exception as e: + st.error(f"❌ Error during execution: {str(e)}") + + finally: + # Restore stdout and stderr + sys.stdout = original_stdout + sys.stderr = original_stderr + +# Results history section +if st.session_state.simulation_results: + st.subheader("📚 Recent Results") + + with st.expander("View Last Simulation Results", expanded=False): + results = st.session_state.simulation_results + + col1, col2 = st.columns(2) + with col1: + st.write("**Agent Type:**", results['agent_type']) + st.write("**Timestamp:**", results['timestamp']) + + with col2: + if st.button("📋 Copy Parameters JSON"): + st.code(json.dumps(results['parameters'], indent=2)) + + st.write("**Parameters:**") + st.json(results['parameters']) + + st.write("**Results:**") + st.json(results['results']) + +# Footer with helpful information +# st.markdown("---") +# st.markdown(""" +# ### 💡 Tips for Better Results + +# **For Dynamic Planner:** +# - Use gemma3:27b for best planning performance +# - Enable debug mode to see step-by-step execution +# - Try complex requests that require multi-step reasoning + +# **For Self-Reflective Agent:** +# - gemma3:27b and mistral:7b work well for parameter generation +# - The agent will automatically retry if validation fails +# - Watch for reflection patterns in debug mode + +# **Common Request Patterns:** +# - "Simulate X operators, Y nurses and Z% extra demand" +# - "Run scenario with [adjective] staffing and [adjective] call volume" +# - "Test configuration with [specific parameters]" +# """) + +# # Troubleshooting section +# with st.expander("🔧 Troubleshooting", expanded=False): +# st.markdown(""" +# **Common Issues:** + +# 1. **MCP Server Not Running:** +# ```bash +# python mcp_server.py +# ``` +# Server should be available at http://localhost:8001/mcp + +# 2. **Ollama Server Not Running:** +# ```bash +# ollama serve +# ``` +# Then pull required models: +# ```bash +# ollama pull gemma3:27b +# ollama pull llama3:latest +# ollama pull mistral:7b +# ``` + +# 3. **Model Not Found:** +# Check available models: `ollama list` + +# 4. **Connection Errors:** +# - Check firewall settings +# - Ensure ports 8001 and 11434 are available +# - Verify model names match exactly +# """) \ No newline at end of file diff --git a/app2.py b/app2.py new file mode 100644 index 0000000..6363621 --- /dev/null +++ b/app2.py @@ -0,0 +1,486 @@ +""" +This code was written by perplexity.ai labs initially. I've been editing it. +Likely to modify a lot as this isn't quite what I wanted. + +""" +import streamlit as st +import asyncio +import json +import time +import sys +import os +from typing import Optional, Dict, Any, List +import pandas as pd +from datetime import datetime +import requests +import socket +from urllib.parse import urlparse + +# Import the agents +try: + from langchain_ollama import OllamaLLM + from fastmcp import Client + from agent_planning_workflow import main as planning_main + from agent_self_reflection import main as reflection_main + import agent_planning_workflow + import agent_self_reflection + AGENTS_AVAILABLE = True +except ImportError as e: + st.error(f"Import error: {e}. Please ensure all dependencies are installed and agents are accessible.") + AGENTS_AVAILABLE = False + +# Page configuration +st.set_page_config( + page_title="Sim Experimentation Agent Interface", + page_icon="🤖", + layout="wide", + initial_sidebar_state="expanded" +) + + +# Initialize session state +if 'chat_messages' not in st.session_state: + st.session_state.chat_messages = [] +if 'simulation_results' not in st.session_state: + st.session_state.simulation_results = None +if 'agent_busy' not in st.session_state: + st.session_state.agent_busy = False + + +def get_ollama_models(): + """Get list of models from Ollama server""" + try: + response = requests.get("http://localhost:11434/api/tags") + if response.status_code == 200: + data = response.json() + models = [model["name"] for model in data["models"]] + return models + return [] + except Exception as e: + print(f"Error: {e}") + return [] + +# maybe refactor into two seperate functions +def check_server_status(url: str, server_type: str = "generic", timeout: int = 3) -> str: + """Check if a server (Ollama or Simulation MCP) is running with proper protocol support""" + try: + if server_type == "mcp": + + parsed = urlparse(url) + host = parsed.hostname or 'localhost' + port = parsed.port or 8001 + + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + sock.settimeout(timeout) + result = sock.connect_ex((host, port)) + sock.close() + + return "Running" if result == 0 else "Not Running" + + elif server_type == "ollama": + response = requests.get(url, timeout=timeout) + return "Running" if response.status_code == 200 else "Error" + + except Exception as e: + return "Not Running" + +async def run_self_reflection_agent(user_input: str, model_name: str, debug_mode: bool = False) -> Dict[str, Any]: + """Run the actual self-reflection agent""" + try: + # Create the LLM and graph + llm = OllamaLLM(model=model_name, base_url="http://localhost:11434") + compiled_graph = agent_self_reflection.build_graph(llm) + + # Run the agent + final_state = await compiled_graph.ainvoke({ + "user_input": user_input, + "retry_count": 0, + "validation_history": [] + }) + + return { + 'status': 'success', + 'agent_type': 'Self-Reflective', + 'model': model_name, + 'parameters': final_state.get('parameters', {}), + 'simulation_results': final_state.get('simulation_result', {}), + 'formatted_parameters': final_state.get('formatted_parameters', ''), + 'validation_history': final_state.get('validation_history', []), + 'retry_count': final_state.get('retry_count', 0), + 'user_input': user_input, + 'debug_mode': debug_mode + } + + except Exception as e: + import traceback + return { + 'status': 'error', + 'error': str(e), + 'traceback': traceback.format_exc(), + 'agent_type': 'Self-Reflective' + } + +def format_results_for_chat(results: Dict[str, Any]) -> str: + """Format simulation results for display in chat""" + if results.get('status') != 'success': + return f"❌ **Error:** {results.get('error', 'Unknown error')}" + + # Format the response + response = f"✅ **Simulation Complete!**\n\n" + + # Add agent info + response += f"**Agent:** {results['agent_type']}\n" + if 'model' in results: + response += f"**Model:** {results['model']}\n" + elif 'models' in results: + response += f"**Planning Model:** {results['models']['planning']}\n" + response += f"**Summary Model:** {results['models']['summary']}\n" + + # Add validation info for self-reflection agent + if results.get('retry_count', 0) > 0: + response += f"**Validation Attempts:** {results['retry_count'] + 1}\n" + + response += "\n" + + # Format key metrics + # sim_results = results.get('simulation_results', {}) + + # if sim_results: + # response += "**📊 Key Performance Indicators:**\n" + # response += f"• **Mean Wait Time:** {sim_results.get('01_mean_waiting_time', 0):.2f} minutes\n" + # response += f"• **Operator Utilization:** {sim_results.get('02_operator_util', 0):.1f}%\n" + # response += f"• **Nurse Wait Time:** {sim_results.get('03_mean_nurse_waiting_time', 0):.2f} minutes\n" + # response += f"• **Nurse Utilization:** {sim_results.get('04_nurse_util', 0):.1f}%\n" + # response += f"• **Callback Rate:** {sim_results.get('05_callback_rate', 0):.1f}%\n\n" + + + # Format parameters + params = results.get('parameters', {}) + if params: + response += "**⚙️ Parameters Used:**\n" + for key, value in params.items(): + clean_key = key.replace('_', ' ').title() + if isinstance(value, float): + response += f"• **{clean_key}:** {value:.3f}\n" + else: + response += f"• **{clean_key}:** {value}\n" + + return response + + +def format_results_table(results: Dict[str, Any]) -> pd.DataFrame: + """ + Display simulation results to the user in Streamlit chat + """ + sim_results = results.get('simulation_results', {}) + df = pd.DataFrame( + list(sim_results.items()), columns=["KPIs", "Values"] + ) + + return df + + +def display_debug_info(results: Dict[str, Any]) -> str: + """Format debug information for chat display""" + if not results.get('debug_mode'): + return "" + + debug_info = "\n**🐛 Debug Information:**\n" + + if results['agent_type'] == 'Self-Reflective': + validation_history = results.get('validation_history', []) + if validation_history: + debug_info += "\n**Validation History:**\n" + for i, attempt in enumerate(validation_history, 1): + status = "✅ Success" if attempt.get('validation_result', {}).get('is_valid', False) else "❌ Failed" + debug_info += f"Attempt {i}: {status}\n" + + errors = attempt.get('validation_result', {}).get('errors', []) + if errors: + debug_info += "Errors: " + ", ".join(errors) + "\n" + + return debug_info + +# Header +st.markdown("# 🤖 Simulation Experimentation Agent Interface") + +st.markdown( + """**Ask the agent to perform simulation questions in natural language and it will provide you with results**. + Click the buttons for example questions.""") + +# Sidebar configuration +st.sidebar.header("🔧 Configuration") + +# Agent selection +agent_type = st.sidebar.selectbox( + "Select Agent Type", + ["Self-Reflective", "Dynamic Planner"], # Prioritize working agent + help="Choose between the two agent architectures" +) + +# Model configuration +st.sidebar.subheader("🧠 LLM Configuration") + +# Get models from Ollama with fallbacks +available_models = get_ollama_models() +if not available_models: + available_models = ["gemma3:27b", "llama3:7b", "llama3:latest", "mistral:7b"] + +# Find default indices +try: + gemma_index = available_models.index("gemma3:27b") +except ValueError: + gemma_index = 0 + +try: + llama_index = available_models.index("llama3:latest") +except ValueError: + llama_index = 0 + +if agent_type == "Dynamic Planner": + planning_model = st.sidebar.selectbox( + "Planning Model", + available_models, + index=gemma_index, + help="Model used for reasoning and task planning" + ) + + summary_model = st.sidebar.selectbox( + "Summary Model", + available_models, + index=llama_index, + help="Model used for summarizing parameters and results" + ) + + debug_mode = st.sidebar.checkbox( + "Enable Debug Mode", + help="Show detailed workflow and planning information" + ) + +else: # Self-Reflective Agent + reflection_model = st.sidebar.selectbox( + "Model", + available_models, + index=gemma_index, + help="Model used for parameter generation and self-reflection" + ) + + debug_mode = st.sidebar.checkbox( + "Enable Debug Mode", + help="Show detailed reflection and retry information" + ) + +# Server status +st.sidebar.subheader("🌐 Server Status") +mcp_status = check_server_status("http://localhost:8001/mcp", server_type="mcp") +ollama_status = check_server_status("http://localhost:11434/api/tags", server_type="ollama") + +st.sidebar.markdown(f"**MCP Server:** {'🟢' if mcp_status == 'Running' else '🔴'} {mcp_status}") +st.sidebar.markdown(f"**Ollama Server:** {'🟢' if ollama_status == 'Running' else '🔴'} {ollama_status}") + +if mcp_status != "Running": + st.sidebar.warning("⚠️ MCP Server not detected. Start with: `python mcp_server.py`") +if ollama_status != "Running": + st.sidebar.warning("⚠️ Ollama Server not detected. Start with: `ollama serve`") + +# Clear chat button in sidebar +if st.sidebar.button("🗑️ Clear Chat"): + st.session_state.chat_messages = [] + st.session_state.simulation_results = None + st.rerun() + +# Main chat interface +st.subheader(f"💬 Chat with {agent_type} Agent") + +# Chat container +chat_container = st.container() + +with chat_container: + # Display chat messages + for message in st.session_state.chat_messages: + with st.chat_message(message["role"]): + st.markdown(message["content"]) + +# Chat input +if not st.session_state.agent_busy: + # Quick examples + st.subheader("💡 Quick Examples") + example_cols = st.columns(3) + + examples = [ + "Simulate 14 operators, 12 nurses and 5% extra demand", + "Run scenario with high staffing and normal call volume", + "Test minimal configuration (5 operators, 3 nurses)" + ] + + for i, example in enumerate(examples): + with example_cols[i]: + if st.button(example, key=f"example_{i}", use_container_width=True): + # Add example to chat and process + st.session_state.chat_messages.append({"role": "user", "content": example}) + st.session_state.agent_busy = True + st.rerun() + +# Chat input +if prompt := st.chat_input("Enter your simulation request...", disabled=st.session_state.agent_busy): + # Add user message to chat + st.session_state.chat_messages.append({"role": "user", "content": prompt}) + st.session_state.agent_busy = True + st.rerun() + +# Process the latest message if agent is busy +if st.session_state.agent_busy and st.session_state.chat_messages: + latest_message = st.session_state.chat_messages[-1] + + if latest_message["role"] == "user": + user_input = latest_message["content"] + + # Check if servers are ready + if mcp_status != "Running" or ollama_status != "Running": + error_msg = "❌ **System Not Ready**\n\nPlease ensure both MCP and Ollama servers are running:\n" + error_msg += "- MCP Server: `python mcp_server.py`\n" + error_msg += "- Ollama Server: `ollama serve`" + + st.session_state.chat_messages.append({"role": "assistant", "content": error_msg}) + st.session_state.agent_busy = False + st.rerun() + + else: + # Show processing message + with st.chat_message("assistant"): + with st.spinner(f"🤖 {agent_type} Agent is processing your request..."): + + if agent_type == "Self-Reflective" and AGENTS_AVAILABLE: + # Run actual self-reflection agent + try: + start_time = time.time() + + # Create progress indicator + progress_placeholder = st.empty() + progress_placeholder.info("🔄 Initializing self-reflection agent...") + + # Run the agent + result = asyncio.run(run_self_reflection_agent( + user_input, reflection_model, debug_mode + )) + + execution_time = time.time() - start_time + result['execution_time'] = execution_time + + progress_placeholder.success(f"✅ Completed in {execution_time:.1f} seconds") + + except Exception as e: + result = { + 'status': 'error', + 'error': str(e), + 'agent_type': 'Self-Reflective' + } + + else: + # Mock execution for Dynamic Planner or when agents not available + time.sleep(3) # Simulate processing + result = { + 'status': 'success', + 'agent_type': agent_type, + 'model': reflection_model if agent_type == "Self-Reflective" else None, + 'models': {'planning': planning_model, 'summary': summary_model} if agent_type == "Dynamic Planner" else None, + 'parameters': { + 'n_operators': 14, 'n_nurses': 12, 'mean_iat': 0.57, + 'call_low': 5.0, 'call_mode': 7.0, 'call_high': 10.0, + 'callback_prob': 0.4, 'run_length': 1000, 'random_seed': 42 + }, + 'simulation_result': { + '01_mean_waiting_time': 2.34, '02_operator_util': 76.8, + '03_mean_nurse_waiting_time': 1.45, '04_nurse_util': 68.2, + '05_callback_rate': 41.2 + }, + 'retry_count': 0, + 'debug_mode': debug_mode + } + + # Format and add response + response_text = format_results_for_chat(result) + st.markdown(response_text) + + # Display results table if simulation results exist + sim_results = result.get('simulation_results', {}) + if 'simulation_results' in result and result['simulation_results']: + st.write("\n") # Add spacing + df = pd.DataFrame( + list(sim_results.items()), + columns=["KPIs", "Values"] + ) + + response_text += "\n**📊 Key Performance Indicators:**\n" + st.markdown("**📊 Key Performance Indicators:**\n") + #st.dataframe(df.round(2), width='stretch') + st.markdown(df.round(2).to_markdown(index=False)) + + # Add debug info if enabled + if debug_mode: + debug_info = display_debug_info(result) + st.markdown(debug_info) + + # Store message content for history (text only, no table) + response_text_stored = format_results_for_chat(result) + response_text_stored += "\n\n📊 **Key Performance Indicators:**\n" + response_text_stored += df.round(2).to_markdown(index=False) + if debug_mode: + response_text_stored += display_debug_info(result) + response_text_stored += f"\n\n*Completed at {datetime.now().strftime('%H:%M:%S')}*" + + st.session_state.chat_messages.append({ + "role": "assistant", + "content": response_text_stored, + "has_table": True, + "table_data": sim_results + }) + st.session_state.simulation_results = result + st.session_state.agent_busy = False + st.rerun() + +# Export options +if st.session_state.simulation_results: + st.subheader("💾 Export Results") + + col1, col2, col3 = st.columns(3) + + results = st.session_state.simulation_results + + with col1: + if results.get('parameters'): + param_df = pd.DataFrame([ + {"Parameter": k.replace('_', ' ').title(), "Value": v} + for k, v in results['parameters'].items() + ]) + csv_params = param_df.to_csv(index=False) + st.download_button( + "📊 Parameters CSV", + csv_params, + file_name=f"parameters_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv", + mime="text/csv" + ) + + with col2: + if results.get('simulation_results'): + results_df = pd.DataFrame([ + {"KPI": k.replace('_', ' ').title(), "Value": v} + for k, v in results['simulation_results'].items() + ]) + csv_results = results_df.to_csv(index=False) + st.download_button( + "📈 Results CSV", + csv_results, + file_name=f"results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv", + mime="text/csv" + ) + + with col3: + full_json = json.dumps(results, indent=2, default=str) + st.download_button( + "📋 Full Report JSON", + full_json, + file_name=f"simulation_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json", + mime="application/json" + ) + diff --git a/evals/evals_output.json b/evals/evals_output.json index 71caedf..62caeec 100644 --- a/evals/evals_output.json +++ b/evals/evals_output.json @@ -30,8 +30,8 @@ }, "passed": true, "context": { - "trace_id": "05e9185a30490fa463f00ea5b0e65108", - "span_id": "39cfc1324c7aee19" + "trace_id": "c717c279cc25965c8b93849431cbe2df", + "span_id": "105b7a80e342ca90" } }, "example_2": { @@ -65,8 +65,8 @@ }, "passed": true, "context": { - "trace_id": "95fc97f63b485e37d20ada5d01b2acdb", - "span_id": "f66b8c36689fbdb2" + "trace_id": "43b85c1dcc267e4c5e648552771de775", + "span_id": "26f1c84b0cecb5a6" } }, "example_3": {