In [1]:
from agent_verify.agent import Agent
import pandas as pd
import json
import pickle
import os
import tiktoken
import re

In [2]:
def chat_completion_request_openai(prompt):
    client = Agent()
    messages = [{"role": "user", "content": prompt}]
    chat_response = client.llm_client.complete(model=client.model_name, messages=messages)
    if chat_response.choices:
        completion_text = chat_response.choices[0].message.content
    else:
        completion_text = None
    return completion_text

In [3]:

definitions = open("taxonomy_definitions_examples/definitions.txt", "r").read()

def openai_evaluator(trace, definitions=definitions, examples=''):
    prompt = (
    "Below I will provide a multiagent system trace. provide me an analysis of the failure modes and inefficiencies as I will say below. \n"
    "In the traces, analyze the system behaviour."
    "There are several failure modes in multiagent systems I identified. I will provide them below. Tell me if you encounter any of them, as a binary yes or no. \n"
    "Also, give me a one sentence (be brief) summary of the problems with the inefficiencies or failure modes in the trace. Only mark a failure mode if you can provide an example of it in the trace, and specify that in your summary at the end"
    "Also tell me whether the task is successfully completed or not, as a binary yes or no."
    "At the very end, I provide you with the definitions of the failure modes and inefficiencies. After the definitions, I will provide you with examples of the failure modes and inefficiencies for you to understand them better."
    "Tell me if you encounter any of them between the @@ symbols as I will say below, as a binary yes or no."
    "Here are the things you should answer. Start after the @@ sign and end before the next @@ sign (do not include the @@ symbols in your answer):"
    "*** begin of things you should answer *** @@"
    "A. Freeform text summary of the problems with the inefficiencies or failure modes in the trace: <summary>"
    "B. Whether the task is successfully completed or not: <yes or no>"
    "C. Whether you encounter any of the failure modes or inefficiencies:"
    "1.1 Disobey Task Specification: <yes or no>"
    "1.2 Disobey Role Specification: <yes or no>"
    "1.3 Step Repetition: <yes or no>"
    "1.4 Loss of Conversation History: <yes or no>"
    "1.5 Unaware of Termination Conditions: <yes or no>"
    "2.1 Conversation Reset: <yes or no>"
    "2.2 Fail to Ask for Clarification: <yes or no>"
    "2.3 Task Derailment: <yes or no>"
    "2.4 Information Withholding: <yes or no>"
    "2.5 Ignored Other Agent's Input: <yes or no>"
    "2.6 Action-Reasoning Mismatch: <yes or no>"
    "3.1 Premature Termination: <yes or no>"
    "3.2 No or Incorrect Verification: <yes or no>"
    "3.3 Weak Verification: <yes or no>"
    "@@*** end of your answer ***"
    "An example answer is: \n"
    "A. The task is not completed due to disobeying role specification as agents went rogue and started to chat with each other instead of completing the task. Agents derailed and verifier is not strong enough to detect it.\n"
    "B. no \n"
    "C. \n"
    "1.1 no \n"
    "1.2 no \n"
    "1.3 no \n"
    "1.4 no \n"
    "1.5 no \n"
    "1.6 yes \n"
    "2.1 no \n"
    "2.2 no \n"
    "2.3 yes \n"
    "2.4 no \n"
    "2.5 no \n"
    "2.6 yes \n"
    "2.7 no \n"
    "3.1 no \n"
    "3.2 yes \n"
    "3.3 no \n"   
    "Here is the trace: \n"
    f"{trace}"
    "Also, here are the explanations (definitions) of the failure modes and inefficiencies: \n"
    f"{definitions} \n"
    "Here are some examples of the failure modes and inefficiencies: \n"
    f"{examples}"
)
    return chat_completion_request_openai(prompt)

In [4]:
examples = open("taxonomy_definitions_examples/examples.txt", "r").read()

In [5]:
def parse_responses(responses):
    """
    Parse the LLM responses to extract yes/no answers for each failure mode.
    
    Args:
        responses: List of LLM responses evaluating traces
        
    Returns:
        Dictionary mapping failure mode codes to lists of binary values (0 for no, 1 for yes)
    """
    import re
    
    # Initialize dictionary with empty lists for each failure mode
    failure_modes = {
        '1.1': [], '1.2': [], '1.3': [], '1.4': [], '1.5': [],
        '2.1': [], '2.2': [], '2.3': [], '2.4': [], '2.5': [], '2.6': [],
        '3.1': [], '3.2': [], '3.3': []
    }
    
    for i, response in enumerate(responses):
        try:
            # Clean up the response - remove @@ markers if present
            cleaned_response = response.strip()
            if cleaned_response.startswith('@@'):
                cleaned_response = cleaned_response[2:]
            if cleaned_response.endswith('@@'):
                cleaned_response = cleaned_response[:-2]
            
            # Process each failure mode
            for mode in failure_modes.keys():
                # Various patterns to match different response formats
                patterns = [
                    # Format with C. prefix and colon
                    rf"C\..*?{mode}.*?(yes|no)",
                    # Format with just C prefix without dot
                    rf"C{mode}\s+(yes|no)",
                    # Format with mode directly (with or without spaces)
                    rf"{mode}\s*[:]\s*(yes|no)",
                    rf"{mode}\s+(yes|no)",
                    # Format with newlines
                    rf"{mode}\s*\n\s*(yes|no)",
                    # Format with C prefix and newlines
                    rf"C\.{mode}\s*\n\s*(yes|no)"
                ]
                
                found = False
                for pattern in patterns:
                    matches = re.findall(pattern, cleaned_response, re.IGNORECASE | re.DOTALL)
                    if matches:
                        # Use the first match
                        value = 1 if matches[0].lower() == 'yes' else 0
                        failure_modes[mode].append(value)
                        found = True
                        break
                
                if not found:
                    # If we still can't find a match, try a more general approach
                    # Look for the mode number followed by any text and then yes/no
                    general_pattern = rf"(?:C\.)?{mode}.*?(yes|no)"
                    match = re.search(general_pattern, cleaned_response, re.IGNORECASE | re.DOTALL)
                    
                    if match:
                        value = 1 if match.group(1).lower() == 'yes' else 0
                        failure_modes[mode].append(value)
                    else:
                        # If all attempts fail, default to 'no'
                        print(f"Warning: Could not find mode {mode} in response {i}")
                        failure_modes[mode].append(0)
                    
        except Exception as e:
            print(f"Error processing response {i}: {e}")
            # If there's an error, default to 'no' for all modes for this response
            for mode in failure_modes:
                if len(failure_modes[mode]) <= i:  # Only append if we haven't already
                    failure_modes[mode].append(0)
    
    # Ensure all lists have the same length
    max_length = max(len(values) for values in failure_modes.values())
    for mode in failure_modes:
        if len(failure_modes[mode]) < max_length:
            failure_modes[mode].extend([0] * (max_length - len(failure_modes[mode])))
    
    return failure_modes


## LLM As a Judge - Tau Bench

In [6]:
full_trace_list = []
for i in range(2):
    trace = open(f"trajectories/tau_bench/trajectories/{i}", "r").read()
    full_trace_list.append(trace)

In [7]:
openai_results = []
dirname = 'saved_results_tau_bench'
os.makedirs(dirname, exist_ok=True)

for i in range(len(full_trace_list)):
    
    if len(full_trace_list[i] + examples) > 1048570:
            full_trace_list[i] = full_trace_list[i][:1048570 - len(examples)]


    try:
        openai_evaluation = openai_evaluator(full_trace_list[i], examples=examples)
        openai_results.append(openai_evaluation)
        
        # Save the current results after each evaluation
        with open(f'{dirname}/o1_results_checkpoint.pkl', 'wb') as f:
            pickle.dump(openai_results, f)
            
        # Optional: Save a backup copy every 10 evaluations
        if (i + 1) % 10 == 0:
            with open(f'{dirname}/o1_results_backup_{i+1}.pkl', 'wb') as f:
                pickle.dump(openai_results, f)
                
        print(f"Completed and saved evaluation {i+1}/{len(full_trace_list)}")
    except Exception as e:
        print(f"Error on evaluation {i+1}: {str(e)}")
        # Save results even if there's an error
        with open(f'{dirname}/o1_results_checkpoint.pkl', 'wb') as f:
            pickle.dump(openai_results, f)

Completed and saved evaluation 1/2
Completed and saved evaluation 2/2


In [8]:
def get_failures_per_test(failure_modes):
    failures_per_test = {}
    for i in range(len(failure_modes[list(failure_modes.keys())[0]])):
        failures_per_test[i] = []
        for mode in failure_modes.keys():
            if failure_modes[mode][i] == 1:
                failures_per_test[i].append(mode)
        
    return failures_per_test

In [9]:
o1_results = openai_results
failure_mode_results_o1 = parse_responses(o1_results)
res = (get_failures_per_test(failure_mode_results_o1))
raw_logs = json.load(open("trajectories/tau_bench/raw_logs/tool-calling-none-0.1_range_0-100_user-none-llm_06232025.json", "r"))
done_but_problems = 0
l = []
not_done_but_no_problems = 0
for i in range(len(res)):
    if raw_logs[i]['reward'] > 0 and len(res[i]) > 0:
        done_but_problems += 1
        l.append((i, res[i]))
    elif raw_logs[i]['reward'] < 1 and len(res[i]) == 0:
        not_done_but_no_problems += 1
    print(f"Test {i}: Reward: {raw_logs[i]['reward']} Failures: {res[i]}")  # Print failures for each test
    # print(f"Test {i}:  Failures: {res[i]}")  # Print failures for each test
print(f"Total tests with done but problems: {done_but_problems}")
print(f"Total tests with not done but no problems: {not_done_but_no_problems}")
print(l)

Test 0: Reward: 1.0 Failures: []
Test 1: Reward: 1.0 Failures: []
Total tests with done but problems: 0
Total tests with not done but no problems: 0
[]


## LLM As a Judge - BFCL

In [10]:
from bfcl_trajectory_loader import get_all_bfcl_trajectories
# Load all BFCL trajectories
full_trace_list = get_all_bfcl_trajectories()
full_trace_list = [str(i) for i in full_trace_list]  # Convert to string if needed

In [11]:
full_trace_list = full_trace_list[:100]  # Limit to first 2 traces for testing
print(len(full_trace_list))

100


In [12]:
openai_results = []
dirname = 'saved_results_bfcl'
os.makedirs(dirname, exist_ok=True)

for i in range(len(full_trace_list)):
    
    if len(full_trace_list[i] + examples) > 1048570:
            full_trace_list[i] = full_trace_list[i][:1048570 - len(examples)]


    try:
        openai_evaluation = openai_evaluator(full_trace_list[i], examples=examples)
        openai_results.append(openai_evaluation)
        
        # Save the current results after each evaluation
        with open(f'{dirname}/o1_results_checkpoint.pkl', 'wb') as f:
            pickle.dump(openai_results, f)
            
        # Optional: Save a backup copy every 10 evaluations
        if (i + 1) % 10 == 0:
            with open(f'{dirname}/o1_results_backup_{i+1}.pkl', 'wb') as f:
                pickle.dump(openai_results, f)
                
        print(f"Completed and saved evaluation {i+1}/{len(full_trace_list)}")
    except Exception as e:
        print(f"Error on evaluation {i+1}: {str(e)}")
        # Save results even if there's an error
        with open(f'{dirname}/o1_results_checkpoint.pkl', 'wb') as f:
            pickle.dump(openai_results, f)

Completed and saved evaluation 1/100
Completed and saved evaluation 2/100
Completed and saved evaluation 3/100
Completed and saved evaluation 4/100
Completed and saved evaluation 5/100
Completed and saved evaluation 6/100
Completed and saved evaluation 7/100
Completed and saved evaluation 8/100
Completed and saved evaluation 9/100
Completed and saved evaluation 10/100
Completed and saved evaluation 11/100
Completed and saved evaluation 12/100
Completed and saved evaluation 13/100
Completed and saved evaluation 14/100
Completed and saved evaluation 15/100
Completed and saved evaluation 16/100
Completed and saved evaluation 17/100
Completed and saved evaluation 18/100
Completed and saved evaluation 19/100
Completed and saved evaluation 20/100
Completed and saved evaluation 21/100
Completed and saved evaluation 22/100
Completed and saved evaluation 23/100
Completed and saved evaluation 24/100
Completed and saved evaluation 25/100
Completed and saved evaluation 26/100
Completed and saved e

In [13]:
o1_results = openai_results
failure_mode_results_o1 = parse_responses(o1_results)
for mode, values in failure_mode_results_o1.items():
    print(f"{mode}: {values[:5]} (total yes: {sum(values)}/{len(values)}, {round(sum(values)/len(values)*100, 2)}%)")

1.1: [0, 0, 0, 0, 0] (total yes: 9/100, 9.0%)
1.2: [0, 0, 0, 0, 0] (total yes: 0/100, 0.0%)
1.3: [0, 0, 0, 0, 0] (total yes: 0/100, 0.0%)
1.4: [0, 0, 0, 0, 0] (total yes: 0/100, 0.0%)
1.5: [0, 0, 0, 0, 0] (total yes: 1/100, 1.0%)
2.1: [0, 0, 0, 0, 0] (total yes: 0/100, 0.0%)
2.2: [0, 0, 0, 0, 0] (total yes: 21/100, 21.0%)
2.3: [0, 0, 0, 0, 0] (total yes: 5/100, 5.0%)
2.4: [0, 0, 0, 0, 0] (total yes: 0/100, 0.0%)
2.5: [0, 0, 0, 0, 0] (total yes: 0/100, 0.0%)
2.6: [0, 0, 0, 0, 0] (total yes: 3/100, 3.0%)
3.1: [0, 0, 0, 0, 0] (total yes: 14/100, 14.0%)
3.2: [0, 0, 0, 0, 0] (total yes: 8/100, 8.0%)
3.3: [0, 0, 0, 0, 0] (total yes: 5/100, 5.0%)


## LLM As a Judge - FLASH

In [14]:
from flash_trajectory_loader import get_all_flash_trajectories
# Load all flash trajectories
full_trace_list = get_all_flash_trajectories()
full_trace_list = [str(i) for i in full_trace_list]  # Convert to string if needed

In [15]:
full_trace_list = full_trace_list[:100]  # Limit to first 2 traces for testing
print((full_trace_list[0]))

[{'timestamp': '2025-07-24T03:39:08.013404', 'source': 'user', 'message': 'Diagnose the incident 445308210. Incident description: Tip Session repave stuck due to active containers. Container Details:\nteam name - NSMConfidentialComputing\nnodeID - 152076538\ncontainerList - 1348992VIXLC, 2959320THASS, 2460743SYENG, 1098082HTCJM', 'type': 'OrchestrationEvent'}, {'timestamp': '2025-07-24T03:39:08.016505', 'source': 'Orchestrator (thought)', 'message': 'Initial plan:\n\nWe are working to address the following user request:\n\nDiagnose the incident 445308210. Incident description: Tip Session repave stuck due to active containers. Container Details:\nteam name - NSMConfidentialComputing\nnodeID - 152076538\ncontainerList - 1348992VIXLC, 2959320THASS, 2460743SYENG, 1098082HTCJM\n\n\nTo answer this request we have assembled the following team:\n\nCoder: A helpful and general-purpose AI assistant that has strong language skills, Python skills, and Linux command line skills.\nExecutor: A compu

In [16]:
openai_results = []
dirname = 'saved_results_flash'
os.makedirs(dirname, exist_ok=True)

for i in range(len(full_trace_list)):
    
    if len(full_trace_list[i] + examples) > 1048570:
            full_trace_list[i] = full_trace_list[i][:1048570 - len(examples)]


    try:
        openai_evaluation = openai_evaluator(full_trace_list[i], examples=examples)
        openai_results.append(openai_evaluation)
        
        # Save the current results after each evaluation
        with open(f'{dirname}/o1_results_checkpoint.pkl', 'wb') as f:
            pickle.dump(openai_results, f)
            
        # Optional: Save a backup copy every 10 evaluations
        if (i + 1) % 10 == 0:
            with open(f'{dirname}/o1_results_backup_{i+1}.pkl', 'wb') as f:
                pickle.dump(openai_results, f)
                
        print(f"Completed and saved evaluation {i+1}/{len(full_trace_list)}")
    except Exception as e:
        print(f"Error on evaluation {i+1}: {str(e)}")
        # Save results even if there's an error
        with open(f'{dirname}/o1_results_checkpoint.pkl', 'wb') as f:
            pickle.dump(openai_results, f)

Completed and saved evaluation 1/42
Completed and saved evaluation 2/42
Completed and saved evaluation 3/42
Completed and saved evaluation 4/42
Completed and saved evaluation 5/42
Completed and saved evaluation 6/42
Completed and saved evaluation 7/42
Completed and saved evaluation 8/42
Completed and saved evaluation 9/42
Completed and saved evaluation 10/42
Completed and saved evaluation 11/42
Completed and saved evaluation 12/42
Completed and saved evaluation 13/42
Completed and saved evaluation 14/42
Completed and saved evaluation 15/42
Completed and saved evaluation 16/42
Completed and saved evaluation 17/42
Completed and saved evaluation 18/42
Completed and saved evaluation 19/42
Completed and saved evaluation 20/42
Completed and saved evaluation 21/42
Completed and saved evaluation 22/42
Completed and saved evaluation 23/42
Completed and saved evaluation 24/42
Completed and saved evaluation 25/42
Completed and saved evaluation 26/42
Completed and saved evaluation 27/42
Completed 

In [17]:
o1_results = openai_results
failure_mode_results_o1 = parse_responses(o1_results)
for mode, values in failure_mode_results_o1.items():
    print(f"{mode}: {values[:5]} (total yes: {sum(values)}/{len(values)}, {round(sum(values)/len(values)*100, 2)}%)")

1.1: [0, 0, 0, 0, 0] (total yes: 0/42, 0.0%)
1.2: [0, 0, 0, 0, 0] (total yes: 0/42, 0.0%)
1.3: [0, 1, 0, 0, 0] (total yes: 4/42, 9.52%)
1.4: [0, 0, 0, 0, 0] (total yes: 0/42, 0.0%)
1.5: [1, 0, 0, 0, 0] (total yes: 1/42, 2.38%)
2.1: [0, 0, 0, 0, 0] (total yes: 2/42, 4.76%)
2.2: [0, 0, 0, 0, 0] (total yes: 6/42, 14.29%)
2.3: [0, 0, 0, 0, 0] (total yes: 0/42, 0.0%)
2.4: [0, 0, 0, 0, 0] (total yes: 0/42, 0.0%)
2.5: [0, 0, 0, 0, 0] (total yes: 0/42, 0.0%)
2.6: [0, 0, 0, 0, 0] (total yes: 6/42, 14.29%)
3.1: [1, 0, 0, 0, 0] (total yes: 11/42, 26.19%)
3.2: [0, 0, 1, 0, 0] (total yes: 8/42, 19.05%)
3.3: [1, 0, 0, 0, 0] (total yes: 5/42, 11.9%)
