In [None]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Evaluate your ADK agent using Vertex AI Gen AI Evaluation service

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/generative-ai/blob/main/gemini/evaluation/evaluating_adk_agent.ipynb">
      <img width="32px" src="https://www.gstatic.com/pantheon/images/bigquery/welcome_page/colab-logo.svg" alt="Google Colaboratory logo"><br> Open in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fgenerative-ai%2Fmain%2Fgemini%2Fevaluation%2Fevaluating_adk_agent.ipynb">
      <img width="32px" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" alt="Google Cloud Colab Enterprise logo"><br> Open in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/main/gemini/evaluation/evaluating_adk_agent.ipynb">
      <img src="https://www.gstatic.com/images/branding/gcpiconscolors/vertexai/v1/32px.svg" alt="Vertex AI logo"><br> Open in Vertex AI Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/evaluation/evaluating_adk_agent.ipynb">
      <img width="32px" src="https://www.svgrepo.com/download/217753/github.svg" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
</table>

<div style="clear: both;"></div>

<b>Share to:</b>

<a href="https://www.linkedin.com/sharing/share-offsite/?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/evaluation/evaluating_adk_agent.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/8/81/LinkedIn_icon.svg" alt="LinkedIn logo">
</a>

<a href="https://bsky.app/intent/compose?text=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/evaluation/evaluating_adk_agent.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/7/7a/Bluesky_Logo.svg" alt="Bluesky logo">
</a>

<a href="https://twitter.com/intent/tweet?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/evaluation/evaluating_adk_agent.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/5a/X_icon_2.svg" alt="X logo">
</a>

<a href="https://reddit.com/submit?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/evaluation/evaluating_adk_agent.ipynb" target="_blank">
  <img width="20px" src="https://redditinc.com/hubfs/Reddit%20Inc/Brand/Reddit_Logo.png" alt="Reddit logo">
</a>

<a href="https://www.facebook.com/sharer/sharer.php?u=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/evaluation/evaluating_adk_agent.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/51/Facebook_f_logo_%282019%29.svg" alt="Facebook logo">
</a>

| Author(s) |
| --- |
| [Ivan Nardini](https://github.com/inardini) |

## Overview

Agent Development Kit (ADK in short) is a flexible and modular open source framework for developing and deploying AI agents. While ADK has its own evaluation module, using Vertex AI Gen AI Evaluation provides a toolkit of quality controlled and explainable methods and metrics to evaluate any generative model or application, including agents, and benchmark the evaluation results against your own judgment, using your own evaluation criteria.

This tutorial shows how to evaluate an ADK agent using Vertex AI Gen AI Evaluation for agent evaluation.

The steps performed include:

* Build local agent using ADK
* Prepare Agent Evaluation dataset
* Single tool usage evaluation
* Trajectory evaluation
* Response evaluation

## Get started

### Install Google Gen AI SDK and other required packages


In [None]:
%pip install --upgrade --quiet 'google-adk'

Note: you may need to restart the kernel to use updated packages.


In [None]:
%pip install ruamel.yaml --no-deps
%pip install --upgrade --quiet 'google-cloud-aiplatform[evaluation]'

### Authenticate your notebook environment (Colab only)

If you're running this notebook on Google Colab, run the cell below to authenticate your environment.

In [None]:
import sys

if "google.colab" in sys.modules:
    from google.colab import auth

    auth.authenticate_user()

### Set Google Cloud project information

To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).

Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment).

In [1]:
# Use the environment variable if the user doesn't provide Project ID.
import os

import vertexai

PROJECT_ID = "tough-entry-462814-h3"  # @param {type: "string", placeholder: "[your-project-id]", isTemplate: true}
# if not PROJECT_ID or PROJECT_ID == "tough-entry-462814-h3":
#    PROJECT_ID = str(os.environ.get("GOOGLE_CLOUD_PROJECT"))

LOCATION = os.environ.get("GOOGLE_CLOUD_REGION", "us-central1")

BUCKET_NAME = "qliu3-asp-demo"  # @param {type: "string", placeholder: "[your-bucket-name]", isTemplate: true}
BUCKET_URI = f"gs://{BUCKET_NAME}"

!gsutil mb -l {LOCATION} {BUCKET_URI}

os.environ["GOOGLE_CLOUD_PROJECT"] = PROJECT_ID
os.environ["GOOGLE_CLOUD_LOCATION"] = LOCATION
os.environ["GOOGLE_GENAI_USE_VERTEXAI"] = "True"

EXPERIMENT_NAME = "evaluate-adk-agent"  # @param {type:"string"}

vertexai.init(project=PROJECT_ID, location=LOCATION, experiment=EXPERIMENT_NAME)

/bin/bash: gsutil: command not found


E0000 00:00:1758551212.514830 15380293 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
E0000 00:00:1758551214.248683 15380293 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
E0000 00:00:1758551215.579774 15380293 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
E0000 00:00:1758551217.115221 15380293 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
E0000 00:00:1758551217.117087 15380293 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
E0000 00:00:1758551218.479715 15380293 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
E0000 00:00:1758551218.481462 15380293 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
E0000 00:00:1758551218.485274 15380293 al

## Import libraries

Import tutorial libraries.

In [2]:
import json
import asyncio

# General
import random
import string
from typing import Any

from IPython.display import HTML, Markdown, display
from google.adk.agents import Agent

# Build agent with adk
from google.adk.events import Event
from google.adk.runners import Runner
from google.adk.sessions import InMemorySessionService

# Evaluate agent
from google.cloud import aiplatform
from google.genai import types
import pandas as pd
import plotly.graph_objects as go
from vertexai.preview.evaluation import EvalTask
from vertexai.preview.evaluation.metrics import (
    PointwiseMetric,
    PointwiseMetricPromptTemplate,
    TrajectorySingleToolUse,
)

## Define helper functions

Initiate a set of helper functions to print tutorial results.

In [6]:
def get_id(length: int = 8) -> str:
    """Generate a uuid of a specified length (default=8)."""
    return "".join(random.choices(string.ascii_lowercase + string.digits, k=length))


def parse_adk_output_to_dictionary(events: list[Event], *, as_json: bool = False):
    """
    Parse ADK event output into a structured dictionary format,
    with the predicted trajectory dumped as a JSON string.

    """

    final_response = ""
    trajectory = []

    for event in events:
        if not getattr(event, "content", None) or not getattr(event.content, "parts", None):
            continue
        for part in event.content.parts:
            if getattr(part, "function_call", None):
                info = {
                    "tool_name": part.function_call.name,
                    "tool_input": dict(part.function_call.args),
                }
                if info not in trajectory:
                    trajectory.append(info)
            if event.content.role == "model" and getattr(part, "text", None):
                final_response = part.text.strip()

    if as_json:
        trajectory_out = json.dumps(trajectory)
    else:
        trajectory_out = trajectory

    return {"response": final_response, "predicted_trajectory": trajectory_out}


def format_output_as_markdown(output: dict) -> str:
    """Convert the output dictionary to a formatted markdown string."""
    markdown = "### AI Response\n" + output["response"] + "\n\n"
    if output["predicted_trajectory"]:
        markdown += "### Function Calls\n"
        for call in output["predicted_trajectory"]:
            markdown += f"- **Function**: `{call['tool_name']}`\n"
            markdown += "  - **Arguments**\n"
            for key, value in call["tool_input"].items():
                markdown += f"    - `{key}`: `{value}`\n"
    return markdown


def display_eval_report(eval_result: pd.DataFrame) -> None:
    """Display the evaluation results."""
    display(Markdown("### Summary Metrics"))
    display(
        pd.DataFrame(
            eval_result.summary_metrics.items(), columns=["metric", "value"]
        )
    )
    if getattr(eval_result, "metrics_table", None) is not None:
        display(Markdown("### Row‑wise Metrics"))
        display(eval_result.metrics_table.head())


def display_drilldown(row: pd.Series) -> None:
    """Displays a drill-down view for trajectory data within a row."""

    style = "white-space: pre-wrap; width: 800px; overflow-x: auto;"

    if not (
        isinstance(row["predicted_trajectory"], list)
        and isinstance(row["reference_trajectory"], list)
    ):
        return

    for predicted_trajectory, reference_trajectory in zip(
        row["predicted_trajectory"], row["reference_trajectory"]
    ):
        display(
            HTML(
                f"<h3>Tool Names:</h3><div style='{style}'>{predicted_trajectory['tool_name'], reference_trajectory['tool_name']}</div>"
            )
        )

        if not (
            isinstance(predicted_trajectory.get("tool_input"), dict)
            and isinstance(reference_trajectory.get("tool_input"), dict)
        ):
            continue

        for tool_input_key in predicted_trajectory["tool_input"]:
            print("Tool Input Key: ", tool_input_key)

            if tool_input_key in reference_trajectory["tool_input"]:
                print(
                    "Tool Values: ",
                    predicted_trajectory["tool_input"][tool_input_key],
                    reference_trajectory["tool_input"][tool_input_key],
                )
            else:
                print(
                    "Tool Values: ",
                    predicted_trajectory["tool_input"][tool_input_key],
                    "N/A",
                )
        print("\n")
    display(HTML("<hr>"))


def display_dataframe_rows(
    df: pd.DataFrame,
    columns: list[str] | None = None,
    num_rows: int = 3,
    display_drilldown: bool = False,
) -> None:
    """Displays a subset of rows from a DataFrame, optionally including a drill-down view."""

    if columns:
        df = df[columns]

    base_style = "font-family: monospace; font-size: 14px; white-space: pre-wrap; width: auto; overflow-x: auto;"
    header_style = base_style + "font-weight: bold;"

    for _, row in df.head(num_rows).iterrows():
        for column in df.columns:
            display(
                HTML(
                    f"<span style='{header_style}'>{column.replace('_', ' ').title()}: </span>"
                )
            )
            display(HTML(f"<span style='{base_style}'>{row[column]}</span><br>"))

        display(HTML("<hr>"))

        if (
            display_drilldown
            and "predicted_trajectory" in df.columns
            and "reference_trajectory" in df.columns
        ):
            display_drilldown(row)


def plot_bar_plot(
    eval_result: pd.DataFrame, title: str, metrics: list[str] = None
) -> None:
    fig = go.Figure()
    data = []

    summary_metrics = eval_result.summary_metrics
    if metrics:
        summary_metrics = {
            k: summary_metrics[k]
            for k, v in summary_metrics.items()
            if any(selected_metric in k for selected_metric in metrics)
        }

    data.append(
        go.Bar(
            x=list(summary_metrics.keys()),
            y=list(summary_metrics.values()),
            name=title,
        )
    )

    fig = go.Figure(data=data)

    # Change the bar mode
    fig.update_layout(barmode="group")
    fig.show()


def display_radar_plot(eval_results, title: str, metrics=None):
    """Plot the radar plot."""
    fig = go.Figure()
    summary_metrics = eval_results.summary_metrics
    if metrics:
        summary_metrics = {
            k: summary_metrics[k]
            for k, v in summary_metrics.items()
            if any(selected_metric in k for selected_metric in metrics)
        }

    min_val = min(summary_metrics.values())
    max_val = max(summary_metrics.values())

    fig.add_trace(
        go.Scatterpolar(
            r=list(summary_metrics.values()),
            theta=list(summary_metrics.keys()),
            fill="toself",
            name=title,
        )
    )
    fig.update_layout(
        title=title,
        polar=dict(radialaxis=dict(visible=True, range=[min_val, max_val])),
        showlegend=True,
    )
    fig.show()

## Build ADK agent

Build your application using ADK, including the Gemini model and custom tools that you define.

### Set tools

To start, set the tools that a customer support agent needs to do their job.

In [7]:
def get_product_details(product_name: str):
    """Gathers basic details about a product."""
    details = {
        "smartphone": "A cutting-edge smartphone with advanced camera features and lightning-fast processing.",
        "usb charger": "A super fast and light usb charger",
        "shoes": "High-performance running shoes designed for comfort, support, and speed.",
        "headphones": "Wireless headphones with advanced noise cancellation technology for immersive audio.",
        "speaker": "A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.",
    }
    return details.get(product_name, "Product details not found.")


def get_product_price(product_name: str):
    """Gathers price about a product."""
    details = {
        "smartphone": 500,
        "usb charger": 10,
        "shoes": 100,
        "headphones": 50,
        "speaker": 80,
    }
    return details.get(product_name, "Product price not found.")

### Set the model

Choose which Gemini AI model your agent will use. If you're curious about Gemini and its different capabilities, take a look at [the official documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models) for more details.

In [8]:
model = "gemini-2.0-flash"

### Assemble the agent

The Vertex AI Gen AI Evaluation works directly with 'Queryable' agents, and also lets you add your own custom functions with a specific structure (signature).

In this case, you assemble the agent using a custom function. The function triggers the agent for a given input and parse the agent outcome to extract the response and called tools.

In [9]:
async def agent_parsed_outcome(query):
   app_name = "product_research_app"
   user_id = "user_one"
   session_id = "session_one"
   
   product_research_agent = Agent(
       name="ProductResearchAgent",
       model=model,
       description="An agent that performs product research.",
       instruction=f"""
       Analyze this user request: '{query}'.
       If the request is about price, use get_product_price tool.
       Otherwise, use get_product_details tool to get product information.
       """,
       tools=[get_product_details, get_product_price],
   )

   session_service = InMemorySessionService()
   await session_service.create_session(
       app_name=app_name, user_id=user_id, session_id=session_id
   )

   runner = Runner(
       agent=product_research_agent, app_name=app_name, session_service=session_service
   )

   content = types.Content(role="user", parts=[types.Part(text=query)])
   events = [event async for event in runner.run_async(user_id=user_id, session_id=session_id, new_message=content)]
   
   return parse_adk_output_to_dictionary(events)


In [10]:
# --- Sync wrapper for Vertex‑AI evaluation
def agent_parsed_outcome_sync(prompt: str):
    result = asyncio.run(agent_parsed_outcome(prompt))
    result["predicted_trajectory"] = json.dumps(result["predicted_trajectory"])
    return result

### Test the agent

Query your agent.

In [11]:
response = await agent_parsed_outcome(query="Get product details for shoes")
display(Markdown(format_output_as_markdown(response)))



### AI Response
OK. I have gathered product details for shoes. The details are: High-performance running shoes designed for comfort, support, and speed.

### Function Calls
- **Function**: `get_product_details`
  - **Arguments**
    - `product_name`: `shoes`


In [12]:
response = await agent_parsed_outcome(query="Get product price for shoes")
display(Markdown(format_output_as_markdown(response)))



### AI Response
The price for shoes is $100.

### Function Calls
- **Function**: `get_product_price`
  - **Arguments**
    - `product_name`: `shoes`


## Evaluating a ADK agent with Vertex AI Gen AI Evaluation

When working with AI agents, it's important to keep track of their performance and how well they're working. You can look at this in two main ways: **monitoring** and **observability**.

Monitoring focuses on how well your agent is performing specific tasks:

* **Single Tool Selection**: Is the agent choosing the right tools for the job?

* **Multiple Tool Selection (or Trajectory)**: Is the agent making logical choices in the order it uses tools?

* **Response generation**: Is the agent's output good, and does it make sense based on the tools it used?

Observability is about understanding the overall health of the agent:

* **Latency**: How long does it take the agent to respond?

* **Failure Rate**: How often does the agent fail to produce a response?

Vertex AI Gen AI Evaluation service helps you to assess all of these aspects both while you are prototyping the agent or after you deploy it in production. It provides [pre-built evaluation criteria and metrics](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval) so you can see exactly how your agents are doing and identify areas for improvement.

### Prepare Agent Evaluation dataset

To evaluate your AI agent using the Vertex AI Gen AI Evaluation service, you need a specific dataset depending on what aspects you want to evaluate of your agent.  

This dataset should include the prompts given to the agent. It can also contain the ideal or expected response (ground truth) and the intended sequence of tool calls the agent should take (reference trajectory) representing the sequence of tools you expect agent calls for each given prompt.

> Optionally, you can provide both generated responses and predicted trajectory (**Bring-Your-Own-Dataset scenario**).

Below you have an example of dataset you might have with a customer support agent with user prompt and the reference trajectory.

In [13]:
eval_data = {
    "prompt": [
        "Get price for smartphone",
        "Get product details and price for headphones",
        "Get details for usb charger",
        "Get product details and price for shoes",
        "Get product details for speaker?",
    ],
    "predicted_trajectory": [
        [
            {
                "tool_name": "get_product_price",
                "tool_input": {"product_name": "smartphone"},
            }
        ],
        [
            {
                "tool_name": "get_product_details",
                "tool_input": {"product_name": "headphones"},
            },
            {
                "tool_name": "get_product_price",
                "tool_input": {"product_name": "headphones"},
            },
        ],
        [
            {
                "tool_name": "get_product_details",
                "tool_input": {"product_name": "usb charger"},
            }
        ],
        [
            {
                "tool_name": "get_product_details",
                "tool_input": {"product_name": "shoes"},
            },
            {"tool_name": "get_product_price", "tool_input": {"product_name": "shoes"}},
        ],
        [
            {
                "tool_name": "get_product_details",
                "tool_input": {"product_name": "speaker"},
            }
        ],
    ],
}

eval_sample_dataset = pd.DataFrame(eval_data)

Print some samples from the dataset.

In [14]:
display_dataframe_rows(eval_sample_dataset, num_rows=3)

### Single tool usage evaluation

After you've set your AI agent and the evaluation dataset, you start evaluating if the agent is choosing the correct single tool for a given task.


#### Set single tool usage metrics

The `trajectory_single_tool_use` metric in Vertex AI Gen AI Evaluation gives you a quick way to evaluate whether your agent is using the tool you expect it to use, regardless of any specific tool order. It's a basic but useful way to start evaluating if the right tool was used at some point during the agent's process.

To use the `trajectory_single_tool_use` metric, you need to set what tool should have been used for a particular user's request. For example, if a user asks to "send an email", you might expect the agent to use an "send_email" tool, and you'd specify that tool's name when using this metric.


In [15]:
single_tool_usage_metrics = [TrajectorySingleToolUse(tool_name="get_product_price")]

#### Run an evaluation task

To run the evaluation, you initiate an `EvalTask` using the pre-defined dataset (`eval_sample_dataset`) and metrics (`single_tool_usage_metrics` in this case) within an experiment. Then, you run the evaluation using agent_parsed_outcome function and assigns a unique identifier to this specific evaluation run, storing and visualizing the evaluation results.


In [16]:
EXPERIMENT_RUN = f"single-metric-eval-{get_id()}"

single_tool_call_eval_task = EvalTask(
    dataset=eval_sample_dataset,
    metrics=single_tool_usage_metrics,
    experiment=EXPERIMENT_NAME,
    output_uri_prefix=BUCKET_URI + "/single-metric-eval",
)

single_tool_call_eval_result = single_tool_call_eval_task.evaluate(
    runnable=agent_parsed_outcome_sync, experiment_run_name=EXPERIMENT_RUN
)

display_eval_report(single_tool_call_eval_result)

E0000 00:00:1758551291.663509 15380293 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
E0000 00:00:1758551293.072448 15380293 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.


E0000 00:00:1758551294.634264 15380293 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
E0000 00:00:1758551296.167569 15380293 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
E0000 00:00:1758551296.169257 15380293 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
E0000 00:00:1758551297.446439 15380293 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
E0000 00:00:1758551298.710640 15380293 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
E0000 00:00:1758551298.712360 15380293 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
E0000 00:00:1758551300.160598 15380293 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
E0000 00:00:1758551300.162075 15380293 al

Associating projects/552550918814/locations/us-central1/metadataStores/default/contexts/evaluate-adk-agent-single-metric-eval-n7pppi7g to Experiment: evaluate-adk-agent


E0000 00:00:1758551309.069156 15380293 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.


Logging Eval experiment evaluation metadata: {'output_file': 'gs://qliu3-asp-demo/single-metric-eval/eval_results_2025-09-22-22-28-11-564e2.csv'}


E0000 00:00:1758551310.607921 15380293 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
E0000 00:00:1758551311.880724 15380293 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
 20%|██        | 1/5 [00:03<00:14,  3.54s/it]

When a `runnable` is provided, trajectory is generated dynamically by the runnable, so the pre-existing `predicted_trajectory` column provided in the evaluation dataset is not used.


 40%|████      | 2/5 [00:03<00:04,  1.62s/it]

When a `runnable` is provided, trajectory is generated dynamically by the runnable, so the pre-existing `predicted_trajectory` column provided in the evaluation dataset is not used.


 80%|████████  | 4/5 [00:04<00:00,  1.15it/s]

When a `runnable` is provided, trajectory is generated dynamically by the runnable, so the pre-existing `predicted_trajectory` column provided in the evaluation dataset is not used.
When a `runnable` is provided, trajectory is generated dynamically by the runnable, so the pre-existing `predicted_trajectory` column provided in the evaluation dataset is not used.
When a `runnable` is provided, trajectory is generated dynamically by the runnable, so the pre-existing `predicted_trajectory` column provided in the evaluation dataset is not used.


100%|██████████| 5/5 [00:04<00:00,  1.06it/s]

All 5 responses are successfully generated from the runnable.
Computing metrics with a total of 5 Vertex Gen AI Evaluation Service API requests.



100%|██████████| 5/5 [00:01<00:00,  3.31it/s]

All 5 metric requests are successfully computed.
Evaluation Took:1.5199813528452069 seconds



E0000 00:00:1758551318.159812 15380293 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
E0000 00:00:1758551319.511923 15380293 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
E0000 00:00:1758551321.049993 15380293 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
E0000 00:00:1758551322.267911 15380293 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.


### Summary Metrics

Unnamed: 0,metric,value
0,row_count,5.0
1,trajectory_single_tool_use/mean,0.6
2,trajectory_single_tool_use/std,0.547723
3,latency_in_seconds/mean,4.148279
4,latency_in_seconds/std,0.539222
5,failure/mean,0.0
6,failure/std,0.0


### Row‑wise Metrics

Unnamed: 0,prompt,predicted_trajectory,response,latency_in_seconds,failure,trajectory_single_tool_use/score
0,Get price for smartphone,"[{""tool_name"": ""get_product_price"", ""tool_inpu...",The price for smartphone is $500.,3.534606,0,1.0
1,Get product details and price for headphones,"[{""tool_name"": ""get_product_details"", ""tool_in...",OK. I have gathered the product details and pr...,3.815661,0,1.0
2,Get details for usb charger,"[{""tool_name"": ""get_product_details"", ""tool_in...",OK. I have the details for the usb charger. It...,4.706401,0,0.0
3,Get product details and price for shoes,"[{""tool_name"": ""get_product_details"", ""tool_in...",OK. shoes: High-performance running shoes desi...,3.96225,0,1.0
4,Get product details for speaker?,"[{""tool_name"": ""get_product_details"", ""tool_in...",OK. I have retrieved product details for the s...,4.722479,0,0.0


#### Visualize evaluation results

Use some helper functions to visualize a sample of evaluation result.

In [17]:
display_dataframe_rows(single_tool_call_eval_result.metrics_table, num_rows=3)

### Trajectory Evaluation

After evaluating the agent's ability to select the single most appropriate tool for a given task, you generalize the evaluation by analyzing the tool sequence choices with respect to the user input (trajectory). This assesses whether the agent not only chooses the right tools but also utilizes them in a rational and effective order.

#### Set trajectory metrics

To evaluate agent's trajectory, Vertex AI Gen AI Evaluation provides several ground-truth based metrics:

* `trajectory_exact_match`: identical trajectories (same actions, same order)

* `trajectory_in_order_match`: reference actions present in predicted trajectory, in order (extras allowed)

* `trajectory_any_order_match`: all reference actions present in predicted trajectory (order, extras don't matter).

* `trajectory_precision`: proportion of predicted actions present in reference

* `trajectory_recall`: proportion of reference actions present in predicted.  

All metrics score 0 or 1, except `trajectory_precision` and `trajectory_recall` which range from 0 to 1.

In [18]:
trajectory_metrics = [
    "trajectory_exact_match",
    "trajectory_in_order_match",
    "trajectory_any_order_match",
    "trajectory_precision",
    "trajectory_recall",
]

#### Run an evaluation task

Submit an evaluation by running `evaluate` method of the new `EvalTask`.

In [19]:
EXPERIMENT_RUN = f"trajectory-{get_id()}"

trajectory_eval_task = EvalTask(
    dataset=eval_sample_dataset,
    metrics=trajectory_metrics,
    experiment=EXPERIMENT_NAME,
    output_uri_prefix=BUCKET_URI + "/multiple-metric-eval",
)

trajectory_eval_result = trajectory_eval_task.evaluate(
    runnable=agent_parsed_outcome_sync, experiment_run_name=EXPERIMENT_RUN
)

display_eval_report(trajectory_eval_result)

E0000 00:00:1758551343.804196 15380293 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
E0000 00:00:1758551345.076459 15380293 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.


E0000 00:00:1758551346.549567 15380293 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
E0000 00:00:1758551347.857578 15380293 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
E0000 00:00:1758551347.859522 15380293 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
E0000 00:00:1758551349.312366 15380293 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
E0000 00:00:1758551350.847861 15380293 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
E0000 00:00:1758551350.849837 15380293 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
E0000 00:00:1758551352.383824 15380293 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
E0000 00:00:1758551352.385739 15380293 al

Associating projects/552550918814/locations/us-central1/metadataStores/default/contexts/evaluate-adk-agent-trajectory-0x8ue9hq to Experiment: evaluate-adk-agent


E0000 00:00:1758551360.791476 15380293 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.


Logging Eval experiment evaluation metadata: {'output_file': 'gs://qliu3-asp-demo/multiple-metric-eval/eval_results_2025-09-22-22-29-03-779b6.csv'}


E0000 00:00:1758551362.090726 15380293 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
E0000 00:00:1758551363.442720 15380293 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
100%|██████████| 5/5 [00:04<00:00,  1.24s/it]

When a `runnable` is provided, trajectory is generated dynamically by the runnable, so the pre-existing `predicted_trajectory` column provided in the evaluation dataset is not used.
When a `runnable` is provided, trajectory is generated dynamically by the runnable, so the pre-existing `predicted_trajectory` column provided in the evaluation dataset is not used.
When a `runnable` is provided, trajectory is generated dynamically by the runnable, so the pre-existing `predicted_trajectory` column provided in the evaluation dataset is not used.
When a `runnable` is provided, trajectory is generated dynamically by the runnable, so the pre-existing `predicted_trajectory` column provided in the evaluation dataset is not used.
When a `runnable` is provided, trajectory is generated dynamically by the runnable, so the pre-existing `predicted_trajectory` column provided in the evaluation dataset is not used.


100%|██████████| 5/5 [00:04<00:00,  1.08it/s]

All 5 responses are successfully generated from the runnable.
Computing metrics with a total of 25 Vertex Gen AI Evaluation Service API requests.



100%|██████████| 25/25 [00:02<00:00,  8.93it/s]

25 errors encountered during evaluation. Continue to compute summary metrics for the rest of the dataset.
Error encountered for metric trajectory_exact_match at dataset index 0: Error: 400 List of found errors:	1.Field: trajectory_exact_match_input.instances[0].reference_trajectory; Message: Required field is not set.	 [field_violations {
  field: "trajectory_exact_match_input.instances[0].reference_trajectory"
  description: "Required field is not set."
}
]
Error encountered for metric trajectory_exact_match at dataset index 1: Error: 400 List of found errors:	1.Field: trajectory_exact_match_input.instances[0].reference_trajectory; Message: Required field is not set.	 [field_violations {
  field: "trajectory_exact_match_input.instances[0].reference_trajectory"
  description: "Required field is not set."
}
]
Error encountered for metric trajectory_exact_match at dataset index 2: Error: 400 List of found errors:	1.Field: trajectory_exact_match_input.instances[0].reference_trajectory; Me


E0000 00:00:1758551370.899785 15380293 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
E0000 00:00:1758551372.194886 15380293 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
E0000 00:00:1758551373.489686 15380293 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
E0000 00:00:1758551374.705046 15380293 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.


### Summary Metrics

Unnamed: 0,metric,value
0,row_count,5.0
1,trajectory_exact_match/mean,
2,trajectory_exact_match/std,
3,trajectory_in_order_match/mean,
4,trajectory_in_order_match/std,
5,trajectory_any_order_match/mean,
6,trajectory_any_order_match/std,
7,trajectory_precision/mean,
8,trajectory_precision/std,
9,trajectory_recall/mean,


### Row‑wise Metrics

Unnamed: 0,prompt,predicted_trajectory,response,latency_in_seconds,failure,trajectory_exact_match/score,trajectory_in_order_match/score,trajectory_any_order_match/score,trajectory_precision/score,trajectory_recall/score
0,Get price for smartphone,"[{""tool_name"": ""get_product_price"", ""tool_inpu...",The price for smartphone is $500.,4.610552,0,,,,,
1,Get product details and price for headphones,"[{""tool_name"": ""get_product_details"", ""tool_in...",OK. I have gathered the product details and pr...,4.087599,0,,,,,
2,Get details for usb charger,"[{""tool_name"": ""get_product_details"", ""tool_in...",OK. I have the details for usb charger. It is ...,4.009379,0,,,,,
3,Get product details and price for shoes,"[{""tool_name"": ""get_product_details"", ""tool_in...",OK. I have retrieved the product details and p...,4.003705,0,,,,,
4,Get product details for speaker?,"[{""tool_name"": ""get_product_details"", ""tool_in...",OK. I have retrieved product details for the s...,3.998338,0,,,,,


#### Visualize evaluation results

Print and visualize a sample of evaluation results.

In [20]:
display_dataframe_rows(trajectory_eval_result.metrics_table, num_rows=3)

In [21]:
plot_bar_plot(
    trajectory_eval_result,
    title="Trajectory Metrics",
    metrics=[f"{metric}/mean" for metric in trajectory_metrics],
)

### Evaluate final response

Similar to model evaluation, you can evaluate the final response of the agent using Vertex AI Gen AI Evaluation.

#### Set response metrics

After agent inference, Vertex AI Gen AI Evaluation provides several metrics to evaluate generated responses. You can use computation-based metrics to compare the response to a reference (if needed) and using existing or custom model-based metrics to determine the quality of the final response.

Check out the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval) to learn more.


In [22]:
response_metrics = ["safety", "coherence"]

#### Run an evaluation task

To evaluate agent's generated responses, use the `evaluate` method of the EvalTask class.

In [23]:
EXPERIMENT_RUN = f"response-{get_id()}"

response_eval_task = EvalTask(
    dataset=eval_sample_dataset,
    metrics=response_metrics,
    experiment=EXPERIMENT_NAME,
    output_uri_prefix=BUCKET_URI + "/response-metric-eval",
)

response_eval_result = response_eval_task.evaluate(
    runnable=agent_parsed_outcome_sync, experiment_run_name=EXPERIMENT_RUN
)

display_eval_report(response_eval_result)

E0000 00:00:1758551430.301118 15380293 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
E0000 00:00:1758551431.641583 15380293 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.


E0000 00:00:1758551433.180446 15380293 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
E0000 00:00:1758551434.716199 15380293 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
E0000 00:00:1758551434.717753 15380293 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
E0000 00:00:1758551435.978135 15380293 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
E0000 00:00:1758551437.718726 15380293 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
E0000 00:00:1758551437.721085 15380293 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
E0000 00:00:1758551439.325976 15380293 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
E0000 00:00:1758551439.327666 15380293 al

Associating projects/552550918814/locations/us-central1/metadataStores/default/contexts/evaluate-adk-agent-response-j5bdg6m3 to Experiment: evaluate-adk-agent


E0000 00:00:1758551448.238053 15380293 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.


Logging Eval experiment evaluation metadata: {'output_file': 'gs://qliu3-asp-demo/response-metric-eval/eval_results_2025-09-22-22-30-30-4f329.csv'}


E0000 00:00:1758551449.504012 15380293 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
E0000 00:00:1758551450.788642 15380293 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
 40%|████      | 2/5 [00:06<00:08,  2.68s/it]

When a `runnable` is provided, trajectory is generated dynamically by the runnable, so the pre-existing `predicted_trajectory` column provided in the evaluation dataset is not used.


 60%|██████    | 3/5 [00:06<00:03,  1.51s/it]

When a `runnable` is provided, trajectory is generated dynamically by the runnable, so the pre-existing `predicted_trajectory` column provided in the evaluation dataset is not used.


 80%|████████  | 4/5 [00:07<00:01,  1.28s/it]

When a `runnable` is provided, trajectory is generated dynamically by the runnable, so the pre-existing `predicted_trajectory` column provided in the evaluation dataset is not used.
When a `runnable` is provided, trajectory is generated dynamically by the runnable, so the pre-existing `predicted_trajectory` column provided in the evaluation dataset is not used.
When a `runnable` is provided, trajectory is generated dynamically by the runnable, so the pre-existing `predicted_trajectory` column provided in the evaluation dataset is not used.


100%|██████████| 5/5 [00:07<00:00,  1.41s/it]

All 5 responses are successfully generated from the runnable.
Computing metrics with a total of 10 Vertex Gen AI Evaluation Service API requests.



100%|██████████| 10/10 [00:22<00:00,  2.23s/it]

All 10 metric requests are successfully computed.
Evaluation Took:22.33367842505686 seconds



E0000 00:00:1758551480.199584 15380293 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
E0000 00:00:1758551481.726642 15380293 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
E0000 00:00:1758551483.024731 15380293 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
E0000 00:00:1758551484.492722 15380293 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.


### Summary Metrics

Unnamed: 0,metric,value
0,row_count,5.0
1,safety/mean,1.0
2,safety/std,0.0
3,coherence/mean,4.6
4,coherence/std,0.894427
5,latency_in_seconds/mean,6.220488
6,latency_in_seconds/std,0.911208
7,failure/mean,0.0
8,failure/std,0.0


### Row‑wise Metrics

Unnamed: 0,prompt,predicted_trajectory,response,latency_in_seconds,failure,safety/explanation,safety/score,coherence/explanation,coherence/score
0,Get price for smartphone,"[{""tool_name"": ""get_product_price"", ""tool_inpu...",The price for smartphone is $500.,6.006195,0,The response is a neutral statement providing ...,1.0,The response directly and clearly answers the ...,5.0
1,Get product details and price for headphones,"[{""tool_name"": ""get_product_details"", ""tool_in...",OK. I found product details are: Wireless head...,6.130865,0,The response is safe as it does not contain an...,1.0,The response perfectly addresses the prompt by...,5.0
2,Get details for usb charger,"[{""tool_name"": ""get_product_details"", ""tool_in...",OK. I have the details for the usb charger. It...,7.055513,0,The response is completely safe and free from ...,1.0,The response demonstrates a logical flow and c...,3.0
3,Get product details and price for shoes,"[{""tool_name"": ""get_product_details"", ""tool_in...",OK. shoes are high-performance running shoes d...,7.056963,0,The response is safe as it does not contain an...,1.0,"The response is a single, concise sentence tha...",5.0
4,Get product details for speaker?,"[{""tool_name"": ""get_product_details"", ""tool_in...",OK. I have the following product details for s...,4.852903,0,The response is safe as it contains no hate sp...,1.0,The response is completely coherent; it direct...,5.0


#### Visualize evaluation results


Print new evaluation result sample.

In [24]:
display_dataframe_rows(response_eval_result.metrics_table, num_rows=3)

### Evaluate generated response conditioned by tool choosing

When evaluating AI agents that interact with environments, standard text generation metrics like coherence may not be sufficient. This is because these metrics primarily focus on text structure, while agent responses should be assessed based on their effectiveness within the environment.

Instead, use custom metrics that assess whether the agent's response logically follows from its tools choices like the one you have in this section.

#### Define a custom metric

According to the [documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval#model-based-metrics), you can define a prompt template for evaluating whether an AI agent's response follows logically from its actions by setting up criteria and a rating system for this evaluation.

Define a `criteria` to set the evaluation guidelines and a `pointwise_rating_rubric` to provide a scoring system (1 or 0). Then use a `PointwiseMetricPromptTemplate` to create the template using these components.


In [25]:
criteria = {
    "Follows trajectory": (
        "Evaluate whether the agent's response logically follows from the "
        "sequence of actions it took. Consider these sub-points:\n"
        "  - Does the response reflect the information gathered during the trajectory?\n"
        "  - Is the response consistent with the goals and constraints of the task?\n"
        "  - Are there any unexpected or illogical jumps in reasoning?\n"
        "Provide specific examples from the trajectory and response to support your evaluation."
    )
}

pointwise_rating_rubric = {
    "1": "Follows trajectory",
    "0": "Does not follow trajectory",
}

response_follows_trajectory_prompt_template = PointwiseMetricPromptTemplate(
    criteria=criteria,
    rating_rubric=pointwise_rating_rubric,
    input_variables=["prompt", "predicted_trajectory"],
)

Print the prompt_data of this template containing the combined criteria and rubric information ready for use in an evaluation.

In [26]:
print(response_follows_trajectory_prompt_template.prompt_data)

# Instruction
You are an expert evaluator. Your task is to evaluate the quality of the responses generated by AI models. We will provide you with the user prompt and an AI-generated responses.
You should first read the user input carefully for analyzing the task, and then evaluate the quality of the responses based on the Criteria provided in the Evaluation section below.
You will assign the response a rating following the Rating Rubric and Evaluation Steps. Give step by step explanations for your rating, and only choose ratings from the Rating Rubric.


# Evaluation
## Criteria
Follows trajectory: Evaluate whether the agent's response logically follows from the sequence of actions it took. Consider these sub-points:
  - Does the response reflect the information gathered during the trajectory?
  - Is the response consistent with the goals and constraints of the task?
  - Are there any unexpected or illogical jumps in reasoning?
Provide specific examples from the trajectory and response

After you define the evaluation prompt template, set up the associated metric to evaluate how well a response follows a specific trajectory. The `PointwiseMetric` creates a metric where `response_follows_trajectory` is the metric's name and `response_follows_trajectory_prompt_template` provides instructions or context for evaluation you set up before.


In [27]:
response_follows_trajectory_metric = PointwiseMetric(
    metric="response_follows_trajectory",
    metric_prompt_template=response_follows_trajectory_prompt_template,
)

#### Set response metrics

Set new generated response evaluation metrics by including the custom metric.


In [28]:
response_tool_metrics = [
    "trajectory_exact_match",
    "trajectory_in_order_match",
    "safety",
    response_follows_trajectory_metric,
]

#### Run an evaluation task

Run a new agent's evaluation.

In [32]:
EXPERIMENT_RUN = f"response-over-tools-{get_id()}"

response_eval_tool_task = EvalTask(
    dataset=eval_sample_dataset,
    metrics=response_tool_metrics,
    experiment=EXPERIMENT_NAME,
    output_uri_prefix=BUCKET_URI + "/reasoning-metric-eval",
)

response_eval_tool_result = response_eval_tool_task.evaluate(
    # Uncomment the line below if you are providing the agent with an unparsed dataset
    #runnable=agent_parsed_outcome_sync, 
    experiment_run_name=EXPERIMENT_RUN
)

display_eval_report(response_eval_tool_result)

E0000 00:00:1758551791.836682 15380293 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
E0000 00:00:1758551793.328432 15380293 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.


E0000 00:00:1758551794.583294 15380293 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
E0000 00:00:1758551795.950004 15380293 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
E0000 00:00:1758551795.951768 15380293 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
E0000 00:00:1758551797.194487 15380293 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
E0000 00:00:1758551798.551466 15380293 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
E0000 00:00:1758551798.552897 15380293 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
E0000 00:00:1758551800.086555 15380293 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
E0000 00:00:1758551800.088689 15380293 al

Associating projects/552550918814/locations/us-central1/metadataStores/default/contexts/evaluate-adk-agent-response-over-tools-thumgwrh to Experiment: evaluate-adk-agent


E0000 00:00:1758551808.160304 15380293 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.


Logging Eval experiment evaluation metadata: {'output_file': 'gs://qliu3-asp-demo/reasoning-metric-eval/eval_results_2025-09-22-22-36-31-c37f2.csv'}


E0000 00:00:1758551809.416035 15380293 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
E0000 00:00:1758551810.735698 15380293 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
E0000 00:00:1758551810.737055 15380293 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.


ValueError: Cannot find the `response` column in the evaluation dataset to fill the metric prompt template for `safety` metric. Please check if the column is present in the evaluation dataset, or provide a key-value pair in `metric_column_mapping` parameter of `EvalTask` to map it to a different column name. The evaluation dataset columns are ['prompt', 'predicted_trajectory'].

#### Visualize evaluation results

Visualize evaluation result sample.

In [31]:
display_dataframe_rows(response_eval_tool_result.metrics_table, num_rows=3)

NameError: name 'response_eval_tool_result' is not defined

In [46]:
plot_bar_plot(
    response_eval_tool_result,
    title="Response Metrics",
    metrics=[f"{metric}/mean" for metric in response_tool_metrics],
)

NameError: name 'response_eval_tool_result' is not defined

## Bonus: Bring-Your-Own-Dataset (BYOD) and evaluate a LangGraph agent using Vertex AI Gen AI Evaluation

In Bring Your Own Dataset (BYOD) [scenarios](https://cloud.google.com/vertex-ai/generative-ai/docs/models/evaluation-dataset), you provide both the predicted trajectory and the generated response from the agent.


### Bring your own evaluation dataset

Define the evaluation dataset with the predicted trajectory and the generated response.

In [47]:
byod_eval_data = {
    "prompt": [
        "Get price for smartphone",
        "Get product details and price for headphones",
        "Get details for usb charger",
        "Get product details and price for shoes",
        "Get product details for speaker?",
    ],
    "reference_trajectory": [
        [
            {
                "tool_name": "get_product_price",
                "tool_input": {"product_name": "smartphone"},
            }
        ],
        [
            {
                "tool_name": "get_product_details",
                "tool_input": {"product_name": "headphones"},
            },
            {
                "tool_name": "get_product_price",
                "tool_input": {"product_name": "headphones"},
            },
        ],
        [
            {
                "tool_name": "get_product_details",
                "tool_input": {"product_name": "usb charger"},
            }
        ],
        [
            {
                "tool_name": "get_product_details",
                "tool_input": {"product_name": "shoes"},
            },
            {"tool_name": "get_product_price", "tool_input": {"product_name": "shoes"}},
        ],
        [
            {
                "tool_name": "get_product_details",
                "tool_input": {"product_name": "speaker"},
            }
        ],
    ],
    "predicted_trajectory": [
        [
            {
                "tool_name": "get_product_price",
                "tool_input": {"product_name": "smartphone"},
            }
        ],
        [
            {
                "tool_name": "get_product_details",
                "tool_input": {"product_name": "headphones"},
            },
            {
                "tool_name": "get_product_price",
                "tool_input": {"product_name": "headphones"},
            },
        ],
        [
            {
                "tool_name": "get_product_details",
                "tool_input": {"product_name": "usb charger"},
            }
        ],
        [
            {
                "tool_name": "get_product_details",
                "tool_input": {"product_name": "shoes"},
            },
            {"tool_name": "get_product_price", "tool_input": {"product_name": "shoes"}},
        ],
        [
            {
                "tool_name": "get_product_details",
                "tool_input": {"product_name": "speaker"},
            }
        ],
    ],
    "response": [
        "500",
        "50",
        "A super fast and light usb charger",
        "100",
        "A voice-controlled smart speaker that plays music, sets alarms, and controls smart home devices.",
    ],
}

byod_eval_sample_dataset = pd.DataFrame(byod_eval_data)
byod_eval_sample_dataset["predicted_trajectory"] = byod_eval_sample_dataset[
    "predicted_trajectory"
].apply(json.dumps)
byod_eval_sample_dataset["reference_trajectory"] = byod_eval_sample_dataset[
    "reference_trajectory"
].apply(json.dumps)
byod_eval_sample_dataset["response"] = byod_eval_sample_dataset["response"].apply(json.dumps)

### Run an evaluation task

Run a new agent's evaluation using your own dataset and the same setting of the latest evaluation.

In [48]:
EXPERIMENT_RUN_NAME = f"response-over-tools-byod-{get_id()}"

byod_response_eval_tool_task = EvalTask(
    dataset=byod_eval_sample_dataset,
    metrics=response_tool_metrics,
    experiment=EXPERIMENT_NAME,
    output_uri_prefix=BUCKET_URI + "/byod-eval",
)

byod_response_eval_tool_result = byod_response_eval_tool_task.evaluate(
    experiment_run_name=EXPERIMENT_RUN_NAME
)

display_eval_report(byod_response_eval_tool_result)

E0000 00:00:1758550722.868727 15353925 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
E0000 00:00:1758550724.369626 15353925 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.


E0000 00:00:1758550725.653714 15353925 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
E0000 00:00:1758550726.981926 15353925 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
E0000 00:00:1758550726.984898 15353925 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
E0000 00:00:1758550728.253341 15353925 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
E0000 00:00:1758550729.459209 15353925 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
E0000 00:00:1758550729.461217 15353925 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
E0000 00:00:1758550730.689712 15353925 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
E0000 00:00:1758550730.691183 15353925 al

Associating projects/552550918814/locations/us-central1/metadataStores/default/contexts/evaluate-adk-agent-response-over-tools-byod-wcs6tw7x to Experiment: evaluate-adk-agent


E0000 00:00:1758550739.483860 15353925 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.


INFO:vertexai.preview.evaluation.eval_task:Logging Eval experiment evaluation metadata: {'output_file': 'gs://qliu3-asp-demo/byod-eval/eval_results_2025-09-22-22-18-42-1b982.csv'}
E0000 00:00:1758550740.730087 15353925 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
INFO:vertexai.preview.evaluation._evaluation:Computing metrics with a total of 20 Vertex Gen AI Evaluation Service API requests.
E0000 00:00:1758550742.028801 15353925 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
100%|██████████| 20/20 [00:21<00:00,  1.06s/it]
INFO:vertexai.preview.evaluation._evaluation:All 20 metric requests are successfully computed.
INFO:vertexai.preview.evaluation._evaluation:Evaluation Took:21.296961749903858 seconds
E0000 00:00:1758550763.327926 15353925 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
E0000 00:00:1758550764.612520 15353925 alts_crede

### Summary Metrics

Unnamed: 0,metric,value
0,row_count,5.0
1,trajectory_exact_match/mean,1.0
2,trajectory_exact_match/std,0.0
3,trajectory_in_order_match/mean,1.0
4,trajectory_in_order_match/std,0.0
5,safety/mean,1.0
6,safety/std,0.0
7,response_follows_trajectory/mean,0.6
8,response_follows_trajectory/std,0.547723


### Row‑wise Metrics

Unnamed: 0,prompt,reference_trajectory,predicted_trajectory,response,trajectory_exact_match/score,trajectory_in_order_match/score,safety/explanation,safety/score,response_follows_trajectory/explanation,response_follows_trajectory/score
0,Get price for smartphone,"[{""tool_name"": ""get_product_price"", ""tool_inpu...","[{""tool_name"": ""get_product_price"", ""tool_inpu...","""500""",1.0,1.0,The response '500' is a simple numerical answe...,1.0,The trajectory indicates a tool call to `get_p...,1.0
1,Get product details and price for headphones,"[{""tool_name"": ""get_product_details"", ""tool_in...","[{""tool_name"": ""get_product_details"", ""tool_in...","""50""",1.0,1.0,The response '50' is a simple number and conta...,1.0,The trajectory indicates that both product det...,0.0
2,Get details for usb charger,"[{""tool_name"": ""get_product_details"", ""tool_in...","[{""tool_name"": ""get_product_details"", ""tool_in...","""A super fast and light usb charger""",1.0,1.0,The response is a harmless description of a US...,1.0,The AI correctly identified the need to use th...,1.0
3,Get product details and price for shoes,"[{""tool_name"": ""get_product_details"", ""tool_in...","[{""tool_name"": ""get_product_details"", ""tool_in...","""100""",1.0,1.0,The response '100' is a simple number and cont...,1.0,The response '100' only provides a price and c...,0.0
4,Get product details for speaker?,"[{""tool_name"": ""get_product_details"", ""tool_in...","[{""tool_name"": ""get_product_details"", ""tool_in...","""A voice-controlled smart speaker that plays m...",1.0,1.0,The response is a neutral and informative desc...,1.0,The response provides relevant product details...,1.0


### Visualize evaluation results

Visualize evaluation result sample.

In [49]:
display_dataframe_rows(byod_response_eval_tool_result.metrics_table, num_rows=3)

In [50]:
display_radar_plot(
    byod_response_eval_tool_result,
    title="ADK agent evaluation",
    metrics=[f"{metric}/mean" for metric in response_tool_metrics],
)

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

## Cleaning up


In [17]:
delete_experiment = True

if delete_experiment:
    try:
        experiment = aiplatform.Experiment(EXPERIMENT_NAME)
        experiment.delete(delete_backing_tensorboard_runs=True)
    except Exception as e:
        print(e)

E0000 00:00:1758551168.804220 15376864 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
E0000 00:00:1758551170.114528 15376864 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
E0000 00:00:1758551171.500477 15376864 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
E0000 00:00:1758551173.185648 15380080 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
E0000 00:00:1758551173.188413 15380081 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
E0000 00:00:1758551174.722427 15380081 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
E0000 00:00:1758551174.723048 15380080 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
E0000 00:00:1758551174.724507 15380080 al

Deleting TensorboardRun : projects/552550918814/locations/us-central1/tensorboards/4561726409084502016/experiments/evaluate-adk-agent/runs/trajectory-jpqyv4nn
TensorboardRun deleted. . Resource name: projects/552550918814/locations/us-central1/tensorboards/4561726409084502016/experiments/evaluate-adk-agent/runs/trajectory-jpqyv4nn
Deleting TensorboardRun resource: projects/552550918814/locations/us-central1/tensorboards/4561726409084502016/experiments/evaluate-adk-agent/runs/trajectory-jpqyv4nn
Delete TensorboardRun backing LRO: projects/552550918814/locations/us-central1/tensorboards/4561726409084502016/experiments/evaluate-adk-agent/operations/8910945687563141120
TensorboardRun resource projects/552550918814/locations/us-central1/tensorboards/4561726409084502016/experiments/evaluate-adk-agent/runs/trajectory-jpqyv4nn deleted.
Deleting Artifact : projects/552550918814/locations/us-central1/metadataStores/default/artifacts/evaluate-adk-agent-trajectory-jpqyv4nn-tb-run


E0000 00:00:1758551177.990236 15376864 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.


Artifact deleted. . Resource name: projects/552550918814/locations/us-central1/metadataStores/default/artifacts/evaluate-adk-agent-trajectory-jpqyv4nn-tb-run
Deleting Artifact resource: projects/552550918814/locations/us-central1/metadataStores/default/artifacts/evaluate-adk-agent-trajectory-jpqyv4nn-tb-run
Delete Artifact backing LRO: projects/552550918814/locations/us-central1/metadataStores/default/artifacts/evaluate-adk-agent-trajectory-jpqyv4nn-tb-run/operations/5893533937224908800
Artifact resource projects/552550918814/locations/us-central1/metadataStores/default/artifacts/evaluate-adk-agent-trajectory-jpqyv4nn-tb-run deleted.
Deleting Context : projects/552550918814/locations/us-central1/metadataStores/default/contexts/evaluate-adk-agent-trajectory-jpqyv4nn


E0000 00:00:1758551179.288424 15376864 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.


Context deleted. . Resource name: projects/552550918814/locations/us-central1/metadataStores/default/contexts/evaluate-adk-agent-trajectory-jpqyv4nn
Deleting Context resource: projects/552550918814/locations/us-central1/metadataStores/default/contexts/evaluate-adk-agent-trajectory-jpqyv4nn
Delete Context backing LRO: projects/552550918814/locations/us-central1/metadataStores/default/contexts/evaluate-adk-agent-trajectory-jpqyv4nn/operations/3148308489365880832
Context resource projects/552550918814/locations/us-central1/metadataStores/default/contexts/evaluate-adk-agent-trajectory-jpqyv4nn deleted.
Deleting TensorboardRun : projects/552550918814/locations/us-central1/tensorboards/4561726409084502016/experiments/evaluate-adk-agent/runs/single-metric-eval-m1boq8ut
TensorboardRun deleted. . Resource name: projects/552550918814/locations/us-central1/tensorboards/4561726409084502016/experiments/evaluate-adk-agent/runs/single-metric-eval-m1boq8ut
Deleting TensorboardRun resource: projects/55

E0000 00:00:1758551180.918605 15376864 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.


Artifact deleted. . Resource name: projects/552550918814/locations/us-central1/metadataStores/default/artifacts/evaluate-adk-agent-single-metric-eval-m1boq8ut-tb-run
Deleting Artifact resource: projects/552550918814/locations/us-central1/metadataStores/default/artifacts/evaluate-adk-agent-single-metric-eval-m1boq8ut-tb-run
Delete Artifact backing LRO: projects/552550918814/locations/us-central1/metadataStores/default/artifacts/evaluate-adk-agent-single-metric-eval-m1boq8ut-tb-run/operations/4086464586742497280
Artifact resource projects/552550918814/locations/us-central1/metadataStores/default/artifacts/evaluate-adk-agent-single-metric-eval-m1boq8ut-tb-run deleted.
Deleting Context : projects/552550918814/locations/us-central1/metadataStores/default/contexts/evaluate-adk-agent-single-metric-eval-m1boq8ut


E0000 00:00:1758551182.217792 15376864 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.


Context deleted. . Resource name: projects/552550918814/locations/us-central1/metadataStores/default/contexts/evaluate-adk-agent-single-metric-eval-m1boq8ut
Deleting Context resource: projects/552550918814/locations/us-central1/metadataStores/default/contexts/evaluate-adk-agent-single-metric-eval-m1boq8ut
Delete Context backing LRO: projects/552550918814/locations/us-central1/metadataStores/default/contexts/evaluate-adk-agent-single-metric-eval-m1boq8ut/operations/5231223317024735232
Context resource projects/552550918814/locations/us-central1/metadataStores/default/contexts/evaluate-adk-agent-single-metric-eval-m1boq8ut deleted.
Deleting Context : projects/552550918814/locations/us-central1/metadataStores/default/contexts/evaluate-adk-agent


E0000 00:00:1758551183.634951 15376864 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.


Context deleted. . Resource name: projects/552550918814/locations/us-central1/metadataStores/default/contexts/evaluate-adk-agent
Deleting Context resource: projects/552550918814/locations/us-central1/metadataStores/default/contexts/evaluate-adk-agent
Delete Context backing LRO: projects/552550918814/locations/us-central1/metadataStores/default/contexts/evaluate-adk-agent/operations/3587690928011214848
Context resource projects/552550918814/locations/us-central1/metadataStores/default/contexts/evaluate-adk-agent deleted.


In [None]:
"not finished"