In [None]:
import vertexai
from vertexai.evaluation import EvalTask
# Import agent evaluation metrics from the preview module
from vertexai.preview.evaluation import metrics
from vertexai.generative_models import GenerativeModel # Only if you're evaluating a pure text model too, otherwise optional
import pandas as pd
from typing import List, Dict, Any

# --- Configuration ---
# IMPORTANT: Replace with your actual GCP Project ID and Region
PROJECT_ID = "your-gcp-project-id"
LOCATION = "us-central1" # e.g., "us-central1"

# Initialize Vertex AI SDK
print(f"Initializing Vertex AI with project: {PROJECT_ID}, location: {LOCATION}")
vertexai.init(project=PROJECT_ID, location=LOCATION)

# --- Define Your Agent (as a Callable) ---
# This is a placeholder. In a real scenario, this 'callable_agent'
# would be a function or method that calls your deployed Vertex AI Agent
# and captures its tool calls and final response.
# The returned dictionary MUST contain "predicted_trajectory" and
# optionally "response" (if you're evaluating the text response).
# The format for predicted_trajectory should be a list of tool call dictionaries,
# e.g., [{"tool_name": "my_tool", "parameters": {"param1": "value"}}]

def my_vertex_ai_agent_simulator(user_input: str) -> Dict[str, Any]:
    """
    Simulates a Vertex AI Agent's behavior for demonstration.
    In a real application, this would interact with your deployed agent.
    """
    predicted_trajectory = []
    final_response = ""

    # Simple logic to simulate tool selection based on keywords
    if "weather" in user_input.lower():
        predicted_trajectory.append({"tool_name": "get_weather", "parameters": {"location": "Wildomar"}})
        final_response = f"I am checking the weather for Wildomar based on your request: '{user_input}'."
    elif "news" in user_input.lower():
        predicted_trajectory.append({"tool_name": "get_news", "parameters": {"topic": "AI"}})
        final_response = f"Fetching the latest news on AI based on your request: '{user_input}'."
    elif "reminder" in user_input.lower():
        predicted_trajectory.append({"tool_name": "set_reminder", "parameters": {"time": "tomorrow 9 AM", "task": "call friend"}})
        final_response = f"I am setting a reminder for you based on: '{user_input}'."
    else:
        final_response = f"I couldn't find a specific tool for your request: '{user_input}'. Providing a general response."

    return {
        "predicted_trajectory": predicted_trajectory,
        "response": final_response
    }

# Assign your agent callable to a variable that EvalTask can use
callable_agent = my_vertex_ai_agent_simulator

# --- Prepare Your Evaluation Dataset ---
# This DataFrame should contain:
#   - 'input': The user query to send to the agent.
#   - 'reference_trajectory': The ground truth (expected) tool calls for the agent.
#     This should be a list of dictionaries, matching the format your agent's
#     predicted_trajectory returns. Use an empty list if no tool is expected.
#   - 'reference_response' (Optional): The expected final text response from the agent.
#     Used if you're also evaluating the text generation quality.

eval_dataset = pd.DataFrame({
    "input": [
        "What's the weather like in Wildomar right now?",
        "Tell me the latest news about artificial intelligence.",
        "Can you help me set a reminder for tomorrow?",
        "What is the capital of California?", # Expected: No tool used
        "Check the weather in London." # Another weather query
    ],
    "reference_trajectory": [
        [{"tool_name": "get_weather", "parameters": {"location": "Wildomar"}}],
        [{"tool_name": "get_news", "parameters": {"topic": "AI"}}],
        [{"tool_name": "set_reminder", "parameters": {"time": "tomorrow", "task": "call friend"}}],
        [], # No tool expected for this
        [{"tool_name": "get_weather", "parameters": {"location": "London"}}]
    ],
    "reference_response": [
        "The current weather in Wildomar is sunny.",
        "Here's the latest on AI: [summary of news].",
        "Reminder successfully set for tomorrow.",
        "The capital of California is Sacramento.",
        "The weather in London is cloudy."
    ]
})

print("\n--- Agent Evaluation Dataset (first 3 rows) ---")
print(eval_dataset.head(3))
print("-" * 40)

# --- Define Agent Evaluation Metrics ---
# Use metrics from vertexai.preview.evaluation.metrics for agent evaluation.
# TrajectorySingleToolUse checks if a specific tool was present in the trajectory.
# You can define multiple such metrics for different tools.

agent_metrics = [
    metrics.TrajectorySingleToolUse(tool_name="get_weather"), # Check if 'get_weather' tool was used
    metrics.TrajectorySingleToolUse(tool_name="get_news"),    # Check if 'get_news' tool was used
    metrics.TrajectorySingleToolUse(tool_name="set_reminder"),# Check if 'set_reminder' tool was used
    # Additional common agent metrics you might find useful:
    metrics.TrajectoryExactMatch(),    # Did the predicted trajectory exactly match the reference?
    metrics.TrajectoryAnyOrderMatch(), # Were all reference tools used, regardless of order?
    metrics.TrajectoryPrecision(),     # What proportion of predicted tools were correct?
    metrics.TrajectoryRecall(),        # What proportion of correct tools were predicted?
    # You can also add metrics for the agent's final text response if desired,
    # e.g., from MetricPromptTemplateExamples or PointwiseMetric
    # PointwiseMetric(
    #     metric="response_quality",
    #     metric_prompt_template=MetricPromptTemplateExamples.get_prompt_template("overall_quality")
    # )
]

# --- Create and Run the EvalTask ---
# For agent evaluation, you use 'runnable' instead of 'model'.
# And specify agent-specific column names.

print(f"\n--- Starting Agent Evaluation with {len(eval_dataset)} instances ---")

eval_task = EvalTask(
    dataset=eval_dataset,
    metrics=agent_metrics,
    # Specify the column names in your eval_dataset and in the runnable's output.
    agent_prediction_column_name="predicted_trajectory", # Key in the dict returned by callable_agent
    reference_column_name="reference_trajectory",     # Column in eval_dataset for ground truth tools
    response_column_name="response", # Key in the dict returned by callable_agent for its text response
    input_column_name="input" # The column from your dataset that serves as input to the agent
)

result = eval_task.evaluate(
    runnable=callable_agent, # Pass your agent (or its wrapper function) here
    experiment_run_name="agent-single-tool-selection-full-example"
)

# --- Display Results ---
print("\n--- Agent Evaluation Metrics (Aggregate Scores) ---")
# This will show a summary score for each metric (e.g., average precision/recall)
print(result.metrics)

print("\n--- Agent Evaluation Results DataFrame (Per-Instance Details) ---")
# This DataFrame will show the detailed results for each row in your eval_dataset,
# including predicted_trajectory, reference_trajectory, and individual metric scores.
print(result.to_dataframe())

print("\n--- Evaluation Complete ---")