In [3]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [8]:
import logging

# Enable debug logging to see auto-detection
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("hud.telemetry")
logger.setLevel(logging.DEBUG)

In [10]:
from datasets import Dataset

# Define task configurations similar to gmail_local.py
task_configs = [
    {
        "prompt": "Open Sent mail, search for the Series B pitch deck, forward it to billgates@microsoft.com, and mark the original message as important.",
        "mcp_config": {
            "gmail": {
                "command": "docker",
                "args": [
                    "run",
                    "-i",  # interactive mode for stdio MCP server
                    "--rm",  # remove the container after it exits
                    "-p",
                    "6080:6080",  # map port 6080 to the host for noVNC
                    "hudpython/gmail-clone:latest",  # use hud gmail image
                ],
            }
        },
        "setup_tool": {
            "name": "setup",
            "arguments": {
                "problem_id": "forward-series-b-deck-to-billgates",
            },
        },
        "evaluate_tool": {
            "name": "evaluate",
            "arguments": {
                "problem_id": "forward-series-b-deck-to-billgates",
            },
        },
        "metadata": {"id": "forward-series-b-deck-to-billgates"},
    }
]

# Create HuggingFace dataset
dataset = Dataset.from_list(task_configs)
# Give it a name for auto-detection
dataset.info.builder_name = "gmail_tasks"

print(f"Created dataset with {len(dataset)} tasks")
print(f"Dataset columns: {dataset.column_names}")
print(f"\nFirst task: {dataset[0]}")

Created dataset with 1 tasks
Dataset columns: ['prompt', 'mcp_config', 'setup_tool', 'evaluate_tool', 'metadata']

First task: {'prompt': 'Open Sent mail, search for the Series B pitch deck, forward it to billgates@microsoft.com, and mark the original message as important.', 'mcp_config': {'gmail': {'args': ['run', '-i', '--rm', '-p', '6080:6080', 'hudpython/gmail-clone:latest'], 'command': 'docker'}}, 'setup_tool': {'arguments': {'problem_id': 'forward-series-b-deck-to-billgates'}, 'name': 'setup'}, 'evaluate_tool': {'arguments': {'problem_id': 'forward-series-b-deck-to-billgates'}, 'name': 'evaluate'}, 'metadata': {'id': 'forward-series-b-deck-to-billgates'}}


In [None]:
from hud.datasets import TaskConfig
from hud.mcp.claude import ClaudeMCPAgent
from mcp_use import MCPClient

# Convert to TaskConfig
task = TaskConfig(**dataset[0])
print(task)

# Create MCP client for the environment
client = MCPClient.from_dict({"mcp_config": task.mcp_config})

# Create agent
agent = ClaudeMCPAgent(client=client, allowed_tools=["computer"], initial_screenshot=True)

id=None prompt='Open Sent mail, search for the Series B pitch deck, forward it to billgates@microsoft.com, and mark the original message as important.' mcp_config={'gmail': {'args': ['run', '-i', '--rm', '-p', '6080:6080', 'hudpython/gmail-clone:latest'], 'command': 'docker'}} setup_tool=CallToolRequestParams(meta=None, name='setup', arguments={'problem_id': 'forward-series-b-deck-to-billgates'}) evaluate_tool=CallToolRequestParams(meta=None, name='evaluate', arguments={'problem_id': 'forward-series-b-deck-to-billgates'}) metadata={'id': 'forward-series-b-deck-to-billgates'}


In [None]:
import hud

with hud.trace("test-claude"):  # Trace the agent execution
    # Initialization should happen inside the trace
    await agent.initialize(task)

    # Setup the task
    setup_result = await agent.call_tool(task.setup_tool)

    # Create initial messages
    messages = await agent.create_initial_messages(task.prompt, None)

    # Run the agent
    for step in range(5):
        print(f"Step {step}/{5}")

        # Get agent response
        response = await agent.get_model_response(messages, step)
        print(
            f"Agent response: {response.content[:200] if response.content else 'No content'}..."
        )  # Truncated

        # Check if agent wants to use tools
        tool_results = []
        if response.tool_calls:
            for tool_call in response.tool_calls:
                print(f"Using tool: {tool_call.name}")
                tool_result = await agent.call_tool(tool_call)
                tool_results.append(tool_result)
                print(f"Tool result: {tool_result}")

        # Format tool results for the model for the next step
        tool_messages = await agent.format_tool_results(response.tool_calls, tool_results)
        messages.extend(tool_messages)

    # Evaluate the task
    eval_result = await agent.call_tool(task.evaluate_tool)
    print(f"Evaluation result: {eval_result}")

[DEBUG] 2025-08-04 04:04:53,677 | hud.telemetry.instrumentation.mcp | MCPInstrumentor: install() called (context-aware mode)
[DEBUG] 2025-08-04 04:04:53,678 | hud.telemetry.instrumentation.mcp | Successfully wrapped BaseSession methods for context-aware telemetry
[DEBUG] 2025-08-04 04:04:53,678 | hud.telemetry.instrumentation.mcp | MCP instrumentation installed (context-aware mode)
[DEBUG] 2025-08-04 04:04:53,679 | hud.telemetry.instrumentation.registry | Installed instrumentor: mcp
[INFO] 2025-08-04 04:04:53,680 | hud.telemetry | Telemetry initialized.
[INFO] 2025-08-04 04:04:53,680 | hud.telemetry | See your agent live at https://app.hud.so/trace/8fbb7772-7291-4b3d-b123-3ecf52dc4fea
[DEBUG] 2025-08-04 04:04:53,680 | hud.telemetry | Starting trace 8fbb7772-7291-4b3d-b123-3ecf52dc4fea (Name: test-claude)
[DEBUG] 2025-08-04 04:04:53,681 | hud.telemetry | Telemetry: Worker thread not alive, starting new one.
[DEBUG] 2025-08-04 04:04:53,681 | hud.telemetry | Telemetry worker thread: Start

ERROR:mcp_use:Error in StdioConnectionManager task: fileno


2025-08-04 04:04:53,744 - mcp_use - ERROR - Failed to connect to MCP implementation: fileno


ERROR:mcp_use:Failed to connect to MCP implementation: fileno


[DEBUG] 2025-08-04 04:04:53,746 | hud.telemetry | Finishing trace 8fbb7772-7291-4b3d-b123-3ecf52dc4fea after 0.07 seconds
[DEBUG] 2025-08-04 04:04:53,746 | hud.telemetry | Updated task run 8fbb7772-7291-4b3d-b123-3ecf52dc4fea status to ERROR: fileno
[DEBUG] 2025-08-04 04:04:53,747 | hud.telemetry | Flushed 0 MCP calls for trace 8fbb7772-7291-4b3d-b123-3ecf52dc4fea
[INFO] 2025-08-04 04:04:53,748 | hud.telemetry | View trace at https://app.hud.so/trace/8fbb7772-7291-4b3d-b123-3ecf52dc4fea
[DEBUG] 2025-08-04 04:04:53,972 | hud.telemetry | Updating status for task run 8fbb7772-7291-4b3d-b123-3ecf52dc4fea to initializing
[DEBUG] 2025-08-04 04:04:54,236 | hud.telemetry | Updating status for task run 8fbb7772-7291-4b3d-b123-3ecf52dc4fea to error


UnsupportedOperation: fileno

INFO:httpx:HTTP Request: POST https://orchestration.hud.so/hud-gym/api/v2/task_runs/8fbb7772-7291-4b3d-b123-3ecf52dc4fea/status "HTTP/1.1 200 OK"


[DEBUG] 2025-08-04 04:04:54,730 | hud.telemetry | Successfully updated status for task run 8fbb7772-7291-4b3d-b123-3ecf52dc4fea to initializing


INFO:httpx:HTTP Request: POST https://orchestration.hud.so/hud-gym/api/v2/task_runs/8fbb7772-7291-4b3d-b123-3ecf52dc4fea/status "HTTP/1.1 200 OK"


[DEBUG] 2025-08-04 04:04:54,735 | hud.telemetry | Successfully updated status for task run 8fbb7772-7291-4b3d-b123-3ecf52dc4fea to error


In [None]:
# Close the MCP client
await client.close_all_sessions()

In [None]:
from hud.datasets import run_dataset

# Run the full dataset like so:
results = await run_dataset(
    name="automated_gmail_test",
    dataset=dataset,  # Raw HF dataset
    agent_class=ClaudeMCPAgent,
    max_concurrent=2,
)