# Building and Evaluating LlamaIndex CodeAct Agent

You can install all the dependencies for this tutorial using:

In [1]:
%pip install llama-index-llms-google-genai llama-index -q

Note: you may need to restart the kernel to use updated packages.


We’ll use a `.env` file to manage API keys securely. You can also set them manually as environment variables, but for this tutorial, we’ll go ahead with a `.env` setup.  

Also include `.env` in your `.gitignore` to avoid accidentally exposing sensitive API keys.

In [3]:
from dotenv import load_dotenv

load_dotenv()

True

## Building CodeAct Agent

### Defining the Tools and the LLM

First, let's configure the LLM we want to use, and provide some functions that we can use in our code.

In [4]:
from llama_index.llms.google_genai import GoogleGenAI
from google.genai import types

llm = GoogleGenAI(
    model="gemini-2.5-flash",
    generation_config=types.GenerateContentConfig(
        thinking_config=types.ThinkingConfig(thinking_budget=0)  # Disables thinking
    ),
)


# Define a few helper functions
def add(a: int, b: int) -> int:
    """Add two numbers together"""
    return a + b


def subtract(a: int, b: int) -> int:
    """Subtract two numbers"""
    return a - b


def multiply(a: int, b: int) -> int:
    """Multiply two numbers"""
    return a * b


def divide(a: int, b: int) -> float:
    """Divide two numbers"""
    return a / b

Both GOOGLE_API_KEY and GEMINI_API_KEY are set. Using GOOGLE_API_KEY.


### Create a Code Executor

CodeAct Agent works by writing Python code (not calling tools directly) to orchestrate logic, loops, and variable management. The code executor is essential because it:

- Runs the agent's generated Python code in a controlled environment
- Maintains execution state across multiple code blocks (variables persist)  
- Captures outputs, errors, and return values for the agent to inspect
- Enables the agent to see results, debug issues, and continue intelligently

The executor acts as the bridge between the agent's code plans and actual execution, forming the core feedback loop that makes CodeAct effective.

In [5]:
from typing import Any, Dict, Tuple
import io
import contextlib
import ast
import traceback


class SimpleCodeExecutor:
    """
    A simple code executor that runs Python code with state persistence.

    This executor maintains a global and local state between executions,
    allowing for variables to persist across multiple code runs.

    NOTE: not safe for production use! Use with caution.
    """

    def __init__(self, locals: Dict[str, Any], globals: Dict[str, Any]):
        """
        Initialize the code executor.

        Args:
            locals: Local variables to use in the execution context
            globals: Global variables to use in the execution context
        """
        # State that persists between executions
        self.globals = globals
        self.locals = locals

    def execute(self, code: str) -> Tuple[bool, str, Any]:
        """
        Execute Python code and capture output and return values.

        Args:
            code: Python code to execute

        Returns:
            Dict with keys `success`, `output`, and `return_value`
        """
        # Capture stdout and stderr
        stdout = io.StringIO()
        stderr = io.StringIO()

        output = ""
        return_value = None
        try:
            # Execute with captured output
            with contextlib.redirect_stdout(stdout), contextlib.redirect_stderr(stderr):
                # Try to detect if there's a return value (last expression)
                try:
                    tree = ast.parse(code)
                    last_node = tree.body[-1] if tree.body else None

                    # If the last statement is an expression, capture its value
                    if isinstance(last_node, ast.Expr):
                        # Split code to add a return value assignment
                        last_line = code.rstrip().split("\n")[-1]
                        exec_code = (
                            code[: -len(last_line)] + "\n__result__ = " + last_line
                        )

                        # Execute modified code
                        exec(exec_code, self.globals, self.locals)
                        return_value = self.locals.get("__result__")
                    else:
                        # Normal execution
                        exec(code, self.globals, self.locals)
                except:
                    # If parsing fails, just execute the code as is
                    exec(code, self.globals, self.locals)

            # Get output
            output = stdout.getvalue()
            if stderr.getvalue():
                output += "\n" + stderr.getvalue()

        except Exception as e:
            # Capture exception information
            output = f"Error: {type(e).__name__}: {str(e)}\n"
            output += traceback.format_exc()

        if return_value is not None:
            output += "\n\n" + str(return_value)

        return output

In [6]:
code_executor = SimpleCodeExecutor(
    # give access to our functions defined above
    locals={
        "add": add,
        "subtract": subtract,
        "multiply": multiply,
        "divide": divide,
    },
    globals={
        # give access to all builtins
        "__builtins__": __builtins__,
        # give access to numpy
        "np": __import__("numpy"),
    },
)

### Setup the CodeAct Agent

In [8]:
from llama_index.core.agent.workflow import CodeActAgent
from llama_index.core.workflow import Context

agent = CodeActAgent(
    code_execute_fn=code_executor.execute,
    llm=llm,
    tools=[add, subtract, multiply, divide],
)

### Use the Agent

In [9]:
from llama_index.core.agent.workflow import (
    ToolCall,
    ToolCallResult,
    AgentStream,
)


async def run_agent_verbose(agent, query):
    handler = agent.run(query)
    print(f"User:  {query}")
    async for event in handler.stream_events():
        if isinstance(event, ToolCallResult):
            print(f"\n-----------\nCode execution result:\n{event.tool_output}")
        elif isinstance(event, ToolCall):
            print(f"\n-----------\nParsed code:\n{event.tool_kwargs['code']}")
        elif isinstance(event, AgentStream):
            print(f"{event.delta}", end="", flush=True)

    return await handler

In [10]:
response = await run_agent_verbose(
    agent, "Add 5 and 3, then multiply the result by 2"
)

User:  Add 5 and 3, then multiply the result by 2
<execute>
result_add = add(5, 3)
final_result = multiply(result_add, 2)
print(final_result)
</execute>
The result is 16.
-----------
Parsed code:
result_add = add(5, 3)
final_result = multiply(result_add, 2)
print(final_result)

-----------
Code execution result:
16

The result of adding 5 and 3, then multiplying the sum by 2 is 16.

## Evaluating the LlamaIndex CodeAct Agent with Wandb weave

When using Weave for evaluation, you need three main components:

1. **Dataset**: A collection of queries or inputs you want to evaluate your application on.  

2.	**Model**: This is an abstraction that represents the application you want to evaluate. It’s not a literal machine learning model, but a wrapper provided by Weave that defines how your application handles input and produces output.  

3. **Scorers**: These are the metrics or scoring functions that assess how well your application performs on the dataset. For example, they might check correctness, retrieval quality.

### Initializing the Project and Creating the Dataset

In [None]:
import weave
from weave import Dataset

weave.init(project_name="llama_index_evaluations")

eval_dataset = Dataset(
    name="codeAct-agent-evaluation-dataset-1",
    rows=[
        {
            "id": "1",
            "query": "Add 5 and 3, then multiply the result by 2",
            "reference": "16",
        },
        {
            "id": "2",
            "query": "Calculate the sum of the first 10 fibonacci numbers, assuming the first fibonacci number is 0",
            "reference": "88",
        },
        {
            "id": "3",
            "query": "Calculate the sum of all numbers from 1 to 10",
            "reference": "55",
        },
    ],
)


weave.publish(eval_dataset)

  from .autonotebook import tqdm as notebook_tqdm
[36m[1mweave[0m: Logged in as Weights & Biases user: siddharth-plaksha.
[36m[1mweave[0m: View Weave data at https://wandb.ai/deep-learning-assignments/llama_index_evaluations/weave
[36m[1mweave[0m: 📦 Published to https://wandb.ai/deep-learning-assignments/llama_index_evaluations/weave/objects/codeAct-agent-evaluation-dataset-1/versions/7VqrXTd9pa6JWTEcw6eO4Wv62UkjwGvhKK3F1zwQmyc


### Setting the Model

In [12]:
import weave
import asyncio
from llama_index.core.agent.workflow import AgentOutput
from llama_index.core.agent.workflow import CodeActAgent


class LlamaIndexCodeActAgent(weave.Model):
    @weave.op()
    async def predict(self, query: str) -> AgentOutput:
        agent = CodeActAgent(
            code_execute_fn=code_executor.execute,
            tools=[add, subtract, multiply, divide],
            llm=llm,
        )
        handler = agent.run(query)
        response = asyncio.run(handler)
        # TODO: Look for better way to get tool description
        return response, agent._get_tool_descriptions(tools=agent.tools)

### Defining the Scorers

To evaluate the performance of a CodeAct agent, we designed two LLM-based scorers focused on different aspects of correctness:  

1. Tool Usage Scorer (CodeActToolUsageScorer):
This checks whether the agent used the correct tools with valid function calls and appropriate parameter names/types. It ensures the code structure adheres to the tool definitions.  

2. Task Completion Scorer (CodeActTaskCompletionScorer):
This verifies whether the generated Python code successfully completes the task as per the user query. It considers the tool usage, execution output, and final result.

In [13]:
import weave
from textwrap import dedent
from typing import Dict
from weave.scorers.scorer_types import LLMScorer
from pydantic import BaseModel, Field


class CodeActTaskCompletionResponse(BaseModel):
    reason: str = Field(
        description="Step-by-step reasoning about whether the agent's generated code correctly completed the user's task using valid tool calls and execution"
    )
    score: int = Field(
        description="Binary score indicating if the task was successfully completed (1 for success, 0 for failure or incorrect tool usage)"
    )


class CodeActTaskCompletionScorer(LLMScorer):
    name: str = "codeact_task_completion"
    prompt_template: str = dedent(
        """
You are evaluating a **CodeAct** agent—a type of LLM agent that generates and executes **Python code** to perform a user-specified task. CodeAct integrates with a Python interpreter, allowing the agent to call tools, run logic, self-debug, and iteratively refine its code.

You are provided with:
- A list of available **tool functions** and their signatures.
- The **agent-generated Python code** snippet that was executed.
- The **execution result**, including stdout or any errors.
- The **original user query/task**.

Your task:
- Verify that the agent’s code **completes the user’s task correctly**.
- Confirm it only uses **valid tools** with correct parameter names and types.
- Ensure the code **executes successfully** (or handles errors intentionally and correctly).

Think step by step about:

1. Was the tool usage valid and correct?
2. Did the execution output achieve the goal described in the query?

Task:
{task}

Tool Definitions:
{tool_desc}

Code:
{code}

Execution Output/Error:
{execution_result}

Then provide:

Reasoning:
<your detailed step-by-step reasoning>

Final Score (0 or 1):
- **1** = Task was correctly completed with valid tool usage and successful execution.
- **0** = Task not completed, or there was invalid tool use or runtime error.
"""
    )
    model_id: str = "gemini/gemini-2.0-flash"

    @weave.op
    async def score(self, output: tuple, query: str) -> Dict:
        agent_output, tool_desc = output
        # assuming the output is not multimodal
        final_answer = agent_output.response.blocks[0].text

        # extracting the python code written
        tool_calls = agent_output.tool_calls
        python_code = [tool_call.tool_kwargs["code"] for tool_call in tool_calls]

        prompt = self.prompt_template.format(
            task=query,
            code="\n".join(python_code),
            tool_desc=tool_desc,
            execution_result=final_answer,
        )
        response = await self._acompletion(
            messages=[{"role": "user", "content": prompt}],
            response_format=CodeActTaskCompletionResponse,
            model=self.model_id,
        )
        response = CodeActTaskCompletionResponse.model_validate_json(
            response.choices[0].message.content
        )
        return response.model_dump()


class CodeActToolUsageResponse(BaseModel):
    reason: str = Field(
        description="Step‑by‑step reasoning about whether the CodeAct agent’s Python code correctly used the provided tools with valid function calls"
    )
    score: int = Field(
        description="Binary score indicating whether all tool calls were valid and correctly structured (1 for correct tool usage, 0 otherwise)"
    )


class CodeActToolUsageScorer(LLMScorer):
    name: str = "codeact_tool_usage_correctness"
    prompt_template: str = dedent(
        """
You are evaluating a **CodeAct** agent, a type of LLM agent that generates and executes **Python code** to call tools directly. CodeAct agents write code snippets, run them, and can self-debug by observing results.

You’re provided with:
- A list of **available tool functions** and their signatures.
- The **Python code** snippet the agent generated.

Your task:
- Verify the agent only calls **valid, provided tools**.
- Check that every function call uses **correct parameter names and types**.

Think step by step about each function call.

Tool Definitions:
{tool_desc}

Code:
{code}

Reasoning:
<your detailed reasoning goes here>

Final Verdict (0 or 1):
- Return **1** if all tool usage is correct and runs without issues.
- Return **0** otherwise.
"""
    )
    model_id: str = "gemini/gemini-2.0-flash"

    @weave.op
    async def score(self, output: tuple, query: str) -> Dict:
        agent_output, tool_desc = output

        # extracting the python code written
        tool_calls = agent_output.tool_calls
        python_code = [tool_call.tool_kwargs["code"] for tool_call in tool_calls]

        prompt = self.prompt_template.format(
            code="\n".join(python_code),
            tool_desc=tool_desc,
        )
        response = await self._acompletion(
            messages=[{"role": "user", "content": prompt}],
            response_format=CodeActToolUsageResponse,
            model=self.model_id,
        )
        response = CodeActToolUsageResponse.model_validate_json(
            response.choices[0].message.content
        )
        return response.model_dump()

Design deterministic evaluation metrics whenever possible to avoid subjective LLM judgments. Simple approaches like string matching or text embedding similarity checks can be highly effective and reliable compared to LLM-based evaluation if you know your data.

In this implementation, we've structured test cases so expected answers appear directly in the agent's output. Since the agent performs deterministic calculations (like computing roots), we can evaluate correctness by simply checking if the reference answer exists in the response.

While this method may not offer the highest level of flexibility, it is extremely efficient—both in terms of speed and cost—making it a practical choice for many evaluation scenarios.

In [14]:
class AgentResponseCorrectnessScorer(weave.Scorer):
    name: str = "agent_response_correctness"

    @weave.op
    def score(self, output: tuple, reference: str) -> Dict:
        agent_output, _ = output
        final_answer = agent_output.response.blocks[0].text
        is_present = reference.strip() in final_answer.strip()
        return {
            "reason": f"Reference {'found' if is_present else 'not found'} in agent response.",
            "score": int(is_present),
        }

### Performing Evaluations

In [15]:
task_completion_scorer = CodeActTaskCompletionScorer()
tool_usage_scorer = CodeActToolUsageScorer()
correctness = AgentResponseCorrectnessScorer()

evaluation = weave.Evaluation(
    dataset=eval_dataset,
    scorers=[task_completion_scorer, tool_usage_scorer, correctness],
)

llama_index_codeAct_model = LlamaIndexCodeActAgent()

In [16]:
import asyncio
import nest_asyncio

nest_asyncio.apply()

asyncio.run(evaluation.evaluate(llama_index_codeAct_model))

[36m[1mweave[0m: 🍩 https://wandb.ai/deep-learning-assignments/llama_index_evaluations/r/call/0197dddc-ac6c-75b3-9424-b11dbb2f8b82
[36m[1mweave[0m: Evaluated 1 of 3 examples
[36m[1mweave[0m: Evaluated 2 of 3 examples
[36m[1mweave[0m: Evaluated 3 of 3 examples
[36m[1mweave[0m: Evaluation summary {
[36m[1mweave[0m:   "codeact_task_completion": {
[36m[1mweave[0m:     "score": {
[36m[1mweave[0m:       "mean": 1.0
[36m[1mweave[0m:     }
[36m[1mweave[0m:   },
[36m[1mweave[0m:   "codeact_tool_usage_correctness": {
[36m[1mweave[0m:     "score": {
[36m[1mweave[0m:       "mean": 1.0
[36m[1mweave[0m:     }
[36m[1mweave[0m:   },
[36m[1mweave[0m:   "agent_response_correctness": {
[36m[1mweave[0m:     "score": {
[36m[1mweave[0m:       "mean": 1.0
[36m[1mweave[0m:     }
[36m[1mweave[0m:   },
[36m[1mweave[0m:   "model_latency": {
[36m[1mweave[0m:     "mean": 3.2990194161732993
[36m[1mweave[0m:   }
[36m[1mweave[0m: }


{'codeact_task_completion': {'score': {'mean': 1.0}},
 'codeact_tool_usage_correctness': {'score': {'mean': 1.0}},
 'agent_response_correctness': {'score': {'mean': 1.0}},
 'model_latency': {'mean': 3.2990194161732993}}