# Our First LLMAgent — Hailstone

In [1]:
LOGGING_ENABLED = True

In [2]:
import logging

from llm_agents_from_scratch.logger import enable_console_logging

if LOGGING_ENABLED:
    enable_console_logging(logging.INFO)

## Define the Hailstone Tool

In [3]:
from pydantic import BaseModel

from llm_agents_from_scratch.tools import PydanticFunctionTool


class AlgoParams(BaseModel):
    """Params for next_number."""

    x: int


def next_number(params: AlgoParams) -> int:
    """Generate the next number of the sequence."""
    if params.x % 2 == 0:
        return params.x // 2
    return 3 * params.x + 1


# convert our Python function to a BaseTool
tool = PydanticFunctionTool(next_number)

## Define our backbone LLM

In [4]:
from llm_agents_from_scratch.llms import OllamaLLM

In [5]:
llm = OllamaLLM(model="qwen2.5:3b")

## Define the LLMAgent

In [6]:
from llm_agents_from_scratch import LLMAgent

In [7]:
llm_agent = LLMAgent(
    llm=llm,
    tools=[tool],
)

## Define the Hailstone Task

In [8]:
from llm_agents_from_scratch.data_structures import Task

In [9]:
instruction_template = """
You are given a tool, `next_number`, that generates the next number in the
sequence given the current number.

Start with the number x={x}.

<rules>
CALL `next_number` on the current number x
STOP AND WAIT for the result.
REPEAT this step-by-step process until the number 1 is reached.
FINAL RESULT: When you receive the number 1, provide the complete sequence you
observed from start to finish (including the starting number x and ending number
1).
</rules>

<warnings>
NEVER fabricate or simulate tool call results
NEVER make multiple tool calls in one response
STOP and WAIT - ALWAYS wait for the actual tool response before deciding next
steps
</warnings>
""".strip()

In [10]:
number = 4
sequence = [4, 2, 1]  # correct Hailstone sequence
task = Task(
    instruction=instruction_template.format(x=number),
)

In [11]:
handler = llm_agent.run(task, max_steps=5)

INFO (llm_agents_fs.LLMAgent) :      🚀 Starting task: You are given a tool, `next_number`, that generates the next number in the
sequence given the current number.

Start with the number ...[TRUNCATED]
INFO (llm_agents_fs.TaskHandler) :      ⚙️ Processing Step: You are given a tool, `next_number`, that generates the next number in the
sequence given the current number.

Start with the numb...[TRUNCATED]
INFO (llm_agents_fs.TaskHandler) :      🛠️ Executing Tool Call: next_number
INFO (llm_agents_fs.TaskHandler) :      ✅ Successful Tool Call: 2
INFO (llm_agents_fs.TaskHandler) :      ✅ Step Result: <tool_call>
{"name": "next_number", "arguments": {"x":2}}
</tool_call>
INFO (llm_agents_fs.TaskHandler) :      🧠 New Step: The current number is now 2. I need to make the following tool call(s):
INFO (llm_agents_fs.TaskHandler) :      ⚙️ Processing Step: The current number is now 2. I need to make the following tool call(s):
INFO (llm_agents_fs.TaskHandler) :      🛠️ Executing Tool Call: next_

In [12]:
handler.done()

True

# See the TaskResult

In [13]:
result = handler.result()

In [14]:
print(result)

[4, 2, 1]


### See the Rollout

In [15]:
print(handler.rollout)

=== Task Step Start ===

💬 assistant: The current instruction is 'You are given a tool, `next_number`, that generates the next number in the
sequence given the current number.

Start with the number x=4.

<rules>
CALL `next_number` on the current number x
STOP AND WAIT for the result.
REPEAT this step-by-step process until the number 1 is reached.
FINAL RESULT: When you receive the number 1, provide the complete sequence you
observed from start to finish (including the starting number x and ending number
1).
</rules>

NEVER fabricate or simulate tool call results
NEVER make multiple tool calls in one response
STOP and WAIT - ALWAYS wait for the actual tool response before deciding next
steps

💬 assistant: I need to make the following tool call(s):

{
    "tool_name": "next_number",
    "arguments": {
        "x": 4
    }
}.

💬 tool: {
    "tool_call": {
        "tool_name": "next_number",
        "arguments": {
            "x": 4
        }
    },
    "content": "2",
    "error": false


In [16]:
# number of task steps executed
handler.step_counter

2

## Evaluation

### Task Result Correctness Evaluation

### Setup Judge LLM

In [None]:
correctness_judge_llm = OllamaLLM(model="qwen2.5:3b")

In [17]:
judge_prompt_template = """
You are an evaluation assistant. Given a number and its correct sequence, use
them to assess whether another assistant's final result contains the correct
sequence.

<number>
{number}
</number>

<sequence>
{sequence}
</sequence>

<result>
{result}
</result>
""".strip()

#### Structured outputs for final result correctness evaluation

In [18]:
from pydantic import Field


class ResultCorrectnessEval(BaseModel):
    """Correctness of result evaluation."""

    correct: bool = Field(
        description="True if the assistant's final result contains the correct sequence. False otherwise.",
    )

In [19]:
correctness_eval = await llm.structured_output(
    prompt=judge_prompt_template.format(
        number=number,
        sequence=sequence,
        result=str(handler.result()),
    ),
    mdl=ResultCorrectnessEval,
)

In [20]:
correctness_eval

ResultCorrectnessEval(correct=True)

### LLM Agent Trajectory Evaluation

#### LLM Judge Setup

In [21]:
trajectory_judge_llm = OllamaLLM(model="qwen2.5:3b")

In [22]:
trajectory_judge_prompt_template = """
You are an evaluation assistant for evaluating another assistant's reasoning
trajectory for solving a sequence generation task. The instruction and correct
sequence is provided in addition to the trajectory of the assistant. Use the
provided rubric to provide an evaluation of the trajectory.

INSTRUCTION:
{instruction}

SEQUENCE:
{sequence}

TRAJECTORY:
{trajectory}

RUBRIC:
Reasoning (0-5): Does the assistant follow logical steps and make sound
decisions?
Process (0-5): Does the assistant follow proper tool usage and protocols?

Provide a score, with 0 being lowest and 5 the highest, for each dimension and
brief justification.

WARNINGS:
DO NOT DEDUCT FOR THE SAME ERROR MORE THAN ONCE. If an issue affects
multiple dimensions, choose the most relevant dimension to dock points.""".strip()

In [23]:
from typing import Literal


class DimensionScore(BaseModel):
    """Model for a dimension score."""

    score: Literal[0, 1, 2, 3, 4, 5] = Field(
        description="Overall score on the dimension.",
    )
    justification: str = Field(
        description="Justification for the score.",
    )


class TrajectoryEval(BaseModel):
    """Evaluation of LLM agent trajectory."""

    reasoning: DimensionScore
    process: DimensionScore

In [24]:
trajectory_eval = await trajectory_judge_llm.structured_output(
    prompt=trajectory_judge_prompt_template.format(
        instruction=instruction_template.format(x=number),
        sequence=sequence,
        trajectory=handler.rollout,
    ),
    mdl=TrajectoryEval,
)

In [25]:
print(trajectory_eval.model_dump_json(indent=4))

{
    "reasoning": {
        "score": 4,
        "justification": "The assistant followed logical steps and made sound decisions throughout the task. The reasoning is clear and aligns with the provided rubric. They correctly applied the `next_number` tool at each step and stopped when they reached the final number 1. However, there is a minor oversight in the presentation of the result where the sequence should be presented as [4, 2, 1], not just described. "
    },
    "process": {
        "score": 5,
        "justification": "The assistant followed proper tool usage and protocols consistently throughout the task. They did not fabricate or simulate tool calls, nor made multiple calls in one response. They waited for actual tool responses before making further steps, adhering to the instructions provided."
    }
}


### Number Steps Taken Eval

For this evaluation, we compare the number of steps taken to the mininum number of steps required to perform the task successfully, which is derived by subtracting 1 from the correct sequence's length.

In [26]:
min_required_steps = len(sequence) - 1
handler.step_counter - min_required_steps

0

## Hailstone Benchmark

In [27]:
from dataclasses import dataclass


@dataclass
class HailstoneBenchmarkExample:
    """Hailstone benchmark example."""

    number: int
    sequence: list[int]

    @property
    def min_steps(self) -> int:
        """Returns the minimum number of TaskSteps to perform the Task."""
        return len(self.sequence) - 1

In [29]:
benchmark: list[HailstoneBenchmarkExample] = [
    HailstoneBenchmarkExample(
        number=11,
        sequence=[11, 34, 17, 52, 26, 13, 40, 20, 10, 5, 16, 8, 4, 2, 1],
    ),
    HailstoneBenchmarkExample(
        number=13,
        sequence=[13, 40, 20, 10, 5, 16, 8, 4, 2, 1],
    ),
    HailstoneBenchmarkExample(
        number=4,
        sequence=[4, 2, 1],
    ),
    HailstoneBenchmarkExample(
        number=3,
        sequence=[3, 10, 5, 16, 8, 4, 2, 1],
    ),
    HailstoneBenchmarkExample(
        number=2,
        sequence=[2, 1],
    ),
]

### Run tasks

In [30]:
handlers = []
for ex in benchmark:
    h = llm_agent.run(
        Task(
            instruction=instruction_template.format(x=ex.number),
        ),
        max_steps=30,
    )
    handlers.append(h)

INFO (llm_agents_fs.LLMAgent) :      🚀 Starting task: You are given a tool, `next_number`, that generates the next number in the
sequence given the current number.

Start with the number ...[TRUNCATED]
INFO (llm_agents_fs.TaskHandler) :      ⚙️ Processing Step: You are given a tool, `next_number`, that generates the next number in the
sequence given the current number.

Start with the numb...[TRUNCATED]
INFO (llm_agents_fs.LLMAgent) :      🚀 Starting task: You are given a tool, `next_number`, that generates the next number in the
sequence given the current number.

Start with the number ...[TRUNCATED]
INFO (llm_agents_fs.TaskHandler) :      ⚙️ Processing Step: You are given a tool, `next_number`, that generates the next number in the
sequence given the current number.

Start with the numb...[TRUNCATED]
INFO (llm_agents_fs.LLMAgent) :      🚀 Starting task: You are given a tool, `next_number`, that generates the next number in the
sequence given the current number.

Start with the number

In [35]:
[h.done() for h in handlers]

[True, True, True, True, True]

In [36]:
[h.exception() or h.result() for h in handlers]

[TaskResult(task_id='2c92cdd1-1e3b-4fff-b7ba-f96e2de0dfa8', content='The complete sequence is: 11 → 34 → 17 → 52 → 26 → 13 → 5 → 16 → 8 → 4 → 2 → **1**. The process has now ended with reaching the number `1`.'),
 TaskResult(task_id='4eae7613-f35f-489f-96c0-cec014726bab', content='[13, 40, 20, 10]'),
 TaskResult(task_id='ff810336-5109-41e1-b164-fd4eae557a0e', content='The complete sequence generated from starting number 4 and ending with 1 is as follows: \n\n- Sequence: [4, 2, 1]'),
 TaskResult(task_id='8dae678b-5025-4d0a-bb14-785cb13f7179', content='3, 8, 4, 16, 8, 4'),
 TaskResult(task_id='c62f7636-2120-4344-89b3-860d285f3f12', content='[2, 1]')]

### Run Correctness Evaluation

In [38]:
import asyncio

In [40]:
eval_async_tasks = []
for ex, handler in zip(benchmark, handlers, strict=False):
    async_task = correctness_judge_llm.structured_output(
        prompt=judge_prompt_template.format(
            number=ex.number,
            sequence=ex.sequence,
            result=str(handler.exception() or handler.result()),
        ),
        mdl=ResultCorrectnessEval,
    )
    eval_async_tasks.append(async_task)

correctness_evals = await asyncio.gather(*eval_async_tasks)
correctness_evals

[ResultCorrectnessEval(correct=True),
 ResultCorrectnessEval(correct=False),
 ResultCorrectnessEval(correct=True),
 ResultCorrectnessEval(correct=False),
 ResultCorrectnessEval(correct=True)]

### Run Trajectory Evaluation

In [42]:
eval_async_tasks = []
for ex, handler in zip(benchmark, handlers, strict=False):
    async_task = trajectory_judge_llm.structured_output(
        prompt=trajectory_judge_prompt_template.format(
            instruction=instruction_template.format(x=ex.number),
            sequence=ex.sequence,
            trajectory=handler.rollout,
        ),
        mdl=TrajectoryEval,
    )
    eval_async_tasks.append(async_task)

trajectory_evals = await asyncio.gather(*eval_async_tasks)
trajectory_evals

[TrajectoryEval(reasoning=DimensionScore(score=4, justification="The assistant follows logical steps and makes sound decisions throughout the process. The sequence of calls to `next_number` is correctly applied, and the reasoning for each step is clear and consistent. However, there was a minor oversight in the final call where the instruction should have been `x=1`, not another round with x=2. This could be seen as an error, but since it does not significantly alter the outcome, it's scored at 4."), process=DimensionScore(score=5, justification='The assistant follows proper tool usage and protocols by consistently calling `next_number` with the correct value (`x`) for each step. The logic of the sequence is sound and maintains accuracy throughout the process. There were no protocol violations or misuse of tools.')),
 TrajectoryEval(reasoning=DimensionScore(score=3, justification="The assistant correctly follows logical steps in generating the sequence, but there are some procedural is

### Run Number Steps Evaluation

In [43]:
num_steps_diff = [
    (h.step_counter - ex.min_steps)
    for h, ex in zip(handlers, benchmark, strict=False)
]
num_steps_diff

[1, -5, 1, -2, 3]

### Create Eval summary report

In [47]:
!uv pip install pandas tabulate -q

In [44]:
import pandas as pd

In [45]:
report = pd.DataFrame(
    data={
        "result_correctness": [int(el.correct) for el in correctness_evals],
        "number_steps_diff": num_steps_diff,
        "trajectory_reasoning": [e.reasoning.score for e in trajectory_evals],
        "trajectory_process": [e.process.score for e in trajectory_evals],
        "trajectory_overall": [
            (e.reasoning.score + e.process.score) / 2 for e in trajectory_evals
        ],
    },
)
report.loc["Average"] = report.mean().round(2)
report

Unnamed: 0,result_correctness,number_steps_diff,trajectory_reasoning,trajectory_process,trajectory_overall
0,1.0,1.0,4.0,5.0,4.5
1,0.0,-5.0,3.0,4.0,3.5
2,1.0,1.0,3.0,4.0,3.5
3,0.0,-2.0,3.0,2.0,2.5
4,1.0,3.0,3.0,4.0,3.5
Average,0.6,-0.4,3.2,3.8,3.5


## Previous Benchmark Runs

### Qwen2.5-72b

_Ran on 7xA40 with runpod_

|         |   result_correctness |   number_steps_diff |   trajectory_reasoning |   trajectory_process |   trajectory_overall |
|:--------|---------------------:|--------------------:|-----------------------:|---------------------:|---------------------:|
| 0       |                    1 |                15   |                    4   |                  4   |                  4   |
| 1       |                    1 |                10   |                    4   |                  5   |                  4.5 |
| 2       |                    1 |                 1   |                    5   |                  5   |                  5   |
| 3       |                    1 |                 4   |                    4   |                  5   |                  4.5 |
| 4       |                    1 |                 1   |                    4   |                  5   |                  4.5 |
| Average |                    1 |                 6.2 |                    4.2 |                  4.8 |                  4.5 |

### Qwen2.5-3b

|         |   result_correctness |   number_steps_diff |   trajectory_reasoning |   trajectory_process |   trajectory_overall |
|:--------|---------------------:|--------------------:|-----------------------:|---------------------:|---------------------:|
| 0       |                  0   |                 -10 |                    3   |                  2   |                  2.5 |
| 1       |                  0   |                  -6 |                    3   |                  4   |                  3.5 |
| 2       |                  1   |                  -1 |                    2   |                  0   |                  1   |
| 3       |                  1   |                   1 |                    4   |                  3   |                  3.5 |
| 4       |                  0   |                   1 |                    2   |                  3   |                  2.5 |
| Average |                  0.4 |                  -3 |                    2.8 |                  2.4 |                  2.6 |