# Capstone 1 (from Chapter 5) ‚Äî Monte Carlo Estimation of Pi

## Setup

In [1]:
import logging
import os

from llm_agents_from_scratch.logger import enable_console_logging

### Constants

In [2]:
IS_ON_RUNPOD = "RUNPOD_POD_ID" in os.environ
LOGGING_ENABLED = True
LOGGING_LEVEL = logging.INFO

# for task execution
MAX_STEPS = 20
NUM_REPLICATIONS = 10

In [3]:
# Install additional dependencies for notebook
if IS_ON_RUNPOD:
    !uv pip install numpy pandas --system
else:
    !uv pip install numpy pandas

Audited 2 packages in 0.95ms


In [4]:
# maybe enable logging
if LOGGING_ENABLED:
    enable_console_logging(LOGGING_LEVEL)

## LLMs

In [5]:
if IS_ON_RUNPOD:
    backbone_llm = os.getenv("OLLAMA_MODEL")
    judge_llm = os.getenv("OLLAMA_MODEL")
else:
    backbone_llm = "qwen3:8b"
    judge_llm = "qwen3:8b"

In [6]:
print(f"Backbone LLM: {backbone_llm}")
print(f"Judge LLM: {judge_llm}")

Backbone LLM: qwen3:8b
Judge LLM: qwen3:8b


## Build Tools

In [7]:
import uuid

import numpy as np
from pydantic import BaseModel, Field, computed_field

from llm_agents_from_scratch.tools import PydanticFunctionTool

### Tool: `generate_random_sample()`

In [8]:
# Global registry to store samples
SAMPLE_REGISTRY: dict[str, list[tuple[float, float]]] = {}

In [9]:
class RandomSampleParams(BaseModel):
    """Params for generate_random_sample."""

    n: int = Field(description="The number of random points to generate")


class RandomSample(BaseModel):
    """Result from generate_random_sample."""

    sample_id: str = Field(
        description="Pass this sample_id to monte_carlo_estimate",
    )

    @computed_field
    @property
    def sample_size(
        self,
    ) -> int:
        """Determine n from SAMPLE_REGISTRY."""
        return len(SAMPLE_REGISTRY[self.sample_id])


def generate_random_sample(params: RandomSampleParams) -> RandomSample:
    """Generate n random points in [-1, 1] √ó [-1, 1].

    Returns a sample_id. Pass this sample_id directly to monte_carlo_estimate.
    """
    orig_pts = np.random.uniform(size=(params.n, 2))
    transformed = 2 * orig_pts - 1

    sample_id = str(uuid.uuid4())
    SAMPLE_REGISTRY[sample_id] = [tuple(pt) for pt in transformed.tolist()]

    return RandomSample(sample_id=sample_id)

In [10]:
# test generate_random_sample() function
rs = generate_random_sample(RandomSampleParams(n=5000))
rs.model_dump()

{'sample_id': 'f5be161a-a1ce-48bf-b3db-afdf84d53aa6', 'sample_size': 5000}

In [11]:
# create tool
random_sample_tool = PydanticFunctionTool(generate_random_sample)
random_sample_tool.parameters_json_schema

{'description': 'Params for generate_random_sample.',
 'properties': {'n': {'description': 'The number of random points to generate',
   'title': 'N',
   'type': 'integer'}},
 'required': ['n'],
 'title': 'RandomSampleParams',
 'type': 'object'}

### Tool: `add_more_points()`

In [12]:
class AddPointsParams(BaseModel):
    """Params for add_more_points_to_sample."""

    sample_id: str = Field(
        description="The sample_id of the sample to augment",
    )
    n: int = Field(description="The number of random points to generate")


def add_more_points_to_sample(params: AddPointsParams) -> RandomSample:
    """Add n more random points to an existing random sample.

    Returns a sample_id and the total number of points.
    """
    orig_pts = np.random.uniform(size=(params.n, 2))
    transformed = 2 * orig_pts - 1

    # augment sample
    SAMPLE_REGISTRY[params.sample_id] += [
        tuple(pt) for pt in transformed.tolist()
    ]

    return RandomSample(sample_id=params.sample_id)

In [13]:
# test add_more_points_to_sample() function
rs = add_more_points_to_sample(AddPointsParams(n=1000, sample_id=rs.sample_id))
str(rs)

"sample_id='f5be161a-a1ce-48bf-b3db-afdf84d53aa6' sample_size=6000"

In [14]:
# create tool
add_more_points_tool = PydanticFunctionTool(add_more_points_to_sample)
add_more_points_tool.parameters_json_schema

{'description': 'Params for add_more_points_to_sample.',
 'properties': {'sample_id': {'description': 'The sample_id of the sample to augment',
   'title': 'Sample Id',
   'type': 'string'},
  'n': {'description': 'The number of random points to generate',
   'title': 'N',
   'type': 'integer'}},
 'required': ['sample_id', 'n'],
 'title': 'AddPointsParams',
 'type': 'object'}

### Tool: `monte_carlo_estimate()`

In [15]:
class MonteCarloEstimateParams(BaseModel):
    """Params for monte_carlo_estimate."""

    sample_id: str = Field(
        description="The sample_id returned by generate_random_sample",
    )


class MonteCarloEstimateResults(BaseModel):
    """Results for monte_carlo_estimate."""

    sample_id: str
    sample_size: int
    estimate: float


def monte_carlo_estimate(
    params: MonteCarloEstimateParams,
) -> MonteCarloEstimateResults:
    """Estimate pi using Monte Carlo method.

    Args:
        params: Contains sample_id from generate_random_sample.

    Returns:
        Estimate of pi (float).
    """
    points = SAMPLE_REGISTRY[params.sample_id]
    n = len(points)
    inside = sum((x**2 + y**2) < 1 for x, y in points)
    return MonteCarloEstimateResults(
        estimate=(inside / n) * 4,
        sample_id=params.sample_id,
        sample_size=n,
    )

In [16]:
pi_estimate = monte_carlo_estimate(
    MonteCarloEstimateParams(sample_id=rs.sample_id),
)
pi_estimate

MonteCarloEstimateResults(sample_id='f5be161a-a1ce-48bf-b3db-afdf84d53aa6', sample_size=6000, estimate=3.1226666666666665)

In [17]:
monte_carlo_estimate_tool = PydanticFunctionTool(monte_carlo_estimate)
monte_carlo_estimate_tool.parameters_json_schema

{'description': 'Params for monte_carlo_estimate.',
 'properties': {'sample_id': {'description': 'The sample_id returned by generate_random_sample',
   'title': 'Sample Id',
   'type': 'string'}},
 'required': ['sample_id'],
 'title': 'MonteCarloEstimateParams',
 'type': 'object'}

## Define our LLMAgent

In [18]:
from llm_agents_from_scratch import LLMAgent
from llm_agents_from_scratch.llms import OllamaLLM

llm = OllamaLLM(backbone_llm)
llm_agent = LLMAgent(
    llm=llm,
    tools=[
        random_sample_tool,
        add_more_points_tool,
        monte_carlo_estimate_tool,
    ],
)

## Define the Task

In [19]:
from llm_agents_from_scratch.data_structures import Task

In [20]:
instruction_template = """
You are tasked with estimating pi using Monte Carlo methods.

TARGET: Get an estimate accurate to 3 decimal places.
Success means the first three digits after the decimal point are 142 (since
pi ‚âà 3.142...). In other words, the estimate should be between 3.1415
and 3.1425.

<tools>
1. generate_random_sample(n) ‚Üí Creates sample, returns sample_id and sample_size
2. add_more_points_to_sample(sample_id, n) ‚Üí Adds n points, returns updated
   sample_size
3. monte_carlo_estimate(sample_id) ‚Üí Returns pi estimate (float)
</tools>

<algorithm>
1. Call generate_random_sample(100000) to start with 1M points
2. Call monte_carlo_estimate(sample_id) to get estimate
3. Check: is the estimate between 3.1415 and 3.1425?
   - YES ‚Üí Report success and STOP
   - NO ‚Üí Continue to step 4
4. Call add_more_points_to_sample, doubling the points each time:
   - First add: 1 hundred thousand
   - Second add: 2 hundred thousand
   - Third add: 4 hundred thousand
   - And so on, doubling each iteration
5. After adding points, go back to step 2

Exponential growth ensures faster convergence while demonstrating adaptive
sampling.
</algorithm>

<critical_rules>
- If the task is not complete, your response MUST contain a tool call
- Do not just describe what you plan to do‚Äîactually call the tool
- Do not stop until the estimate falls within the target range
- Keep track of your iteration to calculate the correct doubling amount
</critical_rules>

Begin by calling generate_random_sample(100000).
""".strip()

In [21]:
task = Task(
    instruction=instruction_template,
)

## Perform the Task

In [22]:
handler = llm_agent.run(task, max_steps=MAX_STEPS)

INFO (llm_agents_fs.LLMAgent) :      üöÄ Starting task: You are tasked with estimating pi using Monte Carlo methods.

TARGET: Get an estimate accurate to 3 decimal places.
Success means the...[TRUNCATED]
INFO (llm_agents_fs.TaskHandler) :      ‚öôÔ∏è Processing Step: You are tasked with estimating pi using Monte Carlo methods.

TARGET: Get an estimate accurate to 3 decimal places.
Success means ...[TRUNCATED]
INFO (llm_agents_fs.TaskHandler) :      üõ†Ô∏è Executing Tool Call: generate_random_sample
INFO (llm_agents_fs.TaskHandler) :      ‚úÖ Successful Tool Call: sample_id='39470047-67c8-4d47-bc42-0f44644cb3f5' sample_size=100000
INFO (llm_agents_fs.TaskHandler) :      ‚úÖ Step Result: I need to call the monte_carlo_estimate tool with the sample_id '39470047-67c8-4d47-bc42-0f44644cb3f5' to get the pi estimate.
INFO (llm_agents_fs.TaskHandler) :      üß† New Step: I need to call the monte_carlo_estimate tool with the sample_id '39470047-67c8-4d47-bc42-0f44644cb3f5' to get the pi esti

In [25]:
# if need to cancel uncomment code below
# handler.cancel()  # noqa: ERA001

In [30]:
handler.done()

True

In [31]:
print(handler.rollout)

=== Task Step Start ===

üí¨ assistant: My current instruction is 'You are tasked with estimating pi using Monte Carlo methods.

TARGET: Get an estimate accurate to 3 decimal places.
Success means the first three digits after the decimal point are 142 (since
pi ‚âà 3.142...). In other words, the estimate should be between 3.1415
and 3.1425.

<tools>
1. generate_random_sample(n) ‚Üí Creates sample, returns sample_id and sample_size
2. add_more_points_to_sample(sample_id, n) ‚Üí Adds n points, returns updated
   sample_size
3. monte_carlo_estimate(sample_id) ‚Üí Returns pi estimate (float)
</tools>

<algorithm>
1. Call generate_random_sample(100000) to start with 1M points
2. Call monte_carlo_estimate(sample_id) to get estimate
3. Check: is the estimate between 3.1415 and 3.1425?
   - YES ‚Üí Report success and STOP
   - NO ‚Üí Continue to step 4
4. Call add_more_points_to_sample, doubling the points each time:
   - First add: 1 hundred thousand
   - Second add: 2 hundred thousand
   - 

In [32]:
result = handler.exception() or handler.result()
result

llm_agents_from_scratch.errors.agent.MaxStepsReachedError('Max steps reached.')

## Evaluation

In [34]:
trajectory_judge = OllamaLLM(model=judge_llm)

In [35]:
class TrajectoryJudgment(BaseModel):
    """Rubric for evaluating a Monte Carlo pi estimation agent trajectory."""

    reached_target_precision: bool = Field(
        description="True if agent achieved estimate that rounds to 3.142",
    )

    completed_without_max_steps: bool = Field(
        description=(
            "True if agent completed task without hitting max steps limit"
        ),
    )

    always_added_points_before_reestimating: bool = Field(
        description=(
            "False if agent called monte_carlo_estimate consecutively more "
            "than once before adding points"
        ),
    )

    reused_sample: bool = Field(
        description=(
            "True if agent used add_more_points_to_sample to grow the sample "
            "instead of creating new samples"
        ),
    )

    no_false_completion: bool = Field(
        description=(
            "True if agent only claimed success when the actual tool result "
            "showed 3.142. False if agent claimed convergence based on a "
            "fabricated or misread estimate."
        ),
    )

    no_missed_completion: bool = Field(
        description=(
            "True if agent stopped when estimate reached 3.142. False if "
            "agent continued adding points after already achieving target."
        ),
    )

    largest_sample_size: int | None = Field(
        description=(
            "The largest sample size achieved during the trajectory, "
            "or None if not determinable from tool outputs"
        ),
    )

    summary: str = Field(
        description="One sentence summary of trajectory quality",
    )

In [39]:
judge_prompt_template = """Evaluate this Monte Carlo pi estimation trajectory.

The agent had three tools:
- `generate_random_sample(n)` - Creates NEW sample
- `add_more_points_to_sample(sample_id, n)` - Adds points to EXISTING sample
- `monte_carlo_estimate(sample_id)` - Returns pi estimate

Correct behavior:
1. Create sample once
2. Estimate ‚Üí if not between 3.1415 and 3.1425,
   add points ‚Üí re-estimate ‚Üí repeat

<final_result>
{result}
</final_result>

<trajectory>
{trajectory}
</trajectory>

Evaluate and submit your judgment.""".strip()

In [40]:
trajectory_eval = await trajectory_judge.structured_output(
    prompt=judge_prompt_template.format(
        result=str(result),
        trajectory=handler.rollout,
    ),
    mdl=TrajectoryJudgment,
)

In [41]:
print(trajectory_eval.model_dump_json(indent=4))

{
    "reached_target_precision": true,
    "completed_without_max_steps": true,
    "always_added_points_before_reestimating": true,
    "reused_sample": false,
    "no_false_completion": true,
    "no_missed_completion": true,
    "largest_sample_size": 1600000,
    "summary": "The agent successfully completed the task by iteratively increasing the sample size and re-estimating œÄ until the target precision range (3.1415 to 3.1425) was achieved. The final estimate of 3.14209 with 1,600,000 samples met the criteria. All actions followed the required sequence of adding points before re-estimating, and no errors or false completions occurred."
}


### Replications

In this section, we'll repeat the task multiple times to get a more robust evaluation of our LLM agent's performance.

In [42]:
handlers = []
for _ in range(NUM_REPLICATIONS):
    h = llm_agent.run(task, max_steps=10)
    handlers.append(h)

INFO (llm_agents_fs.LLMAgent) :      üöÄ Starting task: You are tasked with estimating pi using Monte Carlo methods.

TARGET: Get an estimate accurate to 3 decimal places.
Success means the...[TRUNCATED]
INFO (llm_agents_fs.TaskHandler) :      ‚öôÔ∏è Processing Step: You are tasked with estimating pi using Monte Carlo methods.

TARGET: Get an estimate accurate to 3 decimal places.
Success means ...[TRUNCATED]
INFO (llm_agents_fs.LLMAgent) :      üöÄ Starting task: You are tasked with estimating pi using Monte Carlo methods.

TARGET: Get an estimate accurate to 3 decimal places.
Success means the...[TRUNCATED]
INFO (llm_agents_fs.TaskHandler) :      ‚öôÔ∏è Processing Step: You are tasked with estimating pi using Monte Carlo methods.

TARGET: Get an estimate accurate to 3 decimal places.
Success means ...[TRUNCATED]
INFO (llm_agents_fs.LLMAgent) :      üöÄ Starting task: You are tasked with estimating pi using Monte Carlo methods.

TARGET: Get an estimate accurate to 3 decimal places.

In [48]:
[h.done() for h in handlers]

[True, True, True]

#### Trajectory Evaluations

In [49]:
import asyncio

In [50]:
eval_async_tasks = []
for handler in handlers:
    async_task = trajectory_judge.structured_output(
        prompt=judge_prompt_template.format(
            result=str(handler.exception() or handler.result()),
            trajectory=handler.rollout,
        ),
        mdl=TrajectoryJudgment,
    )
    eval_async_tasks.append(async_task)

trajectory_evals = await asyncio.gather(*eval_async_tasks)
trajectory_evals

[TrajectoryJudgment(reached_target_precision=False, completed_without_max_steps=False, always_added_points_before_reestimating=True, reused_sample=False, no_false_completion=True, no_missed_completion=True, largest_sample_size=1600000, summary='The agent followed the doubling strategy consistently but did not reach the target precision within the maximum allowed steps. The largest sample size used was 1,600,000 points. No errors in completion or reuse of samples were detected.'),
 TrajectoryJudgment(reached_target_precision=False, completed_without_max_steps=False, always_added_points_before_reestimating=True, reused_sample=True, no_false_completion=True, no_missed_completion=True, largest_sample_size=1600000, summary='The agent consistently doubled the sample size before re-estimating pi, but failed to reach the target precision of 3.1415 to 3.1425. The final estimate was 3.13984 after using a sample size of 1,600,000. The agent did not falsely claim completion and reused the sample c

### Evaluation Summary

In [None]:
import pandas as pd

from llm_agents_from_scratch.notebook_utils import set_dataframe_display_options

# sets display options for pd.DataFrame in notebooks
set_dataframe_display_options()

In [None]:
# shape eval results into a pd.DataFrame
trajectory_evals_df = pd.DataFrame(
    data=[e.model_dump() for e in trajectory_evals],
)

# compute aggregations: TOTAL and AVG rows
total_row = {}
avg_row = {}

for col, dtype in trajectory_evals_df.dtypes.items():
    if dtype == "bool" or pd.api.types.is_numeric_dtype(dtype):
        total_row[col] = trajectory_evals_df[col].sum()
        avg_row[col] = trajectory_evals_df[col].mean()
    else:
        total_row[col] = "TOTAL"
        avg_row[col] = "AVG"


# merge evaluations and aggregations dataframes
trajectory_evals_df = pd.concat(
    [
        pd.DataFrame([total_row, avg_row], index=["TOTAL", "AVG"]),
        trajectory_evals_df,
    ],
)

# style
trajectory_evals_df.style.apply(
    lambda r: ["border-bottom: 2px solid #444"] * len(r)
    if r.name == "AVG"
    else [""] * len(r),
    axis=1,
)