In [3]:
import pandas as pd
import plotly.express as px
import numpy as np


def bootstrap_confidence_interval(values, *, n_boot: int = 1000, ci: int = 95, seed: int = 42):
    """
    Compute the bootstrap confidence interval of the mean of a numeric variable.

    Args:
        values (1-D array-like of numeric values)
        n_boot (int): number of bootstrap resamples
        ci (int): confidence level, e.g. 95 for 95 % CI
        seed (int): seed for reproducibility
    """
    assert 0 < ci < 100, "CI must be between 0 and 100"
    values = np.asarray(values, dtype=float)
    n = values.size
    # Handle edge case where there are no samples
    if n == 0:
        return np.nan, (np.nan, np.nan)

    rng = np.random.default_rng(seed)

    # Draw `n_boot` resamples of size `n`
    resample_idxs = rng.choice(n, size=(n_boot, n), replace=True)
    resample_means = values[resample_idxs].mean(axis=1)

    # Percentile-based CI
    alpha = (100 - ci) / 2  # e.g. 2.5 for a 95 % CI
    lower, upper = np.percentile(resample_means, [alpha, 100 - alpha])
    mean = values.mean()

    return mean, (lower, upper)


def analyze_results(file_name):
    full_results = pd.read_json(file_name)

    aggregated_results = {}
    for column in full_results.columns:
        mean, (lower, upper) = bootstrap_confidence_interval(full_results[column])
        aggregated_results[column] = [mean, mean - lower, upper - mean]

    aggregated_results = (
        pd.DataFrame.from_dict(aggregated_results, orient="index", columns=["mean", "lower", "upper"])
        .reset_index()
        .rename(columns={"index": "n_conversions"})
    )
    # aggregated_results = aggregated_results.melt(value_vars=["lower", "upper", "mean"])
    return aggregated_results


results_json = analyze_results("results/benchmark_results_json.json")
results_json["agent_type"] = "json"

# results_code = analyze_results("results/benchmark_results_code.json")
# results_code["agent_type"] = "code"

results = pd.concat([results_json], axis=0)

display(results)

Unnamed: 0,n_conversions,mean,lower,upper,agent_type
0,10,0.99,0.02,0.01,json
1,20,1.0,0.0,0.0,json
2,30,0.99,0.02,0.01,json
3,40,1.0,0.0,0.0,json
4,50,0.22,0.08,0.08025,json
5,60,0.35,0.09,0.1,json
6,70,0.25,0.08,0.08,json
7,80,0.18,0.07,0.08,json
8,90,0.21,0.08,0.08,json
9,100,0.22,0.08,0.08025,json


In [None]:
error_y = ("upper_error",)
error_y_minus = ("lower_error",)

fig = px.bar(
    results,
    x="n_conversions",
    y="mean",
    error_y="upper",
    error_y_minus="lower",
    color_continuous_scale="viridis",
    color="agent_type",
)

# Update legend labels
fig.update_traces(name="Code", selector=dict(name="code"))
fig.update_traces(name="JSON (built-in)", selector=dict(name="json"))

fig.update_layout(
    title="<b>Comparing the reliability of code and JSON agents on repeated operations, with GPT-4o</b>",
    xaxis_title="<b>Number of Operations</b>",
    yaxis_title="<b>Success Rate</b>",
    barmode="group",
    width=1000,
    height=500,
    bargap=0.4,
    yaxis=dict(range=[0, 1]),
)

fig.update_layout(legend_title_text="Agent Type")

fig.show()
