In [None]:
from pathlib import Path
import ast
import statistics as stats

# Prices for 1M tokens
PRICES = {"gpt-4o": {"input": 2.5, "output": 10.0},
          "gpt-5": {"input": 1.25, "output": 10.0}}


def compute_token_and_cost_stats(path_str):
    path = Path(path_str)
    prompt, comp, total, cost = [], [], [], []

    for raw in path.read_text().splitlines():
        line = raw.strip()
        if not line:
            continue
        d = ast.literal_eval(line)
        if any(k.endswith("_final") for k in d.keys()) or ("price" in d):
            continue

        if "langchain" in path_str:
            p, c, t = d["prompt_tokens_draft"] + d["prompt_tokens_reflect"] + d["prompt_tokens_revise"], d["completion_tokens_draft"] + d["completion_tokens_reflect"] + d["completion_tokens_revise"], d["total_tokens_draft"] + d["total_tokens_reflect"] + d["total_tokens_revise"]

        else:
            p, c, t = d["prompt_tokens"], d["completion_tokens"], d["total_tokens"]
        prompt.append(p)
        comp.append(c)
        total.append(t)

        # cost per request in USD
        if "gpt-4o" in path_str:
            cost_i = (p / 1000000) * PRICES["gpt-4o"]["input"] + (c / 1000000) * PRICES["gpt-4o"]["output"]
        elif "gpt-5" in path_str:
            cost_i = (p / 1000000) * PRICES["gpt-5"]["input"] + (c / 1000000) * PRICES["gpt-5"]["output"]
        cost.append(cost_i)

    def mean_std(x):
        if len(x) == 0:
            return float("nan"), float("nan")
        if len(x) == 1:
            return float(x[0]), 0.0
        return stats.mean(x), stats.stdev(x)

    stats_dict = {
        "n_requests": len(total),
        "prompt_mean": mean_std(prompt)[0],
        "prompt_std": mean_std(prompt)[1],
        "completion_mean": mean_std(comp)[0],
        "completion_std": mean_std(comp)[1],
        "total_mean": mean_std(total)[0],
        "total_std": mean_std(total)[1],
        "cost_mean_usd": mean_std(cost)[0],
        "cost_std_usd": mean_std(cost)[1],
    }
    print(f"""Prompt mean: {stats_dict["prompt_mean"]} ({stats_dict["prompt_std"]})
Completion mean: {stats_dict["completion_mean"]} ({stats_dict["completion_std"]})
Total mean: {stats_dict["total_mean"]} ({stats_dict["total_std"]})
Cost mean: {stats_dict["cost_mean_usd"]} ({stats_dict["cost_std_usd"]})
""")
    return stats_dict

In [22]:
import os
results_folder = "./results/fin/"
for name in os.listdir(results_folder):
    if name != ".DS_Store":
        print(name)
        compute_token_and_cost_stats(f"{results_folder}{name}/stats.txt")

gpt-5-2025-08-07-2025-09-01_15-34-16_add_instr
Prompt mean: 153.9 (10.967949597228596)
Completion mean: 2859.66 (1400.2808262076926)
Total mean: 3013.56 (1409.0041230512813)
Cost mean: 0.028788975 (0.014013695134386052)

gpt-5-2025-08-07-2025-09-29_21-27-34_langchain
Prompt mean: 18672.84 (2980.437930840338)
Completion mean: 6589.18 (3729.3705475890283)
Total mean: 25262.02 (6198.307222795855)
Cost mean: 0.08923285 (0.04000059549862849)

gpt-4o-2025-09-13_19-54-25_10_shot_RAG
Prompt mean: 2009.92 (925.1742216513403)
Completion mean: 135.76 (80.38315388544684)
Total mean: 2145.68 (989.1829330826735)
Cost mean: 0.0063824 (0.002982895925445743)

gpt-4o-2025-10-02_16-35-16_huge_prompt_UM
Prompt mean: 13059.9 (10.967949597228596)
Completion mean: 121.58 (68.07591500730366)
Total mean: 13181.48 (78.12163200903512)
Cost mean: 0.03386555 (0.0007056160085560781)

gpt-4o-2025-10-02_14-46-30_langchain
Prompt mean: 19227.9 (3350.30590896913)
Completion mean: 312.06 (201.84287787023544)
Total mean: