In [1]:
import pandas as pd
import json
import pickle
from pathlib import Path
from pprint import pprint

#### for the given runs -> check which tasks where solved by how many runs

In [2]:
runs = [
    'results_terminal_bench_singleton_gpt-oss-20b_run_1_010925',
    'results_terminal_bench_singleton_gpt-oss-20b_run_2_030925',
    'results_terminal_bench_singleton_gpt-oss-20b_run_3_030925',
    'results_terminal_bench_singleton_gpt-oss-20b_run_4_050925',
    'results_terminal_bench_singleton_gpt-oss-20b_run_5_050925'
]

In [3]:
# get all solved and unsolved tasks per run
unresolved_tasks = dict()
resolved_tasks = dict()
all_tasks = set()
for r in runs: 
    r_dir = Path.cwd() / r
    r_dir = r_dir / [d for d in r_dir.iterdir() if d.is_dir() and d.name.endswith("_terminal_bench_logs")][0]
    r_data = json.loads((r_dir / "results.json").read_text())
    resolved_tasks[r] = r_data['resolved_ids']
    unresolved_tasks[r] = r_data['unresolved_ids']
    all_tasks.update(r_data['resolved_ids'])
    all_tasks.update(r_data['unresolved_ids'])
# count how many times a task was solved
tasks = list(all_tasks)
solved_by_runs = list()
for t in tasks:
    r_solved = list()
    for r_i, r in enumerate(runs):
        if t in resolved_tasks[r]:
            r_solved.append(r_i + 1)
    solved_by_runs.append(r_solved)
task_counts = pd.DataFrame({
    "tasks": tasks,
    "solved_by_runs": solved_by_runs
})
task_counts['counts'] = task_counts['solved_by_runs'].apply(len)
task_counts.sort_values(by="counts", ascending=False, inplace=True)
task_counts = task_counts[task_counts["counts"] > 0]
task_counts

FileNotFoundError: [Errno 2] No such file or directory: '/Users/linus/Documents/TUM/MA/code/agentu/src/benchmark/terminal_bench/analytics/results_terminal_bench_singleton_gpt-oss-20b_run_1_010925'

#### tool usages over all runs

In [None]:
runs = [
    'results_terminal_bench_singleton_gpt-oss-20b_run_3_030925',
    'results_terminal_bench_singleton_gpt-oss-20b_run_4_050925',
    'results_terminal_bench_singleton_gpt-oss-20b_run_5_050925'
]

In [None]:
for r in runs:
    all_tool_counts = dict()
    r_dir = Path.cwd() / r
    for f in r_dir.iterdir():
        if f.is_file() and f.name.endswith("_exec_meta_data.json"):
            tool_counts = json.loads(f.read_text())['agent_usage_stats']['tool_call_counts']
            for tool, count in tool_counts.items():
                if tool in all_tool_counts:
                    all_tool_counts[tool] += 1
                else:
                    all_tool_counts[tool] = 1
    pprint(all_tool_counts)


#### tasks usage where "Terminal" tool was used

In [None]:
r_dir = Path.cwd() / 'results_terminal_bench_singleton_gpt-oss-20b_run_5_050925'
for f in r_dir.iterdir():
    if f.is_file() and f.name.endswith("_exec_meta_data.json"):
        tool_counts = json.loads(f.read_text())['agent_usage_stats']['tool_call_counts']
        if 'Terminal' in tool_counts:
            print(f"[{tool_counts['Terminal']}] - {f.name}")

#### 

#### concat all logs data (tb logs and out logs)

In [None]:
run_dir = Path.cwd() / 'results_terminal_bench_singleton_gpt-oss-20b_run_6_40_070925'

# get tb results data
results_dir = run_dir / [d for d in run_dir.iterdir() if d.is_dir() and d.name.endswith("_terminal_bench_logs")][0]
results_data = json.loads((results_dir / "results.json").read_text())

# collect tb tasks and their logging data
tb_data = dict()
for result in results_data['results']:
    tb_data[result['instruction']] = {
        'task_id': result['task_id'], 
        'instruction': result['instruction'], 
        'is_resolved': bool(result['is_resolved']), 
        'failure_mode': result['failure_mode'],
        'total_input_tokens': int(result['total_input_tokens'] or 0),
        'total_output_tokens': int(result['total_output_tokens'] or 0)
    }
print(len(tb_data.keys()))
print(len([d for d in tb_data.values() if d['is_resolved']]))
print(len([d for d in tb_data.values() if not d['is_resolved']]))

In [None]:
# collect our logging data
our_data = dict()
for f in run_dir.iterdir():
    if f.is_file() and f.name.endswith("_exec_meta_data.json"):
        logs = json.loads(f.read_text())
        our_data[logs['task_instruction']] = logs
print(len(our_data.keys()))

In [None]:
# concat both lgo dicts
logs_data = dict()
matches = 0
no_matches = 0
for k, v in tb_data.items():
    logs_data[k] = v
    if k in our_data:
        matches += 1
        logs_data[k] = v | our_data[k]
    else:
        no_matches += 1
print(f"matches: {matches}; no_matches: {no_matches}")
print(len(logs_data.keys()))
print(len([d for d in logs_data.values() if d['is_resolved']]))
print(len([d for d in logs_data.values() if not d['is_resolved']]))
pprint(list(logs_data.items())[0])

In [None]:
# count all prompt tokens and completions tokens
prompt_tokens_our_logs = 0
completion_tokens_our_logs = 0
total_input_tokens_tb_logs = 0
total_output_tokens_tb_logs = 0
for k in logs_data.keys():
    logs = logs_data[k] 
    prompt_tokens_our_logs += sum(logs['agent_usage_stats']['prompt_tokens'])
    completion_tokens_our_logs += sum(logs['agent_usage_stats']['completion_tokens'])
    total_input_tokens_tb_logs += logs['total_input_tokens']
    total_output_tokens_tb_logs += logs['total_output_tokens']
print(f"prompt_tokens_our_logs: {prompt_tokens_our_logs}") 
print(f"completion_tokens_our_logs: {completion_tokens_our_logs}")
print(f"total_input_tokens_tb_logs: {total_input_tokens_tb_logs}")
print(f"total_output_tokens_tb_logs: {total_output_tokens_tb_logs}")