In [None]:
import json
from pathlib import Path

IN_DIR = Path("./eval")
OUT1 = Path("./eval_1")
OUT2 = Path("./eval_2")
OUT1.mkdir(parents=True, exist_ok=True)
OUT2.mkdir(parents=True, exist_ok=True)

def recalc_stats(data: dict, new_results: list) -> dict:
    d = dict(data)
    n_total = len(new_results)
    n_skipped = sum(1 for r in new_results if r.get("is_skipped", False))
    n_eval = n_total - n_skipped
    n_stopped = n_eval

    n_correct = sum(
        1 for r in new_results
        if not r.get("is_skipped", False) and r.get("is_correct", False)
    )
    acc = f"{(n_correct / n_eval) * 100:.2f}%" if n_eval > 0 else "0.00%"

    d["results"] = new_results
    d["num_items_total"] = n_total
    d["num_evaluated"] = n_eval
    d["num_stopped"] = n_stopped
    d["num_skipped"] = n_skipped
    d["accuracy_no_skipped"] = acc
    return d

def count_metrics(results: list):
    n_total = len(results)
    n_skipped = sum(1 for r in results if r.get("is_skipped", False))
    n_eval = n_total - n_skipped
    n_correct_excl_skip = sum(
        1 for r in results
        if not r.get("is_skipped", False) and r.get("is_correct", False)
    )
    acc_excl = (n_correct_excl_skip / n_eval * 100) if n_eval > 0 else 0.0
    acc_incl = (n_correct_excl_skip / n_total * 100) if n_total > 0 else 0.0
    return {
        "correct_excl": n_correct_excl_skip,
        "eval_cnt": n_eval,
        "total_cnt": n_total,
        "skipped": n_skipped,
        "acc_excl": acc_excl,
        "acc_incl": acc_incl,
    }

processed = []

agg_1 = {"correct_excl": 0, "eval_cnt": 0, "total_cnt": 0, "skipped": 0}
agg_2 = {"correct_excl": 0, "eval_cnt": 0, "total_cnt": 0, "skipped": 0}

for path in sorted(IN_DIR.glob("*.json")):
    with path.open("r", encoding="utf-8") as f:
        data = json.load(f)

    results = data.get("results", [])
    n = len(results)

    n_front = n // 2
    front = results[:n_front]
    back = results[n_front:] 

    data_front = recalc_stats(data, front)
    data_back = recalc_stats(data, back)

    out1 = OUT1 / path.name
    out2 = OUT2 / path.name

    with out1.open("w", encoding="utf-8") as f:
        json.dump(data_front, f, ensure_ascii=False, indent=2)
    with out2.open("w", encoding="utf-8") as f:
        json.dump(data_back, f, ensure_ascii=False, indent=2)

    m1 = count_metrics(front)
    m2 = count_metrics(back)

    processed.append((
        path.name,
        len(front), len(back),
        f"{m1['acc_excl']:.2f}%", f"{m2['acc_excl']:.2f}%",  
        f"{m1['acc_incl']:.2f}%", f"{m2['acc_incl']:.2f}%", 
    ))

    for k in agg_1: agg_1[k] += m1[k]
    for k in agg_2: agg_2[k] += m2[k]

for name, nf, nb, accf_ex, accb_ex, accf_in, accb_in in processed:
    print(
        f"{name}: "
        f"eval_1(front)={nf}개 acc_excl_skip={accf_ex} | acc_incl_skip={accf_in}  ||  "
        f"eval_2(back)={nb}개 acc_excl_skip={accb_ex} | acc_incl_skip={accb_in}"
    )

def pretty_overall(agg):
    acc_excl = (agg["correct_excl"] / agg["eval_cnt"] * 100) if agg["eval_cnt"] > 0 else 0.0
    acc_incl = (agg["correct_excl"] / agg["total_cnt"] * 100) if agg["total_cnt"] > 0 else 0.0
    return acc_excl, acc_incl

acc1_ex, acc1_in = pretty_overall(agg_1)
acc2_ex, acc2_in = pretty_overall(agg_2)

print(f"eval_1(front): "
      f"correct={agg_1['correct_excl']}, "
      f"eval={agg_1['eval_cnt']}, total={agg_1['total_cnt']}, skipped={agg_1['skipped']} -> "
      f"acc_excl_skip={acc1_ex:.2f}% | acc_incl_skip={acc1_in:.2f}%")
print(f"eval_2(back) : "
      f"correct={agg_2['correct_excl']}, "
      f"eval={agg_2['eval_cnt']}, total={agg_2['total_cnt']}, skipped={agg_2['skipped']} -> "
      f"acc_excl_skip={acc2_ex:.2f}% | acc_incl_skip={acc2_in:.2f}%")

def peek(dir_name: str, file_name: str):
    p = Path(dir_name) / file_name
    with p.open("r", encoding="utf-8") as f:
        d = json.load(f)
    head = {k: d[k] for k in ["model_id", "num_items_total", "num_evaluated", "num_stopped", "num_skipped", "accuracy_no_skipped"] if k in d}
    print(p, "->", head)


=== 분할/저장 요약 (파일별) ===
results_ds_cloth_20250913_222703.json: eval_1(front)=238개 acc_excl_skip=60.08% | acc_incl_skip=60.08%  ||  eval_2(back)=238개 acc_excl_skip=65.55% | acc_incl_skip=65.55%
results_ds_korean_20250913_222607.json: eval_1(front)=62개 acc_excl_skip=82.26% | acc_incl_skip=82.26%  ||  eval_2(back)=62개 acc_excl_skip=85.48% | acc_incl_skip=85.48%
results_ds_race_high_long_20250913_222903.json: eval_1(front)=262개 acc_excl_skip=72.14% | acc_incl_skip=72.14%  ||  eval_2(back)=263개 acc_excl_skip=76.81% | acc_incl_skip=76.81%
results_ds_race_high_short_20250913_223008.json: eval_1(front)=262개 acc_excl_skip=82.82% | acc_incl_skip=82.82%  ||  eval_2(back)=263개 acc_excl_skip=72.24% | acc_incl_skip=72.24%
results_ds_race_middle_long_20250913_222726.json: eval_1(front)=87개 acc_excl_skip=77.01% | acc_incl_skip=77.01%  ||  eval_2(back)=88개 acc_excl_skip=84.09% | acc_incl_skip=84.09%
results_ds_race_middle_short_20250913_222747.json: eval_1(front)=87개 acc_excl_skip=78.16% | acc_incl_skip