## Imports, Setup, Helper Functions

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

THRESHOLDS = [0.25, 0.5, 0.75, 0.9]

In [None]:
def decide(p: float, t: float) -> bool:
    # decision rule: True if model answers
    # assumption: model only answers when p > t
    return p > t


def penalty_score(pred: str, gold: str, p: float, t: float) -> float:
    # our penalty adjusted scoring
    # if p <= t -> the model abstains -> score = 0.
    # if p > t and prediction == gold -> score = +1.
    # if p > t and prediction != gold -> score = - (p * t) / (1 - t)
    if p <= t:
        return 0.0
    if pred == gold:
        return 1.0
    return round(- (p * t) / (1.0 - t),2)


def accuracy_at_threshold(df: pd.DataFrame, t: float) -> float:
    # fraction of answered questions that are correct
    # acc@t = (correct with p>t) / (# answered with p>t)
    answered = df["confidence"] > t
    answered_n = answered.sum()
    if answered_n == 0:
        return 0.0
    correct = (df["predicted_answer"] == df["answer"]) & answered
    return round(100*float(correct.sum() / answered_n),2)

def coverage(df: pd.DataFrame, t: float) -> float:
    # frac of total questions the model answers
    # cov = (# p>t) / total

    if len(df) == 0:
        return 0.0
    return round(100*float((df["confidence"] > t).sum() / len(df)),2)

def penalty_adjusted_mean(df: pd.DataFrame, t: float) -> float:
    # mean penalty-adjusted score across all rows 
    # includes abstains = 0

    scores = [
        penalty_score(r.predicted_answer, r.answer, float(r.confidence), t)
        for r in df.itertuples(index=False)
    ]
    return round(float(np.mean(scores)) if scores else 0.0,2)

def overconfidence_rate(df: pd.DataFrame, t: float) -> float:
    # fraction of questions where the model is wrong but p>t
    # over = (# wrong & p>t) / total
    if len(df) == 0:
        return 0.0
    mask = (df["predicted_answer"] != df["answer"]) & (df["confidence"] > t)
    return round(100*float(mask.sum() / len(df)))


## Qwen Evaluation

In [None]:
CSV_PATH = Path("../inference/outputs/qwen-gpqa.csv")
OUTPUT_PATH = Path("outputs")

In [None]:
df = pd.read_csv(CSV_PATH)

df["answer"] = df["answer"].astype(str).str.strip().str.upper()
df["predicted_answer"] = df["predicted_answer"].astype(str).str.strip().str.upper()
df["confidence"] = round(100*pd.to_numeric(df["confidence"], errors="coerce").fillna(0.0),2)
df["threshold"] = pd.to_numeric(df["threshold"], errors="coerce")

print(f"Loaded {len(df)} rows from {CSV_PATH.name}")
print("Thresholds found:", sorted(df["threshold"].unique()))
df.head(3)

✅ Loaded 1020 rows from qwen-gpqa.csv
Thresholds found: [np.float64(0.25), np.float64(0.5), np.float64(0.75), np.float64(0.9)]


Unnamed: 0,id,threshold,question,choices,answer,predicted_answer,confidence
0,203,0.25,Identify the correct sequence of reagents for ...,"['1. NaH; CH3CH2Br 2. H2SO4, HNO3 3. Fe-HCl 4....",D,IDK,66.67
1,266,0.25,There is a C-NOT gate where the condition is t...,['U_{C-NOT}\\left|\\psi\\right\\rangle =\\alph...,A,B,100.0
2,152,0.25,Two stars are being studied. It has been obser...,"['ln(2) = [ (T_1 - T_2) / (T1*T2)]', 'ln(2) = ...",A,IDK,100.0


In [None]:
dfs_by_t = {t: df[df["threshold"] == t].copy() for t in THRESHOLDS}

for t in THRESHOLDS:
    print(f"t={t}: {len(dfs_by_t[t])} rows")


t=0.25: 255 rows
t=0.5: 255 rows
t=0.75: 255 rows
t=0.9: 255 rows


In [None]:
metrics_rows = []

for t in THRESHOLDS:
    df_t = dfs_by_t[t]

    metrics_rows.append({
        "threshold": t,
        "accuracy_at_t": accuracy_at_threshold(df_t, t),
        "coverage": coverage(df_t, t),
        "penalty_mean": penalty_adjusted_mean(df_t, t),
        "overconf_rate": round(overconfidence_rate(df_t, t),2),
        "answered_n": int((df_t["confidence"] > t).sum()),
        "total_n": len(df_t)
    })

metrics_df = pd.DataFrame(metrics_rows).sort_values("threshold").reset_index(drop=True)
metrics_df

Unnamed: 0,threshold,accuracy_at_t,coverage,penalty_mean,overconf_rate,answered_n,total_n
0,0.25,5.74,95.69,-28.31,90,244,255
1,0.5,4.53,95.29,-85.05,91,243,255
2,0.75,6.58,95.29,-254.84,89,243,255
3,0.9,4.96,94.9,-763.72,90,242,255


In [None]:
eval_table = (
    metrics_df[["threshold", "accuracy_at_t", "coverage", "penalty_mean", "overconf_rate"]]
    .set_index("threshold")
    .sort_index()
)

print("4×4 Evaluation Table:")
display(eval_table)

eval_table.to_csv(OUTPUT_PATH / "qwen-gpqa-metric-eval.csv")
print(f"Saved results in: {OUTPUT_PATH}")


4×4 Evaluation Table:


Unnamed: 0_level_0,accuracy_at_t,coverage,penalty_mean,overconf_rate
threshold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.25,5.74,95.69,-28.31,90
0.5,4.53,95.29,-85.05,91
0.75,6.58,95.29,-254.84,89
0.9,4.96,94.9,-763.72,90


Saved results in: outputs


## Baseline Evaluation


In [140]:
coverage_floor = 0.3

eligible = eval_table[eval_table["coverage"] >= coverage_floor]
print("DEBUG — eligible rows:")
display(eligible)

best_row = eligible.loc[eligible["accuracy_at_t"].idxmax()]
t_star = best_row.name

print(f"Selected t* = {t_star}")
display(best_row)

DEBUG — eligible rows:


Unnamed: 0_level_0,accuracy_at_t,coverage,penalty_mean,overconf_rate
threshold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.25,5.74,95.69,-28.31,90
0.5,4.53,95.29,-85.05,91
0.75,6.58,95.29,-254.84,89
0.9,4.96,94.9,-763.72,90


Selected t* = 0.75


accuracy_at_t      6.58
coverage          95.29
penalty_mean    -254.84
overconf_rate     89.00
Name: 0.75, dtype: float64

In [None]:
import numpy as np
df = pd.read_csv(CSV_PATH)
binary_acc = np.mean(df["predicted_answer"] == df["answer"])
wrong_rate = 1 - binary_acc
binary_row = {
    "accuracy_at_t": round(100*binary_acc,2),
    "coverage": 1.0,
    "penalty_mean": np.nan,  
    "overconf_rate": round(100*wrong_rate,2),
}

abstain_row = {
    "accuracy_at_t": 0.0,
    "coverage": 0.0,
    "penalty_mean": 0.0,
    "overconf_rate": 0.0,
}

print("Binary-grading baseline:")
display(pd.DataFrame([binary_row], index=["Binary"]))
print("Always-abstain baseline:")
display(pd.DataFrame([abstain_row], index=["Abstain"]))

Binary-grading baseline:


Unnamed: 0,accuracy_at_t,coverage,penalty_mean,overconf_rate
Binary,5.2,1.0,,94.8


Always-abstain baseline:


Unnamed: 0,accuracy_at_t,coverage,penalty_mean,overconf_rate
Abstain,0.0,0.0,0.0,0.0


In [None]:
headline_df = pd.DataFrame([
    best_row[["accuracy_at_t", "coverage", "penalty_mean", "overconf_rate"]],
    pd.Series(binary_row),
    pd.Series(abstain_row)
], index=[f"Confidence-aware (t*={t_star})", "Binary grading", "Always abstain"])

print("=== 3x4 Headline Evaluation Table ===")
display(headline_df)
headline_df.to_csv(OUTPUT_PATH / "qwen-gpqa-baseline-eval.csv")


=== 3x4 Headline Evaluation Table ===


Unnamed: 0,accuracy_at_t,coverage,penalty_mean,overconf_rate
Confidence-aware (t*=0.75),6.58,95.29,-254.84,89.0
Binary grading,5.2,1.0,,94.8
Always abstain,0.0,0.0,0.0,0.0


## GPT Evaluation

In [None]:
CSV_PATH = Path("../inference/outputs/gpt-gpqa.csv")
OUTPUT_PATH = Path("outputs")


In [None]:
df = pd.read_csv(CSV_PATH)

df["answer"] = df["answer"].astype(str).str.strip().str.upper()
df["predicted_answer"] = df["predicted_answer"].astype(str).str.strip().str.upper()
df["confidence"] = round(100*pd.to_numeric(df["confidence"], errors="coerce").fillna(0.0),2)
df["threshold"] = pd.to_numeric(df["threshold"], errors="coerce")

print(f"Loaded {len(df)} rows from {CSV_PATH.name}")
print("Thresholds found:", sorted(df["threshold"].unique()))
df.head(3)

✅ Loaded 1020 rows from gpt-gpqa.csv
Thresholds found: [np.float64(0.25), np.float64(0.5), np.float64(0.75), np.float64(0.9)]


Unnamed: 0,id,threshold,question,choices,answer,predicted_answer,confidence
0,203,0.25,Identify the correct sequence of reagents for ...,"['1. NaH; CH3CH2Br 2. H2SO4, HNO3 3. Fe-HCl 4....",D,B,66.67
1,266,0.25,There is a C-NOT gate where the condition is t...,['U_{C-NOT}\\left|\\psi\\right\\rangle =\\alph...,A,D,66.67
2,152,0.25,Two stars are being studied. It has been obser...,"['ln(2) = [ (T_1 - T_2) / (T1*T2)]', 'ln(2) = ...",A,A,100.0


In [None]:
dfs_by_t = {t: df[df["threshold"] == t].copy() for t in THRESHOLDS}

for t in THRESHOLDS:
    print(f"t={t}: {len(dfs_by_t[t])} rows")


t=0.25: 255 rows
t=0.5: 255 rows
t=0.75: 255 rows
t=0.9: 255 rows


In [None]:
metrics_rows = []

for t in THRESHOLDS:
    df_t = dfs_by_t[t]

    metrics_rows.append({
        "threshold": t,
        "accuracy_at_t": accuracy_at_threshold(df_t, t),
        "coverage": coverage(df_t, t),
        "penalty_mean": penalty_adjusted_mean(df_t, t),
        "overconf_rate": overconfidence_rate(df_t, t),
        "answered_n": int((df_t["confidence"] > t).sum()),
        "total_n": len(df_t)
    })

metrics_df = pd.DataFrame(metrics_rows).sort_values("threshold").reset_index(drop=True)


In [None]:
eval_table = (
    metrics_df[["threshold", "accuracy_at_t", "coverage", "penalty_mean", "overconf_rate"]]
    .set_index("threshold")
    .sort_index()
)

print("4×4 Evaluation Table:")
display(eval_table)

eval_table.to_csv(OUTPUT_PATH / "gpt-gpqa-metric-eval.csv")

print(f"Saved results in: {OUTPUT_PATH}")


4×4 Evaluation Table:


Unnamed: 0_level_0,accuracy_at_t,coverage,penalty_mean,overconf_rate
threshold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.25,33.46,99.61,-18.22,66
0.5,33.86,99.61,-54.24,66
0.75,29.92,99.61,-172.33,70
0.9,31.1,99.61,-506.22,69


Saved results in: outputs


### Baseline Evaluation

In [148]:
coverage_floor = 0.3

eligible = eval_table[eval_table["coverage"] >= coverage_floor]
print("DEBUG — eligible rows:")
display(eligible)

best_row = eligible.loc[eligible["accuracy_at_t"].idxmax()]
t_star = best_row.name

print(f"Selected t* = {t_star}")
display(best_row)

DEBUG — eligible rows:


Unnamed: 0_level_0,accuracy_at_t,coverage,penalty_mean,overconf_rate
threshold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.25,33.46,99.61,-18.22,66
0.5,33.86,99.61,-54.24,66
0.75,29.92,99.61,-172.33,70
0.9,31.1,99.61,-506.22,69


Selected t* = 0.5


accuracy_at_t    33.86
coverage         99.61
penalty_mean    -54.24
overconf_rate    66.00
Name: 0.5, dtype: float64

In [None]:
import numpy as np
df = pd.read_csv(CSV_PATH)

binary_acc = np.mean(df["predicted_answer"] == df["answer"])

wrong_rate = 1 - binary_acc

binary_row = {
    "accuracy_at_t": round(100*binary_acc,2),
    "coverage": 1.0,
    "penalty_mean": np.nan,
    "overconf_rate": round(100*wrong_rate,2),
}

abstain_row = {
    "accuracy_at_t": 0.0,
    "coverage": 0.0,
    "penalty_mean": 0.0,
    "overconf_rate": 0.0,
}

print("Binary-grading baseline:")
display(pd.DataFrame([binary_row], index=["Binary"]))
print("Always-abstain baseline:")
display(pd.DataFrame([abstain_row], index=["Abstain"]))


Binary-grading baseline:


Unnamed: 0,accuracy_at_t,coverage,penalty_mean,overconf_rate
Binary,31.96,1.0,,68.04


Always-abstain baseline:


Unnamed: 0,accuracy_at_t,coverage,penalty_mean,overconf_rate
Abstain,0.0,0.0,0.0,0.0


In [None]:
headline_df = pd.DataFrame([
    best_row[["accuracy_at_t", "coverage", "penalty_mean", "overconf_rate"]],
    pd.Series(binary_row),
    pd.Series(abstain_row)
], index=[f"Confidence-aware (t*={t_star})", "Binary grading", "Always abstain"])

print("=== 3x4 Headline Evaluation Table ===")
display(headline_df)
headline_df.to_csv(OUTPUT_PATH / "gpt-gpqa-baseline-eval.csv")


=== 3x4 Headline Evaluation Table ===


Unnamed: 0,accuracy_at_t,coverage,penalty_mean,overconf_rate
Confidence-aware (t*=0.5),33.86,99.61,-54.24,66.0
Binary grading,31.96,1.0,,68.04
Always abstain,0.0,0.0,0.0,0.0


## Claude Evaluation


In [None]:
CSV_PATH = Path("../inference/outputs/claude-gpqa.csv")
OUTPUT_PATH = Path("outputs")


In [None]:
df = pd.read_csv(CSV_PATH)

df["answer"] = df["answer"].astype(str).str.strip().str.upper()
df["predicted_answer"] = df["predicted_answer"].astype(str).str.strip().str.upper()
df["confidence"] = round(100*pd.to_numeric(df["confidence"], errors="coerce").fillna(0.0),2)
df["threshold"] = pd.to_numeric(df["threshold"], errors="coerce")

print(f"Loaded {len(df)} rows from {CSV_PATH.name}")
print("Thresholds found:", sorted(df["threshold"].unique()))
df.head(3)

✅ Loaded 1020 rows from claude-gpqa.csv
Thresholds found: [np.float64(0.25), np.float64(0.5), np.float64(0.75), np.float64(0.9)]


Unnamed: 0,id,threshold,question,choices,answer,predicted_answer,confidence
0,203,0.25,Identify the correct sequence of reagents for ...,"['1. NaH; CH3CH2Br 2. H2SO4, HNO3 3. Fe-HCl 4....",D,I,100.0
1,266,0.25,There is a C-NOT gate where the condition is t...,['U_{C-NOT}\\left|\\psi\\right\\rangle =\\alph...,A,I,100.0
2,152,0.25,Two stars are being studied. It has been obser...,"['ln(2) = [ (T_1 - T_2) / (T1*T2)]', 'ln(2) = ...",A,I,100.0


In [None]:
dfs_by_t = {t: df[df["threshold"] == t].copy() for t in THRESHOLDS}

for t in THRESHOLDS:
    print(f"t={t}: {len(dfs_by_t[t])} rows")


t=0.25: 255 rows
t=0.5: 255 rows
t=0.75: 255 rows
t=0.9: 255 rows


In [None]:
metrics_rows = []

for t in THRESHOLDS:
    df_t = dfs_by_t[t]

    metrics_rows.append({
        "threshold": t,
        "accuracy_at_t": accuracy_at_threshold(df_t, t),
        "coverage": coverage(df_t, t),
        "penalty_mean": penalty_adjusted_mean(df_t, t),
        "overconf_rate": overconfidence_rate(df_t, t),
        "answered_n": int((df_t["confidence"] > t).sum()),
        "total_n": len(df_t)
    })

metrics_df = pd.DataFrame(metrics_rows).sort_values("threshold").reset_index(drop=True)


In [None]:
eval_table = (
    metrics_df[["threshold", "accuracy_at_t", "coverage", "penalty_mean", "overconf_rate"]]
    .set_index("threshold")
    .sort_index()
)
print("4×4 Evaluation Table:")
display(eval_table)

eval_table.to_csv(OUTPUT_PATH / "claude-gpqa-metric-eval.csv")

print(f"Saved results in: {OUTPUT_PATH}")


4×4 Evaluation Table:


Unnamed: 0_level_0,accuracy_at_t,coverage,penalty_mean,overconf_rate
threshold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.25,3.43,91.37,-29.08,88
0.5,2.92,94.12,-89.08,91
0.75,2.88,95.29,-276.31,93
0.9,2.44,96.47,-839.27,94


Saved results in: outputs


### Baseline Evaluation

In [156]:
coverage_floor = 0.3

eligible = eval_table[eval_table["coverage"] >= coverage_floor]
print("DEBUG — eligible rows:")
display(eligible)

best_row = eligible.loc[eligible["accuracy_at_t"].idxmax()]
t_star = best_row.name

print(f"Selected t* = {t_star}")
display(best_row)

DEBUG — eligible rows:


Unnamed: 0_level_0,accuracy_at_t,coverage,penalty_mean,overconf_rate
threshold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.25,3.43,91.37,-29.08,88
0.5,2.92,94.12,-89.08,91
0.75,2.88,95.29,-276.31,93
0.9,2.44,96.47,-839.27,94


Selected t* = 0.25


accuracy_at_t     3.43
coverage         91.37
penalty_mean    -29.08
overconf_rate    88.00
Name: 0.25, dtype: float64

In [None]:
import numpy as np

df = pd.read_csv(CSV_PATH)

binary_acc = np.mean(df["predicted_answer"] == df["answer"])

wrong_rate = 1 - binary_acc

binary_row = {
    "accuracy_at_t": round(100*binary_acc,2),
    "coverage": 1.0,
    "penalty_mean": np.nan,
    "overconf_rate": round(100*wrong_rate,2),
}

abstain_row = {
    "accuracy_at_t": 0.0,
    "coverage": 0.0,
    "penalty_mean": 0.0,
    "overconf_rate": 0.0,
}

print("Binary-grading baseline:")
display(pd.DataFrame([binary_row], index=["Binary"]))
print("Always-abstain baseline:")
display(pd.DataFrame([abstain_row], index=["Abstain"]))


Binary-grading baseline:


Unnamed: 0,accuracy_at_t,coverage,penalty_mean,overconf_rate
Binary,2.75,1.0,,97.25


Always-abstain baseline:


Unnamed: 0,accuracy_at_t,coverage,penalty_mean,overconf_rate
Abstain,0.0,0.0,0.0,0.0


In [None]:
headline_df = pd.DataFrame([
    best_row[["accuracy_at_t", "coverage", "penalty_mean", "overconf_rate"]],
    pd.Series(binary_row),
    pd.Series(abstain_row)
], index=[f"Confidence-aware (t*={t_star})", "Binary grading", "Always abstain"])

print("=== 3x4 Headline Evaluation Table ===")
display(headline_df)
headline_df.to_csv(OUTPUT_PATH / "claude-gpqa-baseline-eval.csv")


=== 3x4 Headline Evaluation Table ===


Unnamed: 0,accuracy_at_t,coverage,penalty_mean,overconf_rate
Confidence-aware (t*=0.25),3.43,91.37,-29.08,88.0
Binary grading,2.75,1.0,,97.25
Always abstain,0.0,0.0,0.0,0.0


## LLama Evaluation

In [None]:
CSV_PATH = Path("../inference/outputs/llama-gpqa.csv")
OUTPUT_PATH = Path("outputs")


In [None]:
df = pd.read_csv(CSV_PATH)

df["answer"] = df["answer"].astype(str).str.strip().str.upper()
df["predicted_answer"] = df["predicted_answer"].astype(str).str.strip().str.upper()
df["confidence"] = round(100*pd.to_numeric(df["confidence"], errors="coerce").fillna(0.0),2)
df["threshold"] = pd.to_numeric(df["threshold"], errors="coerce")

print(f"Loaded {len(df)} rows from {CSV_PATH.name}")
print("Thresholds found:", sorted(df["threshold"].unique()))
df.head(3)

✅ Loaded 1020 rows from llama-gpqa.csv
Thresholds found: [np.float64(0.25), np.float64(0.5), np.float64(0.75), np.float64(0.9)]


Unnamed: 0,id,threshold,question,choices,answer,predicted_answer,confidence
0,203,0.25,Identify the correct sequence of reagents for ...,"['1. NaH; CH3CH2Br 2. H2SO4, HNO3 3. Fe-HCl 4....",D,T,100.0
1,266,0.25,There is a C-NOT gate where the condition is t...,['U_{C-NOT}\\left|\\psi\\right\\rangle =\\alph...,A,IDK,0.0
2,152,0.25,Two stars are being studied. It has been obser...,"['ln(2) = [ (T_1 - T_2) / (T1*T2)]', 'ln(2) = ...",A,A,66.67


In [None]:
dfs_by_t = {t: df[df["threshold"] == t].copy() for t in THRESHOLDS}

for t in THRESHOLDS:
    print(f"t={t}: {len(dfs_by_t[t])} rows")


t=0.25: 255 rows
t=0.5: 255 rows
t=0.75: 255 rows
t=0.9: 255 rows


In [None]:
metrics_rows = []

for t in THRESHOLDS:
    df_t = dfs_by_t[t]

    metrics_rows.append({
        "threshold": t,
        "accuracy_at_t": accuracy_at_threshold(df_t, t),
        "coverage": coverage(df_t, t),
        "penalty_mean": penalty_adjusted_mean(df_t, t),
        "overconf_rate": overconfidence_rate(df_t, t),
        "answered_n": int((df_t["confidence"] > t).sum()),
        "total_n": len(df_t)
    })

metrics_df = pd.DataFrame(metrics_rows).sort_values("threshold").reset_index(drop=True)


In [None]:
eval_table = (
    metrics_df[["threshold", "accuracy_at_t", "coverage", "penalty_mean", "overconf_rate"]]
    .set_index("threshold")
    .sort_index()
)

print("4×4 Evaluation Table:")
display(eval_table)

eval_table.to_csv(OUTPUT_PATH / "llama-gpqa-metric-eval.csv")
print(f"Saved results in: {OUTPUT_PATH}")


4×4 Evaluation Table:


Unnamed: 0_level_0,accuracy_at_t,coverage,penalty_mean,overconf_rate
threshold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.25,10.83,61.57,-15.8,55
0.5,10.78,65.49,-50.78,58
0.75,11.54,61.18,-139.44,54
0.9,13.12,62.75,-412.68,55


Saved results in: outputs


### Baseline Evaluation

In [164]:
coverage_floor = 0.3

eligible = eval_table[eval_table["coverage"] >= coverage_floor]
print("DEBUG — eligible rows:")
display(eligible)

best_row = eligible.loc[eligible["accuracy_at_t"].idxmax()]
t_star = best_row.name

print(f"Selected t* = {t_star}")
display(best_row)

DEBUG — eligible rows:


Unnamed: 0_level_0,accuracy_at_t,coverage,penalty_mean,overconf_rate
threshold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.25,10.83,61.57,-15.8,55
0.5,10.78,65.49,-50.78,58
0.75,11.54,61.18,-139.44,54
0.9,13.12,62.75,-412.68,55


Selected t* = 0.9


accuracy_at_t     13.12
coverage          62.75
penalty_mean    -412.68
overconf_rate     55.00
Name: 0.9, dtype: float64

In [None]:
import numpy as np

df = pd.read_csv(CSV_PATH)

binary_acc = np.mean(df["predicted_answer"] == df["answer"])

wrong_rate = 1 - binary_acc

binary_row = {
    "accuracy_at_t": round(100*binary_acc,2),
    "coverage": 1.0,
    "penalty_mean": np.nan,
    "overconf_rate": round(100*wrong_rate,2),
}

abstain_row = {
    "accuracy_at_t": 0.0,
    "coverage": 0.0,
    "penalty_mean": 0.0,
    "overconf_rate": 0.0,
}

print("Binary-grading baseline:")
display(pd.DataFrame([binary_row], index=["Binary"]))
print("Always-abstain baseline:")
display(pd.DataFrame([abstain_row], index=["Abstain"]))


Binary-grading baseline:


Unnamed: 0,accuracy_at_t,coverage,penalty_mean,overconf_rate
Binary,7.25,1.0,,92.75


Always-abstain baseline:


Unnamed: 0,accuracy_at_t,coverage,penalty_mean,overconf_rate
Abstain,0.0,0.0,0.0,0.0


In [None]:
headline_df = pd.DataFrame([
    best_row[["accuracy_at_t", "coverage", "penalty_mean", "overconf_rate"]],
    pd.Series(binary_row),
    pd.Series(abstain_row)
], index=[f"Confidence-aware (t*={t_star})", "Binary grading", "Always abstain"])

print("=== 3x4 Headline Evaluation Table ===")
display(headline_df)
headline_df.to_csv(OUTPUT_PATH / "llama-gpqa-baseline-eval.csv")


=== 3x4 Headline Evaluation Table ===


Unnamed: 0,accuracy_at_t,coverage,penalty_mean,overconf_rate
Confidence-aware (t*=0.9),13.12,62.75,-412.68,55.0
Binary grading,7.25,1.0,,92.75
Always abstain,0.0,0.0,0.0,0.0


## Gemini Evaluation

In [None]:
CSV_PATH = Path("../inference/outputs/gemini-gpqa.csv")
OUTPUT_PATH = Path("outputs")


In [None]:
df = pd.read_csv(CSV_PATH)

df["answer"] = df["answer"].astype(str).str.strip().str.upper()
df["predicted_answer"] = df["predicted_answer"].astype(str).str.strip().str.upper()
df["confidence"] = round(100*pd.to_numeric(df["confidence"], errors="coerce").fillna(0.0),2)
df["threshold"] = pd.to_numeric(df["threshold"], errors="coerce")

print(f"Loaded {len(df)} rows from {CSV_PATH.name}")
print("Thresholds found:", sorted(df["threshold"].unique()))
df.head(3)

✅ Loaded 1020 rows from gemini-gpqa.csv
Thresholds found: [np.float64(0.25), np.float64(0.5), np.float64(0.75), np.float64(0.9)]


Unnamed: 0,id,threshold,question,choices,answer,predicted_answer,confidence
0,203,0.25,Identify the correct sequence of reagents for ...,"['1. NaH; CH3CH2Br 2. H2SO4, HNO3 3. Fe-HCl 4....",D,IDK,100.0
1,266,0.25,There is a C-NOT gate where the condition is t...,['U_{C-NOT}\\left|\\psi\\right\\rangle =\\alph...,A,C,75.0
2,152,0.25,Two stars are being studied. It has been obser...,"['ln(2) = [ (T_1 - T_2) / (T1*T2)]', 'ln(2) = ...",A,IDK,0.0


In [None]:
dfs_by_t = {t: df[df["threshold"] == t].copy() for t in THRESHOLDS}

for t in THRESHOLDS:
    print(f"t={t}: {len(dfs_by_t[t])} rows")


t=0.25: 255 rows
t=0.5: 255 rows
t=0.75: 255 rows
t=0.9: 255 rows


In [None]:
metrics_rows = []

for t in THRESHOLDS:
    df_t = dfs_by_t[t]

    metrics_rows.append({
        "threshold": t,
        "accuracy_at_t": accuracy_at_threshold(df_t, t),
        "coverage": coverage(df_t, t),
        "penalty_mean": penalty_adjusted_mean(df_t, t),
        "overconf_rate": overconfidence_rate(df_t, t),
        "answered_n": int((df_t["confidence"] > t).sum()),
        "total_n": len(df_t)
    })

metrics_df = pd.DataFrame(metrics_rows).sort_values("threshold").reset_index(drop=True)


In [None]:
eval_table = (
    metrics_df[["threshold", "accuracy_at_t", "coverage", "penalty_mean", "overconf_rate"]]
    .set_index("threshold")
    .sort_index()
)

print("4×4 Evaluation Table:")
display(eval_table)

eval_table.to_csv(OUTPUT_PATH / "gemini-gpqa-metric-eval.csv")

print(f"Saved results in: {OUTPUT_PATH}")


4×4 Evaluation Table:


Unnamed: 0_level_0,accuracy_at_t,coverage,penalty_mean,overconf_rate
threshold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.25,20.71,54.9,-12.92,44
0.5,21.53,56.47,-39.32,44
0.75,22.92,56.47,-118.6,44
0.9,20.69,56.86,-376.47,45


Saved results in: outputs


### Baseline Evaluation

In [172]:
coverage_floor = 0.3

eligible = eval_table[eval_table["coverage"] >= coverage_floor]
print("DEBUG — eligible rows:")
display(eligible)

best_row = eligible.loc[eligible["accuracy_at_t"].idxmax()]
t_star = best_row.name

print(f"Selected t* = {t_star}")
display(best_row)

DEBUG — eligible rows:


Unnamed: 0_level_0,accuracy_at_t,coverage,penalty_mean,overconf_rate
threshold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.25,20.71,54.9,-12.92,44
0.5,21.53,56.47,-39.32,44
0.75,22.92,56.47,-118.6,44
0.9,20.69,56.86,-376.47,45


Selected t* = 0.75


accuracy_at_t     22.92
coverage          56.47
penalty_mean    -118.60
overconf_rate     44.00
Name: 0.75, dtype: float64

In [None]:
import numpy as np

df = pd.read_csv(CSV_PATH)

binary_acc = np.mean(df["predicted_answer"] == df["answer"])

wrong_rate = 1 - binary_acc

binary_row = {
    "accuracy_at_t": round(100*binary_acc,2),
    "coverage": 1.0,
    "penalty_mean": np.nan,
    "overconf_rate": round(100*wrong_rate,2),
}

abstain_row = {
    "accuracy_at_t": 0.0,
    "coverage": 0.0,
    "penalty_mean": 0.0,
    "overconf_rate": 0.0,
}

print("Binary-grading baseline:")
display(pd.DataFrame([binary_row], index=["Binary"]))
print("Always-abstain baseline:")
display(pd.DataFrame([abstain_row], index=["Abstain"]))

Binary-grading baseline:


Unnamed: 0,accuracy_at_t,coverage,penalty_mean,overconf_rate
Binary,12.06,1.0,,87.94


Always-abstain baseline:


Unnamed: 0,accuracy_at_t,coverage,penalty_mean,overconf_rate
Abstain,0.0,0.0,0.0,0.0


In [None]:
headline_df = pd.DataFrame([
    best_row[["accuracy_at_t", "coverage", "penalty_mean", "overconf_rate"]],
    pd.Series(binary_row),
    pd.Series(abstain_row)
], index=[f"Confidence-aware (t*={t_star})", "Binary grading", "Always abstain"])

print("=== 3x4 Headline Evaluation Table ===")
display(headline_df)
headline_df.to_csv(OUTPUT_PATH / "gemini-gpqa-baseline-eval.csv")


=== 3x4 Headline Evaluation Table ===


Unnamed: 0,accuracy_at_t,coverage,penalty_mean,overconf_rate
Confidence-aware (t*=0.75),22.92,56.47,-118.6,44.0
Binary grading,12.06,1.0,,87.94
Always abstain,0.0,0.0,0.0,0.0


## Mistral Evaluation

In [None]:
CSV_PATH = Path("../inference/outputs/mistral-gpqa.csv")
OUTPUT_PATH = Path("outputs")

In [None]:
df = pd.read_csv(CSV_PATH)

df["answer"] = df["answer"].astype(str).str.strip().str.upper()
df["predicted_answer"] = df["predicted_answer"].astype(str).str.strip().str.upper()
df["confidence"] = round(100*pd.to_numeric(df["confidence"], errors="coerce").fillna(0.0),2)
df["threshold"] = pd.to_numeric(df["threshold"], errors="coerce")

print(f"Loaded {len(df)} rows from {CSV_PATH.name}")
print("Thresholds found:", sorted(df["threshold"].unique()))
df.head(3)

✅ Loaded 1020 rows from mistral-gpqa.csv
Thresholds found: [np.float64(0.25), np.float64(0.5), np.float64(0.75), np.float64(0.9)]


Unnamed: 0,id,threshold,question,choices,answer,predicted_answer,confidence
0,203,0.25,Identify the correct sequence of reagents for ...,"['1. NaH; CH3CH2Br 2. H2SO4, HNO3 3. Fe-HCl 4....",D,A,33.33
1,266,0.25,There is a C-NOT gate where the condition is t...,['U_{C-NOT}\\left|\\psi\\right\\rangle =\\alph...,A,B,100.0
2,152,0.25,Two stars are being studied. It has been obser...,"['ln(2) = [ (T_1 - T_2) / (T1*T2)]', 'ln(2) = ...",A,A,100.0


In [None]:
dfs_by_t = {t: df[df["threshold"] == t].copy() for t in THRESHOLDS}

for t in THRESHOLDS:
    print(f"t={t}: {len(dfs_by_t[t])} rows")

t=0.25: 255 rows
t=0.5: 255 rows
t=0.75: 255 rows
t=0.9: 255 rows


In [None]:
metrics_rows = []

for t in THRESHOLDS:
    df_t = dfs_by_t[t]

    metrics_rows.append({
        "threshold": t,
        "accuracy_at_t": accuracy_at_threshold(df_t, t),
        "coverage": coverage(df_t, t),
        "penalty_mean": penalty_adjusted_mean(df_t, t),
        "overconf_rate": overconfidence_rate(df_t, t),
        "answered_n": int((df_t["confidence"] > t).sum()),
        "total_n": len(df_t)
    })

metrics_df = pd.DataFrame(metrics_rows).sort_values("threshold").reset_index(drop=True)

In [None]:
eval_table = (
    metrics_df[["threshold", "accuracy_at_t", "coverage", "penalty_mean", "overconf_rate"]]
    .set_index("threshold")
    .sort_index()
)

print("4×4 Evaluation Table:")
display(eval_table)

eval_table.to_csv(OUTPUT_PATH / "mistral-gpqa-metric-eval.csv")

print(f"Saved results in: {OUTPUT_PATH}")

4×4 Evaluation Table:


Unnamed: 0_level_0,accuracy_at_t,coverage,penalty_mean,overconf_rate
threshold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.25,20.78,100.0,-23.08,79
0.5,20.39,100.0,-68.3,80
0.75,18.43,100.0,-215.64,82
0.9,19.61,100.0,-634.21,80


Saved results in: outputs


In [180]:
coverage_floor = 0.3

eligible = eval_table[eval_table["coverage"] >= coverage_floor]
print("DEBUG — eligible rows:")
display(eligible)

best_row = eligible.loc[eligible["accuracy_at_t"].idxmax()]
t_star = best_row.name

print(f"Selected t* = {t_star}")
display(best_row)

DEBUG — eligible rows:


Unnamed: 0_level_0,accuracy_at_t,coverage,penalty_mean,overconf_rate
threshold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.25,20.78,100.0,-23.08,79
0.5,20.39,100.0,-68.3,80
0.75,18.43,100.0,-215.64,82
0.9,19.61,100.0,-634.21,80


Selected t* = 0.25


accuracy_at_t     20.78
coverage         100.00
penalty_mean     -23.08
overconf_rate     79.00
Name: 0.25, dtype: float64

In [None]:
import numpy as np

df = pd.read_csv(CSV_PATH)

binary_acc = np.mean(df["predicted_answer"] == df["answer"])

wrong_rate = 1 - binary_acc

binary_row = {
    "accuracy_at_t": round(100*binary_acc,2),
    "coverage": 1.0,
    "penalty_mean": np.nan,
    "overconf_rate": round(100*wrong_rate,2),
}

abstain_row = {
    "accuracy_at_t": 0.0,
    "coverage": 0.0,
    "penalty_mean": 0.0,
    "overconf_rate": 0.0,
}

print("Binary-grading baseline:")
display(pd.DataFrame([binary_row], index=["Binary"]))
print("Always-abstain baseline:")
display(pd.DataFrame([abstain_row], index=["Abstain"]))


Binary-grading baseline:


Unnamed: 0,accuracy_at_t,coverage,penalty_mean,overconf_rate
Binary,19.8,1.0,,80.2


Always-abstain baseline:


Unnamed: 0,accuracy_at_t,coverage,penalty_mean,overconf_rate
Abstain,0.0,0.0,0.0,0.0


In [None]:
headline_df = pd.DataFrame([
    best_row[["accuracy_at_t", "coverage", "penalty_mean", "overconf_rate"]],
    pd.Series(binary_row),
    pd.Series(abstain_row)
], index=[f"Confidence-aware (t*={t_star})", "Binary grading", "Always abstain"])

print("=== 3x4 Headline Evaluation Table ===")
display(headline_df)
headline_df.to_csv(OUTPUT_PATH / "mistral-gpqa-baseline-eval.csv")


=== 3x4 Headline Evaluation Table ===


Unnamed: 0,accuracy_at_t,coverage,penalty_mean,overconf_rate
Confidence-aware (t*=0.25),20.78,100.0,-23.08,79.0
Binary grading,19.8,1.0,,80.2
Always abstain,0.0,0.0,0.0,0.0
