## Imports, Setup, Helper Functions

In [122]:
import pandas as pd
import numpy as np
from pathlib import Path

# Common thresholds (used for all model–dataset combinations)
THRESHOLDS = [0.25, 0.5, 0.75, 0.9]

In [123]:
# ===== Cell 2 — Metric functions =====

def decide(p: float, t: float) -> bool:
    """
    Decision rule: returns True if the model 'answers' the question.
    We assume the model only answers when its confidence p exceeds the threshold t.
    """
    return p > t


def penalty_score(pred: str, gold: str, p: float, t: float) -> float:
    """
    Penalty-adjusted score (as defined in the paper/proposal):
      - If p <= t  → the model abstains → score = 0.
      - If p > t and prediction == gold → score = +1.
      - If p > t and prediction != gold → score = - (p * t) / (1 - t)
        (this is a confidence-proportional penalty for being overconfident).
    """
    if p <= t:
        return 0.0
    if pred == gold:
        return 1.0
    return - (p * t) / (1.0 - t)


#Metric 1
def accuracy_at_threshold(df: pd.DataFrame, t: float) -> float:
    """
    Accuracy@t:
      Fraction of *answered* questions that are correct.
      = (# correct with p>t) / (# answered with p>t)
    If the model abstains on all (no p>t), returns 0.0.
    """
    answered = df["confidence"] > t
    answered_n = answered.sum()
    if answered_n == 0:
        return 0.0
    correct = (df["predicted_answer"] == df["answer"]) & answered
    return float(correct.sum() / answered_n)

#Metric 2
def coverage(df: pd.DataFrame, t: float) -> float:
    """
    Coverage:
      Fraction of total questions that the model *answers*.
      = (# p>t) / total
    """
    if len(df) == 0:
        return 0.0
    return float((df["confidence"] > t).sum() / len(df))

#Metric 3
def penalty_adjusted_mean(df: pd.DataFrame, t: float) -> float:
    """
    Mean penalty-adjusted score across all rows (including abstains).
    Abstentions contribute 0.
    """
    scores = [
        penalty_score(r.predicted_answer, r.answer, float(r.confidence), t)
        for r in df.itertuples(index=False)
    ]
    return float(np.mean(scores)) if scores else 0.0

#Metric 4
def overconfidence_rate(df: pd.DataFrame, t: float) -> float:
    """
    Overconfidence rate:
      Fraction of questions where the model is *wrong* but still confident (p>t).
      = (# wrong & p>t) / total
    """
    if len(df) == 0:
        return 0.0
    mask = (df["predicted_answer"] != df["answer"]) & (df["confidence"] > t)
    return float(mask.sum() / len(df))


## Qwen Evaluation

In [124]:
CSV_PATH = Path("../inference/outputs/qwen-gpqa.csv")  # <-- change this for each run
OUTPUT_PATH = Path("outputs")


In [125]:
df = pd.read_csv(CSV_PATH)

# Normalize strings & types
df["answer"] = df["answer"].astype(str).str.strip().str.upper()
df["predicted_answer"] = df["predicted_answer"].astype(str).str.strip().str.upper()
df["confidence"] = pd.to_numeric(df["confidence"], errors="coerce").fillna(0.0)
df["threshold"] = pd.to_numeric(df["threshold"], errors="coerce")

print(f"✅ Loaded {len(df)} rows from {CSV_PATH.name}")
print("Thresholds found:", sorted(df["threshold"].unique()))
df.head(3)

✅ Loaded 1020 rows from qwen-gpqa.csv
Thresholds found: [np.float64(0.25), np.float64(0.5), np.float64(0.75), np.float64(0.9)]


Unnamed: 0,id,threshold,question,choices,answer,predicted_answer,confidence
0,203,0.25,Identify the correct sequence of reagents for ...,"['1. NaH; CH3CH2Br 2. H2SO4, HNO3 3. Fe-HCl 4....",D,IDK,0.666667
1,266,0.25,There is a C-NOT gate where the condition is t...,['U_{C-NOT}\\left|\\psi\\right\\rangle =\\alph...,A,B,1.0
2,152,0.25,Two stars are being studied. It has been obser...,"['ln(2) = [ (T_1 - T_2) / (T1*T2)]', 'ln(2) = ...",A,IDK,1.0


In [126]:
# ===== Cell 4 — Split into per-threshold DataFrames =====
dfs_by_t = {t: df[df["threshold"] == t].copy() for t in THRESHOLDS}

for t in THRESHOLDS:
    print(f"t={t}: {len(dfs_by_t[t])} rows")


t=0.25: 255 rows
t=0.5: 255 rows
t=0.75: 255 rows
t=0.9: 255 rows


In [127]:
# ===== Cell 5 — Compute metrics per threshold =====
metrics_rows = []

for t in THRESHOLDS:
    df_t = dfs_by_t[t]

    metrics_rows.append({
        "threshold": t,
        "accuracy_at_t": accuracy_at_threshold(df_t, t),
        "coverage": coverage(df_t, t),
        "penalty_mean": penalty_adjusted_mean(df_t, t),
        "overconf_rate": overconfidence_rate(df_t, t),
        "answered_n": int((df_t["confidence"] > t).sum()),
        "total_n": len(df_t)
    })

metrics_df = pd.DataFrame(metrics_rows).sort_values("threshold").reset_index(drop=True)
metrics_df


Unnamed: 0,threshold,accuracy_at_t,coverage,penalty_mean,overconf_rate,answered_n,total_n
0,0.25,0.057377,0.956863,-0.228758,0.901961,244,255
1,0.5,0.04386,0.894118,-0.784314,0.854902,228,255
2,0.75,0.046296,0.847059,-2.358824,0.807843,216,255
3,0.9,0.045685,0.772549,-6.6,0.737255,197,255


In [128]:
# ===== Cell 6 — 4×4 evaluation table =====
eval_table = (
    metrics_df[["threshold", "accuracy_at_t", "coverage", "penalty_mean", "overconf_rate"]]
    .set_index("threshold")
    .sort_index()
)

# Display
print("4×4 Evaluation Table:")
display(eval_table)

# Save both detailed and compact tables
eval_table.to_csv(OUTPUT_PATH / "qwen-gpqa-metric-eval.csv")

print(f"Saved results in: {OUTPUT_PATH}")


4×4 Evaluation Table:


Unnamed: 0_level_0,accuracy_at_t,coverage,penalty_mean,overconf_rate
threshold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.25,0.057377,0.956863,-0.228758,0.901961
0.5,0.04386,0.894118,-0.784314,0.854902
0.75,0.046296,0.847059,-2.358824,0.807843
0.9,0.045685,0.772549,-6.6,0.737255


Saved results in: outputs


## Baseline Evaluation


In [129]:
coverage_floor = 0.3

eligible = eval_table[eval_table["coverage"] >= coverage_floor]
print("DEBUG — eligible rows:")
display(eligible)

best_row = eligible.loc[eligible["accuracy_at_t"].idxmax()]
t_star = best_row.name

print(f"Selected t* = {t_star}")
display(best_row)

DEBUG — eligible rows:


Unnamed: 0_level_0,accuracy_at_t,coverage,penalty_mean,overconf_rate
threshold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.25,0.057377,0.956863,-0.228758,0.901961
0.5,0.04386,0.894118,-0.784314,0.854902
0.75,0.046296,0.847059,-2.358824,0.807843
0.9,0.045685,0.772549,-6.6,0.737255


Selected t* = 0.25


accuracy_at_t    0.057377
coverage         0.956863
penalty_mean    -0.228758
overconf_rate    0.901961
Name: 0.25, dtype: float64

In [130]:
# ===== Cell 2 – Compute binary-grading and always-abstain baselines =====
import numpy as np

# Load the original prediction CSV (same used to compute metrics_df)
df = pd.read_csv(CSV_PATH)

# Compute overall accuracy
binary_acc = np.mean(df["predicted_answer"] == df["answer"])

# Wrong rate (for overconfidence)
wrong_rate = 1 - binary_acc

# Binary baseline metrics (answers everything)
binary_row = {
    "accuracy_at_t": binary_acc,
    "coverage": 1.0,
    "penalty_mean": np.nan,  # you can fill with projected penalty if desired
    "overconf_rate": wrong_rate,
}

# Always-abstain baseline metrics
abstain_row = {
    "accuracy_at_t": 0.0,
    "coverage": 0.0,
    "penalty_mean": 0.0,
    "overconf_rate": 0.0,
}

print("Binary-grading baseline:")
display(pd.DataFrame([binary_row], index=["Binary"]))
print("Always-abstain baseline:")
display(pd.DataFrame([abstain_row], index=["Abstain"]))


Binary-grading baseline:


Unnamed: 0,accuracy_at_t,coverage,penalty_mean,overconf_rate
Binary,0.051961,1.0,,0.948039


Always-abstain baseline:


Unnamed: 0,accuracy_at_t,coverage,penalty_mean,overconf_rate
Abstain,0.0,0.0,0.0,0.0


In [131]:
# ===== Cell 3 – Build final 3x4 headline table =====

headline_df = pd.DataFrame([
    best_row[["accuracy_at_t", "coverage", "penalty_mean", "overconf_rate"]],
    pd.Series(binary_row),
    pd.Series(abstain_row)
], index=[f"Confidence-aware (t*={t_star})", "Binary grading", "Always abstain"])

print("=== 3x4 Headline Evaluation Table ===")
display(headline_df)
headline_df.to_csv(OUTPUT_PATH / "qwen-gpqa-baseline-eval.csv")


=== 3x4 Headline Evaluation Table ===


Unnamed: 0,accuracy_at_t,coverage,penalty_mean,overconf_rate
Confidence-aware (t*=0.25),0.057377,0.956863,-0.228758,0.901961
Binary grading,0.051961,1.0,,0.948039
Always abstain,0.0,0.0,0.0,0.0


## GPT Evaluation

In [132]:
CSV_PATH = Path("../inference/outputs/gpt-gpqa.csv")  # <-- change this for each run
OUTPUT_PATH = Path("outputs")


In [133]:
df = pd.read_csv(CSV_PATH)

# Normalize strings & types
df["answer"] = df["answer"].astype(str).str.strip().str.upper()
df["predicted_answer"] = df["predicted_answer"].astype(str).str.strip().str.upper()
df["confidence"] = pd.to_numeric(df["confidence"], errors="coerce").fillna(0.0)
df["threshold"] = pd.to_numeric(df["threshold"], errors="coerce")

print(f"✅ Loaded {len(df)} rows from {CSV_PATH.name}")
print("Thresholds found:", sorted(df["threshold"].unique()))
df.head(3)

✅ Loaded 1020 rows from gpt-gpqa.csv
Thresholds found: [np.float64(0.25), np.float64(0.5), np.float64(0.75), np.float64(0.9)]


Unnamed: 0,id,threshold,question,choices,answer,predicted_answer,confidence
0,203,0.25,Identify the correct sequence of reagents for ...,"['1. NaH; CH3CH2Br 2. H2SO4, HNO3 3. Fe-HCl 4....",D,B,0.666667
1,266,0.25,There is a C-NOT gate where the condition is t...,['U_{C-NOT}\\left|\\psi\\right\\rangle =\\alph...,A,D,0.666667
2,152,0.25,Two stars are being studied. It has been obser...,"['ln(2) = [ (T_1 - T_2) / (T1*T2)]', 'ln(2) = ...",A,A,1.0


In [134]:
# ===== Cell 4 — Split into per-threshold DataFrames =====
dfs_by_t = {t: df[df["threshold"] == t].copy() for t in THRESHOLDS}

for t in THRESHOLDS:
    print(f"t={t}: {len(dfs_by_t[t])} rows")


t=0.25: 255 rows
t=0.5: 255 rows
t=0.75: 255 rows
t=0.9: 255 rows


In [135]:
# ===== Cell 5 — Compute metrics per threshold =====
metrics_rows = []

for t in THRESHOLDS:
    df_t = dfs_by_t[t]

    metrics_rows.append({
        "threshold": t,
        "accuracy_at_t": accuracy_at_threshold(df_t, t),
        "coverage": coverage(df_t, t),
        "penalty_mean": penalty_adjusted_mean(df_t, t),
        "overconf_rate": overconfidence_rate(df_t, t),
        "answered_n": int((df_t["confidence"] > t).sum()),
        "total_n": len(df_t)
    })

metrics_df = pd.DataFrame(metrics_rows).sort_values("threshold").reset_index(drop=True)


In [136]:
# ===== Cell 6 — 4×4 evaluation table =====
eval_table = (
    metrics_df[["threshold", "accuracy_at_t", "coverage", "penalty_mean", "overconf_rate"]]
    .set_index("threshold")
    .sort_index()
)

# Display
print("4×4 Evaluation Table:")
display(eval_table)

# Save both detailed and compact tables
eval_table.to_csv(OUTPUT_PATH / "gpt-gpqa-metric-eval.csv")

print(f"Saved results in: {OUTPUT_PATH}")


4×4 Evaluation Table:


Unnamed: 0_level_0,accuracy_at_t,coverage,penalty_mean,overconf_rate
threshold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.25,0.334646,0.996078,0.147756,0.662745
0.5,0.348624,0.854902,-0.2,0.556863
0.75,0.338798,0.717647,-1.111765,0.47451
0.9,0.40146,0.537255,-2.678431,0.321569


Saved results in: outputs


### Baseline Evaluation

In [137]:
coverage_floor = 0.3

eligible = eval_table[eval_table["coverage"] >= coverage_floor]
print("DEBUG — eligible rows:")
display(eligible)

best_row = eligible.loc[eligible["accuracy_at_t"].idxmax()]
t_star = best_row.name

print(f"Selected t* = {t_star}")
display(best_row)

DEBUG — eligible rows:


Unnamed: 0_level_0,accuracy_at_t,coverage,penalty_mean,overconf_rate
threshold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.25,0.334646,0.996078,0.147756,0.662745
0.5,0.348624,0.854902,-0.2,0.556863
0.75,0.338798,0.717647,-1.111765,0.47451
0.9,0.40146,0.537255,-2.678431,0.321569


Selected t* = 0.9


accuracy_at_t    0.401460
coverage         0.537255
penalty_mean    -2.678431
overconf_rate    0.321569
Name: 0.9, dtype: float64

In [138]:
# ===== Cell 2 – Compute binary-grading and always-abstain baselines =====
import numpy as np

# Load the original prediction CSV (same used to compute metrics_df)
df = pd.read_csv(CSV_PATH)

# Compute overall accuracy
binary_acc = np.mean(df["predicted_answer"] == df["answer"])

# Wrong rate (for overconfidence)
wrong_rate = 1 - binary_acc

# Binary baseline metrics (answers everything)
binary_row = {
    "accuracy_at_t": binary_acc,
    "coverage": 1.0,
    "penalty_mean": np.nan,  # you can fill with projected penalty if desired
    "overconf_rate": wrong_rate,
}

# Always-abstain baseline metrics
abstain_row = {
    "accuracy_at_t": 0.0,
    "coverage": 0.0,
    "penalty_mean": 0.0,
    "overconf_rate": 0.0,
}

print("Binary-grading baseline:")
display(pd.DataFrame([binary_row], index=["Binary"]))
print("Always-abstain baseline:")
display(pd.DataFrame([abstain_row], index=["Abstain"]))


Binary-grading baseline:


Unnamed: 0,accuracy_at_t,coverage,penalty_mean,overconf_rate
Binary,0.319608,1.0,,0.680392


Always-abstain baseline:


Unnamed: 0,accuracy_at_t,coverage,penalty_mean,overconf_rate
Abstain,0.0,0.0,0.0,0.0


In [139]:
# ===== Cell 3 – Build final 3x4 headline table =====

headline_df = pd.DataFrame([
    best_row[["accuracy_at_t", "coverage", "penalty_mean", "overconf_rate"]],
    pd.Series(binary_row),
    pd.Series(abstain_row)
], index=[f"Confidence-aware (t*={t_star})", "Binary grading", "Always abstain"])

print("=== 3x4 Headline Evaluation Table ===")
display(headline_df)
headline_df.to_csv(OUTPUT_PATH / "gpt-gpqa-baseline-eval.csv")


=== 3x4 Headline Evaluation Table ===


Unnamed: 0,accuracy_at_t,coverage,penalty_mean,overconf_rate
Confidence-aware (t*=0.9),0.40146,0.537255,-2.678431,0.321569
Binary grading,0.319608,1.0,,0.680392
Always abstain,0.0,0.0,0.0,0.0


## Claude Evaluation


In [140]:
CSV_PATH = Path("../inference/outputs/claude-gpqa.csv")  # <-- change this for each run
OUTPUT_PATH = Path("outputs")


In [141]:
df = pd.read_csv(CSV_PATH)

# Normalize strings & types
df["answer"] = df["answer"].astype(str).str.strip().str.upper()
df["predicted_answer"] = df["predicted_answer"].astype(str).str.strip().str.upper()
df["confidence"] = pd.to_numeric(df["confidence"], errors="coerce").fillna(0.0)
df["threshold"] = pd.to_numeric(df["threshold"], errors="coerce")

print(f"✅ Loaded {len(df)} rows from {CSV_PATH.name}")
print("Thresholds found:", sorted(df["threshold"].unique()))
df.head(3)

✅ Loaded 1020 rows from claude-gpqa.csv
Thresholds found: [np.float64(0.25), np.float64(0.5), np.float64(0.75), np.float64(0.9)]


Unnamed: 0,id,threshold,question,choices,answer,predicted_answer,confidence
0,203,0.25,Identify the correct sequence of reagents for ...,"['1. NaH; CH3CH2Br 2. H2SO4, HNO3 3. Fe-HCl 4....",D,I,1.0
1,266,0.25,There is a C-NOT gate where the condition is t...,['U_{C-NOT}\\left|\\psi\\right\\rangle =\\alph...,A,I,1.0
2,152,0.25,Two stars are being studied. It has been obser...,"['ln(2) = [ (T_1 - T_2) / (T1*T2)]', 'ln(2) = ...",A,I,1.0


In [142]:
# ===== Cell 4 — Split into per-threshold DataFrames =====
dfs_by_t = {t: df[df["threshold"] == t].copy() for t in THRESHOLDS}

for t in THRESHOLDS:
    print(f"t={t}: {len(dfs_by_t[t])} rows")


t=0.25: 255 rows
t=0.5: 255 rows
t=0.75: 255 rows
t=0.9: 255 rows


In [143]:
# ===== Cell 5 — Compute metrics per threshold =====
metrics_rows = []

for t in THRESHOLDS:
    df_t = dfs_by_t[t]

    metrics_rows.append({
        "threshold": t,
        "accuracy_at_t": accuracy_at_threshold(df_t, t),
        "coverage": coverage(df_t, t),
        "penalty_mean": penalty_adjusted_mean(df_t, t),
        "overconf_rate": overconfidence_rate(df_t, t),
        "answered_n": int((df_t["confidence"] > t).sum()),
        "total_n": len(df_t)
    })

metrics_df = pd.DataFrame(metrics_rows).sort_values("threshold").reset_index(drop=True)


In [144]:
# ===== Cell 6 — 4×4 evaluation table =====
eval_table = (
    metrics_df[["threshold", "accuracy_at_t", "coverage", "penalty_mean", "overconf_rate"]]
    .set_index("threshold")
    .sort_index()
)

# Display
print("4×4 Evaluation Table:")
display(eval_table)

# Save both detailed and compact tables
eval_table.to_csv(OUTPUT_PATH / "claude-gpqa-metric-eval.csv")

print(f"Saved results in: {OUTPUT_PATH}")


4×4 Evaluation Table:


Unnamed: 0_level_0,accuracy_at_t,coverage,penalty_mean,overconf_rate
threshold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.25,0.034335,0.913725,-0.259804,0.882353
0.5,0.025751,0.913725,-0.855817,0.890196
0.75,0.029046,0.945098,-2.721176,0.917647
0.9,0.021097,0.929412,-8.168627,0.909804


Saved results in: outputs


### Baseline Evaluation

In [145]:
coverage_floor = 0.3

eligible = eval_table[eval_table["coverage"] >= coverage_floor]
print("DEBUG — eligible rows:")
display(eligible)

best_row = eligible.loc[eligible["accuracy_at_t"].idxmax()]
t_star = best_row.name

print(f"Selected t* = {t_star}")
display(best_row)

DEBUG — eligible rows:


Unnamed: 0_level_0,accuracy_at_t,coverage,penalty_mean,overconf_rate
threshold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.25,0.034335,0.913725,-0.259804,0.882353
0.5,0.025751,0.913725,-0.855817,0.890196
0.75,0.029046,0.945098,-2.721176,0.917647
0.9,0.021097,0.929412,-8.168627,0.909804


Selected t* = 0.25


accuracy_at_t    0.034335
coverage         0.913725
penalty_mean    -0.259804
overconf_rate    0.882353
Name: 0.25, dtype: float64

In [146]:
# ===== Cell 2 – Compute binary-grading and always-abstain baselines =====
import numpy as np

# Load the original prediction CSV (same used to compute metrics_df)
df = pd.read_csv(CSV_PATH)

# Compute overall accuracy
binary_acc = np.mean(df["predicted_answer"] == df["answer"])

# Wrong rate (for overconfidence)
wrong_rate = 1 - binary_acc

# Binary baseline metrics (answers everything)
binary_row = {
    "accuracy_at_t": binary_acc,
    "coverage": 1.0,
    "penalty_mean": np.nan,  # you can fill with projected penalty if desired
    "overconf_rate": wrong_rate,
}

# Always-abstain baseline metrics
abstain_row = {
    "accuracy_at_t": 0.0,
    "coverage": 0.0,
    "penalty_mean": 0.0,
    "overconf_rate": 0.0,
}

print("Binary-grading baseline:")
display(pd.DataFrame([binary_row], index=["Binary"]))
print("Always-abstain baseline:")
display(pd.DataFrame([abstain_row], index=["Abstain"]))


Binary-grading baseline:


Unnamed: 0,accuracy_at_t,coverage,penalty_mean,overconf_rate
Binary,0.027451,1.0,,0.972549


Always-abstain baseline:


Unnamed: 0,accuracy_at_t,coverage,penalty_mean,overconf_rate
Abstain,0.0,0.0,0.0,0.0


In [147]:
# ===== Cell 3 – Build final 3x4 headline table =====

headline_df = pd.DataFrame([
    best_row[["accuracy_at_t", "coverage", "penalty_mean", "overconf_rate"]],
    pd.Series(binary_row),
    pd.Series(abstain_row)
], index=[f"Confidence-aware (t*={t_star})", "Binary grading", "Always abstain"])

print("=== 3x4 Headline Evaluation Table ===")
display(headline_df)
headline_df.to_csv(OUTPUT_PATH / "claude-gpqa-baseline-eval.csv")


=== 3x4 Headline Evaluation Table ===


Unnamed: 0,accuracy_at_t,coverage,penalty_mean,overconf_rate
Confidence-aware (t*=0.25),0.034335,0.913725,-0.259804,0.882353
Binary grading,0.027451,1.0,,0.972549
Always abstain,0.0,0.0,0.0,0.0


## LLama Evaluation

In [148]:
CSV_PATH = Path("../inference/outputs/llama-gpqa.csv")  # <-- change this for each run
OUTPUT_PATH = Path("outputs")


In [149]:
df = pd.read_csv(CSV_PATH)

# Normalize strings & types
df["answer"] = df["answer"].astype(str).str.strip().str.upper()
df["predicted_answer"] = df["predicted_answer"].astype(str).str.strip().str.upper()
df["confidence"] = pd.to_numeric(df["confidence"], errors="coerce").fillna(0.0)
df["threshold"] = pd.to_numeric(df["threshold"], errors="coerce")

print(f"✅ Loaded {len(df)} rows from {CSV_PATH.name}")
print("Thresholds found:", sorted(df["threshold"].unique()))
df.head(3)

✅ Loaded 1020 rows from llama-gpqa.csv
Thresholds found: [np.float64(0.25), np.float64(0.5), np.float64(0.75), np.float64(0.9)]


Unnamed: 0,id,threshold,question,choices,answer,predicted_answer,confidence
0,203,0.25,Identify the correct sequence of reagents for ...,"['1. NaH; CH3CH2Br 2. H2SO4, HNO3 3. Fe-HCl 4....",D,T,1.0
1,266,0.25,There is a C-NOT gate where the condition is t...,['U_{C-NOT}\\left|\\psi\\right\\rangle =\\alph...,A,IDK,0.0
2,152,0.25,Two stars are being studied. It has been obser...,"['ln(2) = [ (T_1 - T_2) / (T1*T2)]', 'ln(2) = ...",A,A,0.666667


In [150]:
# ===== Cell 4 — Split into per-threshold DataFrames =====
dfs_by_t = {t: df[df["threshold"] == t].copy() for t in THRESHOLDS}

for t in THRESHOLDS:
    print(f"t={t}: {len(dfs_by_t[t])} rows")


t=0.25: 255 rows
t=0.5: 255 rows
t=0.75: 255 rows
t=0.9: 255 rows


In [151]:
# ===== Cell 5 — Compute metrics per threshold =====
metrics_rows = []

for t in THRESHOLDS:
    df_t = dfs_by_t[t]

    metrics_rows.append({
        "threshold": t,
        "accuracy_at_t": accuracy_at_threshold(df_t, t),
        "coverage": coverage(df_t, t),
        "penalty_mean": penalty_adjusted_mean(df_t, t),
        "overconf_rate": overconfidence_rate(df_t, t),
        "answered_n": int((df_t["confidence"] > t).sum()),
        "total_n": len(df_t)
    })

metrics_df = pd.DataFrame(metrics_rows).sort_values("threshold").reset_index(drop=True)


In [152]:
# ===== Cell 6 — 4×4 evaluation table =====
eval_table = (
    metrics_df[["threshold", "accuracy_at_t", "coverage", "penalty_mean", "overconf_rate"]]
    .set_index("threshold")
    .sort_index()
)

# Display
print("4×4 Evaluation Table:")
display(eval_table)

# Save both detailed and compact tables
eval_table.to_csv(OUTPUT_PATH / "llama-gpqa-metric-eval.csv")

print(f"Saved results in: {OUTPUT_PATH}")


4×4 Evaluation Table:


Unnamed: 0_level_0,accuracy_at_t,coverage,penalty_mean,overconf_rate
threshold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.25,0.10828,0.615686,-0.092048,0.54902
0.5,0.106061,0.517647,-0.394771,0.462745
0.75,0.131579,0.447059,-1.105882,0.388235
0.9,0.151786,0.439216,-3.286275,0.372549


Saved results in: outputs


### Baseline Evaluation

In [153]:
coverage_floor = 0.3

eligible = eval_table[eval_table["coverage"] >= coverage_floor]
print("DEBUG — eligible rows:")
display(eligible)

best_row = eligible.loc[eligible["accuracy_at_t"].idxmax()]
t_star = best_row.name

print(f"Selected t* = {t_star}")
display(best_row)

DEBUG — eligible rows:


Unnamed: 0_level_0,accuracy_at_t,coverage,penalty_mean,overconf_rate
threshold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.25,0.10828,0.615686,-0.092048,0.54902
0.5,0.106061,0.517647,-0.394771,0.462745
0.75,0.131579,0.447059,-1.105882,0.388235
0.9,0.151786,0.439216,-3.286275,0.372549


Selected t* = 0.9


accuracy_at_t    0.151786
coverage         0.439216
penalty_mean    -3.286275
overconf_rate    0.372549
Name: 0.9, dtype: float64

In [154]:
# ===== Cell 2 – Compute binary-grading and always-abstain baselines =====
import numpy as np

# Load the original prediction CSV (same used to compute metrics_df)
df = pd.read_csv(CSV_PATH)

# Compute overall accuracy
binary_acc = np.mean(df["predicted_answer"] == df["answer"])

# Wrong rate (for overconfidence)
wrong_rate = 1 - binary_acc

# Binary baseline metrics (answers everything)
binary_row = {
    "accuracy_at_t": binary_acc,
    "coverage": 1.0,
    "penalty_mean": np.nan,  # you can fill with projected penalty if desired
    "overconf_rate": wrong_rate,
}

# Always-abstain baseline metrics
abstain_row = {
    "accuracy_at_t": 0.0,
    "coverage": 0.0,
    "penalty_mean": 0.0,
    "overconf_rate": 0.0,
}

print("Binary-grading baseline:")
display(pd.DataFrame([binary_row], index=["Binary"]))
print("Always-abstain baseline:")
display(pd.DataFrame([abstain_row], index=["Abstain"]))


Binary-grading baseline:


Unnamed: 0,accuracy_at_t,coverage,penalty_mean,overconf_rate
Binary,0.072549,1.0,,0.927451


Always-abstain baseline:


Unnamed: 0,accuracy_at_t,coverage,penalty_mean,overconf_rate
Abstain,0.0,0.0,0.0,0.0


In [155]:
# ===== Cell 3 – Build final 3x4 headline table =====

headline_df = pd.DataFrame([
    best_row[["accuracy_at_t", "coverage", "penalty_mean", "overconf_rate"]],
    pd.Series(binary_row),
    pd.Series(abstain_row)
], index=[f"Confidence-aware (t*={t_star})", "Binary grading", "Always abstain"])

print("=== 3x4 Headline Evaluation Table ===")
display(headline_df)
headline_df.to_csv(OUTPUT_PATH / "llama-gpqa-baseline-eval.csv")


=== 3x4 Headline Evaluation Table ===


Unnamed: 0,accuracy_at_t,coverage,penalty_mean,overconf_rate
Confidence-aware (t*=0.9),0.151786,0.439216,-3.286275,0.372549
Binary grading,0.072549,1.0,,0.927451
Always abstain,0.0,0.0,0.0,0.0


## Gemini Evaluation

In [156]:
CSV_PATH = Path("../inference/outputs/gemini-gpqa.csv")  # <-- change this for each run
OUTPUT_PATH = Path("outputs")


In [157]:
df = pd.read_csv(CSV_PATH)

# Normalize strings & types
df["answer"] = df["answer"].astype(str).str.strip().str.upper()
df["predicted_answer"] = df["predicted_answer"].astype(str).str.strip().str.upper()
df["confidence"] = pd.to_numeric(df["confidence"], errors="coerce").fillna(0.0)
df["threshold"] = pd.to_numeric(df["threshold"], errors="coerce")

print(f"✅ Loaded {len(df)} rows from {CSV_PATH.name}")
print("Thresholds found:", sorted(df["threshold"].unique()))
df.head(3)

✅ Loaded 1020 rows from gemini-gpqa.csv
Thresholds found: [np.float64(0.25), np.float64(0.5), np.float64(0.75), np.float64(0.9)]


Unnamed: 0,id,threshold,question,choices,answer,predicted_answer,confidence
0,203,0.25,Identify the correct sequence of reagents for ...,"['1. NaH; CH3CH2Br 2. H2SO4, HNO3 3. Fe-HCl 4....",D,IDK,1.0
1,266,0.25,There is a C-NOT gate where the condition is t...,['U_{C-NOT}\\left|\\psi\\right\\rangle =\\alph...,A,C,0.75
2,152,0.25,Two stars are being studied. It has been obser...,"['ln(2) = [ (T_1 - T_2) / (T1*T2)]', 'ln(2) = ...",A,IDK,0.0


In [158]:
# ===== Cell 4 — Split into per-threshold DataFrames =====
dfs_by_t = {t: df[df["threshold"] == t].copy() for t in THRESHOLDS}

for t in THRESHOLDS:
    print(f"t={t}: {len(dfs_by_t[t])} rows")


t=0.25: 255 rows
t=0.5: 255 rows
t=0.75: 255 rows
t=0.9: 255 rows


In [159]:
# ===== Cell 5 — Compute metrics per threshold =====
metrics_rows = []

for t in THRESHOLDS:
    df_t = dfs_by_t[t]

    metrics_rows.append({
        "threshold": t,
        "accuracy_at_t": accuracy_at_threshold(df_t, t),
        "coverage": coverage(df_t, t),
        "penalty_mean": penalty_adjusted_mean(df_t, t),
        "overconf_rate": overconfidence_rate(df_t, t),
        "answered_n": int((df_t["confidence"] > t).sum()),
        "total_n": len(df_t)
    })

metrics_df = pd.DataFrame(metrics_rows).sort_values("threshold").reset_index(drop=True)


In [160]:
# ===== Cell 6 — 4×4 evaluation table =====
eval_table = (
    metrics_df[["threshold", "accuracy_at_t", "coverage", "penalty_mean", "overconf_rate"]]
    .set_index("threshold")
    .sort_index()
)

# Display
print("4×4 Evaluation Table:")
display(eval_table)

# Save both detailed and compact tables
eval_table.to_csv(OUTPUT_PATH / "gemini-gpqa-metric-eval.csv")

print(f"Saved results in: {OUTPUT_PATH}")


4×4 Evaluation Table:


Unnamed: 0_level_0,accuracy_at_t,coverage,penalty_mean,overconf_rate
threshold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.25,0.207143,0.54902,-0.016645,0.435294
0.5,0.234375,0.501961,-0.248627,0.384314
0.75,0.225,0.470588,-0.962353,0.364706
0.9,0.189655,0.454902,-3.231373,0.368627


Saved results in: outputs


### Baseline Evaluation

In [161]:
coverage_floor = 0.3

eligible = eval_table[eval_table["coverage"] >= coverage_floor]
print("DEBUG — eligible rows:")
display(eligible)

best_row = eligible.loc[eligible["accuracy_at_t"].idxmax()]
t_star = best_row.name

print(f"Selected t* = {t_star}")
display(best_row)

DEBUG — eligible rows:


Unnamed: 0_level_0,accuracy_at_t,coverage,penalty_mean,overconf_rate
threshold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.25,0.207143,0.54902,-0.016645,0.435294
0.5,0.234375,0.501961,-0.248627,0.384314
0.75,0.225,0.470588,-0.962353,0.364706
0.9,0.189655,0.454902,-3.231373,0.368627


Selected t* = 0.5


accuracy_at_t    0.234375
coverage         0.501961
penalty_mean    -0.248627
overconf_rate    0.384314
Name: 0.5, dtype: float64

In [162]:
# ===== Cell 2 – Compute binary-grading and always-abstain baselines =====
import numpy as np

# Load the original prediction CSV (same used to compute metrics_df)
df = pd.read_csv(CSV_PATH)

# Compute overall accuracy
binary_acc = np.mean(df["predicted_answer"] == df["answer"])

# Wrong rate (for overconfidence)
wrong_rate = 1 - binary_acc

# Binary baseline metrics (answers everything)
binary_row = {
    "accuracy_at_t": binary_acc,
    "coverage": 1.0,
    "penalty_mean": np.nan,  # you can fill with projected penalty if desired
    "overconf_rate": wrong_rate,
}

# Always-abstain baseline metrics
abstain_row = {
    "accuracy_at_t": 0.0,
    "coverage": 0.0,
    "penalty_mean": 0.0,
    "overconf_rate": 0.0,
}

print("Binary-grading baseline:")
display(pd.DataFrame([binary_row], index=["Binary"]))
print("Always-abstain baseline:")
display(pd.DataFrame([abstain_row], index=["Abstain"]))


Binary-grading baseline:


Unnamed: 0,accuracy_at_t,coverage,penalty_mean,overconf_rate
Binary,0.120588,1.0,,0.879412


Always-abstain baseline:


Unnamed: 0,accuracy_at_t,coverage,penalty_mean,overconf_rate
Abstain,0.0,0.0,0.0,0.0


In [163]:
# ===== Cell 3 – Build final 3x4 headline table =====

headline_df = pd.DataFrame([
    best_row[["accuracy_at_t", "coverage", "penalty_mean", "overconf_rate"]],
    pd.Series(binary_row),
    pd.Series(abstain_row)
], index=[f"Confidence-aware (t*={t_star})", "Binary grading", "Always abstain"])

print("=== 3x4 Headline Evaluation Table ===")
display(headline_df)
headline_df.to_csv(OUTPUT_PATH / "gemini-gpqa-baseline-eval.csv")


=== 3x4 Headline Evaluation Table ===


Unnamed: 0,accuracy_at_t,coverage,penalty_mean,overconf_rate
Confidence-aware (t*=0.5),0.234375,0.501961,-0.248627,0.384314
Binary grading,0.120588,1.0,,0.879412
Always abstain,0.0,0.0,0.0,0.0


## Mistral Evaluation