## Imports, Setup, Helper Functions

In [194]:
import pandas as pd
import numpy as np
from pathlib import Path

# Common thresholds (used for all model–dataset combinations)
THRESHOLDS = [0.25, 0.5, 0.75, 0.9]

In [195]:
# ===== Cell 2 — Metric functions =====

def decide(p: float, t: float) -> bool:
    """
    Decision rule: returns True if the model 'answers' the question.
    We assume the model only answers when its confidence p exceeds the threshold t.
    """
    return p > t


def penalty_score(pred: str, gold: str, p: float, t: float) -> float:
    """
    Penalty-adjusted score (as defined in the paper/proposal):
      - If p <= t  → the model abstains → score = 0.
      - If p > t and prediction == gold → score = +1.
      - If p > t and prediction != gold → score = - (p * t) / (1 - t)
        (this is a confidence-proportional penalty for being overconfident).
    """
    if p <= t:
        return 0.0
    if pred == gold:
        return 1.0
    return - (p * t) / (1.0 - t)


#Metric 1
def accuracy_at_threshold(df: pd.DataFrame, t: float) -> float:
    """
    Accuracy@t:
      Fraction of *answered* questions that are correct.
      = (# correct with p>t) / (# answered with p>t)
    If the model abstains on all (no p>t), returns 0.0.
    """
    answered = df["confidence"] > t
    answered_n = answered.sum()
    if answered_n == 0:
        return 0.0
    correct = (df["predicted_answer"] == df["answer"]) & answered
    return float(correct.sum() / answered_n)

#Metric 2
def coverage(df: pd.DataFrame, t: float) -> float:
    """
    Coverage:
      Fraction of total questions that the model *answers*.
      = (# p>t) / total
    """
    if len(df) == 0:
        return 0.0
    return float((df["confidence"] > t).sum() / len(df))

#Metric 3
def penalty_adjusted_mean(df: pd.DataFrame, t: float) -> float:
    """
    Mean penalty-adjusted score across all rows (including abstains).
    Abstentions contribute 0.
    """
    scores = [
        penalty_score(r.predicted_answer, r.answer, float(r.confidence), t)
        for r in df.itertuples(index=False)
    ]
    return float(np.mean(scores)) if scores else 0.0

#Metric 4
def overconfidence_rate(df: pd.DataFrame, t: float) -> float:
    """
    Overconfidence rate:
      Fraction of questions where the model is *wrong* but still confident (p>t).
      = (# wrong & p>t) / total
    """
    if len(df) == 0:
        return 0.0
    mask = (df["predicted_answer"] != df["answer"]) & (df["confidence"] > t)
    return float(mask.sum() / len(df))


## Qwen Evaluation

In [196]:
CSV_PATH = Path("../inference/outputs/qwen-mmlu.csv")  # <-- change this for each run
OUTPUT_PATH = Path("outputs")


In [197]:
df = pd.read_csv(CSV_PATH)

# Normalize strings & types
df["answer"] = df["answer"].astype(str).str.strip().str.upper()
df["predicted_answer"] = df["predicted_answer"].astype(str).str.strip().str.upper()
df["confidence"] = pd.to_numeric(df["confidence"], errors="coerce").fillna(0.0)
df["threshold"] = pd.to_numeric(df["threshold"], errors="coerce")

print(f"✅ Loaded {len(df)} rows from {CSV_PATH.name}")
print("Thresholds found:", sorted(df["threshold"].unique()))
df.head(3)

✅ Loaded 3400 rows from qwen-mmlu.csv
Thresholds found: [np.float64(0.25), np.float64(0.5), np.float64(0.75), np.float64(0.9)]


Unnamed: 0,id,threshold,question,choices,answer,predicted_answer,confidence
0,596,0.25,A manufacturer is currently selling 2000 units...,['$2.50' '$1.90' '$2.70' '$2.60' '$1.80' '$2.2...,F,IDK,0.5
1,814,0.25,Fred Lowes is a typewriter salesman. He receiv...,['$210' '$200' '$225' '$175' '$195' '$150' '$2...,I,B,0.833333
2,817,0.25,"Mary Redmond purchased a $28,500 home with 20%...",['$305' '$190' '$171' '$285.50' '$399' '$323' ...,F,IDK,1.0


In [198]:
# ===== Cell 4 — Split into per-threshold DataFrames =====
dfs_by_t = {t: df[df["threshold"] == t].copy() for t in THRESHOLDS}

for t in THRESHOLDS:
    print(f"t={t}: {len(dfs_by_t[t])} rows")


t=0.25: 850 rows
t=0.5: 850 rows
t=0.75: 850 rows
t=0.9: 850 rows


In [199]:
# ===== Cell 5 — Compute metrics per threshold =====
metrics_rows = []

for t in THRESHOLDS:
    df_t = dfs_by_t[t]

    metrics_rows.append({
        "threshold": t,
        "accuracy_at_t": accuracy_at_threshold(df_t, t),
        "coverage": coverage(df_t, t),
        "penalty_mean": penalty_adjusted_mean(df_t, t),
        "overconf_rate": overconfidence_rate(df_t, t),
        "answered_n": int((df_t["confidence"] > t).sum()),
        "total_n": len(df_t)
    })

metrics_df = pd.DataFrame(metrics_rows).sort_values("threshold").reset_index(drop=True)
metrics_df


Unnamed: 0,threshold,accuracy_at_t,coverage,penalty_mean,overconf_rate,answered_n,total_n
0,0.25,0.07971,0.974118,-0.17019,0.896471,828,850
1,0.5,0.094183,0.849412,-0.601471,0.769412,722,850
2,0.75,0.081481,0.635294,-1.636941,0.583529,540,850
3,0.9,0.081013,0.464706,-3.805882,0.427059,395,850


In [200]:
# ===== Cell 6 — 4×4 evaluation table =====
eval_table = (
    metrics_df[["threshold", "accuracy_at_t", "coverage", "penalty_mean", "overconf_rate"]]
    .set_index("threshold")
    .sort_index()
)

# Display
print("4×4 Evaluation Table:")
display(eval_table)

# Save both detailed and compact tables
eval_table.to_csv(OUTPUT_PATH / "qwen-mmlu-metric-eval.csv")

print(f"Saved results in: {OUTPUT_PATH}")


4×4 Evaluation Table:


Unnamed: 0_level_0,accuracy_at_t,coverage,penalty_mean,overconf_rate
threshold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.25,0.07971,0.974118,-0.17019,0.896471
0.5,0.094183,0.849412,-0.601471,0.769412
0.75,0.081481,0.635294,-1.636941,0.583529
0.9,0.081013,0.464706,-3.805882,0.427059


Saved results in: outputs


### Baseline Evaluation

In [201]:
coverage_floor = 0.3

eligible = eval_table[eval_table["coverage"] >= coverage_floor]
print("DEBUG — eligible rows:")
display(eligible)

best_row = eligible.loc[eligible["accuracy_at_t"].idxmax()]
t_star = best_row.name

print(f"Selected t* = {t_star}")
display(best_row)

DEBUG — eligible rows:


Unnamed: 0_level_0,accuracy_at_t,coverage,penalty_mean,overconf_rate
threshold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.25,0.07971,0.974118,-0.17019,0.896471
0.5,0.094183,0.849412,-0.601471,0.769412
0.75,0.081481,0.635294,-1.636941,0.583529
0.9,0.081013,0.464706,-3.805882,0.427059


Selected t* = 0.5


accuracy_at_t    0.094183
coverage         0.849412
penalty_mean    -0.601471
overconf_rate    0.769412
Name: 0.5, dtype: float64

In [202]:
# ===== Cell 2 – Compute binary-grading and always-abstain baselines =====
import numpy as np

# Load the original prediction CSV (same used to compute metrics_df)
df = pd.read_csv(CSV_PATH)

# Compute overall accuracy
binary_acc = np.mean(df["predicted_answer"] == df["answer"])

# Wrong rate (for overconfidence)
wrong_rate = 1 - binary_acc

# Binary baseline metrics (answers everything)
binary_row = {
    "accuracy_at_t": binary_acc,
    "coverage": 1.0,
    "penalty_mean": np.nan,  # you can fill with projected penalty if desired
    "overconf_rate": wrong_rate,
}

# Always-abstain baseline metrics
abstain_row = {
    "accuracy_at_t": 0.0,
    "coverage": 0.0,
    "penalty_mean": 0.0,
    "overconf_rate": 0.0,
}

print("Binary-grading baseline:")
display(pd.DataFrame([binary_row], index=["Binary"]))
print("Always-abstain baseline:")
display(pd.DataFrame([abstain_row], index=["Abstain"]))


Binary-grading baseline:


Unnamed: 0,accuracy_at_t,coverage,penalty_mean,overconf_rate
Binary,0.078824,1.0,,0.921176


Always-abstain baseline:


Unnamed: 0,accuracy_at_t,coverage,penalty_mean,overconf_rate
Abstain,0.0,0.0,0.0,0.0


In [203]:
# ===== Cell 3 – Build final 3x4 headline table =====

headline_df = pd.DataFrame([
    best_row[["accuracy_at_t", "coverage", "penalty_mean", "overconf_rate"]],
    pd.Series(binary_row),
    pd.Series(abstain_row)
], index=[f"Confidence-aware (t*={t_star})", "Binary grading", "Always abstain"])

print("=== 3x4 Headline Evaluation Table ===")
display(headline_df)
headline_df.to_csv(OUTPUT_PATH / "qwen-mmlu-baseline-eval.csv")


=== 3x4 Headline Evaluation Table ===


Unnamed: 0,accuracy_at_t,coverage,penalty_mean,overconf_rate
Confidence-aware (t*=0.5),0.094183,0.849412,-0.601471,0.769412
Binary grading,0.078824,1.0,,0.921176
Always abstain,0.0,0.0,0.0,0.0


## GPT Evaluation

In [204]:
CSV_PATH = Path("../inference/outputs/gpt-mmlu.csv")  # <-- change this for each run
OUTPUT_PATH = Path("outputs")


In [205]:
df = pd.read_csv(CSV_PATH)

# Normalize strings & types
df["answer"] = df["answer"].astype(str).str.strip().str.upper()
df["predicted_answer"] = df["predicted_answer"].astype(str).str.strip().str.upper()
df["confidence"] = pd.to_numeric(df["confidence"], errors="coerce").fillna(0.0)
df["threshold"] = pd.to_numeric(df["threshold"], errors="coerce")

print(f"✅ Loaded {len(df)} rows from {CSV_PATH.name}")
print("Thresholds found:", sorted(df["threshold"].unique()))
df.head(3)

✅ Loaded 3400 rows from gpt-mmlu.csv
Thresholds found: [np.float64(0.25), np.float64(0.5), np.float64(0.75), np.float64(0.9)]


Unnamed: 0,id,threshold,question,choices,answer,predicted_answer,confidence
0,596,0.25,A manufacturer is currently selling 2000 units...,['$2.50' '$1.90' '$2.70' '$2.60' '$1.80' '$2.2...,F,D,0.333333
1,814,0.25,Fred Lowes is a typewriter salesman. He receiv...,['$210' '$200' '$225' '$175' '$195' '$150' '$2...,I,B,0.666667
2,817,0.25,"Mary Redmond purchased a $28,500 home with 20%...",['$305' '$190' '$171' '$285.50' '$399' '$323' ...,F,C,0.5


In [206]:
# ===== Cell 4 — Split into per-threshold DataFrames =====
dfs_by_t = {t: df[df["threshold"] == t].copy() for t in THRESHOLDS}

for t in THRESHOLDS:
    print(f"t={t}: {len(dfs_by_t[t])} rows")


t=0.25: 850 rows
t=0.5: 850 rows
t=0.75: 850 rows
t=0.9: 850 rows


In [207]:
# ===== Cell 5 — Compute metrics per threshold =====
metrics_rows = []

for t in THRESHOLDS:
    df_t = dfs_by_t[t]

    metrics_rows.append({
        "threshold": t,
        "accuracy_at_t": accuracy_at_threshold(df_t, t),
        "coverage": coverage(df_t, t),
        "penalty_mean": penalty_adjusted_mean(df_t, t),
        "overconf_rate": overconfidence_rate(df_t, t),
        "answered_n": int((df_t["confidence"] > t).sum()),
        "total_n": len(df_t)
    })

metrics_df = pd.DataFrame(metrics_rows).sort_values("threshold").reset_index(drop=True)
metrics_df


Unnamed: 0,threshold,accuracy_at_t,coverage,penalty_mean,overconf_rate,answered_n,total_n
0,0.25,0.197647,1.0,-0.011542,0.802353,850,850
1,0.5,0.208517,0.801176,-0.386275,0.634118,681,850
2,0.75,0.248,0.441176,-0.834118,0.331765,375,850
3,0.9,0.0,0.0,0.0,0.0,0,850


In [208]:
# ===== Cell 6 — 4×4 evaluation table =====
eval_table = (
    metrics_df[["threshold", "accuracy_at_t", "coverage", "penalty_mean", "overconf_rate"]]
    .set_index("threshold")
    .sort_index()
)

# Display
print("4×4 Evaluation Table:")
display(eval_table)

# Save both detailed and compact tables
eval_table.to_csv(OUTPUT_PATH / "gpt-mmlu-metric-eval.csv")

print(f"Saved results in: {OUTPUT_PATH}")


4×4 Evaluation Table:


Unnamed: 0_level_0,accuracy_at_t,coverage,penalty_mean,overconf_rate
threshold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.25,0.197647,1.0,-0.011542,0.802353
0.5,0.208517,0.801176,-0.386275,0.634118
0.75,0.248,0.441176,-0.834118,0.331765
0.9,0.0,0.0,0.0,0.0


Saved results in: outputs


### Baseline Evaluation

In [209]:
coverage_floor = 0.3

eligible = eval_table[eval_table["coverage"] >= coverage_floor]
print("DEBUG — eligible rows:")
display(eligible)

best_row = eligible.loc[eligible["accuracy_at_t"].idxmax()]
t_star = best_row.name

print(f"Selected t* = {t_star}")
display(best_row)

DEBUG — eligible rows:


Unnamed: 0_level_0,accuracy_at_t,coverage,penalty_mean,overconf_rate
threshold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.25,0.197647,1.0,-0.011542,0.802353
0.5,0.208517,0.801176,-0.386275,0.634118
0.75,0.248,0.441176,-0.834118,0.331765


Selected t* = 0.75


accuracy_at_t    0.248000
coverage         0.441176
penalty_mean    -0.834118
overconf_rate    0.331765
Name: 0.75, dtype: float64

In [210]:
# ===== Cell 2 – Compute binary-grading and always-abstain baselines =====
import numpy as np

# Load the original prediction CSV (same used to compute metrics_df)
df = pd.read_csv(CSV_PATH)

# Compute overall accuracy
binary_acc = np.mean(df["predicted_answer"] == df["answer"])

# Wrong rate (for overconfidence)
wrong_rate = 1 - binary_acc

# Binary baseline metrics (answers everything)
binary_row = {
    "accuracy_at_t": binary_acc,
    "coverage": 1.0,
    "penalty_mean": np.nan,  # you can fill with projected penalty if desired
    "overconf_rate": wrong_rate,
}

# Always-abstain baseline metrics
abstain_row = {
    "accuracy_at_t": 0.0,
    "coverage": 0.0,
    "penalty_mean": 0.0,
    "overconf_rate": 0.0,
}

print("Binary-grading baseline:")
display(pd.DataFrame([binary_row], index=["Binary"]))
print("Always-abstain baseline:")
display(pd.DataFrame([abstain_row], index=["Abstain"]))


Binary-grading baseline:


Unnamed: 0,accuracy_at_t,coverage,penalty_mean,overconf_rate
Binary,0.134118,1.0,,0.865882


Always-abstain baseline:


Unnamed: 0,accuracy_at_t,coverage,penalty_mean,overconf_rate
Abstain,0.0,0.0,0.0,0.0


In [211]:
# ===== Cell 3 – Build final 3x4 headline table =====

headline_df = pd.DataFrame([
    best_row[["accuracy_at_t", "coverage", "penalty_mean", "overconf_rate"]],
    pd.Series(binary_row),
    pd.Series(abstain_row)
], index=[f"Confidence-aware (t*={t_star})", "Binary grading", "Always abstain"])

print("=== 3x4 Headline Evaluation Table ===")
display(headline_df)
headline_df.to_csv(OUTPUT_PATH / "gpt-mmlu-baseline-eval.csv")


=== 3x4 Headline Evaluation Table ===


Unnamed: 0,accuracy_at_t,coverage,penalty_mean,overconf_rate
Confidence-aware (t*=0.75),0.248,0.441176,-0.834118,0.331765
Binary grading,0.134118,1.0,,0.865882
Always abstain,0.0,0.0,0.0,0.0


## Claude Evaluation

In [212]:
#GPQA - Qwen Evalution 
CSV_PATH = Path("../inference/outputs/claude-mmlu.csv")  # <-- change this for each run
OUTPUT_PATH = Path("outputs")


In [213]:
df = pd.read_csv(CSV_PATH)

# Normalize strings & types
df["answer"] = df["answer"].astype(str).str.strip().str.upper()
df["predicted_answer"] = df["predicted_answer"].astype(str).str.strip().str.upper()
df["confidence"] = pd.to_numeric(df["confidence"], errors="coerce").fillna(0.0)
df["threshold"] = pd.to_numeric(df["threshold"], errors="coerce")

print(f"✅ Loaded {len(df)} rows from {CSV_PATH.name}")
print("Thresholds found:", sorted(df["threshold"].unique()))
df.head(3)

✅ Loaded 3400 rows from claude-mmlu.csv
Thresholds found: [np.float64(0.25), np.float64(0.5), np.float64(0.75), np.float64(0.9)]


Unnamed: 0,id,threshold,question,choices,answer,predicted_answer,confidence
0,596,0.25,A manufacturer is currently selling 2000 units...,['$2.50' '$1.90' '$2.70' '$2.60' '$1.80' '$2.2...,F,I,1.0
1,814,0.25,Fred Lowes is a typewriter salesman. He receiv...,['$210' '$200' '$225' '$175' '$195' '$150' '$2...,I,I,1.0
2,817,0.25,"Mary Redmond purchased a $28,500 home with 20%...",['$305' '$190' '$171' '$285.50' '$399' '$323' ...,F,I,1.0


In [214]:
# ===== Cell 4 — Split into per-threshold DataFrames =====
dfs_by_t = {t: df[df["threshold"] == t].copy() for t in THRESHOLDS}

for t in THRESHOLDS:
    print(f"t={t}: {len(dfs_by_t[t])} rows")


t=0.25: 850 rows
t=0.5: 850 rows
t=0.75: 850 rows
t=0.9: 850 rows


In [215]:
# ===== Cell 5 — Compute metrics per threshold =====
metrics_rows = []

for t in THRESHOLDS:
    df_t = dfs_by_t[t]

    metrics_rows.append({
        "threshold": t,
        "accuracy_at_t": accuracy_at_threshold(df_t, t),
        "coverage": coverage(df_t, t),
        "penalty_mean": penalty_adjusted_mean(df_t, t),
        "overconf_rate": overconfidence_rate(df_t, t),
        "answered_n": int((df_t["confidence"] > t).sum()),
        "total_n": len(df_t)
    })

metrics_df = pd.DataFrame(metrics_rows).sort_values("threshold").reset_index(drop=True)



In [216]:
# ===== Cell 6 — 4×4 evaluation table =====
eval_table = (
    metrics_df[["threshold", "accuracy_at_t", "coverage", "penalty_mean", "overconf_rate"]]
    .set_index("threshold")
    .sort_index()
)

# Display
print("4×4 Evaluation Table:")
display(eval_table)

# Save both detailed and compact tables
eval_table.to_csv(OUTPUT_PATH / "claude-mmlu-metric-eval.csv")

print(f"Saved results in: {OUTPUT_PATH}")


4×4 Evaluation Table:


Unnamed: 0_level_0,accuracy_at_t,coverage,penalty_mean,overconf_rate
threshold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.25,0.134454,0.98,-0.133105,0.848235
0.5,0.137107,0.935294,-0.642902,0.807059
0.75,0.129032,0.911765,-2.230353,0.794118
0.9,0.110957,0.848235,-6.692941,0.754118


Saved results in: outputs


### Baseline Evaluation

In [217]:
coverage_floor = 0.3

eligible = eval_table[eval_table["coverage"] >= coverage_floor]
print("DEBUG — eligible rows:")
display(eligible)

best_row = eligible.loc[eligible["accuracy_at_t"].idxmax()]
t_star = best_row.name

print(f"Selected t* = {t_star}")
display(best_row)

DEBUG — eligible rows:


Unnamed: 0_level_0,accuracy_at_t,coverage,penalty_mean,overconf_rate
threshold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.25,0.134454,0.98,-0.133105,0.848235
0.5,0.137107,0.935294,-0.642902,0.807059
0.75,0.129032,0.911765,-2.230353,0.794118
0.9,0.110957,0.848235,-6.692941,0.754118


Selected t* = 0.5


accuracy_at_t    0.137107
coverage         0.935294
penalty_mean    -0.642902
overconf_rate    0.807059
Name: 0.5, dtype: float64

In [218]:
# ===== Cell 2 – Compute binary-grading and always-abstain baselines =====
import numpy as np

# Load the original prediction CSV (same used to compute metrics_df)
df = pd.read_csv(CSV_PATH)

# Compute overall accuracy
binary_acc = np.mean(df["predicted_answer"] == df["answer"])

# Wrong rate (for overconfidence)
wrong_rate = 1 - binary_acc

# Binary baseline metrics (answers everything)
binary_row = {
    "accuracy_at_t": binary_acc,
    "coverage": 1.0,
    "penalty_mean": np.nan,  # you can fill with projected penalty if desired
    "overconf_rate": wrong_rate,
}

# Always-abstain baseline metrics
abstain_row = {
    "accuracy_at_t": 0.0,
    "coverage": 0.0,
    "penalty_mean": 0.0,
    "overconf_rate": 0.0,
}

print("Binary-grading baseline:")
display(pd.DataFrame([binary_row], index=["Binary"]))
print("Always-abstain baseline:")
display(pd.DataFrame([abstain_row], index=["Abstain"]))


Binary-grading baseline:


Unnamed: 0,accuracy_at_t,coverage,penalty_mean,overconf_rate
Binary,0.127647,1.0,,0.872353


Always-abstain baseline:


Unnamed: 0,accuracy_at_t,coverage,penalty_mean,overconf_rate
Abstain,0.0,0.0,0.0,0.0


In [219]:
# ===== Cell 3 – Build final 3x4 headline table =====

headline_df = pd.DataFrame([
    best_row[["accuracy_at_t", "coverage", "penalty_mean", "overconf_rate"]],
    pd.Series(binary_row),
    pd.Series(abstain_row)
], index=[f"Confidence-aware (t*={t_star})", "Binary grading", "Always abstain"])

print("=== 3x4 Headline Evaluation Table ===")
display(headline_df)
headline_df.to_csv(OUTPUT_PATH / "claude-mmlu-baseline-eval.csv")


=== 3x4 Headline Evaluation Table ===


Unnamed: 0,accuracy_at_t,coverage,penalty_mean,overconf_rate
Confidence-aware (t*=0.5),0.137107,0.935294,-0.642902,0.807059
Binary grading,0.127647,1.0,,0.872353
Always abstain,0.0,0.0,0.0,0.0


## LLama Evaluation

In [220]:
#GPQA - Qwen Evalution 
CSV_PATH = Path("../inference/outputs/llama-mmlu.csv")  # <-- change this for each run
OUTPUT_PATH = Path("outputs")


In [221]:
# df = pd.read_csv(CSV_PATH)

# # Normalize strings & types
# df["answer"] = df["answer"].astype(str).str.strip().str.upper()
# df["predicted_answer"] = df["predicted_answer"].astype(str).str.strip().str.upper()
# df["confidence"] = pd.to_numeric(df["confidence"], errors="coerce").fillna(0.0)
# df["threshold"] = pd.to_numeric(df["threshold"], errors="coerce")

# print(f"✅ Loaded {len(df)} rows from {CSV_PATH.name}")
# print("Thresholds found:", sorted(df["threshold"].unique()))
# df.head(3)

In [222]:
# # ===== Cell 4 — Split into per-threshold DataFrames =====
# dfs_by_t = {t: df[df["threshold"] == t].copy() for t in THRESHOLDS}

# for t in THRESHOLDS:
#     print(f"t={t}: {len(dfs_by_t[t])} rows")


In [223]:
# # ===== Cell 5 — Compute metrics per threshold =====
# metrics_rows = []

# for t in THRESHOLDS:
#     df_t = dfs_by_t[t]

#     metrics_rows.append({
#         "threshold": t,
#         "accuracy_at_t": accuracy_at_threshold(df_t, t),
#         "coverage": coverage(df_t, t),
#         "penalty_mean": penalty_adjusted_mean(df_t, t),
#         "overconf_rate": overconfidence_rate(df_t, t),
#         "answered_n": int((df_t["confidence"] > t).sum()),
#         "total_n": len(df_t)
#     })

# metrics_df = pd.DataFrame(metrics_rows).sort_values("threshold").reset_index(drop=True)


In [224]:
# # ===== Cell 6 — 4×4 evaluation table =====
# eval_table = (
#     metrics_df[["threshold", "accuracy_at_t", "coverage", "penalty_mean", "overconf_rate"]]
#     .set_index("threshold")
#     .sort_index()
# )

# # Display
# print("4×4 Evaluation Table:")
# display(eval_table)

# # Save both detailed and compact tables
# eval_table.to_csv(OUTPUT_PATH / "llama-mmlu-metric-eval.csv")

# print(f"Saved results in: {OUTPUT_PATH}")


### Baseline

In [225]:
# coverage_floor = 0.3

# eligible = eval_table[eval_table["coverage"] >= coverage_floor]
# print("DEBUG — eligible rows:")
# display(eligible)

# best_row = eligible.loc[eligible["accuracy_at_t"].idxmax()]
# t_star = best_row.name

# print(f"Selected t* = {t_star}")
# display(best_row)

In [226]:
# # ===== Cell 2 – Compute binary-grading and always-abstain baselines =====
# import numpy as np

# # Load the original prediction CSV (same used to compute metrics_df)
# df = pd.read_csv(CSV_PATH)

# # Compute overall accuracy
# binary_acc = np.mean(df["predicted_answer"] == df["answer"])

# # Wrong rate (for overconfidence)
# wrong_rate = 1 - binary_acc

# # Binary baseline metrics (answers everything)
# binary_row = {
#     "accuracy_at_t": binary_acc,
#     "coverage": 1.0,
#     "penalty_mean": np.nan,  # you can fill with projected penalty if desired
#     "overconf_rate": wrong_rate,
# }

# # Always-abstain baseline metrics
# abstain_row = {
#     "accuracy_at_t": 0.0,
#     "coverage": 0.0,
#     "penalty_mean": 0.0,
#     "overconf_rate": 0.0,
# }

# print("Binary-grading baseline:")
# display(pd.DataFrame([binary_row], index=["Binary"]))
# print("Always-abstain baseline:")
# display(pd.DataFrame([abstain_row], index=["Abstain"]))


In [227]:
# # ===== Cell 3 – Build final 3x4 headline table =====

# headline_df = pd.DataFrame([
#     best_row[["accuracy_at_t", "coverage", "penalty_mean", "overconf_rate"]],
#     pd.Series(binary_row),
#     pd.Series(abstain_row)
# ], index=[f"Confidence-aware (t*={t_star})", "Binary grading", "Always abstain"])

# print("=== 3x4 Headline Evaluation Table ===")
# display(headline_df)
# headline_df.to_csv(OUTPUT_PATH / "llama-mmlu-baseline-eval.csv")


## Gemini Evaluation

In [228]:
#GPQA - Qwen Evalution 
CSV_PATH = Path("../inference/outputs/gemini-mmlu.csv")  # <-- change this for each run
OUTPUT_PATH = Path("outputs")


In [229]:
df = pd.read_csv(CSV_PATH)

# Normalize strings & types
df["answer"] = df["answer"].astype(str).str.strip().str.upper()
df["predicted_answer"] = df["predicted_answer"].astype(str).str.strip().str.upper()
df["confidence"] = pd.to_numeric(df["confidence"], errors="coerce").fillna(0.0)
df["threshold"] = pd.to_numeric(df["threshold"], errors="coerce")

print(f"✅ Loaded {len(df)} rows from {CSV_PATH.name}")
print("Thresholds found:", sorted(df["threshold"].unique()))
df.head(3)

✅ Loaded 3400 rows from gemini-mmlu.csv
Thresholds found: [np.float64(0.25), np.float64(0.5), np.float64(0.75), np.float64(0.9)]


Unnamed: 0,id,threshold,question,choices,answer,predicted_answer,confidence
0,596,0.25,A manufacturer is currently selling 2000 units...,['$2.50' '$1.90' '$2.70' '$2.60' '$1.80' '$2.2...,F,X,1.0
1,814,0.25,Fred Lowes is a typewriter salesman. He receiv...,['$210' '$200' '$225' '$175' '$195' '$150' '$2...,I,S,1.0
2,817,0.25,"Mary Redmond purchased a $28,500 home with 20%...",['$305' '$190' '$171' '$285.50' '$399' '$323' ...,F,S,1.0


In [230]:
# ===== Cell 4 — Split into per-threshold DataFrames =====
dfs_by_t = {t: df[df["threshold"] == t].copy() for t in THRESHOLDS}

for t in THRESHOLDS:
    print(f"t={t}: {len(dfs_by_t[t])} rows")


t=0.25: 850 rows
t=0.5: 850 rows
t=0.75: 850 rows
t=0.9: 850 rows


In [231]:
# ===== Cell 5 — Compute metrics per threshold =====
metrics_rows = []

for t in THRESHOLDS:
    df_t = dfs_by_t[t]

    metrics_rows.append({
        "threshold": t,
        "accuracy_at_t": accuracy_at_threshold(df_t, t),
        "coverage": coverage(df_t, t),
        "penalty_mean": penalty_adjusted_mean(df_t, t),
        "overconf_rate": overconfidence_rate(df_t, t),
        "answered_n": int((df_t["confidence"] > t).sum()),
        "total_n": len(df_t)
    })

metrics_df = pd.DataFrame(metrics_rows).sort_values("threshold").reset_index(drop=True)
metrics_df


Unnamed: 0,threshold,accuracy_at_t,coverage,penalty_mean,overconf_rate,answered_n,total_n
0,0.25,0.123693,0.675294,-0.087301,0.591765,574,850
1,0.5,0.133231,0.768235,-0.515431,0.665882,653,850
2,0.75,0.138739,0.652941,-1.559059,0.562353,555,850
3,0.9,0.139831,0.555294,-4.221176,0.477647,472,850


In [232]:
# ===== Cell 6 — 4×4 evaluation table =====
eval_table = (
    metrics_df[["threshold", "accuracy_at_t", "coverage", "penalty_mean", "overconf_rate"]]
    .set_index("threshold")
    .sort_index()
)

# Display
print("4×4 Evaluation Table:")
display(eval_table)

# Save both detailed and compact tables
eval_table.to_csv(OUTPUT_PATH / "gemini-mmlu-metric-eval.csv")

print(f"Saved results in: {OUTPUT_PATH}")


4×4 Evaluation Table:


Unnamed: 0_level_0,accuracy_at_t,coverage,penalty_mean,overconf_rate
threshold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.25,0.123693,0.675294,-0.087301,0.591765
0.5,0.133231,0.768235,-0.515431,0.665882
0.75,0.138739,0.652941,-1.559059,0.562353
0.9,0.139831,0.555294,-4.221176,0.477647


Saved results in: outputs


### Baseline Evaluation

In [233]:
coverage_floor = 0.3

eligible = eval_table[eval_table["coverage"] >= coverage_floor]
print("DEBUG — eligible rows:")
display(eligible)

best_row = eligible.loc[eligible["accuracy_at_t"].idxmax()]
t_star = best_row.name

print(f"Selected t* = {t_star}")
display(best_row)

DEBUG — eligible rows:


Unnamed: 0_level_0,accuracy_at_t,coverage,penalty_mean,overconf_rate
threshold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.25,0.123693,0.675294,-0.087301,0.591765
0.5,0.133231,0.768235,-0.515431,0.665882
0.75,0.138739,0.652941,-1.559059,0.562353
0.9,0.139831,0.555294,-4.221176,0.477647


Selected t* = 0.9


accuracy_at_t    0.139831
coverage         0.555294
penalty_mean    -4.221176
overconf_rate    0.477647
Name: 0.9, dtype: float64

In [234]:
# ===== Cell 2 – Compute binary-grading and always-abstain baselines =====
import numpy as np

# Load the original prediction CSV (same used to compute metrics_df)
df = pd.read_csv(CSV_PATH)

# Compute overall accuracy
binary_acc = np.mean(df["predicted_answer"] == df["answer"])

# Wrong rate (for overconfidence)
wrong_rate = 1 - binary_acc

# Binary baseline metrics (answers everything)
binary_row = {
    "accuracy_at_t": binary_acc,
    "coverage": 1.0,
    "penalty_mean": np.nan,  # you can fill with projected penalty if desired
    "overconf_rate": wrong_rate,
}

# Always-abstain baseline metrics
abstain_row = {
    "accuracy_at_t": 0.0,
    "coverage": 0.0,
    "penalty_mean": 0.0,
    "overconf_rate": 0.0,
}

print("Binary-grading baseline:")
display(pd.DataFrame([binary_row], index=["Binary"]))
print("Always-abstain baseline:")
display(pd.DataFrame([abstain_row], index=["Abstain"]))


Binary-grading baseline:


Unnamed: 0,accuracy_at_t,coverage,penalty_mean,overconf_rate
Binary,0.105588,1.0,,0.894412


Always-abstain baseline:


Unnamed: 0,accuracy_at_t,coverage,penalty_mean,overconf_rate
Abstain,0.0,0.0,0.0,0.0


In [235]:
# ===== Cell 3 – Build final 3x4 headline table =====

headline_df = pd.DataFrame([
    best_row[["accuracy_at_t", "coverage", "penalty_mean", "overconf_rate"]],
    pd.Series(binary_row),
    pd.Series(abstain_row)
], index=[f"Confidence-aware (t*={t_star})", "Binary grading", "Always abstain"])

print("=== 3x4 Headline Evaluation Table ===")
display(headline_df)
headline_df.to_csv(OUTPUT_PATH / "gemini-mmlu-baseline-eval.csv")


=== 3x4 Headline Evaluation Table ===


Unnamed: 0,accuracy_at_t,coverage,penalty_mean,overconf_rate
Confidence-aware (t*=0.9),0.139831,0.555294,-4.221176,0.477647
Binary grading,0.105588,1.0,,0.894412
Always abstain,0.0,0.0,0.0,0.0


## Mistral Evaluation

In [236]:
# #GPQA - Qwen Evalution 
# CSV_PATH = Path("../inference/outputs/mistral-mmlu.csv")  # <-- change this for each run
# OUTPUT_PATH = Path("outputs")


In [237]:
# df = pd.read_csv(CSV_PATH)

# # Normalize strings & types
# df["answer"] = df["answer"].astype(str).str.strip().str.upper()
# df["predicted_answer"] = df["predicted_answer"].astype(str).str.strip().str.upper()
# df["confidence"] = pd.to_numeric(df["confidence"], errors="coerce").fillna(0.0)
# df["threshold"] = pd.to_numeric(df["threshold"], errors="coerce")

# print(f"✅ Loaded {len(df)} rows from {CSV_PATH.name}")
# print("Thresholds found:", sorted(df["threshold"].unique()))
# df.head(3)

In [238]:
# # ===== Cell 4 — Split into per-threshold DataFrames =====
# dfs_by_t = {t: df[df["threshold"] == t].copy() for t in THRESHOLDS}

# for t in THRESHOLDS:
#     print(f"t={t}: {len(dfs_by_t[t])} rows")


In [239]:
# # ===== Cell 5 — Compute metrics per threshold =====
# metrics_rows = []

# for t in THRESHOLDS:
#     df_t = dfs_by_t[t]

#     metrics_rows.append({
#         "threshold": t,
#         "accuracy_at_t": accuracy_at_threshold(df_t, t),
#         "coverage": coverage(df_t, t),
#         "penalty_mean": penalty_adjusted_mean(df_t, t),
#         "overconf_rate": overconfidence_rate(df_t, t),
#         "answered_n": int((df_t["confidence"] > t).sum()),
#         "total_n": len(df_t)
#     })

# metrics_df = pd.DataFrame(metrics_rows).sort_values("threshold").reset_index(drop=True)
# metrics_df


In [240]:
# # ===== Cell 6 — 4×4 evaluation table =====
# eval_table = (
#     metrics_df[["threshold", "accuracy_at_t", "coverage", "penalty_mean", "overconf_rate"]]
#     .set_index("threshold")
#     .sort_index()
# )

# # Display
# print("4×4 Evaluation Table:")
# display(eval_table)

# # Save both detailed and compact tables
# eval_table.to_csv(OUTPUT_PATH / "mistral-mmlu-metric-eval.csv")

# print(f"Saved results in: {OUTPUT_PATH}")


### Baseline

In [241]:
# coverage_floor = 0.3

# eligible = eval_table[eval_table["coverage"] >= coverage_floor]
# print("DEBUG — eligible rows:")
# display(eligible)

# best_row = eligible.loc[eligible["accuracy_at_t"].idxmax()]
# t_star = best_row.name

# print(f"Selected t* = {t_star}")
# display(best_row)

In [242]:
# # ===== Cell 2 – Compute binary-grading and always-abstain baselines =====
# import numpy as np

# # Load the original prediction CSV (same used to compute metrics_df)
# df = pd.read_csv(CSV_PATH)

# # Compute overall accuracy
# binary_acc = np.mean(df["predicted_answer"] == df["answer"])

# # Wrong rate (for overconfidence)
# wrong_rate = 1 - binary_acc

# # Binary baseline metrics (answers everything)
# binary_row = {
#     "accuracy_at_t": binary_acc,
#     "coverage": 1.0,
#     "penalty_mean": np.nan,  # you can fill with projected penalty if desired
#     "overconf_rate": wrong_rate,
# }

# # Always-abstain baseline metrics
# abstain_row = {
#     "accuracy_at_t": 0.0,
#     "coverage": 0.0,
#     "penalty_mean": 0.0,
#     "overconf_rate": 0.0,
# }

# print("Binary-grading baseline:")
# display(pd.DataFrame([binary_row], index=["Binary"]))
# print("Always-abstain baseline:")
# display(pd.DataFrame([abstain_row], index=["Abstain"]))


In [243]:
# # ===== Cell 3 – Build final 3x4 headline table =====

# headline_df = pd.DataFrame([
#     best_row[["accuracy_at_t", "coverage", "penalty_mean", "overconf_rate"]],
#     pd.Series(binary_row),
#     pd.Series(abstain_row)
# ], index=[f"Confidence-aware (t*={t_star})", "Binary grading", "Always abstain"])

# print("=== 3x4 Headline Evaluation Table ===")
# display(headline_df)
# headline_df.to_csv(OUTPUT_PATH / "mistral-mmlu-baseline-eval.csv")
