# Imports, Setup, Helper Functions


In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

# Common thresholds (used for all model–dataset combinations)
THRESHOLDS = [0.25, 0.5, 0.75, 0.9]

In [2]:
# ===== Cell 2 — Metric functions =====

def decide(p: float, t: float) -> bool:
    """
    Decision rule: returns True if the model 'answers' the question.
    We assume the model only answers when its confidence p exceeds the threshold t.
    """
    return p > t


def penalty_score(pred: str, gold: str, p: float, t: float) -> float:
    """
    Penalty-adjusted score (as defined in the paper/proposal):
      - If p <= t  → the model abstains → score = 0.
      - If p > t and prediction == gold → score = +1.
      - If p > t and prediction != gold → score = - (p * t) / (1 - t)
        (this is a confidence-proportional penalty for being overconfident).
    """
    if p <= t:
        return 0.0
    if pred == gold:
        return 1.0
    return - (p * t) / (1.0 - t)


#Metric 1
def accuracy_at_threshold(df: pd.DataFrame, t: float) -> float:
    """
    Accuracy@t:
      Fraction of *answered* questions that are correct.
      = (# correct with p>t) / (# answered with p>t)
    If the model abstains on all (no p>t), returns 0.0.
    """
    answered = df["confidence"] > t
    answered_n = answered.sum()
    if answered_n == 0:
        return 0.0
    correct = (df["predicted_answer"] == df["answer"]) & answered
    return float(correct.sum() / answered_n)

#Metric 2
def coverage(df: pd.DataFrame, t: float) -> float:
    """
    Coverage:
      Fraction of total questions that the model *answers*.
      = (# p>t) / total
    """
    if len(df) == 0:
        return 0.0
    return float((df["confidence"] > t).sum() / len(df))

#Metric 3
def penalty_adjusted_mean(df: pd.DataFrame, t: float) -> float:
    """
    Mean penalty-adjusted score across all rows (including abstains).
    Abstentions contribute 0.
    """
    scores = [
        penalty_score(r.predicted_answer, r.answer, float(r.confidence), t)
        for r in df.itertuples(index=False)
    ]
    return float(np.mean(scores)) if scores else 0.0

#Metric 4
def overconfidence_rate(df: pd.DataFrame, t: float) -> float:
    """
    Overconfidence rate:
      Fraction of questions where the model is *wrong* but still confident (p>t).
      = (# wrong & p>t) / total
    """
    if len(df) == 0:
        return 0.0
    mask = (df["predicted_answer"] != df["answer"]) & (df["confidence"] > t)
    return float(mask.sum() / len(df))


## GPQA -Qwen Evaluation

In [3]:
#GPQA - Qwen Evalution 
BASE_PATH = Path("gpqa-dataset/qwen")  # <-- change this for each run
OUT_DIR = BASE_PATH / "evaluations"
OUT_DIR.mkdir(parents=True, exist_ok=True)

In [4]:
# ===== Cell 3 — Load model–dataset results =====
# Example: /gpqa-dataset/qwen_results/qwen25_1.5b_gpqa_outputs_by_threshold.csv
csv_path = BASE_PATH / "data/qwen25_1.5b_gpqa_outputs_by_threshold.csv"


df = pd.read_csv(csv_path)

In [5]:

# Normalize strings & types
df["answer"] = df["answer"].astype(str).str.strip().str.upper()
df["predicted_answer"] = df["predicted_answer"].astype(str).str.strip().str.upper()
df["confidence"] = pd.to_numeric(df["confidence"], errors="coerce").fillna(0.0)
df["threshold"] = pd.to_numeric(df["threshold"], errors="coerce")

print(f"✅ Loaded {len(df)} rows from {csv_path.name}")
print("Thresholds found:", sorted(df["threshold"].unique()))
df.head(3)

✅ Loaded 1020 rows from qwen25_1.5b_gpqa_outputs_by_threshold.csv
Thresholds found: [np.float64(0.25), np.float64(0.5), np.float64(0.75), np.float64(0.9)]


Unnamed: 0,id,threshold,question,choices,answer,predicted_answer,confidence
0,203,0.25,Identify the correct sequence of reagents for ...,"['1. NaH; CH3CH2Br 2. H2SO4, HNO3 3. Fe-HCl 4....",D,IDK,0.666667
1,266,0.25,There is a C-NOT gate where the condition is t...,['U_{C-NOT}\\left|\\psi\\right\\rangle =\\alph...,A,B,1.0
2,152,0.25,Two stars are being studied. It has been obser...,"['ln(2) = [ (T_1 - T_2) / (T1*T2)]', 'ln(2) = ...",A,IDK,1.0


In [6]:
# ===== Cell 4 — Split into per-threshold DataFrames =====
dfs_by_t = {t: df[df["threshold"] == t].copy() for t in THRESHOLDS}

for t in THRESHOLDS:
    print(f"t={t}: {len(dfs_by_t[t])} rows")


t=0.25: 255 rows
t=0.5: 255 rows
t=0.75: 255 rows
t=0.9: 255 rows


In [7]:
# ===== Cell 5 — Compute metrics per threshold =====
metrics_rows = []

for t in THRESHOLDS:
    df_t = dfs_by_t[t]

    metrics_rows.append({
        "threshold": t,
        "accuracy_at_t": accuracy_at_threshold(df_t, t),
        "coverage": coverage(df_t, t),
        "penalty_mean": penalty_adjusted_mean(df_t, t),
        "overconf_rate": overconfidence_rate(df_t, t),
        "answered_n": int((df_t["confidence"] > t).sum()),
        "total_n": len(df_t)
    })

metrics_df = pd.DataFrame(metrics_rows).sort_values("threshold").reset_index(drop=True)
metrics_df


Unnamed: 0,threshold,accuracy_at_t,coverage,penalty_mean,overconf_rate,answered_n,total_n
0,0.25,0.057377,0.956863,-0.228758,0.901961,244,255
1,0.5,0.04386,0.894118,-0.784314,0.854902,228,255
2,0.75,0.046296,0.847059,-2.358824,0.807843,216,255
3,0.9,0.045685,0.772549,-6.6,0.737255,197,255


In [8]:
# ===== Cell 6 — 4×4 evaluation table =====
eval_table = (
    metrics_df[["threshold", "accuracy_at_t", "coverage", "penalty_mean", "overconf_rate"]]
    .set_index("threshold")
    .sort_index()
)

# Display
print("4×4 Evaluation Table:")
display(eval_table)

# Save both detailed and compact tables
metrics_df.to_csv(OUT_DIR / "metrics_by_threshold.csv", index=False)
eval_table.to_csv(OUT_DIR / "evaluation_table_4x4.csv")

print(f"Saved results in: {OUT_DIR}")


4×4 Evaluation Table:


Unnamed: 0_level_0,accuracy_at_t,coverage,penalty_mean,overconf_rate
threshold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.25,0.057377,0.956863,-0.228758,0.901961
0.5,0.04386,0.894118,-0.784314,0.854902
0.75,0.046296,0.847059,-2.358824,0.807843
0.9,0.045685,0.772549,-6.6,0.737255


Saved results in: gpqa-dataset/qwen/evaluations


# Comparison against baselines

In [9]:
coverage_floor = 0.3

eligible = eval_table[eval_table["coverage"] >= coverage_floor]
print("DEBUG — eligible rows:")
display(eligible)

best_row = eligible.loc[eligible["accuracy_at_t"].idxmax()]
t_star = best_row.name

print(f"Selected t* = {t_star}")
display(best_row)

DEBUG — eligible rows:


Unnamed: 0_level_0,accuracy_at_t,coverage,penalty_mean,overconf_rate
threshold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.25,0.057377,0.956863,-0.228758,0.901961
0.5,0.04386,0.894118,-0.784314,0.854902
0.75,0.046296,0.847059,-2.358824,0.807843
0.9,0.045685,0.772549,-6.6,0.737255


Selected t* = 0.25


accuracy_at_t    0.057377
coverage         0.956863
penalty_mean    -0.228758
overconf_rate    0.901961
Name: 0.25, dtype: float64

In [10]:
# ===== Cell 2 – Compute binary-grading and always-abstain baselines =====
import numpy as np

# Load the original prediction CSV (same used to compute metrics_df)
df = pd.read_csv("gpqa-dataset/qwen/data/qwen25_1.5b_gpqa_outputs_by_threshold.csv")

# Compute overall accuracy
binary_acc = np.mean(df["predicted_answer"] == df["answer"])

# Wrong rate (for overconfidence)
wrong_rate = 1 - binary_acc

# Binary baseline metrics (answers everything)
binary_row = {
    "accuracy_at_t": binary_acc,
    "coverage": 1.0,
    "penalty_mean": np.nan,  # you can fill with projected penalty if desired
    "overconf_rate": wrong_rate,
}

# Always-abstain baseline metrics
abstain_row = {
    "accuracy_at_t": 0.0,
    "coverage": 0.0,
    "penalty_mean": 0.0,
    "overconf_rate": 0.0,
}

print("Binary-grading baseline:")
display(pd.DataFrame([binary_row], index=["Binary"]))
print("Always-abstain baseline:")
display(pd.DataFrame([abstain_row], index=["Abstain"]))


Binary-grading baseline:


Unnamed: 0,accuracy_at_t,coverage,penalty_mean,overconf_rate
Binary,0.051961,1.0,,0.948039


Always-abstain baseline:


Unnamed: 0,accuracy_at_t,coverage,penalty_mean,overconf_rate
Abstain,0.0,0.0,0.0,0.0


In [11]:
# ===== Cell 3 – Build final 3x4 headline table =====

headline_df = pd.DataFrame([
    best_row[["accuracy_at_t", "coverage", "penalty_mean", "overconf_rate"]],
    pd.Series(binary_row),
    pd.Series(abstain_row)
], index=[f"Confidence-aware (t*={t_star})", "Binary grading", "Always abstain"])

print("=== 3x4 Headline Evaluation Table ===")
display(headline_df)

# Save to disk
headline_path = f"gpqa-dataset/qwen/evaluations/baseline_comparison.csv"
headline_df.to_csv(headline_path)
print(f"Saved headline table → {headline_path}")


=== 3x4 Headline Evaluation Table ===


Unnamed: 0,accuracy_at_t,coverage,penalty_mean,overconf_rate
Confidence-aware (t*=0.25),0.057377,0.956863,-0.228758,0.901961
Binary grading,0.051961,1.0,,0.948039
Always abstain,0.0,0.0,0.0,0.0


Saved headline table → gpqa-dataset/qwen/evaluations/baseline_comparison.csv


# GPQA - GPT 4-o mini Evalution 


In [12]:
#MMLU - Gpt4 Evalution 
BASE_PATH = Path("gpqa-dataset/gpt4")  # <-- change this for each run
OUT_DIR = BASE_PATH / "evaluations"
OUT_DIR.mkdir(parents=True, exist_ok=True)

In [13]:
import os
print(os.getcwd())


/Users/stutisinghal/Documents/fall25/NLP/nlp_proj/eval/llm-eval


In [14]:
# ===== Cell 3 — Load model–dataset results =====

csv_path = BASE_PATH / "data/gpt4_gpqa_outputs_by_threshold.csv"

df = pd.read_csv(csv_path)


In [15]:

# Normalize strings & types
df["answer"] = df["answer"].astype(str).str.strip().str.upper()
df["predicted_answer"] = df["predicted_answer"].astype(str).str.strip().str.upper()
df["confidence"] = pd.to_numeric(df["confidence"], errors="coerce").fillna(0.0)
df["threshold"] = pd.to_numeric(df["threshold"], errors="coerce")

print(f"✅ Loaded {len(df)} rows from {csv_path.name}")
print("Thresholds found:", sorted(df["threshold"].unique()))
df.head(3)

✅ Loaded 1020 rows from gpt4_gpqa_outputs_by_threshold.csv
Thresholds found: [np.float64(0.25), np.float64(0.5), np.float64(0.75), np.float64(0.9)]


Unnamed: 0,id,threshold,question,choices,answer,predicted_answer,confidence
0,203,0.25,Identify the correct sequence of reagents for ...,"['1. NaH; CH3CH2Br 2. H2SO4, HNO3 3. Fe-HCl 4....",D,B,0.666667
1,266,0.25,There is a C-NOT gate where the condition is t...,['U_{C-NOT}\\left|\\psi\\right\\rangle =\\alph...,A,D,0.666667
2,152,0.25,Two stars are being studied. It has been obser...,"['ln(2) = [ (T_1 - T_2) / (T1*T2)]', 'ln(2) = ...",A,A,1.0


In [16]:
# ===== Cell 4 — Split into per-threshold DataFrames =====
dfs_by_t = {t: df[df["threshold"] == t].copy() for t in THRESHOLDS}

for t in THRESHOLDS:
    print(f"t={t}: {len(dfs_by_t[t])} rows")


t=0.25: 255 rows
t=0.5: 255 rows
t=0.75: 255 rows
t=0.9: 255 rows


In [17]:
# ===== Cell 5 — Compute metrics per threshold =====
metrics_rows = []

for t in THRESHOLDS:
    df_t = dfs_by_t[t]

    metrics_rows.append({
        "threshold": t,
        "accuracy_at_t": accuracy_at_threshold(df_t, t),
        "coverage": coverage(df_t, t),
        "penalty_mean": penalty_adjusted_mean(df_t, t),
        "overconf_rate": overconfidence_rate(df_t, t),
        "answered_n": int((df_t["confidence"] > t).sum()),
        "total_n": len(df_t)
    })

metrics_df = pd.DataFrame(metrics_rows).sort_values("threshold").reset_index(drop=True)
metrics_df


Unnamed: 0,threshold,accuracy_at_t,coverage,penalty_mean,overconf_rate,answered_n,total_n
0,0.25,0.334646,0.996078,0.147756,0.662745,254,255
1,0.5,0.348624,0.854902,-0.2,0.556863,218,255
2,0.75,0.338798,0.717647,-1.111765,0.47451,183,255
3,0.9,0.40146,0.537255,-2.678431,0.321569,137,255


In [18]:
# ===== Cell 6 — 4×4 evaluation table =====
eval_table = (
    metrics_df[["threshold", "accuracy_at_t", "coverage", "penalty_mean", "overconf_rate"]]
    .set_index("threshold")
    .sort_index()
)

# Display
print("4×4 Evaluation Table:")
display(eval_table)

# Save both detailed and compact tables
metrics_df.to_csv(OUT_DIR / "metrics_by_threshold.csv", index=False)
eval_table.to_csv(OUT_DIR / "evaluation_table_4x4.csv")

print(f"Saved results in: {OUT_DIR}")


4×4 Evaluation Table:


Unnamed: 0_level_0,accuracy_at_t,coverage,penalty_mean,overconf_rate
threshold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.25,0.334646,0.996078,0.147756,0.662745
0.5,0.348624,0.854902,-0.2,0.556863
0.75,0.338798,0.717647,-1.111765,0.47451
0.9,0.40146,0.537255,-2.678431,0.321569


Saved results in: gpqa-dataset/gpt4/evaluations


# GPQA-GPT 4 Baseline Comparison

In [19]:
coverage_floor = 0.3

eligible = eval_table[eval_table["coverage"] >= coverage_floor]
print("DEBUG — eligible rows:")
display(eligible)

best_row = eligible.loc[eligible["accuracy_at_t"].idxmax()]
t_star = best_row.name

print(f"Selected t* = {t_star}")
display(best_row)

DEBUG — eligible rows:


Unnamed: 0_level_0,accuracy_at_t,coverage,penalty_mean,overconf_rate
threshold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.25,0.334646,0.996078,0.147756,0.662745
0.5,0.348624,0.854902,-0.2,0.556863
0.75,0.338798,0.717647,-1.111765,0.47451
0.9,0.40146,0.537255,-2.678431,0.321569


Selected t* = 0.9


accuracy_at_t    0.401460
coverage         0.537255
penalty_mean    -2.678431
overconf_rate    0.321569
Name: 0.9, dtype: float64

In [20]:
# ===== Cell 2 – Compute binary-grading and always-abstain baselines =====
import numpy as np

# Load the original prediction CSV (same used to compute metrics_df)
df = pd.read_csv("gpqa-dataset/qwen/data/qwen25_1.5b_gpqa_outputs_by_threshold.csv")

# Compute overall accuracy
binary_acc = np.mean(df["predicted_answer"] == df["answer"])

# Wrong rate (for overconfidence)
wrong_rate = 1 - binary_acc

# Binary baseline metrics (answers everything)
binary_row = {
    "accuracy_at_t": binary_acc,
    "coverage": 1.0,
    "penalty_mean": np.nan,  # you can fill with projected penalty if desired
    "overconf_rate": wrong_rate,
}

# Always-abstain baseline metrics
abstain_row = {
    "accuracy_at_t": 0.0,
    "coverage": 0.0,
    "penalty_mean": 0.0,
    "overconf_rate": 0.0,
}

print("Binary-grading baseline:")
display(pd.DataFrame([binary_row], index=["Binary"]))
print("Always-abstain baseline:")
display(pd.DataFrame([abstain_row], index=["Abstain"]))


Binary-grading baseline:


Unnamed: 0,accuracy_at_t,coverage,penalty_mean,overconf_rate
Binary,0.051961,1.0,,0.948039


Always-abstain baseline:


Unnamed: 0,accuracy_at_t,coverage,penalty_mean,overconf_rate
Abstain,0.0,0.0,0.0,0.0


In [21]:
# ===== Cell 3 – Build final 3x4 headline table =====

headline_df = pd.DataFrame([
    best_row[["accuracy_at_t", "coverage", "penalty_mean", "overconf_rate"]],
    pd.Series(binary_row),
    pd.Series(abstain_row)
], index=[f"Confidence-aware (t*={t_star})", "Binary grading", "Always abstain"])

print("=== 3x4 Headline Evaluation Table ===")
display(headline_df)

# Save to disk
headline_path = f"gpqa-dataset/qwen/evaluations/baseline_comparison.csv"
headline_df.to_csv(headline_path)
print(f"Saved headline table → {headline_path}")


=== 3x4 Headline Evaluation Table ===


Unnamed: 0,accuracy_at_t,coverage,penalty_mean,overconf_rate
Confidence-aware (t*=0.9),0.40146,0.537255,-2.678431,0.321569
Binary grading,0.051961,1.0,,0.948039
Always abstain,0.0,0.0,0.0,0.0


Saved headline table → gpqa-dataset/qwen/evaluations/baseline_comparison.csv


# MMLU - GPT 4 o mini Evaluation

In [22]:
#MMLU - Gpt4 Evalution 
BASE_PATH = Path("mmlu-dataset/gpt4")  # <-- change this for each run
OUT_DIR = BASE_PATH / "evaluations"
OUT_DIR.mkdir(parents=True, exist_ok=True)

In [23]:
import os
print(os.getcwd())


/Users/stutisinghal/Documents/fall25/NLP/nlp_proj/eval/llm-eval


In [24]:
# ===== Cell 3 — Load model–dataset results =====

csv_path = BASE_PATH / "data/gpt4_api_mmlu_outputs_by_threshold.csv"
df = pd.read_csv(csv_path)


In [25]:

# Normalize strings & types
df["answer"] = df["answer"].astype(str).str.strip().str.upper()
df["predicted_answer"] = df["predicted_answer"].astype(str).str.strip().str.upper()
df["confidence"] = pd.to_numeric(df["confidence"], errors="coerce").fillna(0.0)
df["threshold"] = pd.to_numeric(df["threshold"], errors="coerce")

print(f"✅ Loaded {len(df)} rows from {csv_path.name}")
print("Thresholds found:", sorted(df["threshold"].unique()))
df.head(3)

✅ Loaded 3400 rows from gpt4_api_mmlu_outputs_by_threshold.csv
Thresholds found: [np.float64(0.25), np.float64(0.5), np.float64(0.75), np.float64(0.9)]


Unnamed: 0,id,threshold,question,choices,answer,predicted_answer,confidence
0,596,0.25,A manufacturer is currently selling 2000 units...,['$2.50' '$1.90' '$2.70' '$2.60' '$1.80' '$2.2...,F,C,0.833333
1,814,0.25,Fred Lowes is a typewriter salesman. He receiv...,['$210' '$200' '$225' '$175' '$195' '$150' '$2...,I,B,0.5
2,817,0.25,"Mary Redmond purchased a $28,500 home with 20%...",['$305' '$190' '$171' '$285.50' '$399' '$323' ...,F,A,0.666667


In [26]:
# ===== Cell 4 — Split into per-threshold DataFrames =====
dfs_by_t = {t: df[df["threshold"] == t].copy() for t in THRESHOLDS}

for t in THRESHOLDS:
    print(f"t={t}: {len(dfs_by_t[t])} rows")


t=0.25: 850 rows
t=0.5: 850 rows
t=0.75: 850 rows
t=0.9: 850 rows


In [27]:
import numpy as np

# Number of rows you want per threshold
N = 250

# Set seed so results are reproducible
np.random.seed(42)

# Subsample each threshold's dataframe to 250 rows
dfs_by_t_small = {}

for t in THRESHOLDS:
    df_t = dfs_by_t[t]
    
    if len(df_t) > N:
        df_sampled = df_t.sample(n=N, random_state=42)
    else:
        df_sampled = df_t.copy()
    
    dfs_by_t_small[t] = df_sampled

# Print sizes to confirm
for t in THRESHOLDS:
    print(f"t={t}: {len(dfs_by_t_small[t])} rows")


t=0.25: 250 rows
t=0.5: 250 rows
t=0.75: 250 rows
t=0.9: 250 rows


In [28]:
# ===== Cell 5 — Compute metrics per threshold =====
metrics_rows = []

for t in THRESHOLDS:
    df_t = dfs_by_t[t]

    metrics_rows.append({
        "threshold": t,
        "accuracy_at_t": accuracy_at_threshold(df_t, t),
        "coverage": coverage(df_t, t),
        "penalty_mean": penalty_adjusted_mean(df_t, t),
        "overconf_rate": overconfidence_rate(df_t, t),
        "answered_n": int((df_t["confidence"] > t).sum()),
        "total_n": len(df_t)
    })

metrics_df = pd.DataFrame(metrics_rows).sort_values("threshold").reset_index(drop=True)
metrics_df


Unnamed: 0,threshold,accuracy_at_t,coverage,penalty_mean,overconf_rate,answered_n,total_n
0,0.25,0.172544,0.934118,-0.043046,0.772941,794,850
1,0.5,0.189139,0.628235,-0.332843,0.509412,534,850
2,0.75,0.0,0.0,0.0,0.0,0,850
3,0.9,0.0,0.0,0.0,0.0,0,850


In [29]:
# ===== Cell 6 — 4×4 evaluation table =====
eval_table = (
    metrics_df[["threshold", "accuracy_at_t", "coverage", "penalty_mean", "overconf_rate"]]
    .set_index("threshold")
    .sort_index()
)

# Display
print("4×4 Evaluation Table:")
display(eval_table)

# Save both detailed and compact tables
metrics_df.to_csv(OUT_DIR / "metrics_by_threshold.csv", index=False)
eval_table.to_csv(OUT_DIR / "evaluation_table_4x4.csv")

print(f"Saved results in: {OUT_DIR}")


4×4 Evaluation Table:


Unnamed: 0_level_0,accuracy_at_t,coverage,penalty_mean,overconf_rate
threshold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.25,0.172544,0.934118,-0.043046,0.772941
0.5,0.189139,0.628235,-0.332843,0.509412
0.75,0.0,0.0,0.0,0.0
0.9,0.0,0.0,0.0,0.0


Saved results in: mmlu-dataset/gpt4/evaluations


# Comparison against baselines

In [30]:
coverage_floor = 0.3

eligible = eval_table[eval_table["coverage"] >= coverage_floor]
print("DEBUG — eligible rows:")
display(eligible)

best_row = eligible.loc[eligible["accuracy_at_t"].idxmax()]
t_star = best_row.name

print(f"Selected t* = {t_star}")
display(best_row)

DEBUG — eligible rows:


Unnamed: 0_level_0,accuracy_at_t,coverage,penalty_mean,overconf_rate
threshold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.25,0.172544,0.934118,-0.043046,0.772941
0.5,0.189139,0.628235,-0.332843,0.509412


Selected t* = 0.5


accuracy_at_t    0.189139
coverage         0.628235
penalty_mean    -0.332843
overconf_rate    0.509412
Name: 0.5, dtype: float64

In [31]:
# ===== Cell 2 – Compute binary-grading and always-abstain baselines =====
import numpy as np

# Load the original prediction CSV (same used to compute metrics_df)
df = pd.read_csv("gpqa-dataset/qwen/data/qwen25_1.5b_gpqa_outputs_by_threshold.csv")

# Compute overall accuracy
binary_acc = np.mean(df["predicted_answer"] == df["answer"])

# Wrong rate (for overconfidence)
wrong_rate = 1 - binary_acc

# Binary baseline metrics (answers everything)
binary_row = {
    "accuracy_at_t": binary_acc,
    "coverage": 1.0,
    "penalty_mean": np.nan,  # you can fill with projected penalty if desired
    "overconf_rate": wrong_rate,
}

# Always-abstain baseline metrics
abstain_row = {
    "accuracy_at_t": 0.0,
    "coverage": 0.0,
    "penalty_mean": 0.0,
    "overconf_rate": 0.0,
}

print("Binary-grading baseline:")
display(pd.DataFrame([binary_row], index=["Binary"]))
print("Always-abstain baseline:")
display(pd.DataFrame([abstain_row], index=["Abstain"]))


Binary-grading baseline:


Unnamed: 0,accuracy_at_t,coverage,penalty_mean,overconf_rate
Binary,0.051961,1.0,,0.948039


Always-abstain baseline:


Unnamed: 0,accuracy_at_t,coverage,penalty_mean,overconf_rate
Abstain,0.0,0.0,0.0,0.0


In [32]:
# ===== Cell 3 – Build final 3x4 headline table =====

headline_df = pd.DataFrame([
    best_row[["accuracy_at_t", "coverage", "penalty_mean", "overconf_rate"]],
    pd.Series(binary_row),
    pd.Series(abstain_row)
], index=[f"Confidence-aware (t*={t_star})", "Binary grading", "Always abstain"])

print("=== 3x4 Headline Evaluation Table ===")
display(headline_df)

# Save to disk
headline_path = f"gpqa-dataset/qwen/evaluations/baseline_comparison.csv"
headline_df.to_csv(headline_path)
print(f"Saved headline table → {headline_path}")


=== 3x4 Headline Evaluation Table ===


Unnamed: 0,accuracy_at_t,coverage,penalty_mean,overconf_rate
Confidence-aware (t*=0.5),0.189139,0.628235,-0.332843,0.509412
Binary grading,0.051961,1.0,,0.948039
Always abstain,0.0,0.0,0.0,0.0


Saved headline table → gpqa-dataset/qwen/evaluations/baseline_comparison.csv


# MMLU -Qwen Evaluation

In [33]:
#MMLU - Gpt4 Evalution 
BASE_PATH = Path("mmlu-dataset/qwen")  # <-- change this for each run
OUT_DIR = BASE_PATH / "evaluations"
OUT_DIR.mkdir(parents=True, exist_ok=True)

In [34]:
# ===== Cell 3 — Load model–dataset results =====

csv_path = BASE_PATH / "data/qwen25_1.5b_mmlu_outputs_by_threshold.csv"
df = pd.read_csv(csv_path)


In [35]:

# Normalize strings & types
df["answer"] = df["answer"].astype(str).str.strip().str.upper()
df["predicted_answer"] = df["predicted_answer"].astype(str).str.strip().str.upper()
df["confidence"] = pd.to_numeric(df["confidence"], errors="coerce").fillna(0.0)
df["threshold"] = pd.to_numeric(df["threshold"], errors="coerce")

print(f"✅ Loaded {len(df)} rows from {csv_path.name}")
print("Thresholds found:", sorted(df["threshold"].unique()))
df.head(3)

✅ Loaded 3400 rows from qwen25_1.5b_mmlu_outputs_by_threshold.csv
Thresholds found: [np.float64(0.25), np.float64(0.5), np.float64(0.75), np.float64(0.9)]


Unnamed: 0,id,threshold,question,choices,answer,predicted_answer,confidence
0,596,0.25,A manufacturer is currently selling 2000 units...,['$2.50' '$1.90' '$2.70' '$2.60' '$1.80' '$2.2...,F,IDK,0.5
1,814,0.25,Fred Lowes is a typewriter salesman. He receiv...,['$210' '$200' '$225' '$175' '$195' '$150' '$2...,I,B,0.833333
2,817,0.25,"Mary Redmond purchased a $28,500 home with 20%...",['$305' '$190' '$171' '$285.50' '$399' '$323' ...,F,IDK,1.0


In [36]:
# ===== Cell 4 — Split into per-threshold DataFrames =====
dfs_by_t = {t: df[df["threshold"] == t].copy() for t in THRESHOLDS}

for t in THRESHOLDS:
    print(f"t={t}: {len(dfs_by_t[t])} rows")


t=0.25: 850 rows
t=0.5: 850 rows
t=0.75: 850 rows
t=0.9: 850 rows


In [37]:
import numpy as np

# Number of rows you want per threshold
N = 250

# Set seed so results are reproducible
np.random.seed(42)

# Subsample each threshold's dataframe to 250 rows
dfs_by_t_small = {}

for t in THRESHOLDS:
    df_t = dfs_by_t[t]
    
    if len(df_t) > N:
        df_sampled = df_t.sample(n=N, random_state=42)
    else:
        df_sampled = df_t.copy()
    
    dfs_by_t_small[t] = df_sampled

# Print sizes to confirm
for t in THRESHOLDS:
    print(f"t={t}: {len(dfs_by_t_small[t])} rows")


t=0.25: 250 rows
t=0.5: 250 rows
t=0.75: 250 rows
t=0.9: 250 rows


In [38]:
# ===== Cell 5 — Compute metrics per threshold =====
metrics_rows = []

for t in THRESHOLDS:
    df_t = dfs_by_t[t]

    metrics_rows.append({
        "threshold": t,
        "accuracy_at_t": accuracy_at_threshold(df_t, t),
        "coverage": coverage(df_t, t),
        "penalty_mean": penalty_adjusted_mean(df_t, t),
        "overconf_rate": overconfidence_rate(df_t, t),
        "answered_n": int((df_t["confidence"] > t).sum()),
        "total_n": len(df_t)
    })

metrics_df = pd.DataFrame(metrics_rows).sort_values("threshold").reset_index(drop=True)
metrics_df


Unnamed: 0,threshold,accuracy_at_t,coverage,penalty_mean,overconf_rate,answered_n,total_n
0,0.25,0.07971,0.974118,-0.17019,0.896471,828,850
1,0.5,0.094183,0.849412,-0.601471,0.769412,722,850
2,0.75,0.081481,0.635294,-1.636941,0.583529,540,850
3,0.9,0.081013,0.464706,-3.805882,0.427059,395,850


In [39]:
# ===== Cell 6 — 4×4 evaluation table =====
eval_table = (
    metrics_df[["threshold", "accuracy_at_t", "coverage", "penalty_mean", "overconf_rate"]]
    .set_index("threshold")
    .sort_index()
)

# Display
print("4×4 Evaluation Table:")
display(eval_table)

# Save both detailed and compact tables
metrics_df.to_csv(OUT_DIR / "metrics_by_threshold.csv", index=False)
eval_table.to_csv(OUT_DIR / "evaluation_table_4x4.csv")

print(f"Saved results in: {OUT_DIR}")


4×4 Evaluation Table:


Unnamed: 0_level_0,accuracy_at_t,coverage,penalty_mean,overconf_rate
threshold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.25,0.07971,0.974118,-0.17019,0.896471
0.5,0.094183,0.849412,-0.601471,0.769412
0.75,0.081481,0.635294,-1.636941,0.583529
0.9,0.081013,0.464706,-3.805882,0.427059


Saved results in: mmlu-dataset/qwen/evaluations


# Comparison aganst baselines

In [40]:
coverage_floor = 0.3

eligible = eval_table[eval_table["coverage"] >= coverage_floor]
print("DEBUG — eligible rows:")
display(eligible)

best_row = eligible.loc[eligible["accuracy_at_t"].idxmax()]
t_star = best_row.name

print(f"Selected t* = {t_star}")
display(best_row)

DEBUG — eligible rows:


Unnamed: 0_level_0,accuracy_at_t,coverage,penalty_mean,overconf_rate
threshold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.25,0.07971,0.974118,-0.17019,0.896471
0.5,0.094183,0.849412,-0.601471,0.769412
0.75,0.081481,0.635294,-1.636941,0.583529
0.9,0.081013,0.464706,-3.805882,0.427059


Selected t* = 0.5


accuracy_at_t    0.094183
coverage         0.849412
penalty_mean    -0.601471
overconf_rate    0.769412
Name: 0.5, dtype: float64

In [41]:
# ===== Cell 2 – Compute binary-grading and always-abstain baselines =====
import numpy as np

# Load the original prediction CSV (same used to compute metrics_df)
df = pd.read_csv("gpqa-dataset/qwen/data/qwen25_1.5b_gpqa_outputs_by_threshold.csv")

# Compute overall accuracy
binary_acc = np.mean(df["predicted_answer"] == df["answer"])

# Wrong rate (for overconfidence)
wrong_rate = 1 - binary_acc

# Binary baseline metrics (answers everything)
binary_row = {
    "accuracy_at_t": binary_acc,
    "coverage": 1.0,
    "penalty_mean": np.nan,  # you can fill with projected penalty if desired
    "overconf_rate": wrong_rate,
}

# Always-abstain baseline metrics
abstain_row = {
    "accuracy_at_t": 0.0,
    "coverage": 0.0,
    "penalty_mean": 0.0,
    "overconf_rate": 0.0,
}

print("Binary-grading baseline:")
display(pd.DataFrame([binary_row], index=["Binary"]))
print("Always-abstain baseline:")
display(pd.DataFrame([abstain_row], index=["Abstain"]))


Binary-grading baseline:


Unnamed: 0,accuracy_at_t,coverage,penalty_mean,overconf_rate
Binary,0.051961,1.0,,0.948039


Always-abstain baseline:


Unnamed: 0,accuracy_at_t,coverage,penalty_mean,overconf_rate
Abstain,0.0,0.0,0.0,0.0


In [42]:
# ===== Cell 3 – Build final 3x4 headline table =====

headline_df = pd.DataFrame([
    best_row[["accuracy_at_t", "coverage", "penalty_mean", "overconf_rate"]],
    pd.Series(binary_row),
    pd.Series(abstain_row)
], index=[f"Confidence-aware (t*={t_star})", "Binary grading", "Always abstain"])

print("=== 3x4 Headline Evaluation Table ===")
display(headline_df)

# Save to disk
headline_path = f"gpqa-dataset/qwen/evaluations/baseline_comparison.csv"
headline_df.to_csv(headline_path)
print(f"Saved headline table → {headline_path}")


=== 3x4 Headline Evaluation Table ===


Unnamed: 0,accuracy_at_t,coverage,penalty_mean,overconf_rate
Confidence-aware (t*=0.5),0.094183,0.849412,-0.601471,0.769412
Binary grading,0.051961,1.0,,0.948039
Always abstain,0.0,0.0,0.0,0.0


Saved headline table → gpqa-dataset/qwen/evaluations/baseline_comparison.csv
