# ScienceQA vPGM evaluation (129 questions)
Evaluate vPGM output quality on 129 ScienceQA questions, report accuracy/calibration, and prepare comparisons against alternative prompting strategies.

> To run locally, activate your environment first: `source C:/Users/shafi/anaconda3/Scripts/activate PGM`


In [1]:
%matplotlib inline
import json
from pathlib import Path
from typing import Dict, Any, List, Optional, Tuple

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datasets import load_dataset

sns.set_theme(style="whitegrid")


  from .autonotebook import tqdm as notebook_tqdm


## Load vPGM results and ScienceQA ground truth
The notebook expects a JSONL file named `scienceqa_vpgm_results_129.jsonl` in the working directory (one object per line). If that file is missing, it will fall back to `scienceqa_vpgm_results.jsonl`.


In [2]:
results_path = Path("scienceqa_vpgm_results_129.jsonl")
if not results_path.exists():
    alt_path = Path("scienceqa_vpgm_results.jsonl")
    if alt_path.exists():
        print(f"Primary file {results_path} not found; falling back to {alt_path}.")
        results_path = alt_path
    else:
        raise FileNotFoundError(f"Could not find {results_path}. Place the vPGM JSONL file in the working directory.")

instances: List[Dict[str, Any]] = []
with results_path.open("r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        instances.append(json.loads(line))

print(f"Loaded {len(instances)} vPGM instances from {results_path}")


Primary file scienceqa_vpgm_results_129.jsonl not found; falling back to scienceqa_vpgm_results.jsonl.
Loaded 129 vPGM instances from scienceqa_vpgm_results.jsonl


In [3]:
dataset_name = "derek-thomas/ScienceQA"
raw_dataset = load_dataset(dataset_name)

def extract_question_id(example: Dict[str, Any]) -> Optional[str]:
    for key in ("scienceqa_id", "id", "qid", "question_id", "questionid", "question_id_str"):
        if key in example and example[key] is not None:
            return str(example[key])
    return None

id_to_example: Dict[str, Dict[str, Any]] = {}
for split_name, split_ds in raw_dataset.items():
    for ex in split_ds:
        qid = extract_question_id(ex)
        if qid is None:
            continue
        id_to_example[qid] = ex

split_sizes = {name: len(ds) for name, ds in raw_dataset.items()}
print(f"Available splits: {split_sizes}")
print(f"Mapped {len(id_to_example)} unique ScienceQA questions across splits")


Available splits: {'train': 12726, 'validation': 4241, 'test': 4241}
Mapped 0 unique ScienceQA questions across splits


## Build evaluation table
Helpers below align vPGM outputs with ScienceQA choices, resolve predicted answers, and assemble a tidy DataFrame for analysis.


In [4]:
def normalize_text(value: Any) -> str:
    return str(value).strip().lower()

def option_label_to_index(label: Any, num_options: int) -> Optional[int]:
    if label is None:
        return None
    text = str(label).strip()
    if text.isdigit():
        idx = int(text)
        if 0 <= idx < num_options:
            return idx
        if 1 <= idx <= num_options:
            return idx - 1  # tolerate 1-based labels
    if len(text) == 1 and text.isalpha():
        idx = ord(text.lower()) - ord("a")
        if 0 <= idx < num_options:
            return idx
    return None

def find_choice_index_by_text(option_text: Any, choices: List[Any]) -> int:
    target = normalize_text(option_text)
    for idx, choice in enumerate(choices):
        if normalize_text(choice) == target:
            return idx
    return -1

def resolve_prob_vector(option_probs: Dict[str, float], observed_options: List[Any], choices: List[Any]) -> np.ndarray:
    prob_vector = np.zeros(len(choices), dtype=float)
    for key, prob in option_probs.items():
        idx_from_label = option_label_to_index(key, len(observed_options))
        mapped_idx = None
        if idx_from_label is not None and idx_from_label < len(observed_options):
            mapped_idx = find_choice_index_by_text(observed_options[idx_from_label], choices)
        if mapped_idx is None or mapped_idx < 0:
            mapped_idx = find_choice_index_by_text(key, choices)
        if mapped_idx is None or mapped_idx < 0 or mapped_idx >= len(choices):
            continue
        prob_vector[mapped_idx] = float(prob)
    if prob_vector.sum() > 0:
        prob_vector = prob_vector / prob_vector.sum()
    return prob_vector

def resolve_selected_answer(selected_answer: Any, observed_options: List[Any], choices: List[Any], prob_vector: Optional[np.ndarray] = None) -> Tuple[Any, Optional[int]]:
    if selected_answer is None:
        selected_answer = ""
    direct_idx = find_choice_index_by_text(selected_answer, choices)
    if direct_idx >= 0:
        return choices[direct_idx], direct_idx
    idx_from_label = option_label_to_index(selected_answer, len(choices))
    if idx_from_label is not None:
        return choices[idx_from_label], idx_from_label
    obs_idx = find_choice_index_by_text(selected_answer, observed_options)
    if obs_idx >= 0:
        matched_idx = find_choice_index_by_text(observed_options[obs_idx], choices)
        if matched_idx >= 0:
            return choices[matched_idx], matched_idx
    if prob_vector is not None and prob_vector.size:
        idx = int(np.argmax(prob_vector))
        return choices[idx], idx
    return selected_answer, None

def compute_brier_score_row(prob_vector: np.ndarray, correct_idx: Optional[int]) -> float:
    if correct_idx is None or prob_vector.size == 0:
        return np.nan
    y = np.zeros_like(prob_vector)
    if 0 <= correct_idx < prob_vector.size:
        y[correct_idx] = 1.0
    return float(np.mean((prob_vector - y) ** 2))

def build_reliability_curve(df: pd.DataFrame, n_bins: int = 10) -> pd.DataFrame:
    bins = np.linspace(0.0, 1.0, n_bins + 1)
    tmp = df.copy()
    tmp = tmp.dropna(subset=["pred_confidence"])
    tmp["conf_bin"] = pd.cut(tmp["pred_confidence"], bins=bins, include_lowest=True, labels=False)
    grouped = tmp.groupby("conf_bin")
    curve = grouped.apply(
        lambda g: pd.Series(
            {
                "bin_count": len(g),
                "bin_confidence_mean": g["pred_confidence"].mean(),
                "bin_accuracy": g["is_correct"].mean(),
            }
        )
    )
    curve["bin_left"] = bins[:-1]
    curve["bin_right"] = bins[1:]
    return curve.reset_index(drop=True)


In [5]:
rows: List[Dict[str, Any]] = []
missing_ground_truth: List[str] = []

for inst in instances:
    meta = inst.get("question_meta", {})
    scienceqa_id_raw = meta.get("scienceqa_id")
    scienceqa_id = str(scienceqa_id_raw) if scienceqa_id_raw is not None else None
    example = id_to_example.get(scienceqa_id)
    if example is None:
        missing_ground_truth.append(scienceqa_id)
        continue

    choices = example.get("choices", [])
    gt_answer_index = example.get("answer")
    gt_answer_text = None
    if gt_answer_index is not None and 0 <= gt_answer_index < len(choices):
        gt_answer_text = choices[gt_answer_index]

    observed_options = inst.get("observed", {}).get("options", [])
    option_probs = inst.get("answer_posterior", {}).get("option_probabilities", {})
    prob_vector = resolve_prob_vector(option_probs, observed_options, choices)
    pred_confidence = float(np.nanmax(prob_vector)) if prob_vector.size else np.nan

    selected_answer = inst.get("answer_posterior", {}).get("selected_answer")
    pred_answer_text, pred_answer_index = resolve_selected_answer(
        selected_answer, observed_options, choices, prob_vector
    )

    is_correct = np.nan
    if pred_answer_index is not None and gt_answer_index is not None:
        is_correct = pred_answer_index == gt_answer_index

    brier_score = compute_brier_score_row(prob_vector, gt_answer_index)

    rows.append(
        {
            "scienceqa_id": scienceqa_id,
            "subject": meta.get("subject"),
            "topic": meta.get("topic"),
            "category": meta.get("category"),
            "skill": meta.get("skill"),
            "grade": meta.get("grade"),
            "pred_selected_answer": pred_answer_text,
            "pred_option_probs": {str(choice): float(prob_vector[i]) for i, choice in enumerate(choices)},
            "pred_confidence": pred_confidence,
            "gt_answer_index": gt_answer_index,
            "gt_answer_text": gt_answer_text,
            "is_correct": is_correct,
            "brier_score": brier_score,
        }
    )

columns = [
    "scienceqa_id",
    "subject",
    "topic",
    "category",
    "skill",
    "grade",
    "pred_selected_answer",
    "pred_option_probs",
    "pred_confidence",
    "gt_answer_index",
    "gt_answer_text",
    "is_correct",
    "brier_score",
]
df = pd.DataFrame(rows, columns=columns)

eval_df = df.dropna(subset=["gt_answer_text"]).copy() if not df.empty else df.copy()

print(f"Constructed DataFrame with {len(df)} rows; {len(eval_df)} have ground truth available for evaluation.")
if missing_ground_truth:
    print(f"Warning: {len(missing_ground_truth)} instances missing ground truth mapping (first few: {missing_ground_truth[:5]})")
if df.empty:
    print("No rows were constructed. Check that scienceqa_id values in the results file match the ScienceQA dataset ids.")

df.head()



Constructed DataFrame with 0 rows; 0 have ground truth available for evaluation.
No rows were constructed. Check that scienceqa_id values in the results file match the ScienceQA dataset ids.


Unnamed: 0,scienceqa_id,subject,topic,category,skill,grade,pred_selected_answer,pred_option_probs,pred_confidence,gt_answer_index,gt_answer_text,is_correct,brier_score


## Metrics: accuracy, Brier score, and calibration
Compute overall accuracy, per-subject accuracy, Brier score, and reliability curve (calibration).


In [7]:
overall_accuracy = float("nan")
subject_accuracy = pd.Series(dtype=float)
mean_brier_score = float("nan")
calibration_curve = pd.DataFrame(columns=["bin_count", "bin_confidence_mean", "bin_accuracy", "bin_left", "bin_right"])
avg_calibration_gap = float("nan")

if eval_df.empty:
    print("No evaluable rows found. Verify that the results file has ScienceQA ids matching the dataset and rerun.")
else:
    overall_accuracy = eval_df["is_correct"].mean()
    subject_accuracy = eval_df.groupby("subject")["is_correct"].mean().sort_values(ascending=False)
    mean_brier_score = eval_df["brier_score"].mean()
    calibration_curve = build_reliability_curve(eval_df, n_bins=10)
    calibration_curve["confidence_minus_accuracy"] = (
        calibration_curve["bin_confidence_mean"] - calibration_curve["bin_accuracy"]
    )
    avg_calibration_gap = calibration_curve["confidence_minus_accuracy"].mean()

    print(f"Overall accuracy: {overall_accuracy:.3f}")
    print(f"Mean Brier score: {mean_brier_score:.3f}")
    print(f"Average calibration gap (confidence - accuracy): {avg_calibration_gap:.3f}")
    print("Accuracy by subject:")
    display(subject_accuracy)

calibration_curve



No evaluable rows found. Verify that the results file has ScienceQA ids matching the dataset and rerun.


Unnamed: 0,bin_count,bin_confidence_mean,bin_accuracy,bin_left,bin_right


## Visualizations
The plots below show model confidence distribution, calibration (reliability diagram), accuracy by subject, and Brier score distribution.


In [8]:
if eval_df.empty:
    print("No confidence data to plot.")
else:
    plt.figure(figsize=(7, 4))
    sns.histplot(eval_df["pred_confidence"].dropna(), bins=10, color="steelblue", edgecolor="white")
    plt.xlabel("Predicted confidence (max probability)")
    plt.ylabel("Count of questions")
    plt.title("vPGM confidence distribution (129 questions)")
    plt.tight_layout()
    plt.show()



No confidence data to plot.


In [None]:
if calibration_curve.empty:
    print("No calibration data to plot.")
else:
    plt.figure(figsize=(6, 6))
    plt.plot(
        calibration_curve["bin_confidence_mean"],
        calibration_curve["bin_accuracy"],
        marker="o",
        linestyle="-",
        label="vPGM"
    )
    plt.plot([0, 1], [0, 1], linestyle="--", color="gray", label="Perfect calibration")
    plt.xlabel("Mean predicted confidence")
    plt.ylabel("Empirical accuracy")
    plt.title("Reliability diagram for vPGM answers (129 questions)")
    plt.legend()
    plt.tight_layout()
    plt.show()



In [None]:
if subject_accuracy.empty:
    print("No subject-level accuracy data to plot.")
else:
    plt.figure(figsize=(8, 4))
    subj_order = subject_accuracy.index
    sns.barplot(x=subj_order, y=subject_accuracy.values, palette="viridis")
    plt.ylabel("Accuracy")
    plt.xlabel("Subject")
    plt.title("Accuracy by subject")
    plt.xticks(rotation=45, ha="right")
    plt.ylim(0, 1)
    plt.tight_layout()
    plt.show()



In [None]:
if eval_df.empty:
    print("No Brier score data to plot.")
else:
    plt.figure(figsize=(7, 4))
    sns.histplot(eval_df["brier_score"].dropna(), bins=10, color="darkorange", edgecolor="white")
    plt.xlabel("Per-question Brier score")
    plt.ylabel("Count of questions")
    plt.title("Distribution of Brier scores")
    plt.tight_layout()
    plt.show()



## Summary and comparisons
The cell below auto-generates a brief summary once the notebook has been run. Use it to interpret accuracy, calibration, and subject-wise differences. Add notes comparing against other prompting pipelines (Plain GPT-4 CoT, GPT-4 CoT + vPGM, structure-free prompting baselines, and standard VQA baselines) after you run those baselines.


In [None]:
from IPython.display import Markdown, display

    top_subjects = subject_accuracy.head(3)
    summary_lines = [
        f"- Overall accuracy on evaluated set: **{overall_accuracy:.3f}**",
        f"- Mean Brier score: **{mean_brier_score:.3f}**",
        f"- Average calibration gap (confidence - accuracy): **{avg_calibration_gap:.3f}** (positive = overconfident)",
        f"- Top subjects by accuracy: {', '.join([f'{subj} ({acc:.2f})' for subj, acc in top_subjects.items()])}",
        "- Check the reliability diagram for signs of over/underconfidence; large positive gaps indicate overconfidence.",
        "- Add baseline results here (Plain GPT-4 CoT, GPT-4 CoT + vPGM, structure-free prompting, VQA baselines) to contextualize vPGM performance."
    ]

    display(Markdown("
".join(["### Quick takeaways"] + summary_lines)))
