In [1]:
# --- Insert this as the FIRST code cell in the notebook ---
import sys
from pathlib import Path
import os

# Print current working dir (helpful check)
print("Notebook cwd:", Path.cwd())

# Walk upwards until we find a folder that contains "src"
proj_root = Path.cwd()
while not (proj_root / "src").exists():
    if proj_root.parent == proj_root:
        raise RuntimeError("Could not find project root containing 'src' by walking up from cwd. "
                           "Either run the notebook from the repo root, or create src/__init__.py, or set PYTHONPATH.")
    proj_root = proj_root.parent

proj_root = proj_root.resolve()
sys.path.insert(0, str(proj_root))
print("Added project root to sys.path:", proj_root)
print("sys.path[0] ->", sys.path[0])

# Quick sanity checks
print("src exists at:", (proj_root / "src").exists())
print("Listing project root contents:", os.listdir(proj_root)[:50])


Notebook cwd: d:\MSc_AI\MSc_Project\NeuroSummarize\notebooks
Added project root to sys.path: D:\MSc_AI\MSc_Project\NeuroSummarize
sys.path[0] -> D:\MSc_AI\MSc_Project\NeuroSummarize
src exists at: True
Listing project root contents: ['.git', '.vscode', 'app.py', 'assets', 'config', 'data', 'nmslib', 'notebooks', 'outputs', 'README.md', 'reports', 'requirements.txt', 'run_pipeline.py', 'src', 'tools']



# Imports & Setup

In [32]:
from pathlib import Path
import json, pandas as pd
from src.evaluate_models import load_gold_dataset, evaluate_one_model_on_dataset, plot_grouped_metrics, plot_radar_normalized, plot_metric_distributions
from src.summarize import Summarizer
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import yaml

In [3]:
# 1) reload the module so the updated function is used
import importlib
import src.evaluate_models as eval_models
importlib.reload(eval_models)

# 2) (optional) import the function directly for convenience
from src.evaluate_models import evaluate_one_model_on_dataset

# 3) confirm the function source (sanity check)
import inspect
print(inspect.getsource(evaluate_one_model_on_dataset))

def evaluate_one_model_on_dataset(model_id: str, items: List[Dict], summarizer,
                                  max_samples: int = None, save_dir: Path = None) -> Dict:
    """
    Evaluate a summarizer on a list of items, returning aggregated metrics.

    Defensive / robustness improvements:
    - Initialize `row` early so it cannot raise UnboundLocalError.
    - Use safe lookups for etype score values.
    - Capture and continue on per-item summarization exceptions.
    - When saving CSV, include the union of all keys (so dynamic `entity_f_*` columns are preserved).
    """
    results_rows = []
    agg = {
        'model': model_id,
        'n': 0,
        'rouge_l_clin': [], 'bleu_clin': [],
        'rouge_l_lay': [], 'bleu_lay': [],
        'entity_p': [], 'entity_r': [], 'entity_f': [],
        'hall_rate': [], 'likert': []
    }

    for idx, item in enumerate(items):
        if max_samples and idx >= max_samples:
            break

        # --- Basic case fields ---
       

In [24]:
DATA_DIR = Path("data/gold/pending")
RESULTS_DIR = Path("results/eval_notebook")
RESULTS_DIR.mkdir(parents=True, exist_ok=True)
models = ["openai:gpt-3.5-turbo", "groq:llama3-8b-8192", "sshleifer/distilbart-cnn-12-6"]
MAX_SAMPLES = 200
DEVICE = None

In [25]:
# Show current working directory
print("Current working directory:", os.getcwd())

# Look for any gold folders under project
for p in Path(".").rglob("gold"):
    print("Found gold folder at:", p.resolve())
    files = list(p.glob("*.json"))
    print(f"  Contains {len(files)} JSON files")
    if files:
        print("  Example:", files[0].name)

Current working directory: d:\MSc_AI\MSc_Project\NeuroSummarize\notebooks


In [28]:
cwd = Path().resolve()
root = None
for parent in [cwd] + list(cwd.parents):
    if (parent / "data" / "gold").exists():
        root = parent
        break

if root is None:
    raise RuntimeError("Could not locate the project root containing data/gold")

print("Project root detected as:", root)

# --- Step 2: choose dataset ---
USE_PENDING = False   # set True for pending predictions, False for curated gold

gold_dir = root / "data" / "gold" / "pending"

# --- Step 3: list JSON files ---
json_files = list(gold_dir.glob("*.json"))
print(f"Found {len(json_files)} JSON files in {gold_dir}")

if json_files:
    print("Example file:", json_files[0].name)

Project root detected as: D:\MSc_AI\MSc_Project\NeuroSummarize
Found 134 JSON files in D:\MSc_AI\MSc_Project\NeuroSummarize\data\gold\pending
Example file: sub-10159_pending.json


In [29]:
def load_all_predictions(gold_dir=gold_dir):
    """Load all saved prediction JSONs from gold_dir into memory."""
    data = []
    for f in gold_dir.glob("*.json"):
        try:
            with open(f, "r", encoding="utf-8") as fh:
                obj = json.load(fh)
                data.append(obj)
        except Exception as e:
            print(f"Skipping {f.name}: {e}")
    print(f"Loaded {len(data)} prediction JSONs from {gold_dir}")
    return data

all_preds = load_all_predictions()
print("First prediction keys:", list(all_preds[0].keys()) if all_preds else "No data found")

Loaded 134 prediction JSONs from D:\MSc_AI\MSc_Project\NeuroSummarize\data\gold\pending
First prediction keys: ['id', 'report', 'clinical_summary_model', 'lay_summary_model', 'pred_entities']


In [None]:
# # Remove OpenAI models if quota issues
# models = [m for m in models if not m.startswith("openai:")]
# print("Models for evaluation (after filtering):", models)

In [45]:
# Limit number of samples for quicker testing
MAX_SAMPLES = 20  

# Load gold dataset and sanity-check

In [41]:
GROQ_API_KEY = os.getenv("GROQ_API_KEY") or config.get("models", {}).get("groq", {}).get("api_key")
if not GROQ_API_KEY:
    print("Groq API key not found")

In [47]:
out_dir = RESULTS_DIR / model_id.replace(":", "_")
out_dir.mkdir(parents=True, exist_ok=True)  # you already do this

# Ensure evaluate_one_model_on_dataset doesn’t add another subfolder automatically
# OR modify evaluate function to create the directory:
per_case_csv = out_dir / f"{model_id.replace(':','_')}_per_case.csv"
per_case_csv.parent.mkdir(parents=True, exist_ok=True)  # <-- ensures folder exists

In [49]:
items = load_gold_dataset(gold_dir)
summaries = []
per_case_csvs = []

for model_id in models:
    print("Running:", model_id)
    out_dir = RESULTS_DIR / model_id.replace(":", "_")
    out_dir.mkdir(parents=True, exist_ok=True)

    # Cached result files
    per_case_file = out_dir / f"{model_id.replace(':','_')}_per_case.csv"
    summary_file   = out_dir / "summary.json"

    if per_case_file.exists() and summary_file.exists():
        print(f" → Skipping {model_id}, results already cached.")
        with open(summary_file, "r", encoding="utf-8") as fh:
            summary = json.load(fh)
    else:
        print(f" → Evaluating {model_id} ... this may take a while")
        s = Summarizer(clinical_model=model_id, lay_model=model_id, device=DEVICE)
        summary = evaluate_one_model_on_dataset(
            model_id, items, s, max_samples=MAX_SAMPLES, save_dir=out_dir
        )
        # Save summary to JSON for future reuse
        with open(summary_file, "w", encoding="utf-8") as fh:
            json.dump(summary, fh, indent=2)

    summaries.append(summary)
    if per_case_file.exists():
        per_case_csvs.append(per_case_file)

# Aggregate results
pd.DataFrame(summaries).to_csv(RESULTS_DIR / "aggregate_summary.csv", index=False)
with open(RESULTS_DIR / "aggregate_summary.json","w") as fh:
    json.dump(summaries, fh, indent=2)


Loaded 134 items from D:\MSc_AI\MSc_Project\NeuroSummarize\data\gold\pending
Running: openai:gpt-3.5-turbo
 → Evaluating openai:gpt-3.5-turbo ... this may take a while
Running: groq:llama3-8b-8192
 → Evaluating groq:llama3-8b-8192 ... this may take a while
Running: facebook/bart-large-cnn
 → Evaluating facebook/bart-large-cnn ... this may take a while


In [50]:
len(summaries)

3

In [51]:
print("RESULTS_DIR =", RESULTS_DIR.resolve())
print("Expected JSON:", (RESULTS_DIR / "aggregate_summary.json").resolve())

RESULTS_DIR = D:\MSc_AI\MSc_Project\NeuroSummarize\notebooks\results\eval_notebook
Expected JSON: D:\MSc_AI\MSc_Project\NeuroSummarize\notebooks\results\eval_notebook\aggregate_summary.json


In [53]:
all_preds = load_all_predictions()  # from gold or pending
print("Loaded", len(all_preds), "cases")

Loaded 134 prediction JSONs from D:\MSc_AI\MSc_Project\NeuroSummarize\data\gold\pending
Loaded 134 cases


In [59]:
results = []

for model_id in models:
    summary = evaluate_one_model_on_dataset(model_id, items, s, max_samples=MAX_SAMPLES)
    agg = summary['summary']  # This has mean metrics already
    results.append({
        "model": model_id,
        "n_cases": agg['n'],
        "rouge_clinical": agg['mean_rouge_l_clin'],
        "rouge_lay": agg['mean_rouge_l_lay'],
        "bleu_clinical": agg['mean_bleu_clin'],
        "bleu_lay": agg['mean_bleu_lay'],
        "entity_f1": agg['mean_entity_f'],
        "hallucination_rate": agg['mean_hall_rate'],
        "utility_mean": agg['mean_likert']
    })

df_results = pd.DataFrame(results)
df_results


Unnamed: 0,model,n_cases,rouge_clinical,rouge_lay,bleu_clinical,bleu_lay,entity_f1,hallucination_rate,utility_mean
0,openai:gpt-3.5-turbo,0,,,,,,,
1,groq:llama3-8b-8192,0,,,,,,,
2,facebook/bart-large-cnn,0,,,,,,,
