In [1]:
# Install required dependencies
%pip install -q --upgrade pip
%pip install -q statsmodels scikit-image seaborn matplotlib pandas scipy opencv-python-headless

# Segment Anything
try:
    import segment_anything  # noqa: F401
except Exception:
    %pip install -q git+https://github.com/facebookresearch/segment-anything.git

print("✅ Dependencies installation cell executed.")


[0mNote: you may need to restart the kernel to use updated packages.
[0mNote: you may need to restart the kernel to use updated packages.
✅ Dependencies installation cell executed.


In [2]:
# Dependency installation / verification (runs in Python, no magics)
import sys
import subprocess
import importlib

def _pip_install(args):
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", *args])

def ensure_pkg(module_name: str, pip_spec: str | None = None) -> None:
    try:
        importlib.import_module(module_name)
        print(f"{module_name} OK")
        return
    except Exception:
        pass
    spec = pip_spec or module_name
    print(f"Installing {spec} ...")
    try:
        _pip_install([spec])
    except subprocess.CalledProcessError:
        # retry with --user to avoid touching system packages
        try:
            print(f"Retrying {spec} with --user ...")
            _pip_install(["--user", spec])
        except subprocess.CalledProcessError:
            # special-case tiatoolbox to avoid distutils uninstall issues (e.g., blinker)
            if module_name == "tiatoolbox":
                try:
                    print("Retrying tiatoolbox with --user and --ignore-installed blinker ...")
                    _pip_install(["--user", "tiatoolbox", "--ignore-installed", "blinker"]) 
                except subprocess.CalledProcessError as e:
                    raise e
            else:
                raise
    importlib.import_module(module_name)
    print(f"{module_name} OK")

# Core scientific & plotting
for mod, spec in [
    ("statsmodels", "statsmodels"),
    ("skimage", "scikit-image"),
    ("seaborn", "seaborn"),
    ("matplotlib", "matplotlib"),
    ("pandas", "pandas"),
    ("scipy", "scipy"),
    ("PIL", "Pillow"),
    ("cv2", "opencv-python-headless"),
]:
    ensure_pkg(mod, spec)

# Segment Anything
try:
    importlib.import_module("segment_anything")
    print("segment_anything OK")
except Exception:
    print("Installing Segment Anything from GitHub ...")
    _pip_install(["git+https://github.com/facebookresearch/segment-anything.git"])
    importlib.import_module("segment_anything")
    print("segment_anything OK")

# TIAToolbox (official HoVer-Net)
try:
    ensure_pkg("tiatoolbox", "tiatoolbox")
except subprocess.CalledProcessError:
    print("❗ tiatoolbox installation failed. Consider using a virtual environment and re-running this cell:\n  python -m venv .venv && source .venv/bin/activate && python -m pip install -U pip && pip install tiatoolbox")
    raise

# TorchVision (fallback HoverNet uses it; attempt install if missing)
try:
    importlib.import_module("torchvision")
    print("torchvision OK")
except Exception:
    try:
        print("Installing torchvision ... (ensure it matches your torch version)")
        _pip_install(["torchvision"])
        importlib.import_module("torchvision")
        print("torchvision OK")
    except Exception:
        print("Warning: torchvision install failed; if using TIAToolbox HoVer-Net this is optional.")

# Torch presence + CUDA check
try:
    import torch  # noqa: F401
    import torch as _torch
    print("torch OK, CUDA:", _torch.cuda.is_available())
except Exception:
    print("Warning: torch not available. Install a platform-appropriate torch manually if needed.")

print("✅ Dependency check complete.")


statsmodels OK
skimage OK
seaborn OK
matplotlib OK
pandas OK
scipy OK
PIL OK
cv2 OK
segment_anything OK
tiatoolbox OK
torchvision OK
torch OK, CUDA: True
✅ Dependency check complete.


# RQ1: SAM Variants vs Established Models on PanNuke

**Research Question**: Do different variants of the Segment Anything Model (SAM), including domain-adapted PathoSAM, achieve competitive or superior nuclei instance segmentation performance on PanNuke compared to established models (HoVer-Net, CellViT, LKCell) and a U-Net baseline?

- **H0 (Null)**: SAM variants do not significantly outperform established models in mPQ or detection F1.
- **H1 (Alt.)**: At least one SAM variant significantly outperforms baselines in mPQ or detection F1.

### What this notebook does
- Loads PanNuke tiles via a reusable dataset
- Runs inference for available models:
  - SAM variants (if checkpoints available)
  - U-Net baseline (checkpoint-gated)
- Converts predictions to instance masks and computes: PQ, object F1, AJI, Dice
- Performs paired statistics with multiple-comparison correction
- Saves CSVs, figures, and an HTML report under `reports/rq1`

Note: HoVer-Net, CellViT, and LKCell slots are scaffolded for future integration; this notebook focuses on SAM variants and a U-Net baseline to establish a robust, reproducible evaluation pipeline.


In [3]:
import os
from pathlib import Path
import json
import random
from typing import Dict, List, Tuple

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
import seaborn as sns

# Project-local imports
import sys
if "__file__" in globals():
    SRC_DIR = Path(__file__).resolve().parent
else:
    SRC_DIR = Path.cwd()
sys.path.append(str(SRC_DIR))
from datasets.pannuke_tissue_dataset import PanNukeTissueDataset
from models.unet import UNet

# Metrics (instance-aware)
from metrics.seg_metrics import (
    reconstruct_instances,
    dice_coefficient,
    aji_aggregated_jaccard,
    pq_panoptic,
    f1_object,
)

# Reproducibility & device
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)

device = torch.device('cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu')
print("Device:", device)

# Paths
PROJECT_ROOT = SRC_DIR.parent
DATASET_TISSUES = PROJECT_ROOT / "dataset_tissues"
REPORTS_DIR = PROJECT_ROOT / "reports" / "rq1"
FIG_DIR = REPORTS_DIR / "figures"
CSV_DIR = REPORTS_DIR / "tables"
for d in [REPORTS_DIR, FIG_DIR, CSV_DIR]:
    d.mkdir(parents=True, exist_ok=True)

sns.set_style("whitegrid")
sns.set_context("notebook")


Device: cuda


In [4]:
# Dataset setup
available_tissues = [p.name for p in DATASET_TISSUES.iterdir() if p.is_dir()]
print("Tissues:", len(available_tissues))

IMG_SIZE = 256
BATCH_SIZE = 6

# Simple transforms via dataset defaults; they already resize/normalize if needed

def make_loader(tissue: str, split: str = "test") -> DataLoader:
    ds = PanNukeTissueDataset(
        str(DATASET_TISSUES / tissue),
        split=split,
        image_transform=None,
        target_transform=None,
    )
    return DataLoader(ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=2)

# Small EDA count table
records = []
for t in sorted(available_tissues):
    for split in ["train", "val", "test"]:
        try:
            ds = PanNukeTissueDataset(str(DATASET_TISSUES / t), split=split)
            records.append({"tissue": t, "split": split, "n": len(ds)})
        except Exception:
            pass
eda_df = pd.DataFrame(records).pivot(index="tissue", columns="split", values="n").fillna(0).astype(int)
eda_df.head()


Tissues: 5


split,test,train,val
tissue,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Adrenal_gland,88,314,35
Bile-duct,84,302,34
Breast,471,1692,188
Colon,288,1036,116
Esophagus,85,305,34


In [None]:
# U-Net baseline loader (checkpoint optional)

def load_unet_checkpoint(ckpt_path: Path, num_classes: int = 7) -> nn.Module:
    model = UNet(in_channels=3, num_classes=num_classes)
    if ckpt_path.exists():
        state = torch.load(ckpt_path, map_location=device)
        # allow raw state or dict
        state_dict = state.get('model_state', state)
        model.load_state_dict(state_dict, strict=False)
        print(f"Loaded U-Net weights from {ckpt_path}")
    else:
        print(f"U-Net checkpoint not found at {ckpt_path}; using randomly initialized model")
    model.to(device).eval()
    return model

# SAM wrapper (automatic mask generation -> instance map)
try:
    from segment_anything import sam_model_registry, SamAutomaticMaskGenerator, SamPredictor
    SAM_AVAILABLE = True
except Exception as e:
    print("SAM not available:", e)
    SAM_AVAILABLE = False

class SAMWrapper:
    def __init__(self, model_type: str, checkpoint: str | None = None):
        assert SAM_AVAILABLE, "segment_anything not installed"
        if checkpoint and os.path.isfile(checkpoint):
            self.sam = sam_model_registry[model_type](checkpoint=checkpoint)
        else:
            self.sam = sam_model_registry[model_type]()
        # move model to selected device for GPU/MPS acceleration
        self.sam.to(device)
        self.predictor = SamPredictor(self.sam)
        # Use single-crop to avoid torchvision box_area 1D bug; adjust for stability and speed
        self.mask_gen = SamAutomaticMaskGenerator(
            model=self.sam,
            points_per_side=8,
            pred_iou_thresh=0.7,
            stability_score_thresh=0.92,
            crop_n_layers=0,  # single crop to avoid multi-crop NMS path
            crop_n_points_downscale_factor=2,
            min_mask_region_area=80,
        )

    def _masks_to_instances(self, masks: List[Dict], min_area: int = 50) -> np.ndarray:
        """Convert SAM masks (list of dicts) to instance map."""
        if not masks:
            return np.zeros((256, 256), dtype=np.int32)
        
        # Get image dimensions from first mask
        first_seg = masks[0]['segmentation']
        H, W = first_seg.shape
        inst = np.zeros((H, W), dtype=np.int32)
        
        # Sort masks by area (largest first) for better instance assignment
        masks_sorted = sorted(masks, key=lambda x: x['area'], reverse=True)
        
        instance_id = 1
        for mask_dict in masks_sorted:
            seg = mask_dict['segmentation']
            area = mask_dict['area']
            
            # Skip small masks
            if area < min_area:
                continue
                
            # Convert to boolean mask
            mask = seg.astype(bool)
            
            # Only assign to pixels not already assigned to an instance
            available_pixels = (inst == 0) & mask
            if available_pixels.sum() > min_area // 2:  # At least half the min area available
                inst[available_pixels] = instance_id
                instance_id += 1
        
        return inst

    def _predict_instances_fallback(self, image_np: np.ndarray) -> np.ndarray:
        # Grid prompt fallback using SamPredictor
        self.predictor.set_image(image_np)
        H, W = image_np.shape[:2]
        grid = 6  # Reduced for speed
        ys = np.linspace(H//4, 3*H//4, grid, dtype=np.int32)
        xs = np.linspace(W//4, 3*W//4, grid, dtype=np.int32)
        pts = np.stack(np.meshgrid(xs, ys), axis=-1).reshape(-1, 2)
        
        fake_masks = []  # Create fake mask dicts similar to SAM format
        batch = 32
        for i in range(0, len(pts), batch):
            coords = pts[i:i+batch]
            labels = np.ones((coords.shape[0],), dtype=np.int32)
            try:
                m, _, _ = self.predictor.predict(point_coords=coords, point_labels=labels, multimask_output=False)
                # m shape: (N, H, W)
                for j in range(m.shape[0]):
                    mask = m[j] > 0
                    if mask.sum() > 50:  # Only keep reasonable sized masks
                        fake_masks.append({
                            'segmentation': mask,
                            'area': int(mask.sum())
                        })
            except Exception as e:
                print(f"Fallback prediction error: {e}")
                continue
        
        if not fake_masks:
            return np.zeros((H, W), dtype=np.int32)
        return self._masks_to_instances(fake_masks)

    @torch.no_grad()
    def predict_instances(self, image_np: np.ndarray) -> np.ndarray:
        # image_np: HxWx3 uint8
        try:
            masks = self.mask_gen.generate(image_np)
            if masks:
                # masks is a list of dicts with 'segmentation', 'area', etc.
                return self._masks_to_instances(masks)
            # no masks -> fallback
            return self._predict_instances_fallback(image_np)
        except Exception as e:
            print(f"SAM generation failed: {e}")
            # Fallback using point grid prompts
            return self._predict_instances_fallback(image_np)


In [None]:
# Inference helpers
from PIL import Image

@torch.no_grad()
def unet_predict_instances(model: nn.Module, img_tensor: torch.Tensor) -> np.ndarray:
    # img_tensor: 3xHxW (normalized)
    model.eval()
    logits = model(img_tensor.unsqueeze(0).to(device))
    sem = torch.argmax(logits, dim=1).squeeze(0).detach().cpu().numpy().astype(np.uint8)
    # derive boundary from semantic changes (FIXED)
    from scipy import ndimage
    foreground = (sem > 0).astype(np.uint8)
    eroded = ndimage.binary_erosion(foreground).astype(np.uint8)
    boundary = (foreground - eroded).astype(np.uint8)
    inst = reconstruct_instances(sem, boundary)
    return inst


def tensor_to_uint8(rgb_tensor: torch.Tensor) -> np.ndarray:
    # approximate inverse of default normalization for visualization/SAM
    arr = rgb_tensor.permute(1,2,0).cpu().numpy()
    arr = arr * np.array([0.229, 0.224, 0.225])[None,None,:] + np.array([0.485, 0.456, 0.406])[None,None,:]
    arr = np.clip(arr, 0, 1)
    return (arr * 255).astype(np.uint8)


def evaluate_on_tissue(tissue: str, models: Dict[str, object], n_limit: int | None = None) -> List[Dict]:
    loader = make_loader(tissue, split="test")
    results = []
    seen = 0
    for batch_idx, batch in enumerate(loader):
        images, targets = batch  # targets are semantic gt
        for b in range(images.shape[0]):
            if n_limit is not None and seen >= n_limit:
                return results
            img_t = images[b]
            gt_sem = targets[b].numpy()
            # GT instance reconstruction from sem + boundary (FIXED)
            from scipy import ndimage
            foreground = (gt_sem > 0).astype(np.uint8)
            eroded = ndimage.binary_erosion(foreground).astype(np.uint8)
            gt_boundary = (foreground - eroded).astype(np.uint8)
            gt_inst = reconstruct_instances(gt_sem, gt_boundary)
            
            # Debug first image
            if batch_idx == 0 and b == 0:
                print(f"\n=== DEBUG FIRST IMAGE ===")
                print(f"GT semantic shape: {gt_sem.shape}, unique: {np.unique(gt_sem)}")
                print(f"GT boundary sum: {gt_boundary.sum()}")
                print(f"GT instances shape: {gt_inst.shape}, unique instances: {len(np.unique(gt_inst))-1}")
            
            # Per-image id for pairing
            image_id = f"{tissue}/test/{batch_idx:05d}_{b}"
            # Evaluate each model (generic: if wrapper exposes predict_instances, use it)
            for name, model in models.items():
                if hasattr(model, 'predict_instances'):
                    img_u8 = tensor_to_uint8(img_t)
                    pred_inst = model.predict_instances(img_u8)
                else:
                    pred_inst = unet_predict_instances(model, img_t)
                
                # Debug first prediction
                if batch_idx == 0 and b == 0:
                    print(f"{name} pred shape: {pred_inst.shape}, unique instances: {len(np.unique(pred_inst))-1}")
                    print(f"{name} pred max: {pred_inst.max()}, nonzero pixels: {(pred_inst > 0).sum()}")
                
                # Metrics
                pq = pq_panoptic(gt_inst, pred_inst)
                f1o = f1_object(gt_inst, pred_inst)
                aji = aji_aggregated_jaccard(gt_inst, pred_inst)
                dice = dice_coefficient(gt_sem, (pred_inst > 0).astype(np.uint8), num_classes=2, ignore_background=False)
                
                # Debug first metrics
                if batch_idx == 0 and b == 0:
                    print(f"{name} metrics: PQ={pq:.3f}, F1={f1o:.3f}, AJI={aji:.3f}, Dice={dice:.3f}")
                
                results.append({
                    "tissue": tissue,
                    "image_id": image_id,
                    "model": name,
                    "pq": pq,
                    "f1_object": f1o,
                    "aji": aji,
                    "dice_bin": dice,
                })
            seen += 1
    return results


In [7]:
# Configure models (gate by availability)
MODELS: Dict[str, object] = {}

# U-Net baseline checkpoint (update if you have a trained model)
UNET_CKPT = PROJECT_ROOT / "artifacts" / "rq3_enhanced" / "checkpoints" / "unet_original_enhanced_best.pth"
MODELS["unet_baseline"] = load_unet_checkpoint(UNET_CKPT, num_classes=7)

# SAM variants (require segment_anything + optional checkpoints)
if SAM_AVAILABLE:
    try:
        MODELS["sam_vit_b"] = SAMWrapper("vit_b")
    except Exception as e:
        print("Skipping sam_vit_b:", e)
    try:
        MODELS["sam_vit_l"] = SAMWrapper("vit_l")
    except Exception as e:
        print("Skipping sam_vit_l:", e)
    try:
        MODELS["sam_vit_h"] = SAMWrapper("vit_h")
    except Exception as e:
        print("Skipping sam_vit_h:", e)

print("Models configured:", list(MODELS.keys()))


  state = torch.load(ckpt_path, map_location=device)



Loaded U-Net weights from /workspace/HistoPathologyResearch/artifacts/rq3_enhanced/checkpoints/unet_original_enhanced_best.pth
Models configured: ['unet_baseline', 'sam_vit_b', 'sam_vit_l', 'sam_vit_h']


In [8]:
# TIAToolbox HoVer-Net integration removed as requested
TIA_AVAILABLE = False


In [9]:
# HoVer-Net (placeholder) removed as requested
print('Models configured:', list(MODELS.keys()))


Models configured: ['unet_baseline', 'sam_vit_b', 'sam_vit_l', 'sam_vit_h']


In [None]:
# DEBUG: Test single image evaluation
print("=== DEBUGGING SINGLE IMAGE ===")
tissue = "Adrenal_gland"
debug_rows = evaluate_on_tissue(tissue, MODELS, n_limit=1)
print(f"Debug results: {len(debug_rows)} rows")
for row in debug_rows:
    print(f"  {row['model']}: PQ={row['pq']:.3f}, F1={row['f1_object']:.3f}, AJI={row['aji']:.3f}")
print("=== END DEBUG ===\n")


In [10]:
# Run evaluation across tissues (global cap of 100 images)
GLOBAL_LIMIT = 100
ALL_ROWS: List[Dict] = []
processed_images = 0

for tissue in sorted(available_tissues):
    remaining = GLOBAL_LIMIT - processed_images
    if remaining <= 0:
        break
    print(f"Evaluating {tissue} (remaining images: {remaining}) ...")
    rows = evaluate_on_tissue(tissue, MODELS, n_limit=remaining)
    ALL_ROWS.extend(rows)
    if rows:
        new_imgs = len({r["image_id"] for r in rows})
        processed_images += new_imgs

print(f"Processed images: {processed_images}")
res_df = pd.DataFrame(ALL_ROWS)
print(res_df.head())

# Save per-image table
csv_path = CSV_DIR / "per_image_instance_metrics.csv"
res_df.to_csv(csv_path, index=False)
print("Saved:", csv_path)


Evaluating Adrenal_gland (remaining images: 100) ...
Evaluating Bile-duct (remaining images: 12) ...
Processed images: 100
          tissue                    image_id          model   pq  f1_object  \
0  Adrenal_gland  Adrenal_gland/test/00000_0  unet_baseline  0.0        0.0   
1  Adrenal_gland  Adrenal_gland/test/00000_0      sam_vit_b  0.0        0.0   
2  Adrenal_gland  Adrenal_gland/test/00000_0      sam_vit_l  0.0        0.0   
3  Adrenal_gland  Adrenal_gland/test/00000_0      sam_vit_h  0.0        0.0   
4  Adrenal_gland  Adrenal_gland/test/00000_1  unet_baseline  0.0        0.0   

        aji  dice_bin  
0  0.000000  0.427778  
1  0.004918  0.386383  
2  0.005273  0.301359  
3  0.005777  0.332589  
4  0.000000  0.451921  
Saved: /workspace/HistoPathologyResearch/reports/rq1/tables/per_image_instance_metrics.csv


In [12]:
metrics_cols = ["pq", "f1_object", "aji", "dice_bin"]
summary = res_df.groupby("model")[metrics_cols].agg(["mean", "std", "count"]).round(4)
summary

Unnamed: 0_level_0,pq,pq,pq,f1_object,f1_object,f1_object,aji,aji,aji,dice_bin,dice_bin,dice_bin
Unnamed: 0_level_1,mean,std,count,mean,std,count,mean,std,count,mean,std,count
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
sam_vit_b,0.0,0.0,100,0.0,0.0,100,0.0056,0.0045,100,0.4122,0.0175,100
sam_vit_h,0.0,0.0,100,0.0,0.0,100,0.0059,0.0047,100,0.321,0.0262,100
sam_vit_l,0.0,0.0,100,0.0,0.0,100,0.005,0.0042,100,0.3344,0.0383,100
unet_baseline,0.0,0.0,100,0.0,0.0,100,0.0,0.0,100,0.7913,0.2226,100


In [13]:
# Pairwise statistical analysis with BH correction
from itertools import combinations
from statsmodels.stats.multitest import multipletests
from scipy.stats import ttest_rel, wilcoxon

metrics = ["pq", "f1_object", "aji", "dice_bin"]
models = res_df["model"].unique().tolist()

pairwise_rows = []
for m1, m2 in combinations(models, 2):
    df1 = res_df[res_df.model == m1].set_index(["tissue", "image_id"])  # align by image
    df2 = res_df[res_df.model == m2].set_index(["tissue", "image_id"])  # same ids
    common_idx = df1.index.intersection(df2.index)
    if len(common_idx) < 5:
        continue
    for metric in metrics:
        x = df1.loc[common_idx, metric].values
        y = df2.loc[common_idx, metric].values
        if len(x) != len(y) or len(x) < 5:
            continue
        # Paired tests
        t_stat, t_p = ttest_rel(x, y, nan_policy='omit')
        try:
            w_stat, w_p = wilcoxon(x, y)
        except Exception:
            w_stat, w_p = np.nan, np.nan
        diff = np.nanmean(x - y)
        pairwise_rows.append({
            "model1": m1,
            "model2": m2,
            "metric": metric,
            "n": int(len(x)),
            "mean_diff": float(diff),
            "t_p": float(t_p) if np.isfinite(t_p) else 1.0,
            "w_p": float(w_p) if np.isfinite(w_p) else 1.0,
        })

pairwise_df = pd.DataFrame(pairwise_rows)
if not pairwise_df.empty:
    # BH correction per metric separately
    corrected = []
    for metric, g in pairwise_df.groupby("metric"):
        for col in ["t_p", "w_p"]:
            rej, p_bh, _, _ = multipletests(g[col].values, method='fdr_bh')
            g[col+"_bh"] = p_bh
            g[col+"_sig_bh"] = rej
        corrected.append(g)
    pairwise_df = pd.concat(corrected, ignore_index=True)

pairwise_csv = CSV_DIR / "pairwise_stats_bh.csv"
pairwise_df.to_csv(pairwise_csv, index=False)
print("Saved:", pairwise_csv)

pairwise_df.head()


Saved: /workspace/HistoPathologyResearch/reports/rq1/tables/pairwise_stats_bh.csv


Unnamed: 0,model1,model2,metric,n,mean_diff,t_p,w_p,t_p_bh,t_p_sig_bh,w_p_bh,w_p_sig_bh
0,unet_baseline,sam_vit_b,aji,100,-0.00565,3.887519e-22,3.89656e-18,1.166256e-21,True,7.79312e-18,True
1,unet_baseline,sam_vit_l,aji,100,-0.005019,6.5362e-21,3.89656e-18,1.30724e-20,True,7.79312e-18,True
2,unet_baseline,sam_vit_h,aji,100,-0.005943,1.986319e-22,3.89656e-18,1.166256e-21,True,7.79312e-18,True
3,sam_vit_b,sam_vit_l,aji,100,0.000631,8.51205e-06,7.459505e-06,1.021446e-05,True,8.951406e-06,True
4,sam_vit_b,sam_vit_h,aji,100,-0.000293,0.002345791,0.001037475,0.002345791,True,0.001037475,True


In [14]:
# Plots
plt.figure(figsize=(8,4))
sns.boxplot(data=res_df, x="model", y="pq")
plt.xticks(rotation=30, ha='right')
plt.title("PQ by model")
plt.tight_layout()
fig_path1 = FIG_DIR / "pq_by_model.png"
plt.savefig(fig_path1, dpi=200)
plt.close()

plt.figure(figsize=(8,4))
sns.boxplot(data=res_df, x="model", y="f1_object")
plt.xticks(rotation=30, ha='right')
plt.title("Object F1 by model")
plt.tight_layout()
fig_path2 = FIG_DIR / "f1_by_model.png"
plt.savefig(fig_path2, dpi=200)
plt.close()

print("Saved:", fig_path1)
print("Saved:", fig_path2)


Saved: /workspace/HistoPathologyResearch/reports/rq1/figures/pq_by_model.png
Saved: /workspace/HistoPathologyResearch/reports/rq1/figures/f1_by_model.png


In [15]:
# Per-tissue paired Wilcoxon tests (BH corrected)
from itertools import product

metrics_primary = ["pq", "f1_object"]
sam_models = [m for m in res_df.model.unique() if m.startswith("sam")]
established = [m for m in ["hovernet", "cellvit", "lkcell"] if m in res_df.model.unique()]

rows = []
for tissue in sorted(res_df.tissue.unique()):
    df_t = res_df[res_df.tissue == tissue].set_index(["tissue", "image_id"])  # align pairs
    for sam, est, metric in product(sam_models, established, metrics_primary):
        a = df_t[df_t.model == sam][metric]
        b = df_t[df_t.model == est][metric]
        idx = a.index.intersection(b.index)
        if len(idx) < 5:
            continue
        x, y = a.loc[idx].values, b.loc[idx].values
        try:
            stat, p = wilcoxon(x, y)
        except Exception:
            p = 1.0
        rows.append({
            "tissue": tissue,
            "sam": sam,
            "established": est,
            "metric": metric,
            "n": int(len(idx)),
            "mean_diff": float(np.nanmean(x - y)),
            "wilcoxon_p": float(p)
        })

tissue_df = pd.DataFrame(rows)
if not tissue_df.empty:
    outs = []
    for metric, g in tissue_df.groupby("metric"):
        rej, p_bh, _, _ = multipletests(g["wilcoxon_p"].values, method="fdr_bh")
        g = g.assign(wilcoxon_p_bh=p_bh, sig_bh=rej)
        outs.append(g)
    tissue_df_bh = pd.concat(outs, ignore_index=True)
else:
    tissue_df_bh = pd.DataFrame(columns=["tissue","sam","established","metric","n","mean_diff","wilcoxon_p","wilcoxon_p_bh","sig_bh"])

per_tissue_csv = CSV_DIR / "per_tissue_wilcoxon_bh.csv"
tissue_df_bh.to_csv(per_tissue_csv, index=False)
print("Saved:", per_tissue_csv)

tissue_df_bh.head()


Saved: /workspace/HistoPathologyResearch/reports/rq1/tables/per_tissue_wilcoxon_bh.csv


Unnamed: 0,tissue,sam,established,metric,n,mean_diff,wilcoxon_p,wilcoxon_p_bh,sig_bh


In [16]:
# HTML report (with per-tissue section)
from datetime import datetime

report_html = f"""
<!DOCTYPE html>
<html><head><meta charset='utf-8'><title>RQ1 - SAM Variants vs Baselines</title></head>
<body style='font-family:Segoe UI,Arial,sans-serif; margin:40px;'>
<h1>RQ1: SAM Variants vs Established Models on PanNuke</h1>
<p><em>Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</em></p>
<h2>Per-image metrics</h2>
<p>Saved CSV: {csv_path.name}</p>
<h2>Summary (by model)</h2>
{summary.to_html()}
<h2>Pairwise statistics (BH corrected)</h2>
{pairwise_df.head(50).to_html(index=False) if 'pairwise_df' in globals() and not pairwise_df.empty else '<p>No pairwise results.</p>'}
<h2>Per-tissue paired Wilcoxon (BH corrected)</h2>
{tissue_df_bh.head(100).to_html(index=False) if 'tissue_df_bh' in globals() and not tissue_df_bh.empty else '<p>No per-tissue results.</p>'}
<h2>Figures</h2>
<ul>
  <li>{fig_path1.name}</li>
  <li>{fig_path2.name}</li>
</ul>
</body></html>
"""
html_path = REPORTS_DIR / "RQ1_SAM_Variants_Report.html"
html_path.write_text(report_html, encoding='utf-8')
print("Saved:", html_path)


Saved: /workspace/HistoPathologyResearch/reports/rq1/RQ1_SAM_Variants_Report.html


In [17]:
# HTML report
from datetime import datetime

report_html = f"""
<!DOCTYPE html>
<html><head><meta charset='utf-8'><title>RQ1 - SAM Variants vs Baselines</title></head>
<body style='font-family:Segoe UI,Arial,sans-serif; margin:40px;'>
<h1>RQ1: SAM Variants vs Established Models on PanNuke</h1>
<p><em>Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</em></p>
<h2>Per-image metrics</h2>
<p>Saved CSV: {csv_path.name}</p>
<h2>Summary (by model)</h2>
{summary.to_html()}
<h2>Pairwise statistics (BH corrected)</h2>
{pairwise_df.head(50).to_html(index=False)}
<h2>Figures</h2>
<ul>
  <li>{fig_path1.name}</li>
  <li>{fig_path2.name}</li>
</ul>
</body></html>
"""
html_path = REPORTS_DIR / "RQ1_SAM_Variants_Report.html"
html_path.write_text(report_html, encoding='utf-8')
print("Saved:", html_path)


Saved: /workspace/HistoPathologyResearch/reports/rq1/RQ1_SAM_Variants_Report.html
