# Model Evaluation - Metrics

Run model evaluations and save results to CSV files in `outputs/eval-metrics/`.

In [None]:
import sys
from pathlib import Path
sys.path.insert(0, str(Path.cwd().parent))

import torch
import yaml
import albumentations as A
import pandas as pd
import torchmetrics
from torch.utils.data import DataLoader

from src.data import NpzSegmentationDataset
from src.models.smp import SMPMulticlassSegmentationModel

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {DEVICE}")

# Create output directory
OUTPUT_DIR = Path("../outputs/eval-metrics")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

In [46]:
def evaluate_model(config_path, ckpt_path, split="val"):
    """Evaluate a model and return IoU, Precision, Recall for seagrass class."""
    
    # Load config
    with open(config_path) as f:
        config = yaml.safe_load(f)
    
    model_args = config["model"]["init_args"]
    data_args = config["data"]["init_args"]
    
    # Select data split
    if split == "val":
        chip_dir = data_args["val_chip_dir"]
    else:
        chip_dir = data_args["test_chip_dir"]
    
    # Load dataset
    test_transforms = A.from_dict(data_args["test_transforms"])
    dataset = NpzSegmentationDataset(chip_dir, transforms=test_transforms)
    loader = DataLoader(dataset, batch_size=8, num_workers=4, shuffle=False)
    
    # Load model
    model = SMPMulticlassSegmentationModel.load_from_checkpoint(ckpt_path, map_location=DEVICE)
    model.eval()
    model.to(DEVICE)
    
    # Create metrics
    num_classes = model_args["num_classes"]
    ignore_index = model_args.get("ignore_index", -100)
    
    iou = torchmetrics.JaccardIndex(task="multiclass", num_classes=num_classes, 
                                     ignore_index=ignore_index, average="none").to(DEVICE)
    precision = torchmetrics.Precision(task="multiclass", num_classes=num_classes,
                                        ignore_index=ignore_index, average="none").to(DEVICE)
    recall = torchmetrics.Recall(task="multiclass", num_classes=num_classes,
                                  ignore_index=ignore_index, average="none").to(DEVICE)
    
    # Evaluate
    with torch.no_grad():
        for images, labels in loader:
            images, labels = images.to(DEVICE), labels.to(DEVICE)
            probs = torch.softmax(model(images), dim=1)
            iou.update(probs, labels)
            precision.update(probs, labels)
            recall.update(probs, labels)
    
    # Return seagrass class metrics (index 1)
    results = {
        "IoU": iou.compute()[1].item(),
        "Precision": precision.compute()[1].item(),
        "Recall": recall.compute()[1].item(),
    }
    
    # Cleanup
    del model
    torch.cuda.empty_cache()
    
    return results

## 1. Architecture Experiment

In [None]:
# Architecture experiment models
ARCH_MODELS = {
    "UNet_512": {
        "config": "../configs/seagrass-rgb/architecture-experiment/unetpp_resnet34_512.yaml",
        "ckpt": "../seagrass-rgb/qjfpb4m8/checkpoints/unetpp_resnet34_512_epoch-199_val-iou-0.7050.ckpt",
    },
    "UNet_1024": {
        "config": "../configs/seagrass-rgb/architecture-experiment/unetpp_resnet34_1024.yaml",
        "ckpt": "../seagrass-rgb/mdqn7se0/checkpoints/unetpp_resnet34_1024_epoch-199_val-iou-0.7247.ckpt",
    },
    "Seg_512": {
        "config": "../configs/seagrass-rgb/architecture-experiment/segformer_mitb2_512.yaml",
        "ckpt": "../seagrass-rgb/3uav2blr/checkpoints/segformer_mitb2_512_epoch-199_val-iou-0.7425.ckpt",
    },
    "Seg_1024": {
        "config": "../configs/seagrass-rgb/architecture-experiment/segformer_mitb2_1024.yaml",
        "ckpt": "../seagrass-rgb/jhf1t0ih/checkpoints/segformer_mitb2_1024_epoch-199_val-iou-0.7909.ckpt",
    },
}

# Evaluate all architecture models
arch_results = {}
for name, paths in ARCH_MODELS.items():
    print(f"Evaluating {name}...")
    arch_results[name] = evaluate_model(paths["config"], paths["ckpt"], split="val")
    print(f"  IoU: {arch_results[name]['IoU']:.4f}")

arch_df = pd.DataFrame(arch_results).T
arch_df.to_csv(OUTPUT_DIR / "architecture_experiment.csv")
arch_df

## 2. Augmentation Experiment

In [None]:
# Augmentation experiment models (Note: Baseline checkpoint not available)
AUG_MODELS = {
    "Baseline": {
        "config": "../configs/seagrass-rgb/augmentation-experiment/segformer_baseline_aug.yaml",
        "ckpt": "/mnt/class_data/sdalgarno/checkpoints/augmentation-experiment/baseline/last.ckpt",
    },
    "Default": {
        "config": "../configs/seagrass-rgb/augmentation-experiment/segformer_default_aug.yaml",
        "ckpt": "/mnt/class_data/sdalgarno/checkpoints/augmentation-experiment/default/last.ckpt",
    },
    "Scale": {
        "config": "../configs/seagrass-rgb/augmentation-experiment/segformer_scale_aug.yaml",
        "ckpt": "/mnt/class_data/sdalgarno/checkpoints/augmentation-experiment/scale/last.ckpt",
    },
    "Domain": {
        "config": "../configs/seagrass-rgb/augmentation-experiment/segformer_domain_aug.yaml",
        "ckpt": "/mnt/class_data/sdalgarno/checkpoints/augmentation-experiment/domain/last.ckpt",
    },
}

# Evaluate all augmentation models
aug_results = {}
for name, paths in AUG_MODELS.items():
    print(f"Evaluating {name}...")
    aug_results[name] = evaluate_model(paths["config"], paths["ckpt"], split="val")
    print(f"  IoU: {aug_results[name]['IoU']:.4f}")

aug_df = pd.DataFrame(aug_results).T
aug_df.to_csv(OUTPUT_DIR / "augmentation_experiment.csv")
aug_df

## 3. Regional Cross-Validation

In [None]:
# Regional CV models
REGIONAL_MODELS = {
    "North": {
        "config": "../configs/seagrass-rgb/regional-cv/segformer_cv_north.yaml",
        "ckpt": "/mnt/class_data/sdalgarno/checkpoints/regional-cv/north/last-v1.ckpt",
    },
    "Central": {
        "config": "../configs/seagrass-rgb/regional-cv/segformer_cv_central.yaml",
        "ckpt": "/mnt/class_data/sdalgarno/checkpoints/regional-cv/central/last-v1.ckpt",
    },
    "South": {
        "config": "../configs/seagrass-rgb/regional-cv/segformer_cv_south.yaml",
        "ckpt": "/mnt/class_data/sdalgarno/checkpoints/regional-cv/south/last-v1.ckpt",
    },
}

# Evaluate on test only
regional_results = []
for region, paths in REGIONAL_MODELS.items():
    print(f"Evaluating {region}...")
    test_metrics = evaluate_model(paths["config"], paths["ckpt"], split="test")
    regional_results.append({
        "Model": region, 
        "IoU": test_metrics["IoU"],
        "Precision": test_metrics["Precision"],
        "Recall": test_metrics["Recall"],
    })
    print(f"  Test IoU: {test_metrics['IoU']:.4f}")

regional_df = pd.DataFrame(regional_results).set_index("Model")
regional_df.to_csv(OUTPUT_DIR / "regional_cv.csv")
regional_df

## 5. Final Model Evaluation

In [None]:
# Final model config and checkpoint
FINAL_MODEL = {
    "config": "../configs/seagrass-rgb/segformer_train50.yaml",
    "ckpt": "/mnt/class_data/sdalgarno/checkpoints/segformer-train50/segformer_train50_epoch-299_val-iou-0.8837.ckpt",
}

# Evaluate on val and test
print("Evaluating Final Model...")
final_val = evaluate_model(FINAL_MODEL["config"], FINAL_MODEL["ckpt"], split="val")
final_test = evaluate_model(FINAL_MODEL["config"], FINAL_MODEL["ckpt"], split="test")

# Create results table
final_results = pd.DataFrame({
    "Val": final_val,
    "Test": final_test,
}).T

final_results.to_csv(OUTPUT_DIR / "final_model.csv")
print("\nFinal Model Metrics:")
final_results