# Model Evaluation - Metrics

Run model evaluations and save results to CSV files in `outputs/eval-metrics/`.

In [1]:
import sys
from pathlib import Path
sys.path.insert(0, str(Path.cwd().parent))

import torch
import yaml
import albumentations as A
import pandas as pd
import torchmetrics
from torch.utils.data import DataLoader

from src.data import NpzSegmentationDataset
from src.models.smp import SMPMulticlassSegmentationModel

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {DEVICE}")

# Create output directory
OUTPUT_DIR = Path("../outputs/eval-metrics")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

Device: cuda


In [2]:
def evaluate_model(config_path, ckpt_path, split="val"):
    """Evaluate a model and return IoU, Precision, Recall, F1 for seagrass class."""
    
    # Load config
    with open(config_path) as f:
        config = yaml.safe_load(f)
    
    model_args = config["model"]["init_args"]
    data_args = config["data"]["init_args"]
    
    # Select data split
    if split == "val":
        chip_dir = data_args["val_chip_dir"]
    else:
        chip_dir = data_args["test_chip_dir"]
    
    # Load dataset
    test_transforms = A.from_dict(data_args["test_transforms"])
    dataset = NpzSegmentationDataset(chip_dir, transforms=test_transforms)
    loader = DataLoader(dataset, batch_size=8, num_workers=4, shuffle=False)
    
    # Load model
    model = SMPMulticlassSegmentationModel.load_from_checkpoint(ckpt_path, map_location=DEVICE)
    model.eval()
    model.to(DEVICE)
    
    # Create metrics
    num_classes = model_args["num_classes"]
    ignore_index = model_args.get("ignore_index", -100)
    
    iou = torchmetrics.JaccardIndex(task="multiclass", num_classes=num_classes, 
                                     ignore_index=ignore_index, average="none").to(DEVICE)
    precision = torchmetrics.Precision(task="multiclass", num_classes=num_classes,
                                        ignore_index=ignore_index, average="none").to(DEVICE)
    recall = torchmetrics.Recall(task="multiclass", num_classes=num_classes,
                                  ignore_index=ignore_index, average="none").to(DEVICE)
    f1 = torchmetrics.F1Score(task="multiclass", num_classes=num_classes,
                               ignore_index=ignore_index, average="none").to(DEVICE)
    
    # Evaluate
    with torch.no_grad():
        for images, labels in loader:
            images, labels = images.to(DEVICE), labels.to(DEVICE)
            probs = torch.softmax(model(images), dim=1)
            iou.update(probs, labels)
            precision.update(probs, labels)
            recall.update(probs, labels)
            f1.update(probs, labels)
    
    # Return seagrass class metrics (index 1)
    results = {
        "IoU": iou.compute()[1].item(),
        "Precision": precision.compute()[1].item(),
        "Recall": recall.compute()[1].item(),
        "F1": f1.compute()[1].item(),
    }
    
    # Cleanup
    del model
    torch.cuda.empty_cache()
    
    return results

## 1. Architecture Experiment

In [3]:
# Architecture experiment models
ARCH_MODELS = {
    "UNet++_512": {
        "config": "../configs/seagrass-rgb/architecture-experiment/unetpp_resnet34_512.yaml",
        "ckpt": "../seagrass-rgb/qjfpb4m8/checkpoints/unetpp_resnet34_512_epoch-199_val-iou-0.7050.ckpt",
    },
    "UNet++_1024": {
        "config": "../configs/seagrass-rgb/architecture-experiment/unetpp_resnet34_1024.yaml",
        "ckpt": "../seagrass-rgb/mdqn7se0/checkpoints/unetpp_resnet34_1024_epoch-199_val-iou-0.7247.ckpt",
    },
    "SegFormer_512": {
        "config": "../configs/seagrass-rgb/architecture-experiment/segformer_mitb2_512.yaml",
        "ckpt": "../seagrass-rgb/3uav2blr/checkpoints/segformer_mitb2_512_epoch-199_val-iou-0.7425.ckpt",
    },
    "SegFormer_1024": {
        "config": "../configs/seagrass-rgb/architecture-experiment/segformer_mitb2_1024.yaml",
        "ckpt": "../seagrass-rgb/jhf1t0ih/checkpoints/segformer_mitb2_1024_epoch-199_val-iou-0.7909.ckpt",
    },
}

# Evaluate all architecture models
arch_results = {}
for name, paths in ARCH_MODELS.items():
    print(f"Evaluating {name}...")
    arch_results[name] = evaluate_model(paths["config"], paths["ckpt"], split="val")
    print(f"  IoU: {arch_results[name]['IoU']:.4f}")

arch_df = pd.DataFrame(arch_results).T
arch_df.to_csv(OUTPUT_DIR / "architecture_experiment.csv")
arch_df

Evaluating UNet++_512...
  IoU: 0.4669
Evaluating UNet++_1024...
  IoU: 0.4890
Evaluating SegFormer_512...
  IoU: 0.5606
Evaluating SegFormer_1024...
  IoU: 0.6252


Unnamed: 0,IoU,Precision,Recall,F1
UNet++_512,0.466905,0.819864,0.520278,0.636585
UNet++_1024,0.488956,0.852443,0.534167,0.656777
SegFormer_512,0.560615,0.872952,0.61042,0.718454
SegFormer_1024,0.625214,0.859664,0.696277,0.769392


## 2. Augmentation Experiment

In [4]:
# Augmentation experiment models (Note: Baseline checkpoint not available)
AUG_MODELS = {
    "Baseline": {
        "config": "../configs/seagrass-rgb/augmentation-experiment/segformer_baseline_aug.yaml",
        "ckpt": "/mnt/class_data/sdalgarno/checkpoints/augmentation-experiment/baseline/last.ckpt",
    },
    "Default": {
        "config": "../configs/seagrass-rgb/augmentation-experiment/segformer_default_aug.yaml",
        "ckpt": "/mnt/class_data/sdalgarno/checkpoints/augmentation-experiment/default/last.ckpt",
    },
    "Scale": {
        "config": "../configs/seagrass-rgb/augmentation-experiment/segformer_scale_aug.yaml",
        "ckpt": "/mnt/class_data/sdalgarno/checkpoints/augmentation-experiment/scale/last.ckpt",
    },
    "Domain": {
        "config": "../configs/seagrass-rgb/augmentation-experiment/segformer_domain_aug.yaml",
        "ckpt": "/mnt/class_data/sdalgarno/checkpoints/augmentation-experiment/domain/last.ckpt",
    },
}

# Evaluate all augmentation models
aug_results = {}
for name, paths in AUG_MODELS.items():
    print(f"Evaluating {name}...")
    aug_results[name] = evaluate_model(paths["config"], paths["ckpt"], split="val")
    print(f"  IoU: {aug_results[name]['IoU']:.4f}")

aug_df = pd.DataFrame(aug_results).T
aug_df.to_csv(OUTPUT_DIR / "augmentation_experiment.csv")
aug_df

Evaluating Baseline...
  IoU: 0.6064
Evaluating Default...
  IoU: 0.6380
Evaluating Scale...
  IoU: 0.6510
Evaluating Domain...
  IoU: 0.6681


Unnamed: 0,IoU,Precision,Recall,F1
Baseline,0.606425,0.875451,0.663685,0.755
Default,0.637995,0.866598,0.707478,0.778995
Scale,0.650954,0.864077,0.725214,0.788579
Domain,0.668088,0.867999,0.743642,0.801022


## 3. Regional Cross-Validation

In [5]:
# Regional CV models
REGIONAL_MODELS = {
    "North": {
        "config": "../configs/seagrass-rgb/regional-cv/segformer_cv_north.yaml",
        "ckpt": "/mnt/class_data/sdalgarno/checkpoints/regional-cv/north/last-v1.ckpt",
    },
    "Central": {
        "config": "../configs/seagrass-rgb/regional-cv/segformer_cv_central.yaml",
        "ckpt": "/mnt/class_data/sdalgarno/checkpoints/regional-cv/central/last-v1.ckpt",
    },
    "South": {
        "config": "../configs/seagrass-rgb/regional-cv/segformer_cv_south.yaml",
        "ckpt": "/mnt/class_data/sdalgarno/checkpoints/regional-cv/south/last-v1.ckpt",
    },
}

# Evaluate on test only
regional_results = []
for region, paths in REGIONAL_MODELS.items():
    print(f"Evaluating {region}...")
    test_metrics = evaluate_model(paths["config"], paths["ckpt"], split="test")
    regional_results.append({
        "Model": region, 
        "IoU": test_metrics["IoU"],
        "Precision": test_metrics["Precision"],
        "Recall": test_metrics["Recall"],
        "F1": test_metrics["F1"],
    })
    print(f"  Test IoU: {test_metrics['IoU']:.4f}")

regional_df = pd.DataFrame(regional_results).set_index("Model")
regional_df.to_csv(OUTPUT_DIR / "regional_cv.csv")
regional_df

Evaluating North...
  Test IoU: 0.5182
Evaluating Central...
  Test IoU: 0.5681
Evaluating South...
  Test IoU: 0.6265


Unnamed: 0_level_0,IoU,Precision,Recall,F1
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
North,0.518212,0.639764,0.731724,0.682661
Central,0.568066,0.889637,0.611134,0.724544
South,0.626513,0.94578,0.649854,0.770376


## 5. Final Model Evaluation

In [6]:
# Final model config and checkpoint
FINAL_MODEL = {
    "config": "../configs/seagrass-rgb/segformer_train50.yaml",
    "ckpt": "/mnt/class_data/sdalgarno/checkpoints/segformer-train50/segformer_train50_epoch-299_val-iou-0.8837.ckpt",
}

# Evaluate on val and test
print("Evaluating Final Model...")
final_val = evaluate_model(FINAL_MODEL["config"], FINAL_MODEL["ckpt"], split="val")
final_test = evaluate_model(FINAL_MODEL["config"], FINAL_MODEL["ckpt"], split="test")

# Create results table
final_results = pd.DataFrame({
    "Val": final_val,
    "Test": final_test,
}).T

final_results.to_csv(OUTPUT_DIR / "final_model.csv")
print("\nFinal Model Metrics:")
final_results

Evaluating Final Model...

Final Model Metrics:


Unnamed: 0,IoU,Precision,Recall,F1
Val,0.653107,0.818018,0.764131,0.790157
Test,0.701688,0.859956,0.792214,0.824696
