In [None]:
import torch
import pandas as pd
import numpy as np
from collections import defaultdict
from torchvision.ops import nms, box_iou
from torch import optim, nn, utils, Tensor
from torchmetrics.detection.mean_ap import MeanAveragePrecision

from anchor_utils import AnchorGenerator as AnchorGenerator
from torchvision.models.detection.rpn import AnchorGenerator as AnchorGeneratorRCNN, RPNHead

from model import RetinaDataset, RetinaNet, collate

from faster_rcnn import RCNNDataset, FasterRCNNModel
from faster_rcnn import collate as collate_rcnn

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def evaluate_model_rcnn(model, dataloader, device, iou_threshold=0.5, score_threshold=0.):
    """
    Evaluates Faster R-CNN model on the test data and returns the mAP, IoU, and specificity.

    Args:
        model: Model to evaluate.
        dataloader: Dataloader of test data.
        device: Device to use for evaluation.
        iou_threshold: The IoU threshold for NMS.
        score_threshold: Confidence score threshold for filtering predictions.

    Returns:
        final_map: Computed mAP metrics
        average_iou: Average IoU score.
        specificity: Specificity. 
    """
    map_metric = MeanAveragePrecision(extended_summary=True)
    iou_scores = []

    tn = 0  
    fp = 0  

    model.eval()
    model.to(device)

    with torch.no_grad():
        for images, targets in dataloader:
            images = images.to(device)
            targets = {k: [v.to(device) for v in t] for k, t in targets.items()}

            outputs = model(images)

            # Convert to the expected format for the metric
            formatted_targets = [{'boxes': b, 'labels': l.int()} for b, l in zip(targets['boxes'], targets['labels'])]

            preds = []
            for output in outputs:
                boxes = output['boxes']
                scores = output['scores']
                labels = output['labels']

                high_conf_indices = scores > score_threshold
                boxes = boxes[high_conf_indices]
                scores = scores[high_conf_indices]
                labels = labels[high_conf_indices]

                # Ensure boxes, scores, and labels are tensors
                if isinstance(boxes, list):
                    boxes = torch.tensor(boxes, device=device)
                if isinstance(scores, list):
                    scores = torch.tensor(scores, device=device)
                if isinstance(labels, list):
                    labels = torch.tensor(labels, device=device)

                if boxes.numel() > 0:  # Check if there are any boxes
                    # Apply NMS
                    keep = nms(boxes, scores, iou_threshold)
                    boxes = boxes[keep]
                    scores = scores[keep]
                    labels = labels[keep]

                    preds.append({
                        'boxes': boxes,
                        'labels': labels.int(),
                        'scores': scores
                    })
                else:
                    # If there are no boxes, just append an empty prediction
                    preds.append({
                        'boxes': torch.empty((0, 4), device=device),
                        'labels': torch.empty((0,), dtype=torch.int64, device=device),
                        'scores': torch.empty((0,), device=device)
                    })

            # Evaluate negative samples
            for target, pred in zip(formatted_targets, preds):
                if target['boxes'].numel() == 0:  
                    if len(pred['scores']) > 0:  
                        high_conf_indices = pred['scores'] > 0.2
                        if high_conf_indices.sum() == 0:
                            tn += 1  
                        else:
                            fp += 1  
                    else:
                        tn += 1  
                else:
                    # Update the metric with the current batch
                    map_metric.update([pred], [target])

                    # Calculate IoU for each image
                    if len(pred['boxes']) > 0 and len(target['boxes']) > 0:
                        iou = box_iou(pred['boxes'], target['boxes']).diag().mean().item()
                    else:
                        iou = 0.0
                    iou_scores.append(iou)

    # Compute the final mAP score
    final_map = map_metric.compute()
    average_iou = sum(iou_scores) / len(iou_scores) if iou_scores else 0.0

    # Calculate specificity: TP / (TP + FP)
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 1.0

    print(f"Mean Average Precision (mAP): {final_map['map']}")
    print(f"Mean Average Precision (mAP_50): {final_map['map_50']}")
    print(f"Average IoU: {average_iou}")
    print(f"Precision on negative samples: {specificity:.4f}")

    return final_map, average_iou, specificity


def evaluate_model_retina(model, dataloader, device, iou_threshold=0.5, score_threshold=0.):
    """
    Evaluates RetinaNet model on the test data and returns the mAP, IoU, and specificity.

    Args:
        model: Model to evaluate.
        dataloader: Dataloader of test data.
        device: Device to use for evaluation.
        iou_threshold: The IoU threshold for NMS.
        score_threshold: Confidence score threshold for filtering predictions.

    Returns:
        final_map: Computed mAP metrics
        average_iou: Average IoU score.
        specificity: Specificity. 
    """
    map_metric = MeanAveragePrecision(extended_summary=True)
    iou_scores = []

    tn = 0  
    fp = 0  

    model.eval()
    model.to(device)

    with torch.no_grad():
        for images, targets in dataloader:
            images = images.to(device)
            targets = {k: [v.to(device) for v in t] for k, t in targets.items()}

            outputs = model(images)

            # Convert to the expected format for the metric
            formatted_targets = [{'boxes': b, 'labels': l.int()} for b, l in zip(targets['bbox'], targets['labels'])]

            preds = []
            for output in outputs[1]:
                boxes = output['bbox']
                scores = output['scores']
                labels = output['labels']

                high_conf_indices = scores > score_threshold
                boxes = boxes[high_conf_indices]
                scores = scores[high_conf_indices]
                labels = labels[high_conf_indices]

                # Ensure boxes, scores, and labels are tensors
                if isinstance(boxes, list):
                    boxes = torch.tensor(boxes, device=device)
                if isinstance(scores, list):
                    scores = torch.tensor(scores, device=device)
                if isinstance(labels, list):
                    labels = torch.tensor(labels, device=device)

                if boxes.numel() > 0:  # Check if there are any boxes
                    # Apply NMS
                    keep = nms(boxes, scores, iou_threshold)
                    boxes = boxes[keep]
                    scores = scores[keep]
                    labels = labels[keep]

                    preds.append({
                        'boxes': boxes,
                        'labels': labels.int(),
                        'scores': scores
                    })
                else:
                    # If there are no boxes, just append an empty prediction
                    preds.append({
                        'boxes': torch.empty((0, 4), device=device),
                        'labels': torch.empty((0,), dtype=torch.int64, device=device),
                        'scores': torch.empty((0,), device=device)
                    })

            # Evaluate negative samples
            for target, pred in zip(formatted_targets, preds):
                if target['boxes'].numel() == 0:  
                    if len(pred['scores']) > 0:  
                        high_conf_indices = pred['scores'] > 0.2
                        if high_conf_indices.sum() == 0:
                            tn += 1  
                        else:
                            # for box in pred['boxes']: # Keep this?
                            fp += 1  
                    else:
                        tn += 1  
                else:
                    # Update the metric with the current batch
                    map_metric.update([pred], [target])

                    # Calculate IoU for each image
                    if len(pred['boxes']) > 0 and len(target['boxes']) > 0:
                        iou = box_iou(pred['boxes'], target['boxes']).diag().mean().item()
                    else:
                        iou = 0.0
                    iou_scores.append(iou)

    # Compute the final mAP score
    final_map = map_metric.compute()
    average_iou = sum(iou_scores) / len(iou_scores) if iou_scores else 0.0

    # Calculate negative sample precision: TP / (TP + FP)
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 1.0

    print(f"Mean Average Precision (mAP): {final_map['map']}")
    print(f"Mean Average Precision (mAP_50): {final_map['map_50']}")
    print(f"Average IoU: {average_iou}")
    print(f"Precision on negative samples: {specificity:.4f}")

    return final_map, average_iou, specificity


def evaluate_multi_retina(model, dataloader, device, iou_threshold=0.5, score_threshold=0.0):
    """
    Evaluates the given model on the test data and returns the Mean Average Precision (mAP) for images with targets,
    and the precision for negative samples (images without targets). Also calculates precision, recall, and F1-score
    per label.

    Args:
        model: The model to evaluate.
        dataloader: DataLoader containing the test data.
        device: The device (CPU or GPU) to use for evaluation.
        iou_threshold: The IoU threshold for Non-Max Suppression (NMS) and true positive determination.
        score_threshold: The confidence score threshold for filtering predictions.

    Returns:
        final_map: The computed Mean Average Precision (mAP) score.
        average_iou: The average IoU score.
        ns_tn_rate: Precision on negative samples.
        class_metrics: Dictionary with precision, recall, and F1-score for each label.
    """
    
    map_metric = MeanAveragePrecision(extended_summary=True)
    iou_scores = []
    
    tn, fp = 0, 0  # For negative sample precision

    # Track true positives, false positives, and false negatives for each class
    per_class_tp = defaultdict(int)
    per_class_fp = defaultdict(int)
    per_class_fn = defaultdict(int)

    model.eval()
    model.to(device)

    with torch.no_grad():
        for images, targets in dataloader:
            images = images.to(device)
            targets = {k: [v.to(device) for v in t] for k, t in targets.items()}

            outputs = model(images)

            # Convert targets into the required format for the metric
            formatted_targets = [{'boxes': b, 'labels': l.int()} for b, l in zip(targets['bbox'], targets['labels'])]

            preds = []
            for output in outputs[1]:
                boxes = output['bbox']
                scores = output['scores']
                labels = output['labels']

                if isinstance(boxes, list):
                    boxes = torch.tensor(boxes, device=device)
                if isinstance(scores, list):
                    scores = torch.tensor(scores, device=device)
                if isinstance(labels, list):
                    labels = torch.tensor(labels, device=device)

                if boxes.numel() > 0:  # If there are predicted boxes
                    # Apply Non-Max Suppression (NMS)
                    keep = nms(boxes, scores, iou_threshold)
                    boxes = boxes[keep]
                    scores = scores[keep]
                    labels = labels[keep]

                    preds.append({
                        'boxes': boxes,
                        'labels': labels.int(),
                        'scores': scores
                    })
                else:
                    preds.append({
                        'boxes': torch.empty((0, 4), device=device),
                        'labels': torch.empty((0,), dtype=torch.int64, device=device),
                        'scores': torch.empty((0,), device=device)
                    })

            for target, pred in zip(formatted_targets, preds):
                if target['boxes'].numel() == 0:  # If no ground-truth boxes (negative samples)
                    if len(pred['scores']) > 0:  # Check if there are predictions
                        high_conf_indices = pred['scores'] > score_threshold
                        if high_conf_indices.sum() == 0:
                            tn += 1
                        else:
                            fp += 1
                    else:
                        tn += 1
                else:
                    # Update the mAP metric with the current batch
                    map_metric.update([pred], [target])

                    # Calculate IoU for valid boxes
                    if len(pred['boxes']) > 0 and len(target['boxes']) > 0:
                        iou_matrix = box_iou(pred['boxes'], target['boxes'])
                        max_ious, max_indices = iou_matrix.max(dim=1)  # Get the best matching ground truth for each prediction

                        # Calculate the average IoU regardless of whether it's above the threshold
                        for iou in max_ious:
                            iou_scores.append(iou.item())  # Append IoU for averaging

                        # Filter TPs based on IoU threshold
                        for pred_idx, iou in enumerate(max_ious):
                            pred_label = pred['labels'][pred_idx]
                            gt_label = target['labels'][max_indices[pred_idx]]

                            if iou > iou_threshold and pred_label == gt_label:
                                per_class_tp[pred_label.item()] += 1
                            else:
                                per_class_fp[pred_label.item()] += 1

                        # Ground truth boxes that weren't matched to any prediction
                        for gt_label in target['labels']:
                            if gt_label not in pred['labels'][max_indices]:
                                per_class_fn[gt_label.item()] += 1
                    else:
                        iou_scores.append(0.0)  # No predictions or ground truths, IoU = 0


    # Compute final mAP score
    final_map = map_metric.compute()
    average_iou = sum(iou_scores) / len(iou_scores) if iou_scores else 0.0
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 1.0

    # Calculate precision, recall, and F1-score for each label
    class_metrics = {}
    for label in set(list(per_class_tp.keys()) + list(per_class_fp.keys()) + list(per_class_fn.keys())):
        tp = per_class_tp[label]
        fp = per_class_fp[label]
        fn = per_class_fn[label]

        precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
        f1_score = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0

        class_metrics[label] = {
            'precision': precision,
            'recall': recall,
            'f1_score': f1_score
        }

    # Print or log the results
    print(f"Mean Average Precision (mAP): {final_map['map']:.4f}")
    print(f"Mean Average Precision (mAP@50): {final_map['map_50']:.4f}")
    print(f"Average IoU: {average_iou:.4f}")
    print(f"Precision on negative samples: {specificity:.4f}")
    
    print("\nPer Label Metrics:")
    for label, metrics in class_metrics.items():
        print(f"Label {label}: Precision: {metrics['precision']:.4f}, Recall: {metrics['recall']:.4f}, F1: {metrics['f1_score']:.4f}")

    return final_map, average_iou, specificity, class_metrics


Example of what to run to evaluate model (in this case we evaluate RetinaNet)

In [None]:
# Use checkpoint path to model you want to evaluate
checkpoint_path = '/vol/biomedic3/bglocker/mscproj24/mrm123/slurm_scripts/Retinanet Models UE0ns/4mb70yay/checkpoints/best_unfiltered_embed_0ns_0.ckpt'
model = RetinaNet.load_from_checkpoint(checkpoint_path, ratios=[1.0, 1.1912243160876392, 0.83947245409187], scales=[0.6701667817494404, 0.43826872391648763, 1.0929571608034148])

test_dataset = RetinaDataset(csv_file='csv_files/unfiltered_embed_0ns/test.csv', augmentation=False)
test_loader = utils.data.DataLoader(test_dataset, batch_size=32, shuffle=True, collate_fn=collate)

final_map, average_iou, ns_tn_rate = evaluate_model_retina(model, test_loader, device, iou_threshold=0.5, score_threshold=0.0)

In [None]:
import numpy as np

# Define a function to calculate the mean and uncertainty
def calculate_mean_uncertainty(metrics):
    """
    Function to calculate uncertainty using multiple metrics from multiple of the same models trained

    Args: 
        metrics: dictionary to metrics to evaluate

    Returns:
        results: dictionary of mean and std per metric (using metric as key)
    
    """
    results = {}
    for key, values in metrics.items():
        # Handle map key separately
        if key == 'map': 
            map_values = [v['map'].item() for v in values]  
            map_50_values = [v['map_50'].item() for v in values]  

            # Calculate mean and standard deviation for both 'map' and 'map_50'
            map_mean = np.mean(map_values)
            map_std_dev = np.std(map_values)
            
            map_50_mean = np.mean(map_50_values)
            map_50_std_dev = np.std(map_50_values)

            # Store results
            results['map'] = {'mean': map_mean, 'std_dev': map_std_dev}
            results['map_50'] = {'mean': map_50_mean, 'std_dev': map_50_std_dev}
        else: 
            mean = np.mean(values)
            std_dev = np.std(values)
            results[key] = {'mean': mean, 'std_dev': std_dev}
    return results