# Comparing DINOv3 Backbones within neurOS

This notebook demonstrates how to integrate the **DINOv3** backbone into the
`neurOS` framework and how to compare different model sizes on synthetic
neuroscience datasets.  We implement a simple segmentation task where the
goal is to predict a binary mask (foreground vs. background) from image
patches.  Although the backbone here is a deterministic placeholder, the
pipeline mirrors what you would do with real DINOv3 weights:

1. Instantiate the backbone for a given variant (e.g. ConvNeXt‑Tiny or ViT‑Large).
2. Generate or load images and corresponding binary masks.
3. Extract patch features for each image.
4. Train a classifier (here, logistic regression) on the patch features.
5. Evaluate the classifier on a held‑out test set using metrics like
   accuracy and F1 score.

We repeat this procedure for three synthetic modalities—electron microscopy
(EM), MRI and histology—and compare the performance of the **CNX‑Tiny** and
**ViT‑Large** variants.  Because the features are randomised, any
difference you observe is due to chance; however the example shows how to
structure your experiments.


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score

from neuros.plugins.cv.dinov3_backbone import DINOv3Backbone


def generate_synthetic_dataset(num_images: int, size: int = 256, mode: str = "em"):
    '''
    Generate a synthetic dataset of images and binary masks.

    Parameters
    ----------
    num_images: int
        Number of images to generate.
    size: int, optional
        Spatial resolution of the square images.
    mode: str, optional
        The type of modality to simulate.  Supported values are:
        ``"em"`` for electron microscopy (random noise with sparse spots),
        ``"mri"`` for MRI (smooth gradient with circular lesion) and
        ``"histology"`` for histology (textured background with blob).

    Returns
    -------
    images: list of numpy.ndarray
        RGB images with pixel values in [0, 255].
    masks: list of numpy.ndarray
        Binary masks of shape (H, W) with values 0 or 1.
    '''
    images = []
    masks = []
    rng = np.random.default_rng(42)
    for i in range(num_images):
        if mode == "em":
            # EM: random noise with bright circular spots
            img = rng.normal(loc=0.5, scale=0.15, size=(size, size, 1)).clip(0, 1)
            num_spots = rng.integers(3, 7)
            mask = np.zeros((size, size), dtype=np.int32)
            for _ in range(num_spots):
                cx, cy = rng.integers(0, size, size=2)
                r = rng.integers(size//20, size//10)
                Y, X = np.ogrid[:size, :size]
                circle = (X - cx)**2 + (Y - cy)**2 <= r**2
                mask[circle] = 1
                img[circle] += 0.5
            img = img.clip(0, 1)
            img_rgb = np.repeat(img, 3, axis=2)
        elif mode == "mri":
            # MRI: smooth gradient with circular lesion
            x = np.linspace(-1, 1, size)
            y = np.linspace(-1, 1, size)
            X, Y = np.meshgrid(x, y)
            img = 0.5 + 0.5 * (X + Y) / 2
            mask = ((X**2 + Y**2) < 0.2**2).astype(np.int32)
            img[mask == 1] = 1.0
            img_rgb = np.repeat(img[:, :, None], 3, axis=2)
        elif mode == "histology":
            # Histology: pinkish texture with darker nucleus region
            base = rng.uniform(0.8, 1.0, size=(size, size, 3))
            noise = rng.normal(0, 0.05, size=(size, size, 3))
            img_rgb = (base + noise).clip(0, 1)
            cx, cy = rng.integers(size//4, 3*size//4, size=2)
            r = size // 6
            mask = np.zeros((size, size), dtype=np.int32)
            Y, X = np.ogrid[:size, :size]
            nucleus = (X - cx)**2 + (Y - cy)**2 <= r**2
            mask[nucleus] = 1
            img_rgb[nucleus] -= 0.4
            img_rgb = img_rgb.clip(0, 1)
        else:
            raise ValueError(f"Unsupported mode: {mode}")
        images.append((img_rgb * 255).astype(np.uint8))
        masks.append(mask)
    return images, masks


def flatten_dataset(images, masks, backbone):
    '''
    Convert images and masks into patch features and labels for classifier training.

    Each patch is assigned the majority label of the underlying mask.  The
    features are extracted using the provided backbone.

    Parameters
    ----------
    images: list of numpy.ndarray
        List of RGB images.
    masks: list of numpy.ndarray
        Corresponding binary masks.
    backbone: DINOv3Backbone
        Backbone instance used to produce features.

    Returns
    -------
    features: numpy.ndarray
        Array of shape (M, C) where M is the total number of patches across
        images and C is the feature dimension.
    labels: numpy.ndarray
        Binary labels of length M.
    '''
    all_feats = []
    all_labels = []
    for img, mask in zip(images, masks):
        patch_feats = backbone.embed([img])[0]  # (N, C)
        grid_size = backbone.grid_size
        # Downsample mask to patch grid by majority voting
        h_patch = backbone.patch_size
        w_patch = backbone.patch_size
        mask_cropped = mask[: grid_size * h_patch, : grid_size * w_patch]
        patch_mask = mask_cropped.reshape(grid_size, h_patch, grid_size, w_patch)
        # Majority vote for each patch
        patch_labels = (patch_mask.sum(axis=(1, 3)) > (h_patch * w_patch / 2)).astype(np.int32)
        all_feats.append(patch_feats)
        all_labels.append(patch_labels.flatten())
    return np.concatenate(all_feats, axis=0), np.concatenate(all_labels, axis=0)


def train_and_evaluate(backbone_id, mode):
    '''
    Train a logistic regression on patch features and evaluate on test data.

    Returns accuracy and F1 score.
    '''
    backbone = DINOv3Backbone(model_id=backbone_id)
    # Generate dataset
    images, masks = generate_synthetic_dataset(12, size=128, mode=mode)
    # Split into train and test
    train_images = images[:8]
    train_masks = masks[:8]
    test_images = images[8:]
    test_masks = masks[8:]
    # Extract features and labels
    X_train, y_train = flatten_dataset(train_images, train_masks, backbone)
    X_test, y_test = flatten_dataset(test_images, test_masks, backbone)
    # If only one class in training data, return trivial baseline
    if len(np.unique(y_train)) < 2:
        # Predict the majority class for test set
        majority = y_train[0] if len(y_train) > 0 else 0
        preds = np.full_like(y_test, majority)
        acc = accuracy_score(y_test, preds)
        f1 = 0.0  # F1 is undefined when only one class exists; set to zero
        return acc, f1
    # Train logistic regression
    clf = LogisticRegression(max_iter=200)
    clf.fit(X_train, y_train)
    # Evaluate
    preds = clf.predict(X_test)
    acc = accuracy_score(y_test, preds)
    f1 = f1_score(y_test, preds)
    return acc, f1


In [None]:
# Evaluate CNX‑Tiny and ViT‑Large on each modality
results = {}
modalities = ["em", "mri", "histology"]
model_variants = {"CNX-Tiny": "cnx-tiny", "ViT-Large": "vit-large"}
for mod in modalities:
    results[mod] = {}
    for name, model_id in model_variants.items():
        acc, f1 = train_and_evaluate(model_id, mod)
        results[mod][name] = {"accuracy": acc, "f1": f1}
        print(f"{name} on {mod}: accuracy={acc:.3f}, F1={f1:.3f}")
results


In [None]:
# Plot bar charts of accuracy and F1 score
fig, axes = plt.subplots(1, 2, figsize=(10, 4))
for i, metric in enumerate(["accuracy", "f1"]):
    ax = axes[i]
    bars = []
    labels = []
    for mod in modalities:
        for name in model_variants.keys():
            bars.append(results[mod][name][metric])
            labels.append(f"{mod}-{name.split('-')[0]}")
    ax.bar(range(len(bars)), bars)
    ax.set_xticks(range(len(bars)))
    ax.set_xticklabels(labels, rotation=45, ha='right')
    ax.set_ylabel(metric.capitalize())
    ax.set_title(f"{metric.capitalize()} by modality and model")
    ax.set_ylim(0, 1)
plt.tight_layout()
plt.show()
