# CSIRO - Image2Biomass with DINOv2 ViT
Fine-tunes Meta's DINOv2 Vision Transformer (via `timm`) on the CSIRO Image2Biomass dataset to predict five pasture biomass measurements from aerial imagery.


## Approach Overview
We load the provided train/test CSVs, reshape labels into per-image targets, and split folds by image. A DINOv2 backbone feeds a lightweight regression head, optimizing Smooth L1 losses across all biomass components. After training, we ensemble fold checkpoints to generate predictions and write the final `submission.csv`.


In [16]:
!pip install -q timm==0.9.16 albumentations==1.4.14 scikit-learn==1.3.2 scipy==1.11.4


In [27]:
import os
import math
import random
import time
from pathlib import Path
from dataclasses import dataclass, asdict
import numpy as np
import pandas as pd
from PIL import Image
import torch 
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from torch.optim.lr_scheduler import OneCycleLR
from sklearn.model_selection import GroupKFold
import timm
from timm.data import create_transform
print(torch.__version__)


2.6.0+cu124


In [29]:
@dataclass
class cfg:
    data_dir: Path = Path('/kaggle/input/csiro-biomass')
    output_dir: Path = Path('.')
    seed: int = 2024
    img_size: int = 518
    batch_size: int = 8
    num_workers: int = 4
    epochs: int = 5
    lr: float = 2e-4
    min_lr: float = 1e-6
    weight_decay: float = 1e-4
    backbone: str = 'vit_base_patch14_dinov2'
    checkpoint_dir: Path = Path('/kaggle/input/dinov2/pytorch/giant/1')
    hidden_dim: int = 512
    drop_rate: float = 0.1
    n_folds: int = 5
    train_folds: tuple = (0,)
    target_names: tuple = ('Dry_Clover_g','Dry_Dead_g','Dry_Green_g','Dry_Total_g','GDM_g')
    num_targets: int = 5
    use_amp: bool = True
    grad_accum_steps: int = 1

cfg = CFG()

possible_dirs = [
    Path('/kaggle/input/csiro-biomass'),
    Path('/kaggle/input/CSIRO-I2B'),
    Path('csiro-biomass'),
]
for p in possible_dirs:
    if p.exists():
        cfg.data_dir = p
        break
print(f"Using data from: {cfg.data_dir}")

Using data from: /kaggle/input/csiro-biomass


In [30]:
def seed_everything(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(cfg.seed)


In [31]:
train_df = pd.read_csv(cfg.data_dir / 'train.csv')
test_df = pd.read_csv(cfg.data_dir / 'test.csv')

train_targets = train_df.pivot_table(
    index='image_path',
    columns='target_name',
    values='target'
).reset_index().copy()
train_targets.columns = ['image_path', *list(cfg.target_names)]
train_targets[list(cfg.target_names)] = train_targets[list(cfg.target_names)].fillna(0.0).astype(float)

train_targets['fold'] = -1
kf = GroupKFold(n_splits=cfg.n_folds)
for fold, (_, val_idx) in enumerate(kf.split(train_targets, groups=train_targets['image_path'])):
    train_targets.loc[val_idx, 'fold'] = fold

print(train_targets.head())
print(train_targets['fold'].value_counts())

               image_path  Dry_Clover_g  Dry_Dead_g  Dry_Green_g  Dry_Total_g  \
0  train/ID1011485656.jpg        0.0000     31.9984      16.2751      48.2735   
1  train/ID1012260530.jpg        0.0000      0.0000       7.6000       7.6000   
2  train/ID1025234388.jpg        6.0500      0.0000       0.0000       6.0500   
3  train/ID1028611175.jpg        0.0000     30.9703      24.2376      55.2079   
4  train/ID1035947949.jpg        0.4343     23.2239      10.5261      34.1844   

     GDM_g  fold  
0  16.2750     0  
1   7.6000     4  
2   6.0500     3  
3  24.2376     2  
4  10.9605     1  
fold
0    72
1    72
4    71
3    71
2    71
Name: count, dtype: int64


In [32]:
class BiomassDataset(Dataset):
    def __init__(self, df: pd.DataFrame, mode: str, transform=None):
        self.df = df.reset_index(drop=True)
        self.mode = mode
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        image_path = cfg.data_dir / row['image_path']
        image = Image.open(image_path).convert('RGB')
        if self.transform:
            image = self.transform(image)
        else:
            image = transforms.ToTensor()(image)
        sample = {
            'pixel_values': image,
            'image_path': row['image_path']
        }
        if self.mode != 'test':
            targets = row[list(cfg.target_names)].astype(float).values
            target = torch.tensor(targets, dtype=torch.float32)
            sample['targets'] = target
        return sample

In [33]:
from torchvision import transforms

def get_transforms(is_train=True):
    return create_transform(
        input_size=(3, cfg.img_size, cfg.img_size),
        is_training=is_train,
        auto_augment='rand-m9-mstd0.5-inc1' if is_train else None,
        interpolation='bicubic',
        re_prob=0.25 if is_train else 0.0,
        re_mode='pixel',
        re_count=1,
    )

In [34]:
class DinoRegressor(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.backbone = timm.create_model(
            cfg.backbone,
            pretrained=False,
            num_classes=0,
            global_pool='' 
        )
        checkpoint_path = None
        if cfg.checkpoint_dir is not None:
            ckpt_dir = Path(cfg.checkpoint_dir)
            if ckpt_dir.exists():
                for pattern in ('*.pth', '*.pt', '*.bin', '*.safetensors'):
                    candidates = sorted(ckpt_dir.rglob(pattern))
                    if candidates:
                        checkpoint_path = candidates[0]
                        break
        if checkpoint_path is None:
            raise FileNotFoundError('No DINOv2 checkpoint found under ' + str(cfg.checkpoint_dir))
        state = torch.load(checkpoint_path, map_location='cpu')
        if isinstance(state, dict):
            for key in ('state_dict', 'model', 'model_state'):
                if key in state:
                    state = state[key]
                    break
        missing, unexpected = self.backbone.load_state_dict(state, strict=False)
        if missing or unexpected:
            print('Loaded checkpoint with missing keys:', missing)
            print('Unexpected keys:', unexpected)
        else:
            print(f'Successfully loaded weights from {checkpoint_path}')
        in_features = self.backbone.num_features
        self.head = nn.Sequential(
            nn.LayerNorm(in_features),
            nn.Dropout(cfg.drop_rate),
            nn.Linear(in_features, cfg.hidden_dim),
            nn.GELU(),
            nn.Dropout(cfg.drop_rate),
            nn.Linear(cfg.hidden_dim, cfg.num_targets)
        )

    def forward(self, pixel_values):
        feats = self.backbone(pixel_values)
        if feats.ndim == 3:
            feats = feats.mean(dim=1)
        return self.head(feats)

In [35]:
class AverageMeter:
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count if self.count != 0 else 0

def get_dataloaders(train_df, valid_df):
    train_dataset = BiomassDataset(train_df, mode='train', transform=get_transforms(True))
    valid_dataset = BiomassDataset(valid_df, mode='valid', transform=get_transforms(False))

    train_loader = DataLoader(
        train_dataset,
        batch_size=cfg.batch_size,
        shuffle=True,
        num_workers=cfg.num_workers,
        pin_memory=True,
        drop_last=True,
    )
    valid_loader = DataLoader(
        valid_dataset,
        batch_size=cfg.batch_size,
        shuffle=False,
        num_workers=cfg.num_workers,
        pin_memory=True,
        drop_last=False,
    )
    return train_loader, valid_loader


In [36]:
def train_one_epoch(model, loader, criterion, optimizer, scheduler, scaler, device):
    model.train()
    loss_meter = AverageMeter()

    for step, batch in enumerate(loader):
        images = batch['pixel_values'].to(device, non_blocking=True)
        targets = batch['targets'].to(device, non_blocking=True)
        optimizer.zero_grad()
        with torch.cuda.amp.autocast(enabled=cfg.use_amp):
            preds = model(images)
            loss = criterion(preds, targets)
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        if scheduler is not None:
            scheduler.step()
        loss_meter.update(loss.item(), images.size(0))
    return loss_meter.avg


def validate_one_epoch(model, loader, criterion, device):
    model.eval()
    loss_meter = AverageMeter()
    preds_list = []
    with torch.no_grad():
        for batch in loader:
            images = batch['pixel_values'].to(device, non_blocking=True)
            targets = batch['targets'].to(device, non_blocking=True)
            with torch.cuda.amp.autocast(enabled=cfg.use_amp):
                preds = model(images)
                loss = criterion(preds, targets)
            loss_meter.update(loss.item(), images.size(0))
            preds_list.append(preds.detach().cpu())
    predictions = torch.cat(preds_list).numpy()
    return loss_meter.avg, predictions


In [37]:
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
criterion = nn.SmoothL1Loss()
oof_predictions = np.zeros((len(train_targets), cfg.num_targets), dtype=np.float32)
test_images = pd.DataFrame({'image_path': test_df['image_path'].unique()})

all_fold_models = []

for fold in range(cfg.n_folds):
    if fold not in cfg.train_folds:
        continue
    print(f"===== Fold {fold} =====")
    train_split = train_targets[train_targets['fold'] != fold].reset_index(drop=True)
    valid_split = train_targets[train_targets['fold'] == fold].reset_index(drop=True)
    train_loader, valid_loader = get_dataloaders(train_split, valid_split)

    model = DinoRegressor(cfg).to(DEVICE)
    optimizer = AdamW(model.parameters(), lr=cfg.lr, weight_decay=cfg.weight_decay)
    steps_per_epoch = len(train_loader)
    scheduler = OneCycleLR(
        optimizer,
        max_lr=cfg.lr,
        epochs=cfg.epochs,
        steps_per_epoch=steps_per_epoch,
        pct_start=0.1,
        div_factor=25,
        final_div_factor=cfg.lr / cfg.min_lr,
    )
    scaler = torch.cuda.amp.GradScaler(enabled=cfg.use_amp)

    best_loss = float('inf')
    best_state = None

    for epoch in range(cfg.epochs):
        start = time.time()
        train_loss = train_one_epoch(model, train_loader, criterion, optimizer, scheduler, scaler, DEVICE)
        val_loss, val_preds = validate_one_epoch(model, valid_loader, criterion, DEVICE)
        duration = time.time() - start
        print(f"Epoch {epoch+1}/{cfg.epochs} | train: {train_loss:.4f} | valid: {val_loss:.4f} | {duration:.1f}s")
        if val_loss < best_loss:
            best_loss = val_loss
            best_state = model.state_dict()
    model.load_state_dict(best_state)
    all_fold_models.append(best_state)

    fold_idx = valid_split.index
    oof_predictions[fold_idx] = val_preds

np.save(cfg.output_dir / 'oof_predictions.npy', oof_predictions)


===== Fold 0 =====
Loaded checkpoint with missing keys: ['cls_token', 'pos_embed', 'patch_embed.proj.weight', 'patch_embed.proj.bias', 'blocks.0.norm1.weight', 'blocks.0.norm1.bias', 'blocks.0.attn.qkv.weight', 'blocks.0.attn.qkv.bias', 'blocks.0.attn.proj.weight', 'blocks.0.attn.proj.bias', 'blocks.0.ls1.gamma', 'blocks.0.norm2.weight', 'blocks.0.norm2.bias', 'blocks.0.mlp.fc1.weight', 'blocks.0.mlp.fc1.bias', 'blocks.0.mlp.fc2.weight', 'blocks.0.mlp.fc2.bias', 'blocks.0.ls2.gamma', 'blocks.1.norm1.weight', 'blocks.1.norm1.bias', 'blocks.1.attn.qkv.weight', 'blocks.1.attn.qkv.bias', 'blocks.1.attn.proj.weight', 'blocks.1.attn.proj.bias', 'blocks.1.ls1.gamma', 'blocks.1.norm2.weight', 'blocks.1.norm2.bias', 'blocks.1.mlp.fc1.weight', 'blocks.1.mlp.fc1.bias', 'blocks.1.mlp.fc2.weight', 'blocks.1.mlp.fc2.bias', 'blocks.1.ls2.gamma', 'blocks.2.norm1.weight', 'blocks.2.norm1.bias', 'blocks.2.attn.qkv.weight', 'blocks.2.attn.qkv.bias', 'blocks.2.attn.proj.weight', 'blocks.2.attn.proj.bias',

  scaler = torch.cuda.amp.GradScaler(enabled=cfg.use_amp)
  with torch.cuda.amp.autocast(enabled=cfg.use_amp):
  with torch.cuda.amp.autocast(enabled=cfg.use_amp):


Epoch 1/5 | train: 22.1410 | valid: 17.3100 | 25.5s
Epoch 2/5 | train: 14.8649 | valid: 14.7429 | 26.1s
Epoch 3/5 | train: 13.5501 | valid: 14.5006 | 26.5s
Epoch 4/5 | train: 13.6078 | valid: 14.4928 | 26.8s
Epoch 5/5 | train: 13.5618 | valid: 14.5186 | 27.3s


In [38]:
@torch.no_grad()
def infer(model_states, df):
    models = []
    for state in model_states:
        model = DinoRegressor(cfg).to(DEVICE)
        model.load_state_dict(state, strict=True)
        model.eval()
        models.append(model)
    dataset = BiomassDataset(df, mode='test', transform=get_transforms(False))
    loader = DataLoader(
        dataset,
        batch_size=cfg.batch_size,
        shuffle=False,
        num_workers=cfg.num_workers,
        pin_memory=True,
    )
    image_preds = {}
    for batch in loader:
        images = batch['pixel_values'].to(DEVICE, non_blocking=True)
        preds = torch.zeros((images.size(0), cfg.num_targets), device=DEVICE)
        for model in models:
            preds += model(images)
        preds /= len(models)
        preds = preds.detach().cpu().numpy()
        for path, pred in zip(batch['image_path'], preds):
            image_preds[path] = pred
    return image_preds

ensemble_preds = infer(all_fold_models, test_images)


Loaded checkpoint with missing keys: ['cls_token', 'pos_embed', 'patch_embed.proj.weight', 'patch_embed.proj.bias', 'blocks.0.norm1.weight', 'blocks.0.norm1.bias', 'blocks.0.attn.qkv.weight', 'blocks.0.attn.qkv.bias', 'blocks.0.attn.proj.weight', 'blocks.0.attn.proj.bias', 'blocks.0.ls1.gamma', 'blocks.0.norm2.weight', 'blocks.0.norm2.bias', 'blocks.0.mlp.fc1.weight', 'blocks.0.mlp.fc1.bias', 'blocks.0.mlp.fc2.weight', 'blocks.0.mlp.fc2.bias', 'blocks.0.ls2.gamma', 'blocks.1.norm1.weight', 'blocks.1.norm1.bias', 'blocks.1.attn.qkv.weight', 'blocks.1.attn.qkv.bias', 'blocks.1.attn.proj.weight', 'blocks.1.attn.proj.bias', 'blocks.1.ls1.gamma', 'blocks.1.norm2.weight', 'blocks.1.norm2.bias', 'blocks.1.mlp.fc1.weight', 'blocks.1.mlp.fc1.bias', 'blocks.1.mlp.fc2.weight', 'blocks.1.mlp.fc2.bias', 'blocks.1.ls2.gamma', 'blocks.2.norm1.weight', 'blocks.2.norm1.bias', 'blocks.2.attn.qkv.weight', 'blocks.2.attn.qkv.bias', 'blocks.2.attn.proj.weight', 'blocks.2.attn.proj.bias', 'blocks.2.ls1.gamm

In [39]:
submission = test_df.copy()
submission['target'] = submission.apply(
    lambda row: ensemble_preds[row['image_path']][cfg.target_names.index(row['target_name'])],
    axis=1
)
submission[['sample_id', 'target']].to_csv('submission.csv', index=False)
submission.head()


Unnamed: 0,sample_id,image_path,target_name,target
0,ID1001187975__Dry_Clover_g,test/ID1001187975.jpg,Dry_Clover_g,1.690462
1,ID1001187975__Dry_Dead_g,test/ID1001187975.jpg,Dry_Dead_g,8.660296
2,ID1001187975__Dry_Green_g,test/ID1001187975.jpg,Dry_Green_g,21.079224
3,ID1001187975__Dry_Total_g,test/ID1001187975.jpg,Dry_Total_g,40.596779
4,ID1001187975__GDM_g,test/ID1001187975.jpg,GDM_g,26.444044
