# V123: MLP Head LP-FT KFold

**V121 的問題**: Linear head 是瓶頸 (R²=0.4099)。MLP 可以捕捉非線性特徵組合。

**Strategy**: 2-stage，但 Stage 1 改為 MLP 訓練（而非 Ridge）
1. **Stage 1 (LP)**: 凍結 BioCLIP backbone → 只訓練 MLP head（用快取特徵，快）
2. **Stage 2 (FT)**: 載入 Stage 1 MLP 權重 → 解凍 backbone → fine-tune (backbone LR=1e-6)

**MLP Head**: `Linear(768,256) → BN → ReLU → Dropout(0.1) → Linear(256,3)`

**與 V121 的差異**:
- Stage 1 用 MLP on cached features（非 Ridge）
- Head 有非線性層
- 共用 V121 的 features_cache.pkl（省 5 分鐘）

In [None]:
# Cell 1: Setup
!pip install -q open_clip_torch datasets transformers scikit-learn

from google.colab import drive
drive.mount('/content/drive')

PROJECT  = '/content/drive/MyDrive/Hackathon_NSF_Beetles'
SAVE_DIR = f'{PROJECT}/4_models/v123_mlp_lpft_kfold'
V121_DIR = f'{PROJECT}/4_models/v121_lpft_kfold'  # reuse feature cache

import os, json, time, pickle, math
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.cuda.amp import autocast, GradScaler
from torch.utils.data import DataLoader, TensorDataset
from torchvision import transforms
from sklearn.model_selection import KFold
from sklearn.linear_model import RidgeCV
from sklearn.metrics import r2_score
from datasets import load_dataset
import open_clip

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Device: {device}')
if device == 'cuda':
    print(f'GPU: {torch.cuda.get_device_name()}')
    print(f'Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB')

os.makedirs(SAVE_DIR, exist_ok=True)
print(f'Save dir: {SAVE_DIR}')

In [None]:
# Cell 2: Load dataset + BioCLIP preprocess
print('Loading sentinel-beetles dataset...')
ds = load_dataset('imageomics/sentinel-beetles')
print(f'Train: {len(ds["train"])}, Val: {len(ds["validation"])}')

_, _, bioclip_preprocess = open_clip.create_model_and_transforms(
    'hf-hub:imageomics/bioclip-2', output_dict=True, require_pretrained=True)

train_augmentation = transforms.Compose([
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.1),
])

TARGETS = ['SPEI_30d', 'SPEI_1y', 'SPEI_2y']

class SimpleConcat:
    def __init__(self, ds1, ds2):
        self.ds1, self.ds2, self.len1 = ds1, ds2, len(ds1)
    def __len__(self): return self.len1 + len(self.ds2)
    def __getitem__(self, idx):
        idx = int(idx)
        return self.ds1[idx] if idx < self.len1 else self.ds2[idx - self.len1]

full_ds = SimpleConcat(ds['train'], ds['validation'])
N = len(full_ds)
print(f'Total samples: {N}')

In [None]:
# Cell 3: Load frozen features
# 優先使用 V121 的快取（同一個 backbone，完全相容）
V121_CACHE = f'{V121_DIR}/features_cache.pkl'
V123_CACHE = f'{SAVE_DIR}/features_cache.pkl'

if os.path.exists(V121_CACHE):
    print(f'Reusing V121 cache: {V121_CACHE}')
    cache_path = V121_CACHE
elif os.path.exists(V123_CACHE):
    print(f'Using V123 cache: {V123_CACHE}')
    cache_path = V123_CACHE
else:
    print('No cache found → extracting features (~5 min)...')
    bioclip_model, _, _ = open_clip.create_model_and_transforms(
        'hf-hub:imageomics/bioclip-2', output_dict=True, require_pretrained=True)
    bioclip_model = bioclip_model.to(device).eval()

    all_features, all_labels = [], []
    batch_size = 64
    for start in range(0, N, batch_size):
        end = min(start + batch_size, N)
        images, labels = [], []
        for i in range(start, end):
            row = full_ds[i]
            images.append(bioclip_preprocess(row['file_path'].convert('RGB')))
            labels.append([row[t] for t in TARGETS])
        images_t = torch.stack(images).to(device)
        with torch.no_grad(), autocast():
            feat = bioclip_model(images_t)['image_features']
        all_features.append(feat.float().cpu().numpy())
        all_labels.append(np.array(labels))
        if (start // batch_size) % 50 == 0:
            print(f'  {start}/{N} ({100*start/N:.0f}%)')

    all_features = np.concatenate(all_features)
    all_labels   = np.concatenate(all_labels)
    with open(V123_CACHE, 'wb') as f:
        pickle.dump({'features': all_features, 'labels': all_labels}, f)
    cache_path = V123_CACHE
    del bioclip_model; torch.cuda.empty_cache()

if cache_path != V123_CACHE or 'all_features' not in dir():
    with open(cache_path, 'rb') as f:
        cache = pickle.load(f)
    all_features = cache['features']
    all_labels   = cache['labels']

# === Alignment Safety Guard ===
# 確保快取樣本數與當前 dataset 完全一致，避免 OOF fold 錯位
assert all_features.shape[0] == N, (
    f'Cache size mismatch! cache={all_features.shape[0]}, dataset={N}. '
    f'Delete {cache_path} and re-extract.'
)
assert all_features.shape[1] == 768, f'Feature dim mismatch: {all_features.shape[1]} != 768'
assert all_labels.shape == (N, 3), f'Label shape mismatch: {all_labels.shape}'
print(f'Alignment check passed: {N} samples, features {all_features.shape}')

print(f'Feature std: {all_features.std():.4f}')

In [None]:
# Cell 3b: Pre-cache images as uint8 tensors → 消除 Stage 2 的 GPU idle from I/O
#
# 只做 Resize(224, BICUBIC) + CenterCrop(224) → uint8 (3,224,224)
# Augmentation (RandomFlip + ColorJitter) 和 Normalize 仍在 __getitem__ on-the-fly 做
# 2000 張圖約 300MB RAM，Colab 完全吃得下
#
IMG_CACHE_PATH = f'{SAVE_DIR}/image_cache.pt'

_resize_crop = transforms.Compose([
    transforms.Resize(224, interpolation=transforms.InterpolationMode.BICUBIC),
    transforms.CenterCrop(224),
    transforms.PILToTensor(),  # → uint8 (3, 224, 224)
])

if os.path.exists(IMG_CACHE_PATH):
    print(f'Loading image cache from Drive...')
    t0 = time.time()
    image_cache = torch.load(IMG_CACHE_PATH, map_location='cpu')
    print(f'Loaded: {image_cache.shape} {image_cache.dtype}  '
          f'{image_cache.element_size() * image_cache.nelement() / 1e6:.0f} MB  '
          f'({time.time()-t0:.1f}s)')
else:
    print(f'Building image cache ({N} images)...')
    t0 = time.time()
    image_cache = torch.zeros((N, 3, 224, 224), dtype=torch.uint8)
    for i in range(N):
        row = full_ds[i]
        image_cache[i] = _resize_crop(row['file_path'].convert('RGB'))
        if i % 500 == 0:
            print(f'  {i}/{N}  ({time.time()-t0:.0f}s)')
    torch.save(image_cache, IMG_CACHE_PATH)
    mb = image_cache.element_size() * image_cache.nelement() / 1e6
    print(f'Done: {mb:.0f} MB → {IMG_CACHE_PATH}  ({time.time()-t0:.0f}s)')

# Alignment check：確保和 all_features / all_labels 一一對應
assert image_cache.shape == (N, 3, 224, 224), \
    f'Image cache shape mismatch: {image_cache.shape} vs expected ({N}, 3, 224, 224)'
print('Image cache ready ✓')

In [None]:
# Cell 4: Config
CONFIG = {
    'n_folds': 5,
    'seed': 42,
    # Stage 1: MLP on frozen features
    'lp_hidden': 256,
    'lp_dropout': 0.1,
    'lp_epochs': 150,
    'lp_patience': 20,
    'lp_batch_size': 512,
    'lp_lr': 1e-3,
    'lp_weight_decay': 1e-4,
    # Stage 2: Fine-tune backbone + head
    # ft_batch_size=64: A100 效率（~25 steps/epoch，warmup 能正常運作）
    # ft_lr_head=1e-4:  與 V121 linear head 相同（backbone 1e-6 的 100x）
    # num_workers=4:    A100 I/O 充分餵飽 GPU
    'ft_epochs': 12,
    'ft_patience': 5,
    'ft_batch_size': 64,
    'ft_lr_backbone': 1e-6,
    'ft_lr_head': 1e-4,
    'ft_weight_decay': 0.01,
    'ft_warmup_epochs': 1,
    'ft_num_workers': 4,
}

with open(f'{SAVE_DIR}/config.json', 'w') as f:
    json.dump(CONFIG, f, indent=2)
print('Config:', CONFIG)

In [None]:
# Cell 5: Model definitions

def build_mlp_head(in_dim=768, hidden=256, dropout=0.1, out_dim=3):
    """Linear(768,256) → BN → ReLU → Dropout → Linear(256,3)"""
    return nn.Sequential(
        nn.Linear(in_dim, hidden),
        nn.BatchNorm1d(hidden),
        nn.ReLU(),
        nn.Dropout(dropout),
        nn.Linear(hidden, out_dim),
    )


class BioCLIP_MLP_LPFT(nn.Module):
    """BioCLIP + MLP head，支援 Stage 2 fine-tuning"""
    def __init__(self, hidden=256, dropout=0.1, head_state=None):
        super().__init__()
        self.bioclip, _, _ = open_clip.create_model_and_transforms(
            'hf-hub:imageomics/bioclip-2', output_dict=True, require_pretrained=True)

        # Probe output dim
        with torch.no_grad():
            dummy = torch.randn(1, 3, 224, 224)
            out_dim = self.bioclip(dummy)['image_features'].shape[-1]

        self.head = build_mlp_head(out_dim, hidden, dropout)

        if head_state is not None:
            self.head.load_state_dict(head_state)
            print(f'  Stage 1 MLP weights loaded into head.')
        else:
            print(f'  Random head init: {out_dim} → {hidden} → 3')

    def forward(self, x):
        features = self.bioclip(x)['image_features']
        return self.head(features)


class BeetleImageDataset(torch.utils.data.Dataset):
    """從 RAM 內的 uint8 image_cache 讀取，消除 Stage 2 磁碟 I/O。
    
    uint8 → float32 → (augment) → normalize 全在 __getitem__ 做，
    避免在快取時就做 normalize（節省 RAM 3×）。
    """
    _normalize = transforms.Normalize(
        mean=(0.48145466, 0.4578275, 0.40821073),
        std=(0.26862954, 0.26130258, 0.27577711))
    _aug = transforms.Compose([
        transforms.RandomHorizontalFlip(p=0.5),
        transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.1),
    ])

    def __init__(self, image_cache, labels, indices, is_train=False):
        # image_cache: (N, 3, 224, 224) uint8 CPU tensor
        # labels:      (N, 3) numpy array or similar
        self.cache   = image_cache
        self.labels  = labels
        self.indices = indices
        self.is_train = is_train

    def __len__(self): return len(self.indices)

    def __getitem__(self, idx):
        i = int(self.indices[idx])
        # uint8 [0,255] → float32 [0,1]
        img = self.cache[i].float().div_(255.0)
        if self.is_train:
            img = self._aug(img)
        img = self._normalize(img)
        label = torch.tensor(self.labels[i], dtype=torch.float32)
        return img, label


print('Model classes ready')

In [None]:
# Cell 6: Stage 1 — Train MLP head on frozen features (fast, GPU)

def stage1_train_mlp(train_idx, val_idx, config):
    """在快取特徵上訓練 MLP head，完全不需要 image loading"""
    X_tr = torch.tensor(all_features[train_idx], dtype=torch.float32)
    y_tr = torch.tensor(all_labels[train_idx],   dtype=torch.float32)
    X_va = torch.tensor(all_features[val_idx],   dtype=torch.float32)
    y_va = torch.tensor(all_labels[val_idx],     dtype=torch.float32)

    tr_loader = DataLoader(TensorDataset(X_tr, y_tr),
                           batch_size=config['lp_batch_size'], shuffle=True, drop_last=False)

    head = build_mlp_head(X_tr.shape[1], config['lp_hidden'], config['lp_dropout']).to(device)
    opt  = optim.AdamW(head.parameters(), lr=config['lp_lr'], weight_decay=config['lp_weight_decay'])
    sched = optim.lr_scheduler.CosineAnnealingLR(opt, T_max=config['lp_epochs'], eta_min=1e-6)
    criterion = nn.MSELoss()

    best_val_r2 = -float('inf')
    best_state  = None
    patience_counter = 0

    # Ridge baseline for comparison
    ridge = RidgeCV(alphas=[0.1, 1.0, 10.0, 100.0])
    ridge.fit(all_features[train_idx], all_labels[train_idx])
    ridge_r2 = r2_score(all_labels[val_idx], ridge.predict(all_features[val_idx]))
    print(f'  Ridge baseline R2={ridge_r2:.4f}')

    for epoch in range(config['lp_epochs']):
        head.train()
        for xb, yb in tr_loader:
            xb, yb = xb.to(device), yb.to(device)
            opt.zero_grad()
            loss = criterion(head(xb), yb)
            loss.backward()
            opt.step()
        sched.step()

        head.eval()
        with torch.no_grad():
            val_preds = head(X_va.to(device)).cpu().numpy()
        val_r2 = r2_score(y_va.numpy(), val_preds)

        if val_r2 > best_val_r2:
            best_val_r2 = val_r2
            best_state  = {k: v.cpu().clone() for k, v in head.state_dict().items()}
            patience_counter = 0
        else:
            patience_counter += 1

        if (epoch + 1) % 30 == 0 or epoch == 0:
            print(f'  LP Ep {epoch+1:3d}/{config["lp_epochs"]} | val R2={val_r2:.4f} | best={best_val_r2:.4f}')

        if patience_counter >= config['lp_patience']:
            print(f'  LP early stop at epoch {epoch+1}')
            break

    print(f'  Stage 1 done: Ridge R2={ridge_r2:.4f} → MLP R2={best_val_r2:.4f} (delta={best_val_r2-ridge_r2:+.4f})')
    del head; torch.cuda.empty_cache()
    return best_state, ridge_r2, best_val_r2


print('Stage 1 function ready')

In [None]:
# Cell 7: Stage 2 — Fine-tune backbone + MLP head

def stage2_finetune(fold_idx, train_idx, val_idx, head_state, lp_r2, config):
    print(f'\n--- Stage 2: Fine-Tune (backbone + MLP head) ---')

    nw = config.get('ft_num_workers', 4)
    # ← 用 image_cache (RAM) 取代 full_ds (磁碟)；消除 GPU idle from I/O
    train_ds_obj = BeetleImageDataset(image_cache, all_labels, train_idx, is_train=True)
    val_ds_obj   = BeetleImageDataset(image_cache, all_labels, val_idx,   is_train=False)
    train_loader = DataLoader(train_ds_obj, batch_size=config['ft_batch_size'],
                              shuffle=True,  num_workers=nw, pin_memory=True, drop_last=True,
                              persistent_workers=(nw > 0), prefetch_factor=(4 if nw > 0 else None))
    val_loader   = DataLoader(val_ds_obj,   batch_size=config['ft_batch_size'],
                              shuffle=False, num_workers=nw, pin_memory=True,
                              persistent_workers=(nw > 0), prefetch_factor=(4 if nw > 0 else None))

    model = BioCLIP_MLP_LPFT(
        hidden=config['lp_hidden'],
        dropout=config['lp_dropout'],
        head_state=head_state
    ).to(device)

    optimizer = torch.optim.AdamW([
        {'params': model.bioclip.parameters(), 'lr': config['ft_lr_backbone']},
        {'params': model.head.parameters(),    'lr': config['ft_lr_head']},
    ], weight_decay=config['ft_weight_decay'])

    total_steps  = len(train_loader) * config['ft_epochs']
    warmup_steps = len(train_loader) * config['ft_warmup_epochs']
    def lr_lambda(step):
        if step < warmup_steps:
            return step / max(warmup_steps, 1)
        progress = (step - warmup_steps) / max(total_steps - warmup_steps, 1)
        return 0.5 * (1 + math.cos(math.pi * progress))
    scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)

    criterion = nn.MSELoss()
    scaler = GradScaler()

    best_val_loss = float('inf')
    best_val_r2   = lp_r2  # start from Stage 1 baseline
    patience_counter = 0
    best_state = None

    steps_per_epoch = len(train_loader)
    print(f'  steps/epoch={steps_per_epoch}, batch={config["ft_batch_size"]}, '
          f'backbone_lr={config["ft_lr_backbone"]:.0e}, head_lr={config["ft_lr_head"]:.0e}')

    for epoch in range(config['ft_epochs']):
        model.train()
        train_losses = []
        t0 = time.time()

        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()
            with autocast():
                loss = criterion(model(images), labels)
            scaler.scale(loss).backward()
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            scaler.step(optimizer)
            scaler.update()
            scheduler.step()
            train_losses.append(loss.item())

        model.eval()
        vp, vl = [], []
        with torch.no_grad():
            for images, labels in val_loader:
                with autocast():
                    preds = model(images.to(device))
                vp.append(preds.float().cpu())
                vl.append(labels)

        val_preds  = torch.cat(vp).numpy()
        val_labels = torch.cat(vl).numpy()
        val_loss   = np.mean((val_preds - val_labels) ** 2)
        val_r2     = r2_score(val_labels, val_preds)

        marker = ''
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_val_r2   = val_r2
            patience_counter = 0
            best_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}
            marker = f' <<< best'
        else:
            patience_counter += 1

        print(f'  Ep {epoch+1:2d}/{config["ft_epochs"]} | '
              f'Train RMSE={np.sqrt(np.mean(train_losses)):.4f} | '
              f'Val RMSE={np.sqrt(val_loss):.4f} R2={val_r2:.4f} | '
              f'{time.time()-t0:.0f}s{marker}')

        if patience_counter >= config['ft_patience']:
            print(f'  Early stopping at epoch {epoch+1}')
            break

    if best_state is None:
        print('  WARNING: FT did not improve Stage 1. Using Stage 1 state.')
        best_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}

    # ---- Save ----
    fold_dir = f'{SAVE_DIR}/fold_{fold_idx + 1}'
    os.makedirs(fold_dir, exist_ok=True)

    model.load_state_dict(best_state)
    model = model.to(device).eval()
    torch.save(best_state, f'{fold_dir}/model.pth')

    # JIT FP16
    try:
        dummy = torch.randn(1, 3, 224, 224).to(device)
        with torch.no_grad():
            traced = torch.jit.trace(model.half(), dummy.half())
            jit_path = f'{fold_dir}/mlp_lpft_fold{fold_idx+1}_fp16.pt'
            traced.save(jit_path)
            print(f'  JIT FP16: {os.path.getsize(jit_path)/1e6:.0f} MB → {jit_path}')
    except Exception as e:
        print(f'  JIT failed: {e}')

    # OOF predictions
    model = model.float().to(device).eval()
    oof_preds = []
    with torch.no_grad():
        for images, _ in val_loader:
            with autocast():
                preds = model(images.to(device))
            oof_preds.append(preds.float().cpu().numpy())
    oof_preds = np.concatenate(oof_preds)

    del model, optimizer, scaler
    torch.cuda.empty_cache()

    return best_val_r2, np.sqrt(best_val_loss), oof_preds


print('Stage 2 function ready')

In [None]:
# Cell 8: Run all 5 folds
np.random.seed(CONFIG['seed'])
torch.manual_seed(CONFIG['seed'])

kfold       = KFold(n_splits=CONFIG['n_folds'], shuffle=True, random_state=CONFIG['seed'])
all_indices = np.arange(N)

fold_results  = []
all_oof_preds = np.zeros((N, 3))

for fold_idx, (train_idx, val_idx) in enumerate(kfold.split(all_indices)):
    print(f'\n{"="*60}')
    print(f'FOLD {fold_idx+1}/{CONFIG["n_folds"]}')
    print(f'Train: {len(train_idx)}, Val: {len(val_idx)}')
    print(f'{"="*60}')

    # ---- Stage 1: MLP on frozen features ----
    print('\n--- Stage 1: MLP Linear Probe (frozen features) ---')
    head_state, ridge_r2, lp_r2 = stage1_train_mlp(train_idx, val_idx, CONFIG)

    # ---- Stage 2: Fine-tune backbone ----
    ft_r2, ft_rmse, oof_preds = stage2_finetune(
        fold_idx, train_idx, val_idx, head_state, lp_r2, CONFIG)

    all_oof_preds[val_idx] = oof_preds

    result = {
        'fold': fold_idx + 1,
        'ridge_r2': float(ridge_r2),
        'lp_r2':    float(lp_r2),
        'ft_r2':    float(ft_r2),
        'ft_rmse':  float(ft_rmse),
        'delta_lp_vs_ridge': float(lp_r2 - ridge_r2),
        'delta_ft_vs_lp':    float(ft_r2 - lp_r2),
    }
    fold_results.append(result)
    print(f'\n  Fold {fold_idx+1} summary: Ridge={ridge_r2:.4f} → MLP-LP={lp_r2:.4f} → FT={ft_r2:.4f}')

    # Checkpoint
    np.save(f'{SAVE_DIR}/oof_preds_partial.npy', all_oof_preds)
    with open(f'{SAVE_DIR}/partial_summary.json', 'w') as f:
        json.dump({'completed_folds': fold_idx+1, 'results': fold_results}, f, indent=2)
    print(f'  Checkpoint saved')

print(f'\n{"="*60}')
print('ALL FOLDS COMPLETE')
print(f'{"="*60}')
print(f'{"Fold":>6} {"Ridge R2":>10} {"MLP-LP R2":>10} {"FT R2":>10} {"Δ(LP-Ridge)":>12} {"Δ(FT-LP)":>10}')
for r in fold_results:
    print(f'{r["fold"]:>6} {r["ridge_r2"]:>10.4f} {r["lp_r2"]:>10.4f} {r["ft_r2"]:>10.4f} '
          f'{r["delta_lp_vs_ridge"]:>+12.4f} {r["delta_ft_vs_lp"]:>+10.4f}')
print(f'{"Mean":>6} '
      f'{np.mean([r["ridge_r2"] for r in fold_results]):>10.4f} '
      f'{np.mean([r["lp_r2"] for r in fold_results]):>10.4f} '
      f'{np.mean([r["ft_r2"] for r in fold_results]):>10.4f}')

In [None]:
# Cell 9: Save OOF + compare vs V121
np.save(f'{SAVE_DIR}/mlp_lpft_oof_preds.npy', all_oof_preds)
np.save(f'{SAVE_DIR}/true_vals.npy', all_labels)

oof_r2   = r2_score(all_labels, all_oof_preds)
oof_rmse = np.sqrt(np.mean((all_labels - all_oof_preds) ** 2))

print(f'\n=== V123 MLP LP-FT OOF Results ===')
print(f'  Overall R2:   {oof_r2:.4f}')
print(f'  Overall RMSE: {oof_rmse:.4f}')
for t, name in enumerate(TARGETS):
    r2_t   = r2_score(all_labels[:, t], all_oof_preds[:, t])
    rmse_t = np.sqrt(np.mean((all_labels[:, t] - all_oof_preds[:, t]) ** 2))
    print(f'  {name}: R2={r2_t:.4f}, RMSE={rmse_t:.4f}')

print(f'\n=== Comparison ===')
baselines = [
    ('Frozen BioCLIP (V116)',   f'{PROJECT}/4_models/oof_predictions/bioclip_oof_preds.npy'),
    ('LP-FT Linear (V121)',     f'{PROJECT}/4_models/v121_lpft_kfold/lpft_oof_preds.npy'),
]
for name, path in baselines:
    try:
        preds = np.load(path)
        r2 = r2_score(all_labels, preds)
        print(f'  {name}: R2={r2:.4f}')
    except:
        print(f'  {name}: not found')
print(f'  MLP LP-FT (V123):       R2={oof_r2:.4f}  ← this')

summary = {
    'model': 'MLP LP-FT BioCLIP-2 (5-fold KFold)',
    'config': CONFIG,
    'overall_r2': float(oof_r2),
    'overall_rmse': float(oof_rmse),
    'fold_results': fold_results,
}
with open(f'{SAVE_DIR}/summary.json', 'w') as f:
    json.dump(summary, f, indent=2)
print(f'\nAll saved to {SAVE_DIR}')

if oof_r2 > 0.46:
    print('\n>>> R2 > 0.46! 加入 V124 stacking 有望提升分數。')
elif oof_r2 > 0.42:
    print('\n>>> R2 與 V121 相近，MLP 帶來多樣性仍有價值。')
else:
    print('\n>>> R2 偏低，考慮增加 hidden dim 或更多 FT epochs。')