In [None]:

# Cell 1: imports + paths/seeds/config
import os
import sys
import glob
import time
import random
import subprocess
from typing import List, Tuple

import numpy as np
import pandas as pd
from PIL import Image
import torch
import torch.nn as nn
from torch.cuda.amp import GradScaler, autocast
from torch.optim import AdamW
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as T
from sklearn.metrics import r2_score
from tqdm import tqdm

# Paths and run configuration
DATA_ROOT = "/kaggle/input/csiro-biomass"
OUTPUT_ROOT = "/kaggle/working/outputs"
RUN_NAME = os.environ.get("RUN_NAME", time.strftime("%Y%m%d-%H%M%S"))
RUN_DIR = os.path.join(OUTPUT_ROOT, RUN_NAME)
SUBMISSION_PATH = os.path.join(RUN_DIR, "submission", "submission.csv")
WORKING_SUBMISSION = "/kaggle/working/submission.csv"

CONFIG = {
    "backbone": os.environ.get("BACKBONE", "efficientnet_b2"),
    "image_size": int(os.environ.get("IMAGE_SIZE", 456)),
    "batch_size": int(os.environ.get("BATCH_SIZE", 32)),
    "num_workers": int(os.environ.get("NUM_WORKERS", 2)),
    "epochs": int(os.environ.get("EPOCHS", 20)),
    "lr": float(os.environ.get("LR", 1e-3)),
    "weight_decay": float(os.environ.get("WEIGHT_DECAY", 1e-4)),
    "patience": int(os.environ.get("PATIENCE", 3)),
    "seed": int(os.environ.get("SEED", 42)),
    "folds": int(os.environ.get("FOLDS", 5)),
    "debug": os.environ.get("DEBUG", "0").lower() in {"1", "true", "yes", "y"},
    "accumulate_steps": int(os.environ.get("ACCUM_STEPS", 1)),
    "amp": os.environ.get("AMP", "1").lower() in {"1", "true", "yes", "y"},
    "device": "cuda" if torch.cuda.is_available() else "cpu",
}

TARGET_COLUMNS = ["Dry_Green_g", "Dry_Clover_g", "Dry_Dead_g"]
ALL_TARGET_COLUMNS = TARGET_COLUMNS + ["GDM_g", "Dry_Total_g"]
AGGREGATION_COLUMNS = ["sample_id_prefix", "image_path"]

_INSTALL_ATTEMPTED = False


def ensure_dirs() -> None:
    os.makedirs(RUN_DIR, exist_ok=True)
    for sub in ["checkpoints", "preds", "submission", "logs"]:
        os.makedirs(os.path.join(RUN_DIR, sub), exist_ok=True)


def set_seed(seed: int) -> None:
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


def ensure_timm_installed() -> None:
    global _INSTALL_ATTEMPTED
    try:
        import timm  # noqa: F401
        return
    except ImportError:
        if _INSTALL_ATTEMPTED:
            raise SystemExit("timm is required but could not be installed.")
        _INSTALL_ATTEMPTED = True
        print("timm not found; attempting installation once...")
        try:
            subprocess.check_call([sys.executable, "-m", "pip", "install", "timm>=0.9.0"])
        except Exception as exc:  # pragma: no cover - Kaggle offline safety
            print("timm installation failed. Please attach internet or add timm to the dataset.")
            raise SystemExit(exc)


ensure_dirs()
set_seed(CONFIG["seed"])
print("Using device:", CONFIG["device"])
print("Run directory:", RUN_DIR)


In [None]:

# Cell 2: data existence check + train/test columns
train_csv = os.path.join(DATA_ROOT, "train.csv")
test_csv = os.path.join(DATA_ROOT, "test.csv")
sample_sub_csv = os.path.join(DATA_ROOT, "sample_submission.csv")

print("Train CSV exists:", os.path.exists(train_csv))
print("Test CSV exists:", os.path.exists(test_csv))
print("Sample submission exists:", os.path.exists(sample_sub_csv))

train_images = glob.glob(os.path.join(DATA_ROOT, "train", "*.jpg"))
test_images = glob.glob(os.path.join(DATA_ROOT, "test", "*.jpg"))
print(f"Detected train images: {len(train_images)}")
print(f"Detected test images: {len(test_images)}")

train_head = pd.read_csv(train_csv, nrows=3)
test_head = pd.read_csv(test_csv, nrows=3)
print("Train columns:", train_head.columns.tolist())
print("Test columns:", test_head.columns.tolist())


In [None]:

# Cell 3: data loading + long→wide aggregation

def load_long_dataframe(csv_path: str, include_targets: bool = True) -> pd.DataFrame:
    df = pd.read_csv(csv_path)
    if "sample_id" not in df.columns or "image_path" not in df.columns:
        raise ValueError("CSV must contain sample_id and image_path columns")
    df["sample_id_prefix"] = df["sample_id"].astype(str).str.split("__").str[0]
    if include_targets:
        missing = [c for c in ["target_name", "target"] if c not in df.columns]
        if missing:
            raise ValueError(f"Missing target columns in training data: {missing}")
    return df


def to_wide(df: pd.DataFrame, include_targets: bool = True) -> pd.DataFrame:
    index_cols = [c for c in AGGREGATION_COLUMNS if c in df.columns]
    missing_idx = [c for c in AGGREGATION_COLUMNS if c not in index_cols]
    if missing_idx:
        raise ValueError(f"Missing required aggregation columns: {missing_idx}")
    print("Aggregation columns:", index_cols)

    if not include_targets:
        return df[index_cols].drop_duplicates().reset_index(drop=True)

    pivot = df.pivot_table(index=index_cols, columns="target_name", values="target", aggfunc="first").reset_index()
    missing_targets = [c for c in TARGET_COLUMNS if c not in pivot.columns]
    if missing_targets:
        raise ValueError(f"Missing target columns after pivot: {missing_targets}")
    pivot = pivot[index_cols + TARGET_COLUMNS]
    return pivot


train_long = load_long_dataframe(train_csv, include_targets=True)
test_long = load_long_dataframe(test_csv, include_targets=False)

train_wide = to_wide(train_long, include_targets=True)
test_wide = to_wide(test_long, include_targets=False)

if CONFIG["debug"]:
    train_wide = train_wide.sample(n=min(len(train_wide), 32), random_state=CONFIG["seed"])
    test_wide = test_wide.head(8)
    print("DEBUG mode: subsampled train and test data")

print("Train wide shape:", train_wide.shape)
print("Test wide shape:", test_wide.shape)


In [None]:

# Cell 4: Dataset and transforms

def build_transforms(image_size: int, augment: bool) -> T.Compose:
    if augment:
        return T.Compose([
            T.Resize((image_size, image_size)),
            T.RandomHorizontalFlip(),
            T.RandomVerticalFlip(),
            T.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.02),
            T.ToTensor(),
            T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ])
    else:
        return T.Compose([
            T.Resize((image_size, image_size)),
            T.ToTensor(),
            T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ])


class RegressionDataset(Dataset):
    def __init__(self, df: pd.DataFrame, image_size: int, augment: bool = False, use_targets: bool = True):
        self.df = df.reset_index(drop=True)
        self.transform = build_transforms(image_size, augment)
        self.use_targets = use_targets

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx: int):
        row = self.df.iloc[idx]
        img_path = os.path.normpath(os.path.join(DATA_ROOT, row["image_path"]))
        with Image.open(img_path) as img:
            image = img.convert("RGB")
        image = self.transform(image)

        if self.use_targets:
            targets = torch.tensor(row[TARGET_COLUMNS].values.astype("float32"))
        else:
            targets = torch.zeros(len(TARGET_COLUMNS), dtype=torch.float32)
        return image, targets, row.get("sample_id_prefix", None)


def create_dataloaders(df: pd.DataFrame, train_idx: List[int], val_idx: List[int]) -> Tuple[DataLoader, DataLoader]:
    train_ds = RegressionDataset(df.iloc[train_idx], CONFIG["image_size"], augment=True, use_targets=True)
    val_ds = RegressionDataset(df.iloc[val_idx], CONFIG["image_size"], augment=False, use_targets=True)

    train_loader = DataLoader(train_ds, batch_size=CONFIG["batch_size"], shuffle=True, num_workers=CONFIG["num_workers"], pin_memory=True)
    val_loader = DataLoader(val_ds, batch_size=CONFIG["batch_size"], shuffle=False, num_workers=CONFIG["num_workers"], pin_memory=True)
    return train_loader, val_loader


In [None]:

# Cell 5: Model definition (timm efficientnet_b2)

def build_model(pretrained: bool = True) -> nn.Module:
    ensure_timm_installed()
    import timm
    model = timm.create_model(CONFIG["backbone"], pretrained=pretrained, num_classes=len(TARGET_COLUMNS))
    return model


In [None]:

# Cell 6: Metrics (weighted R2)

def compute_weighted_r2(y_true: np.ndarray, y_pred: np.ndarray) -> float:
    weights = np.ones(y_true.shape[1], dtype=np.float32)
    scores = []
    for i in range(y_true.shape[1]):
        if np.all(np.isclose(y_true[:, i], y_true[0, i])):
            scores.append(0.0)
        else:
            scores.append(r2_score(y_true[:, i], y_pred[:, i]))
    scores = np.array(scores)
    return float(np.sum(scores * weights) / np.sum(weights))


def expand_targets(primary: np.ndarray) -> np.ndarray:
    dry_green = primary[:, 0]
    dry_clover = primary[:, 1]
    dry_dead = primary[:, 2]
    gdm = dry_green + dry_clover
    dry_total = gdm + dry_dead
    full = np.stack([dry_green, dry_dead, dry_clover, gdm, dry_total], axis=1)
    return full


In [None]:

# Cell 7: Train loop + 5-fold CV + checkpoint saving

def evaluate(model: nn.Module, loader: DataLoader, device: torch.device) -> float:
    model.eval()
    preds_list = []
    targets_list = []
    with torch.no_grad():
        for images, targets, _ in loader:
            images = images.to(device)
            targets = targets.to(device)
            outputs = model(images)
            preds_list.append(outputs.cpu().numpy())
            targets_list.append(targets.cpu().numpy())
    preds = np.concatenate(preds_list)
    targets = np.concatenate(targets_list)
    preds_full = expand_targets(preds)
    targets_full = expand_targets(targets)
    return compute_weighted_r2(targets_full, preds_full)


def train_and_validate(df: pd.DataFrame) -> float:
    device = torch.device(CONFIG["device"] if torch.cuda.is_available() else "cpu")
    print("Training on device:", device)

    num_samples = len(df)
    fold_size = num_samples // CONFIG["folds"]
    indices = np.arange(num_samples)
    best_scores: List[float] = []

    for fold in range(CONFIG["folds"]):
        print(f"=== Fold {fold + 1} / {CONFIG['folds']} ===")
        val_start = fold * fold_size
        val_end = (fold + 1) * fold_size if fold < CONFIG["folds"] - 1 else num_samples
        val_idx = indices[val_start:val_end]
        train_idx = np.concatenate([indices[:val_start], indices[val_end:]])

        train_loader, val_loader = create_dataloaders(df, train_idx.tolist(), val_idx.tolist())

        model = build_model(pretrained=True).to(device)
        criterion = nn.SmoothL1Loss()
        optimizer = AdamW(model.parameters(), lr=CONFIG["lr"], weight_decay=CONFIG["weight_decay"])
        scheduler = ReduceLROnPlateau(optimizer, mode="max", factor=0.5, patience=CONFIG["patience"])
        scaler = GradScaler(enabled=CONFIG["amp"])

        best_fold_score = -np.inf
        best_path = os.path.join(RUN_DIR, "checkpoints", f"fold{fold}_best.pth")
        last_path = os.path.join(RUN_DIR, "checkpoints", f"fold{fold}_last.pth")

        for epoch in range(CONFIG["epochs"]):
            model.train()
            running_loss = 0.0
            pbar = tqdm(train_loader, desc=f"Fold {fold} Epoch {epoch+1}/{CONFIG['epochs']}")
            optimizer.zero_grad()
            for step, (images, targets, _) in enumerate(pbar):
                images = images.to(device)
                targets = targets.to(device)

                with autocast(enabled=CONFIG["amp"]):
                    preds = model(images)
                    loss = criterion(preds, targets) / CONFIG["accumulate_steps"]

                scaler.scale(loss).backward()

                if (step + 1) % CONFIG["accumulate_steps"] == 0:
                    scaler.step(optimizer)
                    scaler.update()
                    optimizer.zero_grad()

                running_loss += loss.item() * CONFIG["accumulate_steps"]
                pbar.set_postfix({"loss": running_loss / (step + 1)})

            val_score = evaluate(model, val_loader, device)
            print(f"Epoch {epoch + 1} loss {running_loss / len(train_loader):.4f} val_r2 {val_score:.4f}")
            scheduler.step(val_score)

            torch.save(model.state_dict(), last_path)
            if val_score > best_fold_score:
                best_fold_score = val_score
                torch.save(model.state_dict(), best_path)
        best_scores.append(best_fold_score)
    mean_score = float(np.mean(best_scores))
    print(f"CV mean R2: {mean_score:.4f}")
    return mean_score


cv_score = train_and_validate(train_wide)
print("Training complete. CV score:", cv_score)


In [None]:

# Cell 8: Inference + submission generation

def predict_wide(test_df: pd.DataFrame, device: torch.device) -> np.ndarray:
    ds = RegressionDataset(test_df, CONFIG["image_size"], augment=False, use_targets=False)
    loader = DataLoader(ds, batch_size=CONFIG["batch_size"], shuffle=False, num_workers=CONFIG["num_workers"], pin_memory=True)

    ckpt_dir = os.path.join(RUN_DIR, "checkpoints")
    ckpts = sorted([os.path.join(ckpt_dir, p) for p in os.listdir(ckpt_dir) if p.endswith("_best.pth")])
    if not ckpts:
        raise FileNotFoundError("No checkpoints found for inference")

    preds_stack: List[np.ndarray] = []
    for ckpt_path in ckpts:
        model = build_model(pretrained=False)
        state = torch.load(ckpt_path, map_location=device)
        model.load_state_dict(state)
        model.to(device)
        model.eval()

        fold_preds = []
        with torch.no_grad():
            for images, _, _ in tqdm(loader, desc=f"Infer {os.path.basename(ckpt_path)}"):
                images = images.to(device)
                outputs = model(images)
                fold_preds.append(outputs.cpu().numpy())
        preds_stack.append(np.concatenate(fold_preds))

    preds_mean = np.mean(preds_stack, axis=0)
    return preds_mean


def build_submission(test_long_df: pd.DataFrame, test_wide_df: pd.DataFrame, preds: np.ndarray) -> pd.DataFrame:
    full_preds = expand_targets(preds)
    pred_df = pd.DataFrame(full_preds, columns=["Dry_Green_g", "Dry_Dead_g", "Dry_Clover_g", "GDM_g", "Dry_Total_g"])
    pred_df["sample_id_prefix"] = test_wide_df["sample_id_prefix"].values

    pred_long = pred_df.melt(id_vars="sample_id_prefix", var_name="target_name", value_name="target")
    pred_long["sample_id"] = pred_long["sample_id_prefix"].astype(str) + "__" + pred_long["target_name"].astype(str)

    merged = test_long_df.merge(pred_long[["sample_id_prefix", "target_name", "target"]], on=["sample_id_prefix", "target_name"], how="left", sort=False)
    submission = merged[["sample_id", "target"]].copy()
    return submission


def run_inference_and_save():
    device = torch.device(CONFIG["device"] if torch.cuda.is_available() else "cpu")
    preds_mean = predict_wide(test_wide, device)
    submission = build_submission(test_long, test_wide, preds_mean)
    submission.to_csv(SUBMISSION_PATH, index=False)

    os.makedirs(os.path.dirname(WORKING_SUBMISSION), exist_ok=True)
    submission.to_csv(WORKING_SUBMISSION, index=False)
    print("Submission saved to:", SUBMISSION_PATH)
    print("Working copy:", WORKING_SUBMISSION)
    return submission


submission_df = run_inference_and_save()


In [None]:

# Cell 9: Submission validation + file locations
print("Submission preview:")
print(submission_df.head())
print("Submission shape:", submission_df.shape)
print("Submission columns:", submission_df.columns.tolist())
print("NaN present:", submission_df["target"].isna().any())

if os.path.exists(sample_sub_csv):
    sample_sub = pd.read_csv(sample_sub_csv)
    sample_ids = set(sample_sub["sample_id"])
    submission_ids = set(submission_df["sample_id"])
    missing_ids = sample_ids - submission_ids
    extra_ids = submission_ids - sample_ids
    print("Missing IDs vs sample_submission:", len(missing_ids))
    print("Extra IDs vs sample_submission:", len(extra_ids))
else:
    print("sample_submission.csv not found; skipping ID comparison")

print("Artifacts saved under:", RUN_DIR)
print("Primary submission file:", SUBMISSION_PATH)
print("Working submission copy:", WORKING_SUBMISSION)
