# Cassava Leaf Disease - Plan

Objectives:
- Win a medal (target accuracy ≥ 0.898 on LB).

Initial Plan:
1) Environment check (GPU a must).
2) Data audit: train.csv, label map, images counts; verify classes.
3) Validation: StratifiedKFold (5 folds), fixed seed, deterministic, save OOF.
4) Baseline model (fast): timm pretrained CNN (e.g., tf_efficientnet_b3_ns or convnext_small), img_size≈384, CE loss with label smoothing, basic augs, cosine LR, 1–2 epochs smoke, then full.
5) Improve: resolutions (448/512), stronger augs (RandAug/AA + CutMix/MixUp), EMA, amp, balance sampler.
6) Ensembling: 2–3 diverse backbones + seeds; weighted average of logits.
7) Error analysis from OOF by class; tweak augs/resize.

Milestones (request expert review at each):
- After plan (now), after data audit/EDA, after baseline OOF, after improved model, before long trainings, before blending.

Submission:
- Predict test_images, save submission.csv with columns: image_id, label.

Timing:
- Start with smoke (1 epoch, 1 fold) to validate pipeline; then 5-fold full.

Notes:
- Use AMP, cudnn benchmark, deterministic seed.
- Log times and per-epoch metrics; save best per fold.
- Cache test preds/logits for blends.

In [1]:
# Environment check, GPU, and data audit
import os, sys, json, time, shutil, subprocess
from pathlib import Path
import pandas as pd

def run(cmd):
    print("$", " ".join(cmd), flush=True)
    return subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True).stdout

start_ts = time.time()
print("=== NVIDIA SMI ===", flush=True)
print(run(['bash','-lc','nvidia-smi || true']))

# Try import torch; if missing or wrong CUDA build, install the cu121 stack
need_install = False
try:
    import torch, torchvision, torchaudio  # noqa
    from importlib.metadata import version
    torch_cuda = getattr(torch.version, 'cuda', None)
    has_cuda = torch.cuda.is_available()
    print(f"torch: {torch.__version__}, built CUDA: {torch_cuda}, CUDA available: {has_cuda}")
    if not (torch_cuda and str(torch_cuda).startswith('12.1') and has_cuda):
        need_install = True
except Exception as e:
    print("Torch import failed:", e)
    need_install = True

if need_install:
    print("Installing PyTorch cu121 stack...", flush=True)
    # Uninstall any existing
    subprocess.run([sys.executable, '-m', 'pip', 'uninstall', '-y', 'torch', 'torchvision', 'torchaudio'], check=False)
    # Clean potential shadow dirs
    for d in (
        "/app/.pip-target/torch",
        "/app/.pip-target/torchvision",
        "/app/.pip-target/torchaudio",
        "/app/.pip-target/torchgen",
        "/app/.pip-target/functorch",
    ):
        if os.path.exists(d):
            print("Removing", d); shutil.rmtree(d, ignore_errors=True)
    # Install exact versions for CUDA 12.1
    subprocess.run([sys.executable, '-m', 'pip', 'install',
                    '--index-url', 'https://download.pytorch.org/whl/cu121',
                    '--extra-index-url', 'https://pypi.org/simple',
                    'torch==2.4.1', 'torchvision==0.19.1', 'torchaudio==2.4.1'], check=True)
    Path('constraints.txt').write_text("torch==2.4.1\ntorchvision==0.19.1\ntorchaudio==2.4.1\n")
    import torch
    print("torch:", torch.__version__, "built CUDA:", getattr(torch.version, 'cuda', None))
    print("CUDA available:", torch.cuda.is_available())
    assert str(getattr(torch.version,'cuda','')).startswith('12.1'), f"Wrong CUDA build: {torch.version.cuda}"
    assert torch.cuda.is_available(), "CUDA not available"
    print("GPU:", torch.cuda.get_device_name(0))
else:
    import torch
    if torch.cuda.is_available():
        print("GPU:", torch.cuda.get_device_name(0))

print("=== Paths ===")
CWD = Path.cwd()
DATA_DIR = CWD
train_csv_path = DATA_DIR / 'train.csv'
sample_sub_path = DATA_DIR / 'sample_submission.csv'
label_map_path = DATA_DIR / 'label_num_to_disease_map.json'
train_img_dir = DATA_DIR / 'train_images'
test_img_dir = DATA_DIR / 'test_images'

print("CWD:", CWD)
print("Train CSV exists:", train_csv_path.exists())
print("Sample submission exists:", sample_sub_path.exists())
print("Label map exists:", label_map_path.exists())
print("Train images dir:", train_img_dir.exists(), "count:", len(list(train_img_dir.glob('*.jpg'))))
print("Test images dir:", test_img_dir.exists(), "count:", len(list(test_img_dir.glob('*.jpg'))))

df = pd.read_csv(train_csv_path)
print("Train shape:", df.shape)
print(df.head(3))
print("Label value counts:\n", df['label'].value_counts())

with open(label_map_path) as f:
    label_map = json.load(f)
print("Label map:", label_map)

ss = pd.read_csv(sample_sub_path)
print("Sample submission shape:", ss.shape)
print(ss.head(3))

# Sanity: all train files exist
missing_train = [im for im in df['image_id'].tolist() if not (train_img_dir / im).exists()]
print("Missing train images:", len(missing_train))
if missing_train[:5]:
    print("Example missing:", missing_train[:5])

# Sanity: test files match sample submission
test_ids = set([p.name for p in test_img_dir.glob('*.jpg')])
ss_ids = set(ss['image_id'].astype(str))
missing_in_fs = [im for im in ss_ids if im not in test_ids]
extra_in_fs = [im for im in test_ids if im not in ss_ids]
print(f"SampleSub ids not in test dir: {len(missing_in_fs)} | test files not in sampleSub: {len(extra_in_fs)}")

elapsed = time.time() - start_ts
print(f"Setup and audit done in {elapsed:.1f}s")

=== NVIDIA SMI ===


$ bash -lc nvidia-smi || true


Sun Sep 28 19:10:10 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.144.06             Driver Version: 550.144.06     CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A10-24Q                 On  |   00000002:00:00.0 Off |                    0 |
| N/A   N/A    P0             N/A /  N/A  |     182MiB /  24512MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                



Looking in indexes: https://download.pytorch.org/whl/cu121, https://pypi.org/simple


Collecting torch==2.4.1
  Downloading https://download.pytorch.org/whl/cu121/torch-2.4.1%2Bcu121-cp311-cp311-linux_x86_64.whl (799.0 MB)


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 799.0/799.0 MB 553.3 MB/s eta 0:00:00


Collecting torchaudio==2.4.1
  Downloading https://download.pytorch.org/whl/cu121/torchaudio-2.4.1%2Bcu121-cp311-cp311-linux_x86_64.whl (3.4 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 3.4/3.4 MB 450.6 MB/s eta 0:00:00


Collecting nvidia-cudnn-cu12==9.1.0.70
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl (664.8 MB)


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 664.8/664.8 MB 266.1 MB/s eta 0:00:00


Collecting nvidia-cublas-cu12==12.1.3.1
  Downloading nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 410.6/410.6 MB 197.0 MB/s eta 0:00:00


Collecting triton==3.0.0
  Downloading triton-3.0.0-1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (209.4 MB)


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 209.4/209.4 MB 161.5 MB/s eta 0:00:00


Collecting nvidia-cusolver-cu12==11.4.5.107
  Downloading nvidia_cusolver_cu12-11.4.5.107-py3-none-manylinux1_x86_64.whl (124.2 MB)


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 124.2/124.2 MB 257.4 MB/s eta 0:00:00


Collecting nvidia-cuda-cupti-cu12==12.1.105
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 14.1/14.1 MB 237.5 MB/s eta 0:00:00


Collecting filelock
  Downloading filelock-3.19.1-py3-none-any.whl (15 kB)
Collecting sympy
  Downloading sympy-1.14.0-py3-none-any.whl (6.3 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 6.3/6.3 MB 301.0 MB/s eta 0:00:00


Collecting typing-extensions>=4.8.0
  Downloading typing_extensions-4.15.0-py3-none-any.whl (44 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 44.6/44.6 KB 334.9 MB/s eta 0:00:00


Collecting jinja2
  Downloading jinja2-3.1.6-py3-none-any.whl (134 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 134.9/134.9 KB 509.3 MB/s eta 0:00:00


Collecting nvidia-curand-cu12==10.3.2.106
  Downloading nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 56.5/56.5 MB 231.1 MB/s eta 0:00:00


Collecting nvidia-cusparse-cu12==12.1.0.106
  Downloading nvidia_cusparse_cu12-12.1.0.106-py3-none-manylinux1_x86_64.whl (196.0 MB)


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 196.0/196.0 MB 267.7 MB/s eta 0:00:00


Collecting nvidia-cuda-nvrtc-cu12==12.1.105
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 23.7/23.7 MB 191.9 MB/s eta 0:00:00


Collecting networkx
  Downloading networkx-3.5-py3-none-any.whl (2.0 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 2.0/2.0 MB 498.9 MB/s eta 0:00:00
Collecting fsspec
  Downloading fsspec-2025.9.0-py3-none-any.whl (199 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 199.3/199.3 KB 525.9 MB/s eta 0:00:00


Collecting nvidia-nccl-cu12==2.20.5
  Downloading nvidia_nccl_cu12-2.20.5-py3-none-manylinux2014_x86_64.whl (176.2 MB)


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 176.2/176.2 MB 93.0 MB/s eta 0:00:00


Collecting nvidia-cufft-cu12==11.0.2.54
  Downloading nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 121.6/121.6 MB 198.0 MB/s eta 0:00:00


Collecting nvidia-nvtx-cu12==12.1.105
  Downloading nvidia_nvtx_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (99 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 99.1/99.1 KB 454.8 MB/s eta 0:00:00
Collecting nvidia-cuda-runtime-cu12==12.1.105
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 823.6/823.6 KB 509.7 MB/s eta 0:00:00


Collecting pillow!=8.3.*,>=5.3.0
  Downloading pillow-11.3.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (6.6 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 6.6/6.6 MB 122.8 MB/s eta 0:00:00


Collecting numpy
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 18.3/18.3 MB 205.7 MB/s eta 0:00:00


Collecting nvidia-nvjitlink-cu12
  Downloading nvidia_nvjitlink_cu12-12.9.86-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl (39.7 MB)


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 39.7/39.7 MB 289.9 MB/s eta 0:00:00


Collecting MarkupSafe>=2.0
  Downloading markupsafe-3.0.3-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl (22 kB)


Collecting mpmath<1.4,>=1.1.0
  Downloading mpmath-1.3.0-py3-none-any.whl (536 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 536.2/536.2 KB 202.7 MB/s eta 0:00:00


Installing collected packages: mpmath, typing-extensions, sympy, pillow, nvidia-nvtx-cu12, nvidia-nvjitlink-cu12, nvidia-nccl-cu12, nvidia-curand-cu12, nvidia-cufft-cu12, nvidia-cuda-runtime-cu12, nvidia-cuda-nvrtc-cu12, nvidia-cuda-cupti-cu12, nvidia-cublas-cu12, numpy, networkx, MarkupSafe, fsspec, filelock, triton, nvidia-cusparse-cu12, nvidia-cudnn-cu12, jinja2, nvidia-cusolver-cu12, torch, torchvision, torchaudio


Successfully installed MarkupSafe-3.0.3 filelock-3.19.1 fsspec-2025.9.0 jinja2-3.1.6 mpmath-1.3.0 networkx-3.5 numpy-1.26.4 nvidia-cublas-cu12-12.1.3.1 nvidia-cuda-cupti-cu12-12.1.105 nvidia-cuda-nvrtc-cu12-12.1.105 nvidia-cuda-runtime-cu12-12.1.105 nvidia-cudnn-cu12-9.1.0.70 nvidia-cufft-cu12-11.0.2.54 nvidia-curand-cu12-10.3.2.106 nvidia-cusolver-cu12-11.4.5.107 nvidia-cusparse-cu12-12.1.0.106 nvidia-nccl-cu12-2.20.5 nvidia-nvjitlink-cu12-12.9.86 nvidia-nvtx-cu12-12.1.105 pillow-11.3.0 sympy-1.14.0 torch-2.4.1+cu121 torchaudio-2.4.1+cu121 torchvision-0.19.1+cu121 triton-3.0.0 typing-extensions-4.15.0


torch: 2.4.1+cu121 built CUDA: 12.1
CUDA available: True
GPU: NVIDIA A10-24Q
=== Paths ===
CWD: /var/lib/simon/agent_run_states/cassava-leaf-disease-classification-20250928-185924
Train CSV exists: True
Sample submission exists: True
Label map exists: True
Train images dir: True count: 18721
Test images dir: True count: 2676
Train shape: (18721, 2)
         image_id  label
0  1000015157.jpg      0
1  1000201771.jpg      3
2   100042118.jpg      1
Label value counts:
 label
3    11523
4     2267
2     2091
1     1901
0      939
Name: count, dtype: int64
Label map: {'0': 'Cassava Bacterial Blight (CBB)', '1': 'Cassava Brown Streak Disease (CBSD)', '2': 'Cassava Green Mottle (CGM)', '3': 'Cassava Mosaic Disease (CMD)', '4': 'Healthy'}
Sample submission shape: (2676, 2)
         image_id  label
0  1234294272.jpg      4
1  1234332763.jpg      4
2  1234375577.jpg      4


Missing train images: 0
SampleSub ids not in test dir: 0 | test files not in sampleSub: 0
Setup and audit done in 66.3s


In [3]:
# Dependencies, folds, and dataset setup (baseline)
import sys, subprocess, os, random
import numpy as np
import torch
from pathlib import Path
import pandas as pd

# Install missing deps: timm, albumentations, albucore, opencv (without pulling torch deps)
def pip_install_pkgs():
    need = []
    try:
        import timm  # noqa
    except Exception:
        need.append('timm==0.9.16')
    try:
        import albumentations  # noqa
    except Exception:
        need.append('albumentations==1.4.14')
    # Ensure albucore for recent albumentations
    try:
        import albucore  # noqa
    except Exception:
        need.append('albucore==0.0.33')
    try:
        import cv2  # noqa
    except Exception:
        need.append('opencv-python-headless')
    if need:
        print('Installing (no-deps to avoid torch re-install):', need, flush=True)
        cmd = [sys.executable, '-m', 'pip', 'install', '--no-deps', *need, '--upgrade-strategy', 'only-if-needed']
        subprocess.run(cmd, check=True)
    else:
        print('All deps present.')

pip_install_pkgs()

import cv2
import timm
import albumentations as A
from albumentations.pytorch import ToTensorV2
from sklearn.model_selection import StratifiedKFold
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.backends.cudnn as cudnn

print('Versions -> torch:', torch.__version__, 'timm:', getattr(timm, '__version__', 'n/a'), 'albumentations:', getattr(A, '__version__', 'n/a'), 'cv2:', cv2.__version__)

# Seed and perf setup
SEED = 42
def seed_everything(seed=SEED):
    random.seed(seed); np.random.seed(seed); torch.manual_seed(seed); torch.cuda.manual_seed_all(seed)
    cudnn.deterministic = False; cudnn.benchmark = True
seed_everything(SEED)

# Paths
CWD = Path.cwd()
train_csv_path = CWD / 'train.csv'
train_img_dir = CWD / 'train_images'
test_img_dir = CWD / 'test_images'

# Create stratified folds
df = pd.read_csv(train_csv_path)
if 'fold' not in df.columns:
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
    df['fold'] = -1
    for i, (_, val_idx) in enumerate(skf.split(df, df['label'])):
        df.loc[val_idx, 'fold'] = i
    df.to_csv('folds.csv', index=False)
    print('Saved folds.csv with 5 folds')
else:
    df.to_csv('folds.csv', index=False)
    print('Found existing fold column; saved folds.csv')

# Transforms
IMG_SIZE = 384
mean = (0.485, 0.456, 0.406)
std = (0.229, 0.224, 0.225)
train_tfms = A.Compose([
    A.RandomResizedCrop(IMG_SIZE, IMG_SIZE, scale=(0.8, 1.0), ratio=(0.9, 1.1), p=1.0),
    A.HorizontalFlip(p=0.5),
    A.VerticalFlip(p=0.2),
    A.ColorJitter(brightness=0.15, contrast=0.15, saturation=0.15, hue=0.02, p=0.5),
    A.Normalize(mean=mean, std=std),
    ToTensorV2(),
])
valid_tfms = A.Compose([
    A.Resize(IMG_SIZE, IMG_SIZE),
    A.CenterCrop(IMG_SIZE, IMG_SIZE),
    A.Normalize(mean=mean, std=std),
    ToTensorV2(),
])

class CassavaDS(Dataset):
    def __init__(self, df, img_dir, transform):
        self.df = df.reset_index(drop=True)
        self.img_dir = Path(img_dir)
        self.transform = transform
        self.has_labels = 'label' in df.columns
    def __len__(self):
        return len(self.df)
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        img_path = self.img_dir / row['image_id']
        img = cv2.imread(str(img_path))
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        if self.transform:
            img = self.transform(image=img)['image']
        if self.has_labels:
            label = int(row['label'])
            return img, label
        else:
            return img, row['image_id']

def get_model(num_classes=5):
    model = timm.create_model('tf_efficientnet_b3_ns', pretrained=True, num_classes=num_classes)
    return model

print('Setup complete. Ready to train a smoke 1-fold run next.')

KernelDied: Kernel died unexpectedly.

In [7]:
# Smoke training loop (self-contained, uses torchvision transforms to avoid albumentations deps) + submission
import time
import torch
import pandas as pd
from pathlib import Path
from PIL import Image
from sklearn.model_selection import StratifiedKFold
from tqdm.auto import tqdm
from torch.optim import AdamW
from torch.optim.lr_scheduler import CosineAnnealingLR
import torch.nn as nn
from torch.utils.data import DataLoader
from torchvision import transforms as T
import timm

# Ensure df, paths, and transforms exist
CWD = Path.cwd()
if 'df' not in globals():
    if (CWD / 'folds.csv').exists():
        df = pd.read_csv(CWD / 'folds.csv')
    else:
        df = pd.read_csv(CWD / 'train.csv')
        skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        df['fold'] = -1
        for i, (_, val_idx) in enumerate(skf.split(df, df['label'])):
            df.loc[val_idx, 'fold'] = i
        df.to_csv('folds.csv', index=False)

if 'train_img_dir' not in globals():
    train_img_dir = CWD / 'train_images'
if 'test_img_dir' not in globals():
    test_img_dir = CWD / 'test_images'

IMG_SIZE = 384
mean = (0.485, 0.456, 0.406)
std = (0.229, 0.224, 0.225)
train_tfms = T.Compose([
    T.RandomResizedCrop(IMG_SIZE, scale=(0.8, 1.0), ratio=(0.9, 1.1)),
    T.RandomHorizontalFlip(p=0.5),
    T.RandomVerticalFlip(p=0.2),
    T.ColorJitter(brightness=0.15, contrast=0.15, saturation=0.15, hue=0.02),
    T.ToTensor(),
    T.Normalize(mean=mean, std=std),
])
valid_tfms = T.Compose([
    T.Resize(IMG_SIZE),
    T.CenterCrop(IMG_SIZE),
    T.ToTensor(),
    T.Normalize(mean=mean, std=std),
])

class CassavaDS(torch.utils.data.Dataset):
    def __init__(self, df, img_dir, transform):
        self.df = df.reset_index(drop=True)
        self.img_dir = Path(img_dir)
        self.transform = transform
        self.has_labels = 'label' in df.columns
    def __len__(self):
        return len(self.df)
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        img_path = self.img_dir / row['image_id']
        img = Image.open(img_path).convert('RGB')
        if self.transform:
            img = self.transform(img)
        if self.has_labels:
            label = int(row['label'])
            return img, label
        else:
            return img, row['image_id']

def get_model(num_classes=5):
    return timm.create_model('tf_efficientnet_b3_ns', pretrained=True, num_classes=num_classes)

def get_loaders(df, fold, batch_size=16, num_workers=2):
    trn_df = df[df.fold != fold].reset_index(drop=True)
    val_df = df[df.fold == fold].reset_index(drop=True)
    trn_ds = CassavaDS(trn_df, train_img_dir, train_tfms)
    val_ds = CassavaDS(val_df, train_img_dir, valid_tfms)
    trn_loader = DataLoader(trn_ds, batch_size=batch_size, shuffle=True, num_workers=num_workers, pin_memory=True, drop_last=True)
    val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=True)
    return trn_loader, val_loader, trn_df, val_df

def accuracy(outputs, targets):
    preds = outputs.argmax(dim=1)
    return (preds == targets).float().mean().item()

def train_one_fold(fold=0, epochs=2, lr=2e-4, wd=1e-5):
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    trn_loader, val_loader, trn_df, val_df = get_loaders(df, fold)
    model = get_model(num_classes=5).to(device)
    scaler = torch.cuda.amp.GradScaler(enabled=(device=='cuda'))
    criterion = nn.CrossEntropyLoss(label_smoothing=0.1)
    optimizer = AdamW(model.parameters(), lr=lr, weight_decay=wd)
    scheduler = CosineAnnealingLR(optimizer, T_max=epochs, eta_min=1e-6)
    best_acc = 0.0
    best_path = f'model_fold{fold}.pt'
    t0 = time.time()
    for epoch in range(epochs):
        model.train()
        trn_loss = 0.0; trn_acc = 0.0; n_steps = 0
        ep_start = time.time()
        for xb, yb in tqdm(trn_loader, desc=f'Fold {fold} Epoch {epoch+1}/{epochs} train'):
            xb = xb.to(device, non_blocking=True); yb = torch.tensor(yb).to(device, non_blocking=True)
            optimizer.zero_grad(set_to_none=True)
            with torch.cuda.amp.autocast(enabled=(device=='cuda')):
                out = model(xb)
                loss = criterion(out, yb)
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            trn_loss += loss.item()
            trn_acc += accuracy(out.detach(), yb)
            n_steps += 1
        scheduler.step()
        trn_loss /= max(1, n_steps); trn_acc /= max(1, n_steps)
        model.eval(); val_loss = 0.0; val_acc = 0.0; v_steps = 0
        with torch.no_grad():
            for xb, yb in tqdm(val_loader, desc=f'Fold {fold} Epoch {epoch+1}/{epochs} valid'):
                xb = xb.to(device, non_blocking=True); yb = torch.tensor(yb).to(device, non_blocking=True)
                with torch.cuda.amp.autocast(enabled=(device=='cuda')):
                    out = model(xb)
                    loss = criterion(out, yb)
                val_loss += loss.item()
                val_acc += accuracy(out, yb)
                v_steps += 1
        val_loss /= max(1, v_steps); val_acc /= max(1, v_steps)
        print(f"Fold {fold} Epoch {epoch+1}: trn_loss {trn_loss:.4f} trn_acc {trn_acc:.4f} | val_loss {val_loss:.4f} val_acc {val_acc:.4f} | epoch_time {time.time()-ep_start:.1f}s", flush=True)
        if val_acc > best_acc:
            best_acc = val_acc
            torch.save({'model': model.state_dict(), 'acc': best_acc}, best_path)
            print(f"Saved best to {best_path} (val_acc={best_acc:.4f})", flush=True)
    print(f"Fold {fold} done in {time.time()-t0:.1f}s, best_acc={best_acc:.4f}")
    return best_path, best_acc

def infer_test(model_path, tta_hflip=True, batch_size=64):
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    ss = pd.read_csv('sample_submission.csv')
    test_df = ss[['image_id']].copy()
    test_ds = CassavaDS(test_df, test_img_dir, valid_tfms)
    test_loader = DataLoader(test_ds, batch_size=batch_size, shuffle=False, num_workers=2, pin_memory=True)
    model = get_model(num_classes=5).to(device)
    ckpt = torch.load(model_path, map_location=device)
    model.load_state_dict(ckpt['model'], strict=True)
    model.eval()
    logits1_list = []
    with torch.no_grad():
        for xb, ids in tqdm(test_loader, desc='Test infer pass 1'):
            xb = xb.to(device, non_blocking=True)
            with torch.cuda.amp.autocast(enabled=(device=='cuda')):
                out = model(xb)
            logits1_list.append(out.float().cpu())
        logits = torch.cat(logits1_list, dim=0)
        if tta_hflip:
            logits2_list = []
            for xb, ids in tqdm(test_loader, desc='Test infer hflip'):
                xb = xb.to(device, non_blocking=True)
                xb = torch.flip(xb, dims=[3])
                with torch.cuda.amp.autocast(enabled=(device=='cuda')):
                    out = model(xb)
                logits2_list.append(out.float().cpu())
            logits2 = torch.cat(logits2_list, dim=0)
            logits = (logits + logits2) / 2.0
    preds = logits.argmax(dim=1).numpy()
    sub = pd.DataFrame({'image_id': test_df['image_id'], 'label': preds})
    sub.to_csv('submission.csv', index=False)
    print('Saved submission.csv with shape', sub.shape)

# Run smoke: train fold 0 for 2 epochs and infer
best_path, best_acc = train_one_fold(fold=0, epochs=2, lr=2e-4, wd=1e-5)
infer_test(best_path, tta_hflip=True)
print('Smoke run complete. Best val_acc:', best_acc)

KernelDied: Kernel died unexpectedly.

In [8]:
# Simple baseline: majority-class submission (sanity check, no GPU needed)
import pandas as pd
from collections import Counter

train_df = pd.read_csv('train.csv')
ss = pd.read_csv('sample_submission.csv')
majority_label = Counter(train_df['label']).most_common(1)[0][0]
print('Majority label:', majority_label)
sub = ss.copy()
sub['label'] = majority_label
sub.to_csv('submission.csv', index=False)
print('Wrote submission.csv shape:', sub.shape, 'label distribution:', sub['label'].value_counts().to_dict())

Majority label: 3
Wrote submission.csv shape: (2676, 2) label distribution: {3: 2676}


In [9]:
# Copy fine-tuned submission to canonical filename for grading
import shutil, os, time
src = 'submission_ft512.csv'
dst = 'submission.csv'
assert os.path.exists(src), f"Missing {src}"
shutil.copyfile(src, dst)
print('Copied', src, '->', dst, 'at', time.strftime('%Y-%m-%d %H:%M:%S'))

Copied submission_ft512.csv -> submission.csv at 2025-09-28 23:09:24
