# iNaturalist 2019 FGVC6 - Plan

Goal: Ship a strong, GPU-accelerated baseline fast; iterate to medal.

Plan:
- Environment check: verify GPU and install correct CUDA 12.1 PyTorch stack.
- Data audit: inspect train/val/test JSONs; confirm classes and image paths.
- Data extraction: untar images into a structured directory if needed.
- Validation: Stratified K-Fold on training (or train+val) respecting categories.
- Baseline model: timm pretrained ConvNeXt/ResNet at 224px; mixed precision; strong aug.
- Training loop: proper logging, early stopping, save OOF and test logits.
- Submission: top-1 predictions for test; format: image_id, category_id.
- Iterate: improve resolution/architectures, aug, label-smoothing, EMA; ensemble if time.

Checkpoints for expert review:
- After environment + data audit
- After baseline CV setup
- After first trained baseline + LB result
- Before heavy training runs / ensembling

In [1]:
# Environment check, Torch install (cu121), and data audit
import os, sys, json, time, shutil, subprocess, tarfile
from pathlib import Path

def run(cmd):
    return subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True).stdout

print('== nvidia-smi ==', flush=True)
print(run(['bash','-lc','nvidia-smi || true']))

# Install exact cu121 torch stack if missing or wrong build
def ensure_torch_cu121():
    try:
        import torch
        ok = str(getattr(torch.version,'cuda','')).startswith('12.1')
        if not ok:
            raise ImportError('Wrong CUDA build')
        print('Torch present:', torch.__version__, 'CUDA build:', torch.version.cuda, 'CUDA avail:', torch.cuda.is_available(), flush=True)
        if not torch.cuda.is_available():
            raise ImportError('CUDA not available at runtime')
        return
    except Exception as e:
        print('Reinstalling torch stack due to:', e, flush=True)
        # Uninstall possibly wrong stacks
        for pkg in ('torch','torchvision','torchaudio'):
            subprocess.run([sys.executable,'-m','pip','uninstall','-y',pkg], check=False)
        # Clean stray site dirs that can shadow wheels
        for d in (
            '/app/.pip-target/torch','/app/.pip-target/torchvision','/app/.pip-target/torchaudio',
            '/app/.pip-target/torch-2.8.0.dist-info','/app/.pip-target/torch-2.4.1.dist-info',
            '/app/.pip-target/torchvision-0.23.0.dist-info','/app/.pip-target/torchvision-0.19.1.dist-info',
            '/app/.pip-target/torchaudio-2.8.0.dist-info','/app/.pip-target/torchaudio-2.4.1.dist-info',
            '/app/.pip-target/torchgen','/app/.pip-target/functorch',
        ):
            if os.path.exists(d):
                shutil.rmtree(d, ignore_errors=True)
        # Install
        cmd = [sys.executable,'-m','pip','install','--index-url','https://download.pytorch.org/whl/cu121','--extra-index-url','https://pypi.org/simple','torch==2.4.1','torchvision==0.19.1','torchaudio==2.4.1']
        print('>', ' '.join(cmd), flush=True)
        subprocess.run(cmd, check=True)
        import torch
        print('torch:', torch.__version__, 'built CUDA:', getattr(torch.version,'cuda',None), flush=True)
        print('CUDA available:', torch.cuda.is_available(), flush=True)
        assert str(getattr(torch.version,'cuda','')).startswith('12.1')
        assert torch.cuda.is_available(), 'CUDA not available after install'
        print('GPU:', torch.cuda.get_device_name(0), flush=True)

ensure_torch_cu121()

print('== Repo listing ==', flush=True)
for p in sorted(Path('.').glob('*')):
    sz = p.stat().st_size
    print(f'{p.name}\t{sz/1e6:.2f} MB')

# Load JSONs
def load_json(fp):
    with open(fp,'r') as f:
        return json.load(f)

train_js = load_json('train2019.json')
val_js = load_json('val2019.json')
test_js = load_json('test2019.json')

def summarize(js, name):
    imgs = js.get('images', [])
    anns = js.get('annotations', [])
    cats = js.get('categories', [])
    print(f'-- {name} -- images: {len(imgs)}, annotations: {len(anns)}, categories: {len(cats)}', flush=True)
    if imgs:
        print('sample image:', imgs[0])
    if anns:
        print('sample ann:', anns[0])
    if cats:
        print('sample cat:', cats[0])

summarize(train_js, 'train')
summarize(val_js, 'val')
summarize(test_js, 'test')

# Peek into tar files
def peek_tar(fp, n=5):
    print(f'-- Peek {fp} --', flush=True)
    with tarfile.open(fp, 'r:gz') as tf:
        names = [m.name for m in tf.getmembers() if m.isfile()]
        print('files:', len(names))
        for x in names[:n]:
            print(' ', x)

peek_tar('train_val2019.tar.gz', 5)
peek_tar('test2019.tar.gz', 5)

print('ENV & data audit complete.', flush=True)

== nvidia-smi ==


Sun Sep 28 22:59:24 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.144.06             Driver Version: 550.144.06     CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A10-24Q                 On  |   00000002:00:00.0 Off |                    0 |
| N/A   N/A    P0             N/A /  N/A  |     182MiB /  24512MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                





> /usr/bin/python3.11 -m pip install --index-url https://download.pytorch.org/whl/cu121 --extra-index-url https://pypi.org/simple torch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1




Looking in indexes: https://download.pytorch.org/whl/cu121, https://pypi.org/simple


Collecting torch==2.4.1
  Downloading https://download.pytorch.org/whl/cu121/torch-2.4.1%2Bcu121-cp311-cp311-linux_x86_64.whl (799.0 MB)


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 799.0/799.0 MB 516.0 MB/s eta 0:00:00


Collecting torchvision==0.19.1
  Downloading https://download.pytorch.org/whl/cu121/torchvision-0.19.1%2Bcu121-cp311-cp311-linux_x86_64.whl (7.1 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 7.1/7.1 MB 237.6 MB/s eta 0:00:00


Collecting torchaudio==2.4.1
  Downloading https://download.pytorch.org/whl/cu121/torchaudio-2.4.1%2Bcu121-cp311-cp311-linux_x86_64.whl (3.4 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 3.4/3.4 MB 352.9 MB/s eta 0:00:00


Collecting fsspec
  Downloading fsspec-2025.9.0-py3-none-any.whl (199 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 199.3/199.3 KB 7.5 MB/s eta 0:00:00


Collecting nvidia-cusparse-cu12==12.1.0.106
  Downloading nvidia_cusparse_cu12-12.1.0.106-py3-none-manylinux1_x86_64.whl (196.0 MB)


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 196.0/196.0 MB 227.4 MB/s eta 0:00:00


Collecting nvidia-cuda-runtime-cu12==12.1.105
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 823.6/823.6 KB 233.1 MB/s eta 0:00:00


Collecting nvidia-cusolver-cu12==11.4.5.107
  Downloading nvidia_cusolver_cu12-11.4.5.107-py3-none-manylinux1_x86_64.whl (124.2 MB)


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 124.2/124.2 MB 132.0 MB/s eta 0:00:00


Collecting nvidia-nvtx-cu12==12.1.105
  Downloading nvidia_nvtx_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (99 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 99.1/99.1 KB 407.0 MB/s eta 0:00:00


Collecting nvidia-cuda-cupti-cu12==12.1.105
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 14.1/14.1 MB 214.5 MB/s eta 0:00:00


Collecting sympy
  Downloading sympy-1.14.0-py3-none-any.whl (6.3 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 6.3/6.3 MB 155.1 MB/s eta 0:00:00


Collecting filelock
  Downloading filelock-3.19.1-py3-none-any.whl (15 kB)
Collecting nvidia-curand-cu12==10.3.2.106
  Downloading nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 56.5/56.5 MB 203.0 MB/s eta 0:00:00


Collecting jinja2
  Downloading jinja2-3.1.6-py3-none-any.whl (134 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 134.9/134.9 KB 501.8 MB/s eta 0:00:00
Collecting nvidia-cudnn-cu12==9.1.0.70
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl (664.8 MB)


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 664.8/664.8 MB 183.7 MB/s eta 0:00:00


Collecting nvidia-nccl-cu12==2.20.5
  Downloading nvidia_nccl_cu12-2.20.5-py3-none-manylinux2014_x86_64.whl (176.2 MB)


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 176.2/176.2 MB 92.2 MB/s eta 0:00:00


Collecting nvidia-cuda-nvrtc-cu12==12.1.105
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 23.7/23.7 MB 178.2 MB/s eta 0:00:00


Collecting nvidia-cufft-cu12==11.0.2.54
  Downloading nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 121.6/121.6 MB 229.1 MB/s eta 0:00:00


Collecting nvidia-cublas-cu12==12.1.3.1
  Downloading nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 410.6/410.6 MB 222.3 MB/s eta 0:00:00


Collecting triton==3.0.0
  Downloading triton-3.0.0-1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (209.4 MB)


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 209.4/209.4 MB 174.3 MB/s eta 0:00:00


Collecting typing-extensions>=4.8.0
  Downloading typing_extensions-4.15.0-py3-none-any.whl (44 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 44.6/44.6 KB 425.0 MB/s eta 0:00:00


Collecting networkx
  Downloading networkx-3.5-py3-none-any.whl (2.0 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 2.0/2.0 MB 563.6 MB/s eta 0:00:00


Collecting numpy
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 18.3/18.3 MB 87.5 MB/s eta 0:00:00


Collecting pillow!=8.3.*,>=5.3.0
  Downloading pillow-11.3.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (6.6 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 6.6/6.6 MB 279.9 MB/s eta 0:00:00


Collecting nvidia-nvjitlink-cu12
  Downloading nvidia_nvjitlink_cu12-12.9.86-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl (39.7 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 39.7/39.7 MB 233.6 MB/s eta 0:00:00


Collecting MarkupSafe>=2.0
  Downloading markupsafe-3.0.3-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl (22 kB)


Collecting mpmath<1.4,>=1.1.0
  Downloading mpmath-1.3.0-py3-none-any.whl (536 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 536.2/536.2 KB 475.2 MB/s eta 0:00:00


Installing collected packages: mpmath, typing-extensions, sympy, pillow, nvidia-nvtx-cu12, nvidia-nvjitlink-cu12, nvidia-nccl-cu12, nvidia-curand-cu12, nvidia-cufft-cu12, nvidia-cuda-runtime-cu12, nvidia-cuda-nvrtc-cu12, nvidia-cuda-cupti-cu12, nvidia-cublas-cu12, numpy, networkx, MarkupSafe, fsspec, filelock, triton, nvidia-cusparse-cu12, nvidia-cudnn-cu12, jinja2, nvidia-cusolver-cu12, torch, torchvision, torchaudio


Successfully installed MarkupSafe-3.0.3 filelock-3.19.1 fsspec-2025.9.0 jinja2-3.1.6 mpmath-1.3.0 networkx-3.5 numpy-1.26.4 nvidia-cublas-cu12-12.1.3.1 nvidia-cuda-cupti-cu12-12.1.105 nvidia-cuda-nvrtc-cu12-12.1.105 nvidia-cuda-runtime-cu12-12.1.105 nvidia-cudnn-cu12-9.1.0.70 nvidia-cufft-cu12-11.0.2.54 nvidia-curand-cu12-10.3.2.106 nvidia-cusolver-cu12-11.4.5.107 nvidia-cusparse-cu12-12.1.0.106 nvidia-nccl-cu12-2.20.5 nvidia-nvjitlink-cu12-12.9.86 nvidia-nvtx-cu12-12.1.105 pillow-11.3.0 sympy-1.14.0 torch-2.4.1+cu121 torchaudio-2.4.1+cu121 torchvision-0.19.1+cu121 triton-3.0.0 typing-extensions-4.15.0


torch: 2.4.1+cu121 built CUDA: 12.1


CUDA available: True


GPU: NVIDIA A10-24Q


== Repo listing ==


.00_eda_and_planning_kernel_state.json	0.00 MB
00_eda_and_planning.ipynb	0.01 MB
agent_metadata	0.00 MB
description.md	0.01 MB
docker_run.log	0.04 MB
kaggle_sample_submission.csv	0.34 MB
requirements.txt	0.00 MB
submission.csv	0.34 MB
task.txt	0.00 MB
test2019.json	7.86 MB
test2019.tar.gz	9501.32 MB
train2019.json	86.20 MB
train_val2019.tar.gz	68612.51 MB
val2019.json	0.84 MB


-- train -- images: 232999, annotations: 232999, categories: 1010


sample image: {'file_name': 'train_val2019/Plants/400/5a8f865ac7a3b5f7694e3116198c7564.jpg', 'height': 800, 'id': 36800, 'license': 3, 'rights_holder': 'kerriohara', 'width': 600}
sample ann: {'category_id': 400, 'id': 36800, 'image_id': 36800}
sample cat: {'class': 'KSMQKH', 'family': 'RPVDIT', 'genus': 'AIEVWT', 'id': 0, 'kingdom': 'SMHLVG', 'name': 'UYRGAX', 'order': 'GWFTGO', 'phylum': 'QVXHMU'}
-- val -- images: 3030, annotations: 3030, categories: 1010


sample image: {'license': 3, 'file_name': 'train_val2019/Plants/644/716a69838526f3ada3b2fe2e099cfcb6.jpg', 'rights_holder': 'Adrian Stewart', 'height': 618, 'width': 800, 'id': 265213}
sample ann: {'image_id': 265213, 'category_id': 644, 'id': 265213}
sample cat: {'kingdom': 'SMHLVG', 'phylum': 'QVXHMU', 'name': 'UYRGAX', 'family': 'RPVDIT', 'genus': 'AIEVWT', 'order': 'GWFTGO', 'id': 0, 'class': 'KSMQKH'}
-- test -- images: 32214, annotations: 0, categories: 0


sample image: {'file_name': 'test2019/2882396373c6e0f89f755fd5e0e810e5.jpg', 'height': 533, 'id': 177388, 'license': 3, 'rights_holder': 'Mike Hannisian', 'width': 800}
-- Peek train_val2019.tar.gz --


ReadError: not a gzip file

In [2]:
# Extract tar archives (auto-detect compression) and verify file paths
import tarfile, os, time
from pathlib import Path

def extract_tar(fp: str, dest: str = '.'):
    t0 = time.time()
    print(f'Extracting {fp} -> {dest}', flush=True)
    assert Path(fp).exists(), f'Missing archive: {fp}'
    assert tarfile.is_tarfile(fp), f'Not a tar archive: {fp}'
    with tarfile.open(fp, mode='r:*') as tf:
        members = tf.getmembers()
        print(f'Members: {len(members)}', flush=True)
        tf.extractall(path=dest)
    print(f'Done {fp} in {time.time()-t0:.1f}s', flush=True)

# Only extract if top-level dirs don't already exist
need_train_val = not Path('train_val2019').exists()
need_test = not Path('test2019').exists()
if need_train_val:
    extract_tar('train_val2019.tar.gz', '.')
else:
    print('train_val2019/ already exists, skip extraction')
if need_test:
    extract_tar('test2019.tar.gz', '.')
else:
    print('test2019/ already exists, skip extraction')

# Verify JSON file paths exist
def check_paths(js, name, n=10):
    miss = 0
    imgs = js.get('images', [])[:n]
    for im in imgs:
        fp = im['file_name']
        if not Path(fp).exists():
            print('MISSING:', fp)
            miss += 1
    print(f'{name}: checked {len(imgs)} paths, missing {miss}')

check_paths(train_js, 'train', 20)
check_paths(val_js, 'val', 20)
check_paths(test_js, 'test', 20)
print('Extraction & path verification complete.', flush=True)

Extracting train_val2019.tar.gz -> .


Members: 232999


Done train_val2019.tar.gz in 74.6s


Extracting test2019.tar.gz -> .


Members: 32214


Done test2019.tar.gz in 10.7s


MISSING: train_val2019/Plants/400/5a8f865ac7a3b5f7694e3116198c7564.jpg
MISSING: train_val2019/Plants/400/b29ce08f0f5e68cd489ee5e1f1469fcc.jpg
MISSING: train_val2019/Plants/400/545645ddeadacac64926b3bf012916b1.jpg
MISSING: train_val2019/Plants/400/cb06f47ac10823ee9c051d1027177561.jpg
MISSING: train_val2019/Plants/400/fedf0f512e9450c32f34f2f0a6788a92.jpg
MISSING: train_val2019/Plants/400/de45225566c43c17a5c02d1d26c992ec.jpg
MISSING: train_val2019/Plants/400/af7fdd89518238215b309bfe56e1b3f6.jpg
MISSING: train_val2019/Plants/400/d61ede7003f14fee1180385e8e3cb654.jpg
MISSING: train_val2019/Plants/400/6192a801c4bf7f03fa53db9665135c0a.jpg
MISSING: train_val2019/Plants/400/fa30268fd7f5dff1c97e0ada9896cb94.jpg
MISSING: train_val2019/Plants/400/07ee66a72874f551069395e4c20bda47.jpg
MISSING: train_val2019/Plants/400/2648d193906b70c822e373857c339616.jpg
MISSING: train_val2019/Plants/400/c411577f4f150e6cd771edf1a9c33a50.jpg
MISSING: train_val2019/Plants/400/f9a35f4afbd6b20d0587ee48c5b3aef7.jpg
MISSIN

In [3]:
# Fix paths: create symlinks so JSON file_name paths resolve
from pathlib import Path
import os

def ensure_symlink(link: str, target: str):
    lp = Path(link)
    tp = Path(target)
    if lp.exists() or lp.is_symlink():
        try:
            # if it's a wrong symlink, remove and recreate
            if lp.is_symlink() and os.readlink(lp) != str(tp):
                lp.unlink()
        except Exception:
            pass
    if not lp.exists():
        print(f'Creating symlink: {link} -> {target}', flush=True)
        lp.symlink_to(tp, target_is_directory=True)
    else:
        print(f'Symlink/dir already present: {link}', flush=True)

# The extracted archives placed category folders and test jpgs at repository root.
# JSON expects 'train_val2019/... and test2019/...'. Point both to '.' via symlinks.
if not Path('train_val2019').exists():
    ensure_symlink('train_val2019', '.')
else:
    print('train_val2019 exists')
if not Path('test2019').exists():
    ensure_symlink('test2019', '.')
else:
    print('test2019 exists')

# Re-check a handful of paths after symlink fix
check_paths(train_js, 'train', 20)
check_paths(val_js, 'val', 20)
check_paths(test_js, 'test', 20)
print('Symlink path fix complete.', flush=True)

Creating symlink: train_val2019 -> .


Creating symlink: test2019 -> .


train: checked 20 paths, missing 0
MISSING: train_val2019/Plants/644/716a69838526f3ada3b2fe2e099cfcb6.jpg
MISSING: train_val2019/Plants/597/0942cc64d2e759c5ee05970d8170942c.jpg
MISSING: train_val2019/Plants/883/acfdbfd9fa675f1c84558e3b9239db90.jpg
MISSING: train_val2019/Birds/300/5f3194ff536c7dd31d80b78ef809bc23.jpg
MISSING: train_val2019/Plants/881/76acaf0b2841f91982d2197cff825014.jpg
MISSING: train_val2019/Plants/771/5d190fd90da893988a3c9043b607fd24.jpg
MISSING: train_val2019/Plants/607/8fb0ddfe92dadbf9c575305387f4795d.jpg
MISSING: train_val2019/Plants/698/6d71521a64d1e2fe8bb34a94dee3d656.jpg
MISSING: train_val2019/Plants/714/09508f2bf937d21d63297f40b6abc731.jpg
MISSING: train_val2019/Birds/316/21c700bc90523485af67308cecdf4cd4.jpg
MISSING: train_val2019/Plants/726/847b2ebf8efc1a528c2d31ac9be2d6ed.jpg
MISSING: train_val2019/Insects/27/e5d141185a3a50b544d153d8be82c4b7.jpg
MISSING: train_val2019/Plants/640/cbe5ff1a159b614ae8677bd8bfefdfe2.jpg
MISSING: train_val2019/Plants/756/a99c33a5f9