# iNaturalist 2019 (FGVC6) – Plan

Objectives:
- Establish a fast, correct pipeline using official splits (train2019/val2019) and test2019.
- Start with a small baseline to validate data loading and submission format, then scale up.
- Use strong pretrained vision backbones (timm), AMP, cosine LR, label smoothing, balanced sampling, TTA, EMA.
- Train on train -> validate on val; final run on train+val -> predict test.

Milestones:
1) Env + GPU check; data inspection from JSONs; extract archives.
2) Minimal baseline: tf_efficientnet_b0@224 with light aug, single seed; sanity-check val acc and submission format.
3) Scale model/resolution: convnext_base@384 or tf_efficientnet_b4@380; stronger aug (RandAug/TrivialAug), cosine, warmup, EMA; class-balanced sampler.
4) TTA (e.g., flips, resize-crop variants). Save logits.
5) Train on train+val and generate test predictions; save submission.csv.
6) Error analysis on val: per-class, confidence bins; tweak aug/sampler if needed.

Validation & Tracking:
- Use official val2019.json as validation for baseline/tuning.
- Log per-epoch metrics and elapsed time; checkpoint best.
- Cache predictions/logits for quick re-use.

Ask Experts after plan and after baseline run for strategy validation and scaling advice.

In [2]:
import os, sys, json, tarfile, time, shutil, subprocess, math, random
from pathlib import Path
from collections import Counter, defaultdict

def run(cmd):
    print("$", " ".join(cmd), flush=True)
    try:
        out = subprocess.run(cmd, capture_output=True, text=True, check=False)
        print(out.stdout, flush=True)
        if out.stderr:
            print(out.stderr, file=sys.stderr, flush=True)
        return out.returncode
    except Exception as e:
        print("Command failed:", e, flush=True)
        return 1

print("Env check:")
run(['bash','-lc','nvidia-smi || true'])
print(f"Python: {sys.version}")
print("CWD:", os.getcwd())
print("Dir listing:")
for p in sorted(Path('.').glob('*')):
    try:
        sz = p.stat().st_size
    except Exception:
        sz = -1
    print(f" - {p.name} ({sz/1e6:.2f} MB)")

def load_json(fp):
    with open(fp,'r') as f:
        return json.load(f)

train_json = load_json('train2019.json')
val_json = load_json('val2019.json')
test_json = load_json('test2019.json')

# Categories mapping
categories = train_json.get('categories', [])
assert categories, 'No categories found in train2019.json'
cat_ids = [c['id'] for c in categories]
cat_ids_sorted = sorted(cat_ids)
catid2idx = {cid:i for i,cid in enumerate(cat_ids_sorted)}
idx2catid = {i:cid for cid,i in catid2idx.items()}
print(f"Classes: {len(categories)}; example cat ids: {cat_ids_sorted[:5]} -> idx {list(range(5))}")
Path('artifacts').mkdir(exist_ok=True)
Path('artifacts/catid2idx.json').write_text(json.dumps(catid2idx))
Path('artifacts/idx2catid.json').write_text(json.dumps(idx2catid))

# Image-id to filename maps
def build_img_map(j):
    imgs = j.get('images', [])
    return {img['id']: img['file_name'] for img in imgs}
train_img_map = build_img_map(train_json)
val_img_map = build_img_map(val_json)
test_img_map = build_img_map(test_json)
print(f"Train images: {len(train_img_map)}, Val images: {len(val_img_map)}, Test images: {len(test_img_map)}")

# Peek example file_name patterns
train_fns = list(train_img_map.values())[:3]
val_fns = list(val_img_map.values())[:3]
test_fns = list(test_img_map.values())[:3]
print("Sample train file_names:", train_fns)
print("Sample val file_names:", val_fns)
print("Sample test file_names:", test_fns)

# Class counts (train/val)
def class_counts(j):
    cnt = Counter()
    for ann in j.get('annotations', []):
        cnt[ann['category_id']] += 1
    return cnt
train_cls_cnt = class_counts(train_json)
val_cls_cnt = class_counts(val_json)
print(f"Train annotations: {sum(train_cls_cnt.values())}; unique classes in train: {len(train_cls_cnt)}")
print(f"Val annotations: {sum(val_cls_cnt.values())}; unique classes in val: {len(val_cls_cnt)}")
print("Top-5 frequent train classes:", train_cls_cnt.most_common(5))

# Verify mapping consistency on a few samples
sample_items = list(train_cls_cnt.items())[:5]
for cid,_ in sample_items:
    print(f"cat_id {cid} -> idx {catid2idx[cid]}")

# Extract archives using system tar (auto-detects compression).
# Marker placed next to archive to avoid dir assumptions.
def extract_with_tar(archive_path: str):
    marker = Path(archive_path + '.extracted_ok')
    if marker.exists():
        print(f"Already extracted (marker present): {archive_path}")
        return
    print(f"Extracting via tar: {archive_path}")
    t0 = time.time()
    # List a few entries first to validate archive
    rc = run(['bash','-lc', f"tar -tf {archive_path} | head -n 5"])
    if rc != 0:
        raise RuntimeError(f"Cannot list archive {archive_path}")
    # Extract
    rc = run(['bash','-lc', f"tar -xf {archive_path}"])
    if rc != 0:
        raise RuntimeError(f"Extraction failed for {archive_path}")
    marker.write_text('ok')
    print(f"Done in {(time.time()-t0)/60:.1f} min", flush=True)

extract_with_tar('train_val2019.tar.gz')
extract_with_tar('test2019.tar.gz')

# Sanity check a few paths exist (file_name is relative to CWD after extraction) 
def check_files(img_map, root='.'):
    root = Path(root)
    miss = 0
    keys = list(img_map.keys())
    if not keys:
        return 0
    for k in random.sample(keys, min(5, len(keys))):
        fn = img_map[k]
        fp = root / fn
        ok = fp.exists()
        print(f"Check: {fp} -> {ok}")
        if not ok: miss += 1
    return miss

print("Checking a few extracted files exist:")
miss_tv = check_files({**train_img_map, **val_img_map}, '.')
miss_t = check_files(test_img_map, '.')
print(f"Missing counts -> train+val: {miss_tv}, test: {miss_t}")

print("Setup complete. Next: install torch/timm and implement dataset + baseline model.")

Env check:
$ bash -lc nvidia-smi || true


Sun Sep 28 20:01:13 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.144.06             Driver Version: 550.144.06     CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A10-24Q                 On  |   00000002:00:00.0 Off |                    0 |
| N/A   N/A    P0             N/A /  N/A  |     182MiB /  24512MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

Python: 3.11.0rc1 (main, Aug 12 2022, 10:02:14) [GCC 11.2.0]
CWD: /var/lib/simon/agent_run_states/inaturalist-2019-fgvc6-20250928-191506
Dir listing:
 - .00_eda_and_planning_kernel_state.json (0.00 MB)
 - 00_eda_and_planning.ipynb (0.03 MB)
 - agent_metadata (0.00 MB)
 - artifacts (0.00 MB)
 - description.md (0.01 MB)
 - docker_run.log (0.07 MB)
 - kaggle_sample_submission.csv (0.34 MB)
 - requirements.txt (0.00 MB)
 - submission.csv (0.34 MB)
 - task.txt (0.00 MB)
 - test2019.json (7.86 MB)
 - test2019.tar.gz (9501.32 MB)
 - train2019.json (86.20 MB)
 - train_val (0.00 MB)
 - train_val2019.tar.gz (68612.51 MB)
 - val2019.json (0.84 MB)


Classes: 1010; example cat ids: [0, 1, 2, 3, 4] -> idx [0, 1, 2, 3, 4]
Train images: 232999, Val images: 3030, Test images: 32214
Sample train file_names: ['train_val2019/Plants/400/5a8f865ac7a3b5f7694e3116198c7564.jpg', 'train_val2019/Plants/400/b29ce08f0f5e68cd489ee5e1f1469fcc.jpg', 'train_val2019/Plants/400/545645ddeadacac64926b3bf012916b1.jpg']
Sample val file_names: ['train_val2019/Plants/644/716a69838526f3ada3b2fe2e099cfcb6.jpg', 'train_val2019/Plants/597/0942cc64d2e759c5ee05970d8170942c.jpg', 'train_val2019/Plants/883/acfdbfd9fa675f1c84558e3b9239db90.jpg']
Sample test file_names: ['test2019/2882396373c6e0f89f755fd5e0e810e5.jpg', 'test2019/437b7310fe7d060a8a09a50cd8758d66.jpg', 'test2019/baf96733bdc1eaf9f3ec6eaaea279eb0.jpg']
Train annotations: 232999; unique classes in train: 1010
Val annotations: 3030; unique classes in val: 1010
Top-5 frequent train classes: [(400, 440), (570, 440), (167, 440), (254, 440), (246, 440)]
cat_id 400 -> idx 400
cat_id 570 -> idx 570
cat_id 167 -> i

Insects/24/83162221c209fd4ec0b8a1bf54f9ab4b.jpg
Insects/24/fb04a7b17940580e68d250e30fbd3862.jpg
Insects/24/c0a092b7087f15115776ad6a4ea80a00.jpg
Insects/24/be7505ba1ba1b0081053bde9170938da.jpg
Insects/24/9222de3c41ac78c6224d438749f8f658.jpg



$ bash -lc tar -xf train_val2019.tar.gz





Done in 1.2 min


Extracting via tar: test2019.tar.gz
$ bash -lc tar -tf test2019.tar.gz | head -n 5


d61dd9d613f7f233584b9a27c503c539.jpg
3161dc4615fe65f1f3ef4aae9b1baacc.jpg
14eea3a6ed04d6c9acfa4a24e508d758.jpg
fa991414d054638b85568142f34968ae.jpg
7e2f14dea7faf00d5dc6cf47bdda0b8a.jpg



$ bash -lc tar -xf test2019.tar.gz





Done in 0.1 min


Checking a few extracted files exist:
Check: train_val2019/Birds/287/b0daafa14d9519bc79d59a40d8628bb2.jpg -> False
Check: train_val2019/Birds/311/5b94cd1749534674a8fc40600b73e880.jpg -> False
Check: train_val2019/Plants/607/e884c86c079c9a80c311f8f21ce08524.jpg -> False
Check: train_val2019/Reptiles/186/b960c9a1b30fe0bb12e92271b29587ea.jpg -> False
Check: train_val2019/Plants/869/2103c419e33bec7199cd01e36690f72b.jpg -> False
Check: test2019/dbd7a4216d5a6db8eb9ed124badf1369.jpg -> False
Check: test2019/6ff2837bf651fa51f6e3710f996d57b9.jpg -> False
Check: test2019/4f7ad6e8b05e1e87caf8072da47cf99f.jpg -> False
Check: test2019/66aa4a19f66392c8917d918f8960bfd2.jpg -> False
Check: test2019/f576b4ce1dc034301093ce6797706ebf.jpg -> False
Missing counts -> train+val: 5, test: 5
Setup complete. Next: install torch/timm and implement dataset + baseline model.


In [3]:
# Fix path prefixes by creating symlinks so JSON file_names resolve
import os
from pathlib import Path

def ensure_prefix_symlink(prefix: str):
    p = Path(prefix)
    if p.exists():
        print(f"Prefix already exists: {prefix}")
        return
    os.symlink('.', prefix)
    print(f"Created symlink: {prefix} -> .")

ensure_prefix_symlink('train_val2019')
ensure_prefix_symlink('test2019')

# Re-check a few paths now
def check_files_again(img_map, root='.'):
    import random
    root = Path(root)
    miss = 0
    keys = list(img_map.keys())
    for k in random.sample(keys, min(5, len(keys))):
        fn = img_map[k]
        fp = root / fn
        ok = fp.exists()
        print(f"Check: {fp} -> {ok}")
        if not ok: miss += 1
    return miss

print("Re-checking after symlinks:")
miss_tv2 = check_files_again({**train_img_map, **val_img_map}, '.')
miss_t2 = check_files_again(test_img_map, '.')
print(f"Missing counts (post-fix) -> train+val: {miss_tv2}, test: {miss_t2}")

Created symlink: train_val2019 -> .
Created symlink: test2019 -> .
Re-checking after symlinks:
Check: train_val2019/Plants/533/e846ebdb3d42e4e7ae34d71f67e18c03.jpg -> True
Check: train_val2019/Birds/261/6bc894a7a2961de674fcadbe4043e28f.jpg -> True
Check: train_val2019/Insects/36/adb367b3b7273dc38c63832a8885af4b.jpg -> True
Check: train_val2019/Plants/835/ebff56ab243284b6ee5e81746c923c42.jpg -> False
Check: train_val2019/Plants/673/b196fda7a666c0a4b1272acc81c42865.jpg -> True
Check: test2019/899d782727cfccfa3936619efe680d67.jpg -> True
Check: test2019/49a4845ff330e901495708b6e5c7a8d2.jpg -> True
Check: test2019/e97add79298145b96bec8eea536ae022.jpg -> True
Check: test2019/058ce0ef598f27e224a950ce56651111.jpg -> True
Check: test2019/707f2a2c659c1e3428b736ad657142ce.jpg -> True
Missing counts (post-fix) -> train+val: 1, test: 0


In [4]:
# Install CUDA 12.1 torch stack and core deps
import os, sys, subprocess, shutil
from pathlib import Path

def pip(*args):
    print('> pip', ' '.join(args), flush=True)
    subprocess.run([sys.executable, '-m', 'pip', *args], check=True)

# Uninstall any preexisting torch stack to avoid conflicts
for pkg in ('torch','torchvision','torchaudio'):
    try:
        subprocess.run([sys.executable, '-m', 'pip', 'uninstall', '-y', pkg], check=False)
    except Exception as e:
        print('uninstall error:', e)

# Clean stray site dirs that can shadow correct wheels (idempotent)
for d in (
    '/app/.pip-target/torch',
    '/app/.pip-target/torchvision',
    '/app/.pip-target/torchaudio',
    '/app/.pip-target/torch-2.8.0.dist-info',
    '/app/.pip-target/torch-2.4.1.dist-info',
    '/app/.pip-target/torchvision-0.23.0.dist-info',
    '/app/.pip-target/torchvision-0.19.1.dist-info',
    '/app/.pip-target/torchaudio-2.8.0.dist-info',
    '/app/.pip-target/torchaudio-2.4.1.dist-info',
    '/app/.pip-target/torchgen',
    '/app/.pip-target/functorch',
):
    if os.path.exists(d):
        print('Removing', d)
        shutil.rmtree(d, ignore_errors=True)

# Install EXACT cu121 torch stack
pip('install',
    '--index-url', 'https://download.pytorch.org/whl/cu121',
    '--extra-index-url', 'https://pypi.org/simple',
    'torch==2.4.1', 'torchvision==0.19.1', 'torchaudio==2.4.1')

# Freeze versions for later installs
Path('constraints.txt').write_text(
    'torch==2.4.1\n'
    'torchvision==0.19.1\n'
    'torchaudio==2.4.1\n'
)

# Install non-torch deps while honoring constraints
pip('install', '-c', 'constraints.txt',
    'timm==1.0.9',
    'albumentations==1.4.14',
    'opencv-python-headless==4.10.0.84',
    'scikit-learn==1.5.2',
    'numpy', 'pandas',
    '--upgrade-strategy', 'only-if-needed')

# Sanity check GPU
import torch
print('torch:', torch.__version__, 'CUDA build:', getattr(torch.version, 'cuda', None), flush=True)
print('CUDA available:', torch.cuda.is_available(), flush=True)
if torch.cuda.is_available():
    print('GPU:', torch.cuda.get_device_name(0), flush=True)
assert str(getattr(torch.version,'cuda','')).startswith('12.1'), f'Wrong CUDA build: {torch.version.cuda}'
assert torch.cuda.is_available(), 'CUDA not available'





> pip install --index-url https://download.pytorch.org/whl/cu121 --extra-index-url https://pypi.org/simple torch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1




Looking in indexes: https://download.pytorch.org/whl/cu121, https://pypi.org/simple


Collecting torch==2.4.1
  Downloading https://download.pytorch.org/whl/cu121/torch-2.4.1%2Bcu121-cp311-cp311-linux_x86_64.whl (799.0 MB)


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 799.0/799.0 MB 567.9 MB/s eta 0:00:00


Collecting torchvision==0.19.1
  Downloading https://download.pytorch.org/whl/cu121/torchvision-0.19.1%2Bcu121-cp311-cp311-linux_x86_64.whl (7.1 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 7.1/7.1 MB 468.2 MB/s eta 0:00:00


Collecting torchaudio==2.4.1
  Downloading https://download.pytorch.org/whl/cu121/torchaudio-2.4.1%2Bcu121-cp311-cp311-linux_x86_64.whl (3.4 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 3.4/3.4 MB 539.9 MB/s eta 0:00:00


Collecting filelock
  Downloading filelock-3.19.1-py3-none-any.whl (15 kB)


Collecting nvidia-nvtx-cu12==12.1.105
  Downloading nvidia_nvtx_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (99 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 99.1/99.1 KB 6.9 MB/s eta 0:00:00


Collecting triton==3.0.0
  Downloading triton-3.0.0-1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (209.4 MB)


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 209.4/209.4 MB 236.3 MB/s eta 0:00:00


Collecting nvidia-nccl-cu12==2.20.5
  Downloading nvidia_nccl_cu12-2.20.5-py3-none-manylinux2014_x86_64.whl (176.2 MB)


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 176.2/176.2 MB 245.7 MB/s eta 0:00:00


Collecting sympy
  Downloading sympy-1.14.0-py3-none-any.whl (6.3 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 6.3/6.3 MB 177.8 MB/s eta 0:00:00


Collecting nvidia-cusparse-cu12==12.1.0.106
  Downloading nvidia_cusparse_cu12-12.1.0.106-py3-none-manylinux1_x86_64.whl (196.0 MB)


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 196.0/196.0 MB 206.5 MB/s eta 0:00:00


Collecting nvidia-cufft-cu12==11.0.2.54
  Downloading nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 121.6/121.6 MB 177.1 MB/s eta 0:00:00


Collecting nvidia-cusolver-cu12==11.4.5.107
  Downloading nvidia_cusolver_cu12-11.4.5.107-py3-none-manylinux1_x86_64.whl (124.2 MB)


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 124.2/124.2 MB 177.4 MB/s eta 0:00:00


Collecting jinja2
  Downloading jinja2-3.1.6-py3-none-any.whl (134 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 134.9/134.9 KB 518.1 MB/s eta 0:00:00


Collecting nvidia-cuda-runtime-cu12==12.1.105
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 823.6/823.6 KB 181.5 MB/s eta 0:00:00


Collecting nvidia-cuda-nvrtc-cu12==12.1.105
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 23.7/23.7 MB 152.5 MB/s eta 0:00:00


Collecting nvidia-cudnn-cu12==9.1.0.70
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl (664.8 MB)


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 664.8/664.8 MB 131.5 MB/s eta 0:00:00


Collecting fsspec
  Downloading fsspec-2025.9.0-py3-none-any.whl (199 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 199.3/199.3 KB 300.0 MB/s eta 0:00:00


Collecting nvidia-cuda-cupti-cu12==12.1.105
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 14.1/14.1 MB 136.5 MB/s eta 0:00:00


Collecting nvidia-curand-cu12==10.3.2.106
  Downloading nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 56.5/56.5 MB 83.9 MB/s eta 0:00:00


Collecting typing-extensions>=4.8.0
  Downloading typing_extensions-4.15.0-py3-none-any.whl (44 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 44.6/44.6 KB 420.4 MB/s eta 0:00:00


Collecting nvidia-cublas-cu12==12.1.3.1
  Downloading nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 410.6/410.6 MB 171.3 MB/s eta 0:00:00


Collecting networkx
  Downloading networkx-3.5-py3-none-any.whl (2.0 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 2.0/2.0 MB 527.5 MB/s eta 0:00:00


Collecting pillow!=8.3.*,>=5.3.0
  Downloading pillow-11.3.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (6.6 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 6.6/6.6 MB 273.2 MB/s eta 0:00:00


Collecting numpy
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 18.3/18.3 MB 119.9 MB/s eta 0:00:00


Collecting nvidia-nvjitlink-cu12
  Downloading nvidia_nvjitlink_cu12-12.9.86-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl (39.7 MB)


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 39.7/39.7 MB 141.6 MB/s eta 0:00:00


Collecting MarkupSafe>=2.0
  Downloading markupsafe-3.0.3-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl (22 kB)


Collecting mpmath<1.4,>=1.1.0
  Downloading mpmath-1.3.0-py3-none-any.whl (536 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 536.2/536.2 KB 574.0 MB/s eta 0:00:00


Installing collected packages: mpmath, typing-extensions, sympy, pillow, nvidia-nvtx-cu12, nvidia-nvjitlink-cu12, nvidia-nccl-cu12, nvidia-curand-cu12, nvidia-cufft-cu12, nvidia-cuda-runtime-cu12, nvidia-cuda-nvrtc-cu12, nvidia-cuda-cupti-cu12, nvidia-cublas-cu12, numpy, networkx, MarkupSafe, fsspec, filelock, triton, nvidia-cusparse-cu12, nvidia-cudnn-cu12, jinja2, nvidia-cusolver-cu12, torch, torchvision, torchaudio


Successfully installed MarkupSafe-3.0.3 filelock-3.19.1 fsspec-2025.9.0 jinja2-3.1.6 mpmath-1.3.0 networkx-3.5 numpy-1.26.4 nvidia-cublas-cu12-12.1.3.1 nvidia-cuda-cupti-cu12-12.1.105 nvidia-cuda-nvrtc-cu12-12.1.105 nvidia-cuda-runtime-cu12-12.1.105 nvidia-cudnn-cu12-9.1.0.70 nvidia-cufft-cu12-11.0.2.54 nvidia-curand-cu12-10.3.2.106 nvidia-cusolver-cu12-11.4.5.107 nvidia-cusparse-cu12-12.1.0.106 nvidia-nccl-cu12-2.20.5 nvidia-nvjitlink-cu12-12.9.86 nvidia-nvtx-cu12-12.1.105 pillow-11.3.0 sympy-1.14.0 torch-2.4.1+cu121 torchaudio-2.4.1+cu121 torchvision-0.19.1+cu121 triton-3.0.0 typing-extensions-4.15.0


> pip install -c constraints.txt timm==1.0.9 albumentations==1.4.14 opencv-python-headless==4.10.0.84 scikit-learn==1.5.2 numpy pandas --upgrade-strategy only-if-needed


Collecting timm==1.0.9
  Downloading timm-1.0.9-py3-none-any.whl (2.3 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 2.3/2.3 MB 66.0 MB/s eta 0:00:00
Collecting albumentations==1.4.14
  Downloading albumentations-1.4.14-py3-none-any.whl (177 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 178.0/178.0 KB 494.4 MB/s eta 0:00:00


Collecting opencv-python-headless==4.10.0.84
  Downloading opencv_python_headless-4.10.0.84-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (49.9 MB)


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 49.9/49.9 MB 114.5 MB/s eta 0:00:00
Collecting scikit-learn==1.5.2


  Downloading scikit_learn-1.5.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.3 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 13.3/13.3 MB 125.7 MB/s eta 0:00:00


Collecting numpy
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 18.3/18.3 MB 219.8 MB/s eta 0:00:00


Collecting pandas
  Downloading pandas-2.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.4 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 12.4/12.4 MB 277.5 MB/s eta 0:00:00
Collecting huggingface_hub
  Downloading huggingface_hub-0.35.1-py3-none-any.whl (563 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 563.3/563.3 KB 295.6 MB/s eta 0:00:00


Collecting safetensors
  Downloading safetensors-0.6.2-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (485 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 485.8/485.8 KB 319.1 MB/s eta 0:00:00
Collecting torch
  Downloading torch-2.4.1-cp311-cp311-manylinux1_x86_64.whl (797.1 MB)


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 797.1/797.1 MB 217.1 MB/s eta 0:00:00


Collecting pyyaml
  Downloading pyyaml-6.0.3-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl (806 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 806.6/806.6 KB 456.9 MB/s eta 0:00:00
Collecting torchvision
  Downloading torchvision-0.19.1-cp311-cp311-manylinux1_x86_64.whl (7.0 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 7.0/7.0 MB 366.9 MB/s eta 0:00:00


Collecting scipy>=1.10.0
  Downloading scipy-1.16.2-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (35.9 MB)


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 35.9/35.9 MB 207.0 MB/s eta 0:00:00
Collecting eval-type-backport
  Downloading eval_type_backport-0.2.2-py3-none-any.whl (5.8 kB)
Collecting albucore>=0.0.13
  Downloading albucore-0.0.33-py3-none-any.whl (18 kB)


Collecting pydantic>=2.7.0
  Downloading pydantic-2.11.9-py3-none-any.whl (444 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 444.9/444.9 KB 548.4 MB/s eta 0:00:00
Collecting typing-extensions>=4.9.0
  Downloading typing_extensions-4.15.0-py3-none-any.whl (44 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 44.6/44.6 KB 427.3 MB/s eta 0:00:00
Collecting scikit-image>=0.21.0
  Downloading scikit_image-0.25.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (14.8 MB)


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 14.8/14.8 MB 232.6 MB/s eta 0:00:00
Collecting threadpoolctl>=3.1.0
  Downloading threadpoolctl-3.6.0-py3-none-any.whl (18 kB)
Collecting joblib>=1.2.0
  Downloading joblib-1.5.2-py3-none-any.whl (308 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 308.4/308.4 KB 423.2 MB/s eta 0:00:00


Collecting pytz>=2020.1
  Downloading pytz-2025.2-py2.py3-none-any.whl (509 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 509.2/509.2 KB 533.7 MB/s eta 0:00:00
Collecting tzdata>=2022.7
  Downloading tzdata-2025.2-py2.py3-none-any.whl (347 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 347.8/347.8 KB 500.2 MB/s eta 0:00:00
Collecting python-dateutil>=2.8.2
  Downloading python_dateutil-2.9.0.post0-py2.py3-none-any.whl (229 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 229.9/229.9 KB 537.2 MB/s eta 0:00:00


Collecting stringzilla>=3.10.4
  Downloading stringzilla-4.0.14-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl (496 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 496.5/496.5 KB 328.7 MB/s eta 0:00:00


Collecting simsimd>=5.9.2
  Downloading simsimd-6.5.3-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (1.1 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.1/1.1 MB 279.9 MB/s eta 0:00:00
Collecting typing-inspection>=0.4.0
  Downloading typing_inspection-0.4.1-py3-none-any.whl (14 kB)
Collecting annotated-types>=0.6.0
  Downloading annotated_types-0.7.0-py3-none-any.whl (13 kB)


Collecting pydantic-core==2.33.2
  Downloading pydantic_core-2.33.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.0 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 2.0/2.0 MB 428.5 MB/s eta 0:00:00
Collecting six>=1.5
  Downloading six-1.17.0-py2.py3-none-any.whl (11 kB)
Collecting imageio!=2.35.0,>=2.33
  Downloading imageio-2.37.0-py3-none-any.whl (315 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 315.8/315.8 KB 458.8 MB/s eta 0:00:00
Collecting networkx>=3.0
  Downloading networkx-3.5-py3-none-any.whl (2.0 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 2.0/2.0 MB 328.4 MB/s eta 0:00:00


Collecting packaging>=21
  Downloading packaging-25.0-py3-none-any.whl (66 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 66.5/66.5 KB 406.1 MB/s eta 0:00:00


Collecting pillow>=10.1
  Downloading pillow-11.3.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (6.6 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 6.6/6.6 MB 357.5 MB/s eta 0:00:00
Collecting tifffile>=2022.8.12
  Downloading tifffile-2025.9.20-py3-none-any.whl (230 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 230.1/230.1 KB 516.2 MB/s eta 0:00:00
Collecting lazy-loader>=0.4
  Downloading lazy_loader-0.4-py3-none-any.whl (12 kB)


Collecting fsspec>=2023.5.0
  Downloading fsspec-2025.9.0-py3-none-any.whl (199 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 199.3/199.3 KB 547.1 MB/s eta 0:00:00
Collecting requests
  Downloading requests-2.32.5-py3-none-any.whl (64 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 64.7/64.7 KB 347.0 MB/s eta 0:00:00
Collecting tqdm>=4.42.1
  Downloading tqdm-4.67.1-py3-none-any.whl (78 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 78.5/78.5 KB 382.1 MB/s eta 0:00:00
Collecting filelock
  Downloading filelock-3.19.1-py3-none-any.whl (15 kB)


Collecting hf-xet<2.0.0,>=1.1.3
  Downloading hf_xet-1.1.10-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.2 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 3.2/3.2 MB 363.4 MB/s eta 0:00:00
Collecting nvidia-nvtx-cu12==12.1.105
  Downloading nvidia_nvtx_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (99 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 99.1/99.1 KB 498.4 MB/s eta 0:00:00
Collecting nvidia-curand-cu12==10.3.2.106
  Downloading nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 56.5/56.5 MB 225.2 MB/s eta 0:00:00
Collecting nvidia-cuda-nvrtc-cu12==12.1.105
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 23.7/23.7 MB 257.0 MB/s eta 0:00:00


Collecting nvidia-cusolver-cu12==11.4.5.107
  Downloading nvidia_cusolver_cu12-11.4.5.107-py3-none-manylinux1_x86_64.whl (124.2 MB)


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 124.2/124.2 MB 88.7 MB/s eta 0:00:00
Collecting nvidia-cuda-runtime-cu12==12.1.105
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 823.6/823.6 KB 401.4 MB/s eta 0:00:00
Collecting triton==3.0.0
  Downloading triton-3.0.0-1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (209.4 MB)


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 209.4/209.4 MB 31.7 MB/s eta 0:00:00
Collecting nvidia-cublas-cu12==12.1.3.1
  Downloading nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 410.6/410.6 MB 232.6 MB/s eta 0:00:00


Collecting jinja2
  Downloading jinja2-3.1.6-py3-none-any.whl (134 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 134.9/134.9 KB 453.5 MB/s eta 0:00:00
Collecting nvidia-cufft-cu12==11.0.2.54
  Downloading nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 121.6/121.6 MB 292.6 MB/s eta 0:00:00
Collecting nvidia-nccl-cu12==2.20.5
  Downloading nvidia_nccl_cu12-2.20.5-py3-none-manylinux2014_x86_64.whl (176.2 MB)


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 176.2/176.2 MB 281.3 MB/s eta 0:00:00
Collecting nvidia-cudnn-cu12==9.1.0.70
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl (664.8 MB)


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 664.8/664.8 MB 303.2 MB/s eta 0:00:00


Collecting nvidia-cuda-cupti-cu12==12.1.105
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 14.1/14.1 MB 181.4 MB/s eta 0:00:00
Collecting sympy
  Downloading sympy-1.14.0-py3-none-any.whl (6.3 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 6.3/6.3 MB 241.5 MB/s eta 0:00:00
Collecting nvidia-cusparse-cu12==12.1.0.106
  Downloading nvidia_cusparse_cu12-12.1.0.106-py3-none-manylinux1_x86_64.whl (196.0 MB)


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 196.0/196.0 MB 261.7 MB/s eta 0:00:00
Collecting nvidia-nvjitlink-cu12
  Downloading nvidia_nvjitlink_cu12-12.9.86-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl (39.7 MB)


     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 39.7/39.7 MB 80.5 MB/s eta 0:00:00


Collecting MarkupSafe>=2.0
  Downloading markupsafe-3.0.3-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl (22 kB)
Collecting charset_normalizer<4,>=2
  Downloading charset_normalizer-3.4.3-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl (150 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 150.3/150.3 KB 524.4 MB/s eta 0:00:00


Collecting urllib3<3,>=1.21.1
  Downloading urllib3-2.5.0-py3-none-any.whl (129 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 129.8/129.8 KB 491.4 MB/s eta 0:00:00
Collecting certifi>=2017.4.17
  Downloading certifi-2025.8.3-py3-none-any.whl (161 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 161.2/161.2 KB 498.3 MB/s eta 0:00:00
Collecting idna<4,>=2.5
  Downloading idna-3.10-py3-none-any.whl (70 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 70.4/70.4 KB 459.3 MB/s eta 0:00:00
Collecting mpmath<1.4,>=1.1.0
  Downloading mpmath-1.3.0-py3-none-any.whl (536 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 536.2/536.2 KB 548.7 MB/s eta 0:00:00


Installing collected packages: simsimd, pytz, mpmath, urllib3, tzdata, typing-extensions, tqdm, threadpoolctl, sympy, stringzilla, six, safetensors, pyyaml, pillow, packaging, nvidia-nvtx-cu12, nvidia-nvjitlink-cu12, nvidia-nccl-cu12, nvidia-curand-cu12, nvidia-cufft-cu12, nvidia-cuda-runtime-cu12, nvidia-cuda-nvrtc-cu12, nvidia-cuda-cupti-cu12, nvidia-cublas-cu12, numpy, networkx, MarkupSafe, joblib, idna, hf-xet, fsspec, filelock, eval-type-backport, charset_normalizer, certifi, annotated-types, typing-inspection, triton, tifffile, scipy, requests, python-dateutil, pydantic-core, opencv-python-headless, nvidia-cusparse-cu12, nvidia-cudnn-cu12, lazy-loader, jinja2, imageio, scikit-learn, scikit-image, pydantic, pandas, nvidia-cusolver-cu12, huggingface_hub, albucore, torch, albumentations, torchvision, timm


Successfully installed MarkupSafe-3.0.3 albucore-0.0.33 albumentations-1.4.14 annotated-types-0.7.0 certifi-2025.8.3 charset_normalizer-3.4.3 eval-type-backport-0.2.2 filelock-3.19.1 fsspec-2025.9.0 hf-xet-1.1.10 huggingface_hub-0.35.1 idna-3.10 imageio-2.37.0 jinja2-3.1.6 joblib-1.5.2 lazy-loader-0.4 mpmath-1.3.0 networkx-3.5 numpy-1.26.4 nvidia-cublas-cu12-12.1.3.1 nvidia-cuda-cupti-cu12-12.1.105 nvidia-cuda-nvrtc-cu12-12.1.105 nvidia-cuda-runtime-cu12-12.1.105 nvidia-cudnn-cu12-9.1.0.70 nvidia-cufft-cu12-11.0.2.54 nvidia-curand-cu12-10.3.2.106 nvidia-cusolver-cu12-11.4.5.107 nvidia-cusparse-cu12-12.1.0.106 nvidia-nccl-cu12-2.20.5 nvidia-nvjitlink-cu12-12.9.86 nvidia-nvtx-cu12-12.1.105 opencv-python-headless-4.10.0.84 packaging-25.0 pandas-2.3.2 pillow-11.3.0 pydantic-2.11.9 pydantic-core-2.33.2 python-dateutil-2.9.0.post0 pytz-2025.2 pyyaml-6.0.3 requests-2.32.5 safetensors-0.6.2 scikit-image-0.25.2 scikit-learn-1.5.2 scipy-1.16.2 simsimd-6.5.3 six-1.17.0 stringzilla-4.0.14 sympy-1.







torch: 2.4.1+cu121 CUDA build: 12.1


CUDA available: True


GPU: NVIDIA A10-24Q


In [7]:
# Dataset/Dataloader smoke test with a tiny forward pass (torchvision transforms)
import os, json, math, random, time
from pathlib import Path
from PIL import Image
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as T
import timm

with open('artifacts/catid2idx.json','r') as f:
    catid2idx = json.load(f)
idx2catid = {int(v): int(k) for k,v in {k:v for k,v in catid2idx.items()}.items()}

def build_records(j):
    anns_by_img = {}
    for ann in j['annotations']:
        anns_by_img[ann['image_id']] = ann['category_id']
    recs = []
    dropped = 0
    for img in j['images']:
        img_id = img['id']
        fn = img['file_name']
        if img_id in anns_by_img:
            cid = anns_by_img[img_id]
            # map cat id -> idx
            y = catid2idx[str(cid)] if isinstance(catid2idx, dict) and str(cid) in catid2idx else catid2idx[cid]
            # ensure file exists; handle prefix/symlink oddities
            fp = Path(fn)
            if not fp.exists():
                alt = Path(fp.name)
                if alt.exists():
                    fn = str(alt)
                else:
                    dropped += 1
                    continue
            recs.append((img_id, fn, y))
    if dropped:
        print(f"Dropped {dropped} missing files from {len(j['images'])} images.")
    return recs

train_recs = build_records(train_json)
val_recs = build_records(val_json)
print(f"Records -> train: {len(train_recs)}, val: {len(val_recs)}")

IM_SIZE = 224  # sanity run size
IM_MEAN = (0.485, 0.456, 0.406)
IM_STD  = (0.229, 0.224, 0.225)

train_tfms = T.Compose([
    T.RandomResizedCrop(IM_SIZE, scale=(0.4, 1.0), ratio=(0.75, 1.33)),
    T.RandomHorizontalFlip(p=0.5),
    T.ColorJitter(0.2,0.2,0.2,0.1),
    T.ToTensor(),
    T.Normalize(mean=IM_MEAN, std=IM_STD),
])
val_tfms = T.Compose([
    T.Resize(IM_SIZE, interpolation=T.InterpolationMode.BICUBIC),
    T.CenterCrop(IM_SIZE),
    T.ToTensor(),
    T.Normalize(mean=IM_MEAN, std=IM_STD),
])

class INatDataset(Dataset):
    def __init__(self, records, transforms=None):
        self.records = records
        self.transforms = transforms
    def __len__(self):
        return len(self.records)
    def __getitem__(self, idx):
        img_id, fn, y = self.records[idx]
        fp = Path(fn)
        img = Image.open(fp).convert('RGB')
        if self.transforms:
            img = self.transforms(img)
        return img, int(y), int(img_id)

def make_loader(records, transforms, batch_size, shuffle, workers=8):
    ds = INatDataset(records, transforms)
    dl = DataLoader(ds, batch_size=batch_size, shuffle=shuffle, num_workers=workers, pin_memory=True, drop_last=False, persistent_workers=(workers>0))
    return ds, dl

# Subsample for smoke test
random.seed(42)
subset_train = random.sample(train_recs, min(128, len(train_recs)))
subset_val = random.sample(val_recs, min(128, len(val_recs)))

train_ds, train_dl = make_loader(subset_train, train_tfms, batch_size=16, shuffle=True, workers=8)
val_ds, val_dl = make_loader(subset_val, val_tfms, batch_size=16, shuffle=False, workers=8)
print("Dataloaders ready. Batches (train/val):", len(train_dl), len(val_dl))

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = timm.create_model('resnet18', pretrained=True, num_classes=len(catid2idx))
model.to(device)
model.eval()
criterion = torch.nn.CrossEntropyLoss()

@torch.inference_mode()
def smoke_pass(dl):
    t0 = time.time()
    for i, (x,y,ids) in enumerate(dl):
        x = x.to(device, non_blocking=True)
        y = y.to(device, non_blocking=True)
        with torch.autocast(device_type='cuda', dtype=torch.float16, enabled=device.type=='cuda'):
            logits = model(x)
            loss = criterion(logits, y)
        print(f"batch {i}: x={tuple(x.shape)}, logits={tuple(logits.shape)}, loss={loss.item():.4f}")
        break
    print(f"Smoke pass done in {time.time()-t0:.2f}s")

smoke_pass(train_dl)
smoke_pass(val_dl)
print("Smoke test complete. Next: implement full training loop (convnext_base@384) with class-balanced sampler.")

Dropped 3030 missing files from 3030 images.
Records -> train: 232999, val: 0
Dataloaders ready. Batches (train/val): 8 0


batch 0: x=(16, 3, 224, 224), logits=(16, 1010), loss=7.0293
Smoke pass done in 0.27s


Smoke pass done in 0.09s
Smoke test complete. Next: implement full training loop (convnext_base@384) with class-balanced sampler.


In [12]:
# Train convnext_base@384 with stratified holdout (official val imgs unavailable on disk)
import math, time, random, json
from collections import defaultdict, Counter
from pathlib import Path
import numpy as np
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset, WeightedRandomSampler
import torchvision.transforms as T
import timm
from timm.data.mixup import Mixup
from timm.loss import SoftTargetCrossEntropy
from timm.scheduler.cosine_lr import CosineLRScheduler
from timm.utils import ModelEmaV2

torch.backends.cudnn.benchmark = True
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
try:
    torch.set_float32_matmul_precision('medium')
except Exception:
    pass
random.seed(42); np.random.seed(42); torch.manual_seed(42)

# Reuse simple dataset that reads PIL and applies torchvision transforms
class INatDatasetSimple(Dataset):
    def __init__(self, records, transforms=None):
        self.records = records
        self.transforms = transforms
    def __len__(self):
        return len(self.records)
    def __getitem__(self, idx):
        img_id, fn, y = self.records[idx]
        from PIL import Image
        img = Image.open(fn).convert('RGB')
        if self.transforms: img = self.transforms(img)
        return img, int(y), int(img_id)

# Build 5% stratified holdout (min 3 per class)
labels_to_items = defaultdict(list)
for r in train_recs:
    labels_to_items[r[2]].append(r)
val_frac = 0.05
train_split, valid_split = [], []
random.seed(42)
for y, items in labels_to_items.items():
    if not items: continue
    random.shuffle(items)
    k = max(3, int(len(items)*val_frac))
    valid_split.extend(items[:k])
    train_split.extend(items[k:])
print(f"Holdout sizes -> train: {len(train_split)}, valid: {len(valid_split)} (val_frac={val_frac})")

IM_SIZE = 384
IM_MEAN = (0.485, 0.456, 0.406)
IM_STD  = (0.229, 0.224, 0.225)
train_tfms = T.Compose([
    T.RandomResizedCrop(IM_SIZE, scale=(0.4,1.0), ratio=(0.75,1.33)),
    T.RandomHorizontalFlip(p=0.5),
    T.TrivialAugmentWide(num_magnitude_bins=31),
    T.ToTensor(),
    T.RandomErasing(p=0.25, value='random'),
    T.Normalize(IM_MEAN, IM_STD),
])
val_tfms = T.Compose([
    T.Resize(int(IM_SIZE/0.875), interpolation=T.InterpolationMode.BICUBIC),
    T.CenterCrop(IM_SIZE),
    T.ToTensor(),
    T.Normalize(IM_MEAN, IM_STD),
])

# Datasets
train_ds = INatDatasetSimple(train_split, train_tfms)
valid_ds = INatDatasetSimple(valid_split, val_tfms)

# Class-balanced sampler ~ 1/sqrt(freq)
cls_counts = Counter([y for _,_,y in train_split])
weights = [1.0/np.sqrt(cls_counts[y]) for _,_,y in train_split]
sampler = WeightedRandomSampler(weights, num_samples=len(train_split), replacement=True)

BATCH_SIZE = 48  # adjust if OOM; fallback 32
train_dl = DataLoader(train_ds, batch_size=BATCH_SIZE, sampler=sampler, shuffle=False, num_workers=12, pin_memory=True, persistent_workers=True, prefetch_factor=6)
valid_dl = DataLoader(valid_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=12, pin_memory=True, persistent_workers=True, prefetch_factor=6)
print("DL ready:", len(train_dl), len(valid_dl))

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
num_classes = len(catid2idx)
# Prefer IN-21k finetuned head
model = timm.create_model('convnext_base.fb_in22k_ft_in1k', pretrained=True, num_classes=num_classes)
model.to(device)
model = model.to(memory_format=torch.channels_last)

# Mixup/CutMix + SoftTarget loss
mixup_fn = Mixup(mixup_alpha=0.3, cutmix_alpha=0.5, prob=1.0, switch_prob=0.5, label_smoothing=0.0, num_classes=num_classes)
criterion_train = SoftTargetCrossEntropy()
criterion_eval = nn.CrossEntropyLoss()

fused_ok = (device.type=='cuda')
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=0.05, betas=(0.9,0.999), fused=fused_ok)
EPOCHS = 15  # trimmed to fit within time budget
steps_per_epoch = max(1, len(train_dl))
total_updates = EPOCHS * steps_per_epoch
warmup_t = 1000  # updates-based warmup
sched = CosineLRScheduler(
    optimizer,
    t_initial=total_updates,
    lr_min=1e-6,
    warmup_t=warmup_t,
    warmup_lr_init=1e-6,
    k_decay=1.0,
    t_in_epochs=False
)

scaler = torch.amp.GradScaler('cuda', enabled=(device.type=='cuda'))
ema = ModelEmaV2(model, decay=0.9998, device=device if device.type=='cuda' else None)

def evaluate(use_ema=True):
    m = ema.module if use_ema and ema is not None else model
    m.eval()
    correct = 0; total = 0; loss_sum = 0.0
    t0 = time.time()
    with torch.no_grad():
        for bx, (x,y,ids) in enumerate(valid_dl):
            x = x.to(device, non_blocking=True).to(memory_format=torch.channels_last)
            y = y.to(device, non_blocking=True)
            with torch.autocast(device_type='cuda', dtype=torch.float16, enabled=device.type=='cuda'):
                logits = m(x)
                loss = criterion_eval(logits, y)
            loss_sum += loss.item() * y.size(0)
            pred = logits.argmax(dim=1)
            correct += (pred==y).sum().item()
            total += y.size(0)
            if (bx+1)%50==0:
                print(f"  Eval batch {bx+1}/{len(valid_dl)}", flush=True)
    acc = correct/max(1,total)
    print(f"Eval done in {time.time()-t0:.1f}s; acc={acc:.4f}; loss={loss_sum/max(1,total):.4f}")
    return acc, loss_sum/max(1,total)

best_acc = -1.0
t_start = time.time()
num_updates = 0
for ep in range(EPOCHS):
    # Taper mixup late
    if ep >= EPOCHS - 2:
        mixup_fn.prob = 0.5
    model.train()
    t0 = time.time()
    running = 0.0; n_seen = 0
    for bi, (x,y,ids) in enumerate(train_dl):
        x = x.to(device, non_blocking=True).to(memory_format=torch.channels_last)
        y = y.to(device, non_blocking=True)
        # apply mixup
        x, y_soft = mixup_fn(x, y)
        optimizer.zero_grad(set_to_none=True)
        with torch.autocast(device_type='cuda', dtype=torch.float16, enabled=device.type=='cuda'):
            logits = model(x)
            loss = criterion_train(logits, y_soft)
        scaler.scale(loss).backward()
        # unscale and clip before stepping
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        scaler.step(optimizer)
        scaler.update()
        num_updates += 1
        sched.step_update(num_updates)
        if num_updates > warmup_t:
            ema.update(model)
        running += loss.item()*x.size(0); n_seen += x.size(0)
        if (bi+1)%100==0:
            cur_lr = optimizer.param_groups[0]['lr']
            print(f"ep {ep+1}/{EPOCHS} batch {bi+1}/{len(train_dl)} lr={cur_lr:.2e} loss={running/max(1,n_seen):.4f} elapsed={time.time()-t0:.1f}s", flush=True)
    print(f"Epoch {ep+1} train_loss={running/max(1,n_seen):.4f} epoch_time={time.time()-t0:.1f}s total_elapsed={(time.time()-t_start)/60:.1f}m")
    acc, vloss = evaluate(use_ema=True)
    if acc>best_acc:
        best_acc = acc
        torch.save({'model': ema.module.state_dict(), 'acc': acc}, 'best_convnext_baseline.pt')
        print(f"Saved new best (EMA) acc={acc:.4f}")

print("Training finished. Best holdout acc:", best_acc)
print("Next: add TTA inference on test and write submission.")

Holdout sizes -> train: 221594, valid: 11405 (val_frac=0.05)


DL ready: 4617 238


ep 1/15 batch 100/4617 lr=1.01e-04 loss=6.9130 elapsed=56.4s


ep 1/15 batch 200/4617 lr=2.01e-04 loss=6.4803 elapsed=112.8s


ep 1/15 batch 300/4617 lr=3.01e-04 loss=5.9256 elapsed=169.8s


ep 1/15 batch 400/4617 lr=4.01e-04 loss=5.4922 elapsed=227.5s


ep 1/15 batch 500/4617 lr=5.01e-04 loss=5.1785 elapsed=285.6s


ep 1/15 batch 600/4617 lr=6.00e-04 loss=4.9424 elapsed=344.0s


ep 1/15 batch 700/4617 lr=7.00e-04 loss=4.8172 elapsed=402.7s


ep 1/15 batch 800/4617 lr=8.00e-04 loss=4.7275 elapsed=461.5s


ep 1/15 batch 900/4617 lr=9.00e-04 loss=4.6433 elapsed=520.3s


ep 1/15 batch 1000/4617 lr=9.99e-04 loss=4.5816 elapsed=579.1s


ep 1/15 batch 1100/4617 lr=9.99e-04 loss=4.5409 elapsed=638.9s


ep 1/15 batch 1200/4617 lr=9.99e-04 loss=4.5033 elapsed=698.8s


ep 1/15 batch 1300/4617 lr=9.99e-04 loss=4.4610 elapsed=758.7s


ep 1/15 batch 1400/4617 lr=9.99e-04 loss=4.4288 elapsed=818.3s


ep 1/15 batch 1500/4617 lr=9.99e-04 loss=4.3948 elapsed=878.1s


ep 1/15 batch 1600/4617 lr=9.99e-04 loss=4.3598 elapsed=937.9s


ep 1/15 batch 1700/4617 lr=9.99e-04 loss=4.3378 elapsed=997.8s


ep 1/15 batch 1800/4617 lr=9.98e-04 loss=4.3198 elapsed=1057.7s


ep 1/15 batch 1900/4617 lr=9.98e-04 loss=4.2938 elapsed=1117.7s


ep 1/15 batch 2000/4617 lr=9.98e-04 loss=4.2682 elapsed=1177.6s


ep 1/15 batch 2100/4617 lr=9.98e-04 loss=4.2522 elapsed=1237.6s


ep 1/15 batch 2200/4617 lr=9.98e-04 loss=4.2254 elapsed=1297.5s


ep 1/15 batch 2300/4617 lr=9.97e-04 loss=4.1979 elapsed=1357.4s


ep 1/15 batch 2400/4617 lr=9.97e-04 loss=4.1763 elapsed=1417.4s


ep 1/15 batch 2500/4617 lr=9.97e-04 loss=4.1570 elapsed=1477.3s


ep 1/15 batch 2600/4617 lr=9.97e-04 loss=4.1411 elapsed=1537.2s


ep 1/15 batch 2700/4617 lr=9.96e-04 loss=4.1153 elapsed=1597.2s


ep 1/15 batch 2800/4617 lr=9.96e-04 loss=4.1025 elapsed=1657.1s


ep 1/15 batch 2900/4617 lr=9.96e-04 loss=4.0861 elapsed=1717.1s


ep 1/15 batch 3000/4617 lr=9.95e-04 loss=4.0736 elapsed=1777.0s


ep 1/15 batch 3100/4617 lr=9.95e-04 loss=4.0559 elapsed=1837.0s


ep 1/15 batch 3200/4617 lr=9.95e-04 loss=4.0444 elapsed=1896.9s


ep 1/15 batch 3300/4617 lr=9.94e-04 loss=4.0297 elapsed=1956.9s


ep 1/15 batch 3400/4617 lr=9.94e-04 loss=4.0138 elapsed=2016.8s


ep 1/15 batch 3500/4617 lr=9.94e-04 loss=4.0038 elapsed=2076.8s


ep 1/15 batch 3600/4617 lr=9.93e-04 loss=3.9879 elapsed=2136.6s


ep 1/15 batch 3700/4617 lr=9.93e-04 loss=3.9723 elapsed=2196.3s


ep 1/15 batch 3800/4617 lr=9.93e-04 loss=3.9586 elapsed=2256.2s


ep 1/15 batch 3900/4617 lr=9.92e-04 loss=3.9467 elapsed=2316.0s


ep 1/15 batch 4000/4617 lr=9.92e-04 loss=3.9358 elapsed=2375.9s


ep 1/15 batch 4100/4617 lr=9.91e-04 loss=3.9224 elapsed=2435.8s


ep 1/15 batch 4200/4617 lr=9.91e-04 loss=3.9058 elapsed=2495.8s


ep 1/15 batch 4300/4617 lr=9.91e-04 loss=3.8920 elapsed=2555.7s


ep 1/15 batch 4400/4617 lr=9.90e-04 loss=3.8828 elapsed=2615.7s


ep 1/15 batch 4500/4617 lr=9.90e-04 loss=3.8693 elapsed=2675.7s


ep 1/15 batch 4600/4617 lr=9.89e-04 loss=3.8546 elapsed=2735.6s


Epoch 1 train_loss=3.8522 epoch_time=2745.5s total_elapsed=45.8m


  Eval batch 50/238


  Eval batch 100/238


  Eval batch 150/238


  Eval batch 200/238


Eval done in 48.6s; acc=0.6687; loss=2.0256


Saved new best (EMA) acc=0.6687


ep 2/15 batch 100/4617 lr=9.89e-04 loss=3.3758 elapsed=60.2s


ep 2/15 batch 200/4617 lr=9.88e-04 loss=3.3277 elapsed=119.9s


ep 2/15 batch 300/4617 lr=9.88e-04 loss=3.3128 elapsed=180.0s


ep 2/15 batch 400/4617 lr=9.87e-04 loss=3.2921 elapsed=239.8s


ep 2/15 batch 500/4617 lr=9.87e-04 loss=3.3019 elapsed=299.8s


ep 2/15 batch 600/4617 lr=9.86e-04 loss=3.2773 elapsed=359.8s


ep 2/15 batch 700/4617 lr=9.86e-04 loss=3.2659 elapsed=419.7s


ep 2/15 batch 800/4617 lr=9.85e-04 loss=3.2534 elapsed=479.6s


ep 2/15 batch 900/4617 lr=9.84e-04 loss=3.2620 elapsed=539.4s


ep 2/15 batch 1000/4617 lr=9.84e-04 loss=3.2540 elapsed=599.3s


ep 2/15 batch 1100/4617 lr=9.83e-04 loss=3.2455 elapsed=659.1s


ep 2/15 batch 1200/4617 lr=9.83e-04 loss=3.2540 elapsed=718.9s


ep 2/15 batch 1300/4617 lr=9.82e-04 loss=3.2669 elapsed=778.7s


ep 2/15 batch 1400/4617 lr=9.82e-04 loss=3.2715 elapsed=838.6s


ep 2/15 batch 1500/4617 lr=9.81e-04 loss=3.2761 elapsed=898.5s


ep 2/15 batch 1600/4617 lr=9.80e-04 loss=3.2622 elapsed=958.4s


ep 2/15 batch 1700/4617 lr=9.80e-04 loss=3.2684 elapsed=1018.3s


ep 2/15 batch 1800/4617 lr=9.79e-04 loss=3.2657 elapsed=1078.2s


ep 2/15 batch 1900/4617 lr=9.78e-04 loss=3.2629 elapsed=1138.0s


ep 2/15 batch 2000/4617 lr=9.78e-04 loss=3.2573 elapsed=1197.9s


ep 2/15 batch 2100/4617 lr=9.77e-04 loss=3.2539 elapsed=1257.9s


ep 2/15 batch 2200/4617 lr=9.76e-04 loss=3.2459 elapsed=1317.9s


ep 2/15 batch 2300/4617 lr=9.76e-04 loss=3.2448 elapsed=1377.9s


ep 2/15 batch 2400/4617 lr=9.75e-04 loss=3.2511 elapsed=1437.8s


ep 2/15 batch 2500/4617 lr=9.74e-04 loss=3.2470 elapsed=1497.5s


ep 2/15 batch 2600/4617 lr=9.73e-04 loss=3.2460 elapsed=1557.3s


ep 2/15 batch 2700/4617 lr=9.73e-04 loss=3.2429 elapsed=1617.2s


ep 2/15 batch 2800/4617 lr=9.72e-04 loss=3.2340 elapsed=1677.1s


ep 2/15 batch 2900/4617 lr=9.71e-04 loss=3.2281 elapsed=1737.0s


ep 2/15 batch 3000/4617 lr=9.70e-04 loss=3.2224 elapsed=1796.7s


ep 2/15 batch 3100/4617 lr=9.70e-04 loss=3.2195 elapsed=1856.7s


ep 2/15 batch 3200/4617 lr=9.69e-04 loss=3.2127 elapsed=1916.7s


ep 2/15 batch 3300/4617 lr=9.68e-04 loss=3.2079 elapsed=1976.6s


ep 2/15 batch 3400/4617 lr=9.67e-04 loss=3.2039 elapsed=2036.5s


ep 2/15 batch 3500/4617 lr=9.67e-04 loss=3.2037 elapsed=2096.3s


ep 2/15 batch 3600/4617 lr=9.66e-04 loss=3.2014 elapsed=2156.2s


ep 2/15 batch 3700/4617 lr=9.65e-04 loss=3.1951 elapsed=2216.1s


ep 2/15 batch 3800/4617 lr=9.64e-04 loss=3.1897 elapsed=2275.9s


ep 2/15 batch 3900/4617 lr=9.63e-04 loss=3.1868 elapsed=2335.8s


ep 2/15 batch 4000/4617 lr=9.62e-04 loss=3.1842 elapsed=2395.7s


ep 2/15 batch 4100/4617 lr=9.61e-04 loss=3.1772 elapsed=2455.5s


ep 2/15 batch 4200/4617 lr=9.61e-04 loss=3.1754 elapsed=2515.4s


ep 2/15 batch 4300/4617 lr=9.60e-04 loss=3.1735 elapsed=2575.3s


ep 2/15 batch 4400/4617 lr=9.59e-04 loss=3.1714 elapsed=2635.2s


ep 2/15 batch 4500/4617 lr=9.58e-04 loss=3.1686 elapsed=2695.1s


ep 2/15 batch 4600/4617 lr=9.57e-04 loss=3.1665 elapsed=2755.0s


Epoch 2 train_loss=3.1651 epoch_time=2764.9s total_elapsed=92.7m


  Eval batch 50/238


  Eval batch 100/238


  Eval batch 150/238


  Eval batch 200/238


Eval done in 46.8s; acc=0.7180; loss=1.0515


Saved new best (EMA) acc=0.7180


ep 3/15 batch 100/4617 lr=9.56e-04 loss=2.9193 elapsed=59.9s


ep 3/15 batch 200/4617 lr=9.55e-04 loss=2.9207 elapsed=119.7s


ep 3/15 batch 300/4617 lr=9.54e-04 loss=2.9752 elapsed=179.4s


ep 3/15 batch 400/4617 lr=9.53e-04 loss=2.9518 elapsed=239.2s


ep 3/15 batch 500/4617 lr=9.52e-04 loss=2.9982 elapsed=299.0s


ep 3/15 batch 600/4617 lr=9.51e-04 loss=2.9875 elapsed=358.9s


ep 3/15 batch 700/4617 lr=9.50e-04 loss=3.0183 elapsed=418.8s


ep 3/15 batch 800/4617 lr=9.49e-04 loss=3.0012 elapsed=478.6s


ep 3/15 batch 900/4617 lr=9.48e-04 loss=2.9858 elapsed=538.4s


ep 3/15 batch 1000/4617 lr=9.47e-04 loss=2.9654 elapsed=598.2s


ep 3/15 batch 1100/4617 lr=9.46e-04 loss=2.9694 elapsed=658.1s


ep 3/15 batch 1200/4617 lr=9.45e-04 loss=2.9673 elapsed=717.9s


ep 3/15 batch 1300/4617 lr=9.44e-04 loss=2.9730 elapsed=777.8s


ep 3/15 batch 1400/4617 lr=9.43e-04 loss=2.9695 elapsed=837.7s


ep 3/15 batch 1500/4617 lr=9.42e-04 loss=2.9731 elapsed=897.6s


ep 3/15 batch 1600/4617 lr=9.41e-04 loss=2.9826 elapsed=957.5s


ep 3/15 batch 1700/4617 lr=9.40e-04 loss=2.9810 elapsed=1017.2s


ep 3/15 batch 1800/4617 lr=9.39e-04 loss=2.9803 elapsed=1077.1s


ep 3/15 batch 1900/4617 lr=9.38e-04 loss=2.9806 elapsed=1137.1s


ep 3/15 batch 2000/4617 lr=9.37e-04 loss=2.9694 elapsed=1196.9s


ep 3/15 batch 2100/4617 lr=9.35e-04 loss=2.9676 elapsed=1256.8s


ep 3/15 batch 2200/4617 lr=9.34e-04 loss=2.9703 elapsed=1316.6s


ep 3/15 batch 2300/4617 lr=9.33e-04 loss=2.9667 elapsed=1376.5s


ep 3/15 batch 2400/4617 lr=9.32e-04 loss=2.9659 elapsed=1436.5s


ep 3/15 batch 2500/4617 lr=9.31e-04 loss=2.9632 elapsed=1496.1s


ep 3/15 batch 2600/4617 lr=9.30e-04 loss=2.9671 elapsed=1555.8s


ep 3/15 batch 2700/4617 lr=9.29e-04 loss=2.9668 elapsed=1615.6s


ep 3/15 batch 2800/4617 lr=9.27e-04 loss=2.9656 elapsed=1675.4s


ep 3/15 batch 2900/4617 lr=9.26e-04 loss=2.9613 elapsed=1735.2s


ep 3/15 batch 3000/4617 lr=9.25e-04 loss=2.9569 elapsed=1795.0s


ep 3/15 batch 3100/4617 lr=9.24e-04 loss=2.9534 elapsed=1854.9s


ep 3/15 batch 3200/4617 lr=9.23e-04 loss=2.9475 elapsed=1914.7s


ep 3/15 batch 3300/4617 lr=9.21e-04 loss=2.9477 elapsed=1974.5s


ep 3/15 batch 3400/4617 lr=9.20e-04 loss=2.9463 elapsed=2034.4s


ep 3/15 batch 3500/4617 lr=9.19e-04 loss=2.9411 elapsed=2094.4s


ep 3/15 batch 3600/4617 lr=9.18e-04 loss=2.9319 elapsed=2154.4s


ep 3/15 batch 3700/4617 lr=9.16e-04 loss=2.9332 elapsed=2214.2s


ep 3/15 batch 3800/4617 lr=9.15e-04 loss=2.9324 elapsed=2273.9s


ep 3/15 batch 3900/4617 lr=9.14e-04 loss=2.9260 elapsed=2333.7s


ep 3/15 batch 4000/4617 lr=9.13e-04 loss=2.9228 elapsed=2393.6s


ep 3/15 batch 4100/4617 lr=9.11e-04 loss=2.9188 elapsed=2453.4s


ep 3/15 batch 4200/4617 lr=9.10e-04 loss=2.9161 elapsed=2513.1s


ep 3/15 batch 4300/4617 lr=9.09e-04 loss=2.9161 elapsed=2572.8s


ep 3/15 batch 4400/4617 lr=9.07e-04 loss=2.9134 elapsed=2632.6s


ep 3/15 batch 4500/4617 lr=9.06e-04 loss=2.9107 elapsed=2692.3s


ep 3/15 batch 4600/4617 lr=9.05e-04 loss=2.9070 elapsed=2752.3s


Epoch 3 train_loss=2.9069 epoch_time=2762.3s total_elapsed=139.5m


  Eval batch 50/238


  Eval batch 100/238


  Eval batch 150/238


  Eval batch 200/238


Eval done in 46.8s; acc=0.7415; loss=0.9101


Saved new best (EMA) acc=0.7415


ep 4/15 batch 100/4617 lr=9.03e-04 loss=2.7670 elapsed=60.1s


ep 4/15 batch 200/4617 lr=9.02e-04 loss=2.8117 elapsed=120.1s


ep 4/15 batch 300/4617 lr=9.01e-04 loss=2.8226 elapsed=180.0s


ep 4/15 batch 400/4617 lr=8.99e-04 loss=2.8188 elapsed=239.6s


ep 4/15 batch 500/4617 lr=8.98e-04 loss=2.8204 elapsed=299.3s


ep 4/15 batch 600/4617 lr=8.96e-04 loss=2.8230 elapsed=359.0s


ep 4/15 batch 700/4617 lr=8.95e-04 loss=2.8178 elapsed=418.8s


ep 4/15 batch 800/4617 lr=8.94e-04 loss=2.8071 elapsed=478.7s


ep 4/15 batch 900/4617 lr=8.92e-04 loss=2.8023 elapsed=538.5s


ep 4/15 batch 1000/4617 lr=8.91e-04 loss=2.8115 elapsed=598.3s


ep 4/15 batch 1100/4617 lr=8.89e-04 loss=2.8126 elapsed=657.9s


ep 4/15 batch 1200/4617 lr=8.88e-04 loss=2.8149 elapsed=717.6s


ep 4/15 batch 1300/4617 lr=8.87e-04 loss=2.8137 elapsed=777.3s


ep 4/15 batch 1400/4617 lr=8.85e-04 loss=2.8074 elapsed=837.2s


ep 4/15 batch 1500/4617 lr=8.84e-04 loss=2.8115 elapsed=896.8s


ep 4/15 batch 1600/4617 lr=8.82e-04 loss=2.8165 elapsed=956.7s


ep 4/15 batch 1700/4617 lr=8.81e-04 loss=2.8073 elapsed=1016.6s


ep 4/15 batch 1800/4617 lr=8.79e-04 loss=2.8096 elapsed=1076.4s


ep 4/15 batch 1900/4617 lr=8.78e-04 loss=2.8080 elapsed=1136.3s


ep 4/15 batch 2000/4617 lr=8.76e-04 loss=2.8067 elapsed=1196.1s


ep 4/15 batch 2100/4617 lr=8.75e-04 loss=2.8157 elapsed=1255.8s


ep 4/15 batch 2200/4617 lr=8.73e-04 loss=2.8119 elapsed=1315.5s


ep 4/15 batch 2300/4617 lr=8.72e-04 loss=2.8024 elapsed=1375.4s


ep 4/15 batch 2400/4617 lr=8.70e-04 loss=2.8045 elapsed=1435.1s


ep 4/15 batch 2500/4617 lr=8.69e-04 loss=2.8031 elapsed=1494.8s


ep 4/15 batch 2600/4617 lr=8.67e-04 loss=2.8020 elapsed=1554.5s


ep 4/15 batch 2700/4617 lr=8.66e-04 loss=2.7965 elapsed=1614.3s


ep 4/15 batch 2800/4617 lr=8.64e-04 loss=2.7945 elapsed=1674.1s


ep 4/15 batch 2900/4617 lr=8.63e-04 loss=2.7979 elapsed=1734.0s


ep 4/15 batch 3000/4617 lr=8.61e-04 loss=2.7978 elapsed=1793.6s


ep 4/15 batch 3100/4617 lr=8.59e-04 loss=2.7952 elapsed=1853.3s


ep 4/15 batch 3200/4617 lr=8.58e-04 loss=2.7898 elapsed=1913.1s


ep 4/15 batch 3300/4617 lr=8.56e-04 loss=2.7904 elapsed=1972.9s


ep 4/15 batch 3400/4617 lr=8.55e-04 loss=2.7861 elapsed=2032.7s


ep 4/15 batch 3500/4617 lr=8.53e-04 loss=2.7835 elapsed=2092.5s


ep 4/15 batch 3600/4617 lr=8.51e-04 loss=2.7796 elapsed=2152.3s


ep 4/15 batch 3700/4617 lr=8.50e-04 loss=2.7751 elapsed=2212.1s


ep 4/15 batch 3800/4617 lr=8.48e-04 loss=2.7717 elapsed=2272.0s


ep 4/15 batch 3900/4617 lr=8.47e-04 loss=2.7741 elapsed=2331.7s


ep 4/15 batch 4000/4617 lr=8.45e-04 loss=2.7715 elapsed=2391.6s


ep 4/15 batch 4100/4617 lr=8.43e-04 loss=2.7644 elapsed=2451.3s


ep 4/15 batch 4200/4617 lr=8.42e-04 loss=2.7608 elapsed=2510.9s


ep 4/15 batch 4300/4617 lr=8.40e-04 loss=2.7541 elapsed=2570.6s


ep 4/15 batch 4400/4617 lr=8.38e-04 loss=2.7518 elapsed=2630.3s


ep 4/15 batch 4500/4617 lr=8.37e-04 loss=2.7496 elapsed=2690.1s


ep 4/15 batch 4600/4617 lr=8.35e-04 loss=2.7465 elapsed=2749.9s


Epoch 4 train_loss=2.7464 epoch_time=2759.8s total_elapsed=186.3m


  Eval batch 50/238


  Eval batch 100/238


  Eval batch 150/238


  Eval batch 200/238


Eval done in 46.7s; acc=0.7564; loss=0.8443


Saved new best (EMA) acc=0.7564


ep 5/15 batch 100/4617 lr=8.33e-04 loss=2.6709 elapsed=60.0s


ep 5/15 batch 200/4617 lr=8.31e-04 loss=2.6992 elapsed=119.9s


ep 5/15 batch 300/4617 lr=8.30e-04 loss=2.6479 elapsed=179.8s


ep 5/15 batch 400/4617 lr=8.28e-04 loss=2.6515 elapsed=239.5s


ep 5/15 batch 500/4617 lr=8.26e-04 loss=2.6158 elapsed=299.2s


ep 5/15 batch 600/4617 lr=8.25e-04 loss=2.6087 elapsed=358.9s


ep 5/15 batch 700/4617 lr=8.23e-04 loss=2.6164 elapsed=418.7s


ep 5/15 batch 800/4617 lr=8.21e-04 loss=2.6510 elapsed=478.5s


ep 5/15 batch 900/4617 lr=8.19e-04 loss=2.6592 elapsed=538.2s


ep 5/15 batch 1000/4617 lr=8.18e-04 loss=2.6736 elapsed=598.0s


ep 5/15 batch 1100/4617 lr=8.16e-04 loss=2.6683 elapsed=657.8s


ep 5/15 batch 1200/4617 lr=8.14e-04 loss=2.6730 elapsed=717.6s


ep 5/15 batch 1300/4617 lr=8.12e-04 loss=2.6768 elapsed=777.3s


ep 5/15 batch 1400/4617 lr=8.10e-04 loss=2.6807 elapsed=837.1s


ep 5/15 batch 1500/4617 lr=8.09e-04 loss=2.6895 elapsed=896.9s


ep 5/15 batch 1600/4617 lr=8.07e-04 loss=2.6748 elapsed=956.6s


ep 5/15 batch 1700/4617 lr=8.05e-04 loss=2.6672 elapsed=1016.4s


ep 5/15 batch 1800/4617 lr=8.03e-04 loss=2.6664 elapsed=1076.1s


ep 5/15 batch 1900/4617 lr=8.02e-04 loss=2.6682 elapsed=1135.8s


ep 5/15 batch 2000/4617 lr=8.00e-04 loss=2.6711 elapsed=1195.5s


ep 5/15 batch 2100/4617 lr=7.98e-04 loss=2.6721 elapsed=1255.2s


ep 5/15 batch 2200/4617 lr=7.96e-04 loss=2.6747 elapsed=1315.1s


ep 5/15 batch 2300/4617 lr=7.94e-04 loss=2.6739 elapsed=1374.8s


ep 5/15 batch 2400/4617 lr=7.92e-04 loss=2.6681 elapsed=1434.5s


ep 5/15 batch 2500/4617 lr=7.91e-04 loss=2.6763 elapsed=1494.3s


ep 5/15 batch 2600/4617 lr=7.89e-04 loss=2.6771 elapsed=1554.0s


ep 5/15 batch 2700/4617 lr=7.87e-04 loss=2.6753 elapsed=1613.5s


ep 5/15 batch 2800/4617 lr=7.85e-04 loss=2.6758 elapsed=1673.2s


ep 5/15 batch 2900/4617 lr=7.83e-04 loss=2.6739 elapsed=1732.9s


ep 5/15 batch 3000/4617 lr=7.81e-04 loss=2.6734 elapsed=1792.7s


ep 5/15 batch 3100/4617 lr=7.79e-04 loss=2.6739 elapsed=1852.4s


ep 5/15 batch 3200/4617 lr=7.78e-04 loss=2.6696 elapsed=1912.2s


ep 5/15 batch 3300/4617 lr=7.76e-04 loss=2.6672 elapsed=1971.9s


ep 5/15 batch 3400/4617 lr=7.74e-04 loss=2.6604 elapsed=2031.6s


ep 5/15 batch 3500/4617 lr=7.72e-04 loss=2.6570 elapsed=2091.6s


ep 5/15 batch 3600/4617 lr=7.70e-04 loss=2.6509 elapsed=2151.4s


ep 5/15 batch 3700/4617 lr=7.68e-04 loss=2.6501 elapsed=2211.3s


ep 5/15 batch 3800/4617 lr=7.66e-04 loss=2.6486 elapsed=2271.1s


ep 5/15 batch 3900/4617 lr=7.64e-04 loss=2.6438 elapsed=2330.6s


ep 5/15 batch 4000/4617 lr=7.62e-04 loss=2.6413 elapsed=2390.6s


ep 5/15 batch 4100/4617 lr=7.60e-04 loss=2.6374 elapsed=2450.4s


ep 5/15 batch 4200/4617 lr=7.58e-04 loss=2.6340 elapsed=2510.2s


ep 5/15 batch 4300/4617 lr=7.56e-04 loss=2.6323 elapsed=2570.0s


ep 5/15 batch 4400/4617 lr=7.54e-04 loss=2.6326 elapsed=2629.6s


ep 5/15 batch 4500/4617 lr=7.53e-04 loss=2.6298 elapsed=2689.1s


ep 5/15 batch 4600/4617 lr=7.51e-04 loss=2.6262 elapsed=2748.9s


Epoch 5 train_loss=2.6267 epoch_time=2758.8s total_elapsed=233.1m


  Eval batch 50/238


  Eval batch 100/238


  Eval batch 150/238


  Eval batch 200/238


Eval done in 46.6s; acc=0.7712; loss=0.7954


Saved new best (EMA) acc=0.7712


ep 6/15 batch 100/4617 lr=7.48e-04 loss=2.4719 elapsed=59.8s


ep 6/15 batch 200/4617 lr=7.46e-04 loss=2.4635 elapsed=119.5s


ep 6/15 batch 300/4617 lr=7.44e-04 loss=2.4943 elapsed=179.4s


ep 6/15 batch 400/4617 lr=7.42e-04 loss=2.5018 elapsed=239.2s


ep 6/15 batch 500/4617 lr=7.40e-04 loss=2.5103 elapsed=299.0s


ep 6/15 batch 600/4617 lr=7.38e-04 loss=2.5026 elapsed=358.6s


ep 6/15 batch 700/4617 lr=7.36e-04 loss=2.5040 elapsed=418.3s


ep 6/15 batch 800/4617 lr=7.34e-04 loss=2.5103 elapsed=478.0s


ep 6/15 batch 900/4617 lr=7.32e-04 loss=2.5101 elapsed=537.8s


ep 6/15 batch 1000/4617 lr=7.30e-04 loss=2.5038 elapsed=597.6s


ep 6/15 batch 1100/4617 lr=7.28e-04 loss=2.5066 elapsed=657.3s


ep 6/15 batch 1200/4617 lr=7.26e-04 loss=2.4922 elapsed=716.9s


ep 6/15 batch 1300/4617 lr=7.24e-04 loss=2.5043 elapsed=776.6s


ep 6/15 batch 1400/4617 lr=7.22e-04 loss=2.5059 elapsed=836.3s


ep 6/15 batch 1500/4617 lr=7.20e-04 loss=2.4892 elapsed=896.0s


ep 6/15 batch 1600/4617 lr=7.18e-04 loss=2.4929 elapsed=955.7s


ep 6/15 batch 1700/4617 lr=7.16e-04 loss=2.4935 elapsed=1015.4s


ep 6/15 batch 1800/4617 lr=7.14e-04 loss=2.4932 elapsed=1075.2s


ep 6/15 batch 1900/4617 lr=7.12e-04 loss=2.4997 elapsed=1135.0s


ep 6/15 batch 2000/4617 lr=7.10e-04 loss=2.5032 elapsed=1194.8s


ep 6/15 batch 2100/4617 lr=7.08e-04 loss=2.5022 elapsed=1254.6s


ep 6/15 batch 2200/4617 lr=7.06e-04 loss=2.5096 elapsed=1314.3s


ep 6/15 batch 2300/4617 lr=7.04e-04 loss=2.5066 elapsed=1373.9s


ep 6/15 batch 2400/4617 lr=7.02e-04 loss=2.5072 elapsed=1433.8s


ep 6/15 batch 2500/4617 lr=7.00e-04 loss=2.5096 elapsed=1493.7s


ep 6/15 batch 2600/4617 lr=6.98e-04 loss=2.5066 elapsed=1553.5s


ep 6/15 batch 2700/4617 lr=6.96e-04 loss=2.5072 elapsed=1613.2s


ep 6/15 batch 2800/4617 lr=6.93e-04 loss=2.5037 elapsed=1672.8s


ep 6/15 batch 2900/4617 lr=6.91e-04 loss=2.5039 elapsed=1732.6s


ep 6/15 batch 3000/4617 lr=6.89e-04 loss=2.5023 elapsed=1792.3s


ep 6/15 batch 3100/4617 lr=6.87e-04 loss=2.5046 elapsed=1852.1s


ep 6/15 batch 3200/4617 lr=6.85e-04 loss=2.5030 elapsed=1911.7s


ep 6/15 batch 3300/4617 lr=6.83e-04 loss=2.4992 elapsed=1971.3s


ep 6/15 batch 3400/4617 lr=6.81e-04 loss=2.4983 elapsed=2031.1s


ep 6/15 batch 3500/4617 lr=6.79e-04 loss=2.4953 elapsed=2090.8s


ep 6/15 batch 3600/4617 lr=6.77e-04 loss=2.4955 elapsed=2150.6s


ep 6/15 batch 3700/4617 lr=6.74e-04 loss=2.4939 elapsed=2210.4s


ep 6/15 batch 3800/4617 lr=6.72e-04 loss=2.4935 elapsed=2270.2s


ep 6/15 batch 3900/4617 lr=6.70e-04 loss=2.4893 elapsed=2329.9s


ep 6/15 batch 4000/4617 lr=6.68e-04 loss=2.4895 elapsed=2389.7s


ep 6/15 batch 4100/4617 lr=6.66e-04 loss=2.4927 elapsed=2449.4s


ep 6/15 batch 4200/4617 lr=6.64e-04 loss=2.4963 elapsed=2509.2s


ep 6/15 batch 4300/4617 lr=6.62e-04 loss=2.4958 elapsed=2568.9s


ep 6/15 batch 4400/4617 lr=6.60e-04 loss=2.4958 elapsed=2628.7s


ep 6/15 batch 4500/4617 lr=6.57e-04 loss=2.4931 elapsed=2688.5s


ep 6/15 batch 4600/4617 lr=6.55e-04 loss=2.4909 elapsed=2748.3s


Epoch 6 train_loss=2.4905 epoch_time=2758.2s total_elapsed=279.8m


  Eval batch 50/238


  Eval batch 100/238


  Eval batch 150/238


  Eval batch 200/238


Eval done in 46.5s; acc=0.7853; loss=0.7572


Saved new best (EMA) acc=0.7853


ep 7/15 batch 100/4617 lr=6.53e-04 loss=2.4672 elapsed=59.8s


ep 7/15 batch 200/4617 lr=6.51e-04 loss=2.3729 elapsed=119.5s


ep 7/15 batch 300/4617 lr=6.48e-04 loss=2.3812 elapsed=179.4s


ep 7/15 batch 400/4617 lr=6.46e-04 loss=2.4049 elapsed=239.2s


ep 7/15 batch 500/4617 lr=6.44e-04 loss=2.3732 elapsed=298.9s


ep 7/15 batch 600/4617 lr=6.42e-04 loss=2.3769 elapsed=358.8s


ep 7/15 batch 700/4617 lr=6.40e-04 loss=2.3890 elapsed=418.8s


ep 7/15 batch 800/4617 lr=6.38e-04 loss=2.3771 elapsed=478.6s


ep 7/15 batch 900/4617 lr=6.35e-04 loss=2.3819 elapsed=538.4s


ep 7/15 batch 1000/4617 lr=6.33e-04 loss=2.3901 elapsed=598.1s


ep 7/15 batch 1100/4617 lr=6.31e-04 loss=2.3956 elapsed=657.7s


ep 7/15 batch 1200/4617 lr=6.29e-04 loss=2.3909 elapsed=717.7s


ep 7/15 batch 1300/4617 lr=6.27e-04 loss=2.3935 elapsed=777.5s


ep 7/15 batch 1400/4617 lr=6.24e-04 loss=2.3983 elapsed=837.3s


ep 7/15 batch 1500/4617 lr=6.22e-04 loss=2.3945 elapsed=896.9s


ep 7/15 batch 1600/4617 lr=6.20e-04 loss=2.3836 elapsed=956.6s


ep 7/15 batch 1700/4617 lr=6.18e-04 loss=2.3878 elapsed=1016.3s


ep 7/15 batch 1800/4617 lr=6.16e-04 loss=2.3895 elapsed=1076.1s


ep 7/15 batch 1900/4617 lr=6.13e-04 loss=2.3995 elapsed=1135.9s


ep 7/15 batch 2000/4617 lr=6.11e-04 loss=2.4019 elapsed=1195.5s


ep 7/15 batch 2100/4617 lr=6.09e-04 loss=2.3953 elapsed=1255.2s


ep 7/15 batch 2200/4617 lr=6.07e-04 loss=2.3974 elapsed=1314.9s


ep 7/15 batch 2300/4617 lr=6.05e-04 loss=2.3967 elapsed=1374.7s


ep 7/15 batch 2400/4617 lr=6.02e-04 loss=2.3905 elapsed=1434.4s


ep 7/15 batch 2500/4617 lr=6.00e-04 loss=2.3789 elapsed=1494.0s


ep 7/15 batch 2600/4617 lr=5.98e-04 loss=2.3733 elapsed=1553.8s


ep 7/15 batch 2700/4617 lr=5.96e-04 loss=2.3708 elapsed=1613.5s


ep 7/15 batch 2800/4617 lr=5.93e-04 loss=2.3712 elapsed=1673.2s


ep 7/15 batch 2900/4617 lr=5.91e-04 loss=2.3703 elapsed=1733.1s


ep 7/15 batch 3000/4617 lr=5.89e-04 loss=2.3709 elapsed=1792.9s


ep 7/15 batch 3100/4617 lr=5.87e-04 loss=2.3681 elapsed=1852.6s


ep 7/15 batch 3200/4617 lr=5.85e-04 loss=2.3641 elapsed=1912.4s


ep 7/15 batch 3300/4617 lr=5.82e-04 loss=2.3621 elapsed=1972.1s


ep 7/15 batch 3400/4617 lr=5.80e-04 loss=2.3644 elapsed=2032.0s


ep 7/15 batch 3500/4617 lr=5.78e-04 loss=2.3590 elapsed=2091.8s


ep 7/15 batch 3600/4617 lr=5.76e-04 loss=2.3601 elapsed=2151.4s


ep 7/15 batch 3700/4617 lr=5.73e-04 loss=2.3564 elapsed=2211.2s


ep 7/15 batch 3800/4617 lr=5.71e-04 loss=2.3576 elapsed=2271.1s


ep 7/15 batch 3900/4617 lr=5.69e-04 loss=2.3581 elapsed=2330.9s


ep 7/15 batch 4000/4617 lr=5.67e-04 loss=2.3618 elapsed=2390.7s


ep 7/15 batch 4100/4617 lr=5.64e-04 loss=2.3596 elapsed=2450.3s


ep 7/15 batch 4200/4617 lr=5.62e-04 loss=2.3526 elapsed=2510.0s


ep 7/15 batch 4300/4617 lr=5.60e-04 loss=2.3503 elapsed=2569.7s


ep 7/15 batch 4400/4617 lr=5.58e-04 loss=2.3499 elapsed=2629.6s


ep 7/15 batch 4500/4617 lr=5.55e-04 loss=2.3469 elapsed=2689.1s


ep 7/15 batch 4600/4617 lr=5.53e-04 loss=2.3448 elapsed=2748.8s


Epoch 7 train_loss=2.3431 epoch_time=2758.7s total_elapsed=326.6m


  Eval batch 50/238


  Eval batch 100/238


  Eval batch 150/238


  Eval batch 200/238


Eval done in 46.6s; acc=0.7930; loss=0.7232


Saved new best (EMA) acc=0.7930


ep 8/15 batch 100/4617 lr=5.50e-04 loss=2.4371 elapsed=60.2s


ep 8/15 batch 200/4617 lr=5.48e-04 loss=2.4384 elapsed=120.1s


ep 8/15 batch 300/4617 lr=5.46e-04 loss=2.3714 elapsed=179.9s


ep 8/15 batch 400/4617 lr=5.44e-04 loss=2.3476 elapsed=239.7s


ep 8/15 batch 500/4617 lr=5.41e-04 loss=2.3558 elapsed=299.5s


ep 8/15 batch 600/4617 lr=5.39e-04 loss=2.3517 elapsed=359.3s


ep 8/15 batch 700/4617 lr=5.37e-04 loss=2.3447 elapsed=419.0s


ep 8/15 batch 800/4617 lr=5.35e-04 loss=2.3467 elapsed=478.8s


ep 8/15 batch 900/4617 lr=5.32e-04 loss=2.3454 elapsed=538.6s


ep 8/15 batch 1000/4617 lr=5.30e-04 loss=2.3578 elapsed=598.4s


ep 8/15 batch 1100/4617 lr=5.28e-04 loss=2.3553 elapsed=658.2s


ep 8/15 batch 1200/4617 lr=5.26e-04 loss=2.3580 elapsed=718.0s


ep 8/15 batch 1300/4617 lr=5.23e-04 loss=2.3509 elapsed=777.8s


ep 8/15 batch 1400/4617 lr=5.21e-04 loss=2.3439 elapsed=837.5s


ep 8/15 batch 1500/4617 lr=5.19e-04 loss=2.3386 elapsed=897.3s


ep 8/15 batch 1600/4617 lr=5.17e-04 loss=2.3310 elapsed=957.1s


ep 8/15 batch 1700/4617 lr=5.14e-04 loss=2.3266 elapsed=1016.9s


ep 8/15 batch 1800/4617 lr=5.12e-04 loss=2.3185 elapsed=1076.7s


ep 8/15 batch 1900/4617 lr=5.10e-04 loss=2.3127 elapsed=1136.4s


ep 8/15 batch 2000/4617 lr=5.07e-04 loss=2.3141 elapsed=1196.2s


ep 8/15 batch 2100/4617 lr=5.05e-04 loss=2.3114 elapsed=1256.0s


ep 8/15 batch 2200/4617 lr=5.03e-04 loss=2.3087 elapsed=1315.9s


ep 8/15 batch 2300/4617 lr=5.01e-04 loss=2.3098 elapsed=1375.7s


ep 8/15 batch 2400/4617 lr=4.98e-04 loss=2.3036 elapsed=1435.4s


ep 8/15 batch 2500/4617 lr=4.96e-04 loss=2.3034 elapsed=1495.2s


ep 8/15 batch 2600/4617 lr=4.94e-04 loss=2.2990 elapsed=1555.0s


ep 8/15 batch 2700/4617 lr=4.92e-04 loss=2.2984 elapsed=1614.8s


ep 8/15 batch 2800/4617 lr=4.89e-04 loss=2.2944 elapsed=1674.6s


ep 8/15 batch 2900/4617 lr=4.87e-04 loss=2.2920 elapsed=1734.3s


ep 8/15 batch 3000/4617 lr=4.85e-04 loss=2.2899 elapsed=1794.0s


ep 8/15 batch 3100/4617 lr=4.83e-04 loss=2.2884 elapsed=1853.8s


ep 8/15 batch 3200/4617 lr=4.80e-04 loss=2.2841 elapsed=1913.6s


ep 8/15 batch 3300/4617 lr=4.78e-04 loss=2.2814 elapsed=1973.2s


ep 8/15 batch 3400/4617 lr=4.76e-04 loss=2.2755 elapsed=2032.9s


ep 8/15 batch 3500/4617 lr=4.74e-04 loss=2.2738 elapsed=2092.7s


ep 8/15 batch 3600/4617 lr=4.71e-04 loss=2.2741 elapsed=2152.4s


ep 8/15 batch 3700/4617 lr=4.69e-04 loss=2.2692 elapsed=2212.2s


ep 8/15 batch 3800/4617 lr=4.67e-04 loss=2.2653 elapsed=2271.9s


ep 8/15 batch 3900/4617 lr=4.64e-04 loss=2.2620 elapsed=2331.7s


ep 8/15 batch 4000/4617 lr=4.62e-04 loss=2.2582 elapsed=2391.5s


ep 8/15 batch 4100/4617 lr=4.60e-04 loss=2.2570 elapsed=2451.3s


ep 8/15 batch 4200/4617 lr=4.58e-04 loss=2.2521 elapsed=2511.0s


ep 8/15 batch 4300/4617 lr=4.55e-04 loss=2.2528 elapsed=2570.7s


ep 8/15 batch 4400/4617 lr=4.53e-04 loss=2.2488 elapsed=2630.5s


ep 8/15 batch 4500/4617 lr=4.51e-04 loss=2.2485 elapsed=2690.3s


ep 8/15 batch 4600/4617 lr=4.49e-04 loss=2.2474 elapsed=2750.2s


Epoch 8 train_loss=2.2472 epoch_time=2760.1s total_elapsed=373.4m


  Eval batch 50/238


  Eval batch 100/238


  Eval batch 150/238


  Eval batch 200/238


Eval done in 46.5s; acc=0.7994; loss=0.6911


Saved new best (EMA) acc=0.7994


ep 9/15 batch 100/4617 lr=4.46e-04 loss=2.2430 elapsed=59.8s


ep 9/15 batch 200/4617 lr=4.44e-04 loss=2.0687 elapsed=119.7s


ep 9/15 batch 300/4617 lr=4.42e-04 loss=2.1294 elapsed=179.4s


ep 9/15 batch 400/4617 lr=4.39e-04 loss=2.1584 elapsed=239.1s


ep 9/15 batch 500/4617 lr=4.37e-04 loss=2.1659 elapsed=298.9s


ep 9/15 batch 600/4617 lr=4.35e-04 loss=2.1754 elapsed=358.7s


ep 9/15 batch 700/4617 lr=4.33e-04 loss=2.1691 elapsed=418.4s


ep 9/15 batch 800/4617 lr=4.30e-04 loss=2.1625 elapsed=478.3s


ep 9/15 batch 900/4617 lr=4.28e-04 loss=2.1515 elapsed=538.1s


ep 9/15 batch 1000/4617 lr=4.26e-04 loss=2.1538 elapsed=597.9s


ep 9/15 batch 1100/4617 lr=4.24e-04 loss=2.1499 elapsed=657.6s


ep 9/15 batch 1200/4617 lr=4.21e-04 loss=2.1484 elapsed=717.1s


ep 9/15 batch 1300/4617 lr=4.19e-04 loss=2.1353 elapsed=777.0s


ep 9/15 batch 1400/4617 lr=4.17e-04 loss=2.1300 elapsed=836.9s


ep 9/15 batch 1500/4617 lr=4.15e-04 loss=2.1350 elapsed=896.8s


ep 9/15 batch 1600/4617 lr=4.12e-04 loss=2.1304 elapsed=956.6s


ep 9/15 batch 1700/4617 lr=4.10e-04 loss=2.1176 elapsed=1016.1s


ep 9/15 batch 1800/4617 lr=4.08e-04 loss=2.1035 elapsed=1075.7s


ep 9/15 batch 1900/4617 lr=4.06e-04 loss=2.1002 elapsed=1135.3s


ep 9/15 batch 2000/4617 lr=4.03e-04 loss=2.0971 elapsed=1195.0s


ep 9/15 batch 2100/4617 lr=4.01e-04 loss=2.0980 elapsed=1254.5s


ep 9/15 batch 2200/4617 lr=3.99e-04 loss=2.0978 elapsed=1314.1s


ep 9/15 batch 2300/4617 lr=3.97e-04 loss=2.0990 elapsed=1373.8s


ep 9/15 batch 2400/4617 lr=3.95e-04 loss=2.0935 elapsed=1433.5s


ep 9/15 batch 2500/4617 lr=3.92e-04 loss=2.0965 elapsed=1493.2s


ep 9/15 batch 2600/4617 lr=3.90e-04 loss=2.0982 elapsed=1553.0s


ep 9/15 batch 2700/4617 lr=3.88e-04 loss=2.0935 elapsed=1612.7s


ep 9/15 batch 2800/4617 lr=3.86e-04 loss=2.0904 elapsed=1672.6s


ep 9/15 batch 2900/4617 lr=3.84e-04 loss=2.0895 elapsed=1732.3s


ep 9/15 batch 3000/4617 lr=3.81e-04 loss=2.0887 elapsed=1791.8s


ep 9/15 batch 3100/4617 lr=3.79e-04 loss=2.0830 elapsed=1851.5s


ep 9/15 batch 3200/4617 lr=3.77e-04 loss=2.0807 elapsed=1911.2s


ep 9/15 batch 3300/4617 lr=3.75e-04 loss=2.0787 elapsed=1970.9s


ep 9/15 batch 3400/4617 lr=3.73e-04 loss=2.0752 elapsed=2030.6s


ep 9/15 batch 3500/4617 lr=3.70e-04 loss=2.0778 elapsed=2090.4s


ep 9/15 batch 3600/4617 lr=3.68e-04 loss=2.0754 elapsed=2150.1s


ep 9/15 batch 3700/4617 lr=3.66e-04 loss=2.0699 elapsed=2209.8s


ep 9/15 batch 3800/4617 lr=3.64e-04 loss=2.0647 elapsed=2269.6s


ep 9/15 batch 3900/4617 lr=3.62e-04 loss=2.0619 elapsed=2329.3s


ep 9/15 batch 4000/4617 lr=3.60e-04 loss=2.0611 elapsed=2389.2s


ep 9/15 batch 4100/4617 lr=3.57e-04 loss=2.0534 elapsed=2449.1s


ep 9/15 batch 4200/4617 lr=3.55e-04 loss=2.0508 elapsed=2508.7s


ep 9/15 batch 4300/4617 lr=3.53e-04 loss=2.0474 elapsed=2568.3s


ep 9/15 batch 4400/4617 lr=3.51e-04 loss=2.0466 elapsed=2628.1s


ep 9/15 batch 4500/4617 lr=3.49e-04 loss=2.0457 elapsed=2687.9s


ep 9/15 batch 4600/4617 lr=3.47e-04 loss=2.0487 elapsed=2747.7s


Epoch 9 train_loss=2.0490 epoch_time=2757.6s total_elapsed=420.1m


  Eval batch 50/238


  Eval batch 100/238


  Eval batch 150/238


  Eval batch 200/238


Eval done in 46.4s; acc=0.8107; loss=0.6609


Saved new best (EMA) acc=0.8107


ep 10/15 batch 100/4617 lr=3.44e-04 loss=2.0100 elapsed=59.9s


ep 10/15 batch 200/4617 lr=3.42e-04 loss=1.9413 elapsed=119.8s


ep 10/15 batch 300/4617 lr=3.40e-04 loss=1.9614 elapsed=179.6s


ep 10/15 batch 400/4617 lr=3.38e-04 loss=1.9570 elapsed=239.3s


ep 10/15 batch 500/4617 lr=3.35e-04 loss=1.9742 elapsed=298.9s


ep 10/15 batch 600/4617 lr=3.33e-04 loss=1.9390 elapsed=358.6s


ep 10/15 batch 700/4617 lr=3.31e-04 loss=1.9464 elapsed=418.4s


ep 10/15 batch 800/4617 lr=3.29e-04 loss=1.9417 elapsed=478.1s


ep 10/15 batch 900/4617 lr=3.27e-04 loss=1.9555 elapsed=537.9s


ep 10/15 batch 1000/4617 lr=3.25e-04 loss=1.9552 elapsed=597.6s


ep 10/15 batch 1100/4617 lr=3.23e-04 loss=1.9770 elapsed=657.3s


ep 10/15 batch 1200/4617 lr=3.21e-04 loss=1.9763 elapsed=717.0s


ep 10/15 batch 1300/4617 lr=3.18e-04 loss=1.9623 elapsed=776.8s


ep 10/15 batch 1400/4617 lr=3.16e-04 loss=1.9659 elapsed=836.6s


ep 10/15 batch 1500/4617 lr=3.14e-04 loss=1.9547 elapsed=896.3s


ep 10/15 batch 1600/4617 lr=3.12e-04 loss=1.9459 elapsed=955.8s


ep 10/15 batch 1700/4617 lr=3.10e-04 loss=1.9496 elapsed=1015.5s


ep 10/15 batch 1800/4617 lr=3.08e-04 loss=1.9486 elapsed=1075.3s


ep 10/15 batch 1900/4617 lr=3.06e-04 loss=1.9475 elapsed=1135.1s


ep 10/15 batch 2000/4617 lr=3.04e-04 loss=1.9496 elapsed=1194.8s


ep 10/15 batch 2100/4617 lr=3.02e-04 loss=1.9531 elapsed=1254.5s


ep 10/15 batch 2200/4617 lr=3.00e-04 loss=1.9515 elapsed=1314.2s


ep 10/15 batch 2300/4617 lr=2.98e-04 loss=1.9555 elapsed=1373.9s


ep 10/15 batch 2400/4617 lr=2.95e-04 loss=1.9578 elapsed=1433.5s


ep 10/15 batch 2500/4617 lr=2.93e-04 loss=1.9608 elapsed=1493.2s


ep 10/15 batch 2600/4617 lr=2.91e-04 loss=1.9586 elapsed=1553.0s


ep 10/15 batch 2700/4617 lr=2.89e-04 loss=1.9607 elapsed=1612.7s


ep 10/15 batch 2800/4617 lr=2.87e-04 loss=1.9606 elapsed=1672.4s


ep 10/15 batch 2900/4617 lr=2.85e-04 loss=1.9561 elapsed=1732.2s


ep 10/15 batch 3000/4617 lr=2.83e-04 loss=1.9556 elapsed=1791.8s


ep 10/15 batch 3100/4617 lr=2.81e-04 loss=1.9521 elapsed=1851.5s


ep 10/15 batch 3200/4617 lr=2.79e-04 loss=1.9450 elapsed=1911.2s


ep 10/15 batch 3300/4617 lr=2.77e-04 loss=1.9447 elapsed=1971.0s


ep 10/15 batch 3400/4617 lr=2.75e-04 loss=1.9463 elapsed=2030.8s


ep 10/15 batch 3500/4617 lr=2.73e-04 loss=1.9452 elapsed=2090.3s


ep 10/15 batch 3600/4617 lr=2.71e-04 loss=1.9456 elapsed=2150.0s


ep 10/15 batch 3700/4617 lr=2.69e-04 loss=1.9425 elapsed=2209.6s


ep 10/15 batch 3800/4617 lr=2.67e-04 loss=1.9391 elapsed=2269.3s


ep 10/15 batch 3900/4617 lr=2.65e-04 loss=1.9389 elapsed=2329.1s


ep 10/15 batch 4000/4617 lr=2.63e-04 loss=1.9326 elapsed=2388.8s


ep 10/15 batch 4100/4617 lr=2.61e-04 loss=1.9316 elapsed=2448.5s


ep 10/15 batch 4200/4617 lr=2.59e-04 loss=1.9254 elapsed=2508.2s


ep 10/15 batch 4300/4617 lr=2.57e-04 loss=1.9252 elapsed=2567.8s


ep 10/15 batch 4400/4617 lr=2.55e-04 loss=1.9219 elapsed=2627.8s


ep 10/15 batch 4500/4617 lr=2.53e-04 loss=1.9209 elapsed=2687.6s


ep 10/15 batch 4600/4617 lr=2.51e-04 loss=1.9210 elapsed=2747.3s


Epoch 10 train_loss=1.9201 epoch_time=2757.2s total_elapsed=466.9m


  Eval batch 50/238


  Eval batch 100/238


  Eval batch 150/238


  Eval batch 200/238


Eval done in 46.7s; acc=0.8164; loss=0.6335


Saved new best (EMA) acc=0.8164


ep 11/15 batch 100/4617 lr=2.49e-04 loss=1.8287 elapsed=60.1s


ep 11/15 batch 200/4617 lr=2.47e-04 loss=1.7943 elapsed=120.0s


ep 11/15 batch 300/4617 lr=2.45e-04 loss=1.7591 elapsed=179.7s


ep 11/15 batch 400/4617 lr=2.43e-04 loss=1.7951 elapsed=239.3s


ep 11/15 batch 500/4617 lr=2.41e-04 loss=1.8053 elapsed=299.0s


ep 11/15 batch 600/4617 lr=2.39e-04 loss=1.8235 elapsed=359.0s


ep 11/15 batch 700/4617 lr=2.37e-04 loss=1.8105 elapsed=418.9s


ep 11/15 batch 800/4617 lr=2.35e-04 loss=1.8222 elapsed=478.6s


ep 11/15 batch 900/4617 lr=2.33e-04 loss=1.8237 elapsed=538.2s


ep 11/15 batch 1000/4617 lr=2.31e-04 loss=1.8107 elapsed=598.0s


ep 11/15 batch 1100/4617 lr=2.29e-04 loss=1.8134 elapsed=657.8s


ep 11/15 batch 1200/4617 lr=2.28e-04 loss=1.8093 elapsed=717.5s


ep 11/15 batch 1300/4617 lr=2.26e-04 loss=1.8057 elapsed=777.1s


ep 11/15 batch 1400/4617 lr=2.24e-04 loss=1.8019 elapsed=836.6s


ep 11/15 batch 1500/4617 lr=2.22e-04 loss=1.8051 elapsed=896.3s


ep 11/15 batch 1600/4617 lr=2.20e-04 loss=1.8102 elapsed=956.0s


ep 11/15 batch 1700/4617 lr=2.18e-04 loss=1.8147 elapsed=1015.8s


ep 11/15 batch 1800/4617 lr=2.16e-04 loss=1.8178 elapsed=1075.6s


ep 11/15 batch 1900/4617 lr=2.14e-04 loss=1.8263 elapsed=1135.4s


ep 11/15 batch 2000/4617 lr=2.13e-04 loss=1.8270 elapsed=1195.1s


ep 11/15 batch 2100/4617 lr=2.11e-04 loss=1.8276 elapsed=1254.9s


ep 11/15 batch 2200/4617 lr=2.09e-04 loss=1.8253 elapsed=1314.6s


ep 11/15 batch 2300/4617 lr=2.07e-04 loss=1.8200 elapsed=1374.4s


ep 11/15 batch 2400/4617 lr=2.05e-04 loss=1.8218 elapsed=1434.3s


ep 11/15 batch 2500/4617 lr=2.03e-04 loss=1.8235 elapsed=1494.1s


ep 11/15 batch 2600/4617 lr=2.02e-04 loss=1.8253 elapsed=1553.8s


ep 11/15 batch 2700/4617 lr=2.00e-04 loss=1.8267 elapsed=1613.6s


ep 11/15 batch 2800/4617 lr=1.98e-04 loss=1.8249 elapsed=1673.4s


ep 11/15 batch 2900/4617 lr=1.96e-04 loss=1.8209 elapsed=1733.1s


ep 11/15 batch 3000/4617 lr=1.94e-04 loss=1.8203 elapsed=1792.8s


ep 11/15 batch 3100/4617 lr=1.93e-04 loss=1.8169 elapsed=1852.6s


ep 11/15 batch 3200/4617 lr=1.91e-04 loss=1.8133 elapsed=1912.5s


ep 11/15 batch 3300/4617 lr=1.89e-04 loss=1.8153 elapsed=1972.2s


ep 11/15 batch 3400/4617 lr=1.87e-04 loss=1.8138 elapsed=2031.9s


ep 11/15 batch 3500/4617 lr=1.85e-04 loss=1.8107 elapsed=2091.7s


ep 11/15 batch 3600/4617 lr=1.84e-04 loss=1.8085 elapsed=2151.5s


ep 11/15 batch 3700/4617 lr=1.82e-04 loss=1.8049 elapsed=2211.3s


ep 11/15 batch 3800/4617 lr=1.80e-04 loss=1.8033 elapsed=2271.1s


ep 11/15 batch 3900/4617 lr=1.79e-04 loss=1.8014 elapsed=2330.7s


ep 11/15 batch 4000/4617 lr=1.77e-04 loss=1.8021 elapsed=2390.5s


ep 11/15 batch 4100/4617 lr=1.75e-04 loss=1.8031 elapsed=2450.2s


ep 11/15 batch 4200/4617 lr=1.73e-04 loss=1.7991 elapsed=2510.0s


ep 11/15 batch 4300/4617 lr=1.72e-04 loss=1.7996 elapsed=2569.7s


ep 11/15 batch 4400/4617 lr=1.70e-04 loss=1.7975 elapsed=2629.4s


ep 11/15 batch 4500/4617 lr=1.68e-04 loss=1.7971 elapsed=2689.2s


ep 11/15 batch 4600/4617 lr=1.67e-04 loss=1.7961 elapsed=2749.1s


Epoch 11 train_loss=1.7953 epoch_time=2759.1s total_elapsed=513.6m


  Eval batch 50/238


  Eval batch 100/238


  Eval batch 150/238


  Eval batch 200/238


Eval done in 46.6s; acc=0.8247; loss=0.6138


Saved new best (EMA) acc=0.8247


ep 12/15 batch 100/4617 lr=1.65e-04 loss=1.7125 elapsed=60.0s


ep 12/15 batch 200/4617 lr=1.63e-04 loss=1.7458 elapsed=119.6s


ep 12/15 batch 300/4617 lr=1.61e-04 loss=1.7163 elapsed=179.3s


ep 12/15 batch 400/4617 lr=1.60e-04 loss=1.6595 elapsed=239.0s


ep 12/15 batch 500/4617 lr=1.58e-04 loss=1.6833 elapsed=298.7s


ep 12/15 batch 600/4617 lr=1.56e-04 loss=1.6870 elapsed=358.5s


ep 12/15 batch 700/4617 lr=1.55e-04 loss=1.6922 elapsed=418.5s


ep 12/15 batch 800/4617 lr=1.53e-04 loss=1.6761 elapsed=478.3s


ep 12/15 batch 900/4617 lr=1.51e-04 loss=1.6634 elapsed=538.1s


ep 12/15 batch 1000/4617 lr=1.50e-04 loss=1.6813 elapsed=597.9s


ep 12/15 batch 1100/4617 lr=1.48e-04 loss=1.6780 elapsed=657.6s


ep 12/15 batch 1200/4617 lr=1.47e-04 loss=1.6767 elapsed=717.4s


ep 12/15 batch 1300/4617 lr=1.45e-04 loss=1.6765 elapsed=777.2s


ep 12/15 batch 1400/4617 lr=1.43e-04 loss=1.6742 elapsed=836.9s


ep 12/15 batch 1500/4617 lr=1.42e-04 loss=1.6682 elapsed=896.7s


ep 12/15 batch 1600/4617 lr=1.40e-04 loss=1.6718 elapsed=956.5s


ep 12/15 batch 1700/4617 lr=1.39e-04 loss=1.6764 elapsed=1016.2s


ep 12/15 batch 1800/4617 lr=1.37e-04 loss=1.6725 elapsed=1076.0s


ep 12/15 batch 1900/4617 lr=1.36e-04 loss=1.6770 elapsed=1135.8s


ep 12/15 batch 2000/4617 lr=1.34e-04 loss=1.6682 elapsed=1195.4s


ep 12/15 batch 2100/4617 lr=1.32e-04 loss=1.6737 elapsed=1255.1s


ep 12/15 batch 2200/4617 lr=1.31e-04 loss=1.6728 elapsed=1314.7s


ep 12/15 batch 2300/4617 lr=1.29e-04 loss=1.6780 elapsed=1374.4s


ep 12/15 batch 2400/4617 lr=1.28e-04 loss=1.6781 elapsed=1434.2s


ep 12/15 batch 2500/4617 lr=1.26e-04 loss=1.6773 elapsed=1493.9s


ep 12/15 batch 2600/4617 lr=1.25e-04 loss=1.6739 elapsed=1553.7s


ep 12/15 batch 2700/4617 lr=1.23e-04 loss=1.6725 elapsed=1613.5s


ep 12/15 batch 2800/4617 lr=1.22e-04 loss=1.6723 elapsed=1673.3s


ep 12/15 batch 2900/4617 lr=1.20e-04 loss=1.6735 elapsed=1732.8s


ep 12/15 batch 3000/4617 lr=1.19e-04 loss=1.6751 elapsed=1792.5s


ep 12/15 batch 3100/4617 lr=1.18e-04 loss=1.6713 elapsed=1852.2s


ep 12/15 batch 3200/4617 lr=1.16e-04 loss=1.6671 elapsed=1912.0s


ep 12/15 batch 3300/4617 lr=1.15e-04 loss=1.6671 elapsed=1971.8s


ep 12/15 batch 3400/4617 lr=1.13e-04 loss=1.6657 elapsed=2031.6s


ep 12/15 batch 3500/4617 lr=1.12e-04 loss=1.6689 elapsed=2091.4s


ep 12/15 batch 3600/4617 lr=1.10e-04 loss=1.6726 elapsed=2151.1s


ep 12/15 batch 3700/4617 lr=1.09e-04 loss=1.6701 elapsed=2210.9s


ep 12/15 batch 3800/4617 lr=1.08e-04 loss=1.6679 elapsed=2270.6s


ep 12/15 batch 3900/4617 lr=1.06e-04 loss=1.6689 elapsed=2330.4s


ep 12/15 batch 4000/4617 lr=1.05e-04 loss=1.6671 elapsed=2390.1s


ep 12/15 batch 4100/4617 lr=1.03e-04 loss=1.6678 elapsed=2449.9s


ep 12/15 batch 4200/4617 lr=1.02e-04 loss=1.6684 elapsed=2509.6s


ep 12/15 batch 4300/4617 lr=1.01e-04 loss=1.6665 elapsed=2569.4s


ep 12/15 batch 4400/4617 lr=9.93e-05 loss=1.6624 elapsed=2629.2s


ep 12/15 batch 4500/4617 lr=9.80e-05 loss=1.6637 elapsed=2689.0s


ep 12/15 batch 4600/4617 lr=9.66e-05 loss=1.6597 elapsed=2748.7s


Epoch 12 train_loss=1.6605 epoch_time=2758.7s total_elapsed=560.4m


  Eval batch 50/238


  Eval batch 100/238


  Eval batch 150/238


  Eval batch 200/238


Eval done in 46.6s; acc=0.8331; loss=0.5988


Saved new best (EMA) acc=0.8331


ep 13/15 batch 100/4617 lr=9.51e-05 loss=1.5917 elapsed=60.0s


ep 13/15 batch 200/4617 lr=9.37e-05 loss=1.6548 elapsed=119.9s


ep 13/15 batch 300/4617 lr=9.24e-05 loss=1.6597 elapsed=179.8s


ep 13/15 batch 400/4617 lr=9.11e-05 loss=1.6310 elapsed=239.6s


ep 13/15 batch 500/4617 lr=8.98e-05 loss=1.6336 elapsed=299.3s


ep 13/15 batch 600/4617 lr=8.86e-05 loss=1.6566 elapsed=359.3s


ep 13/15 batch 700/4617 lr=8.73e-05 loss=1.6766 elapsed=419.1s


ep 13/15 batch 800/4617 lr=8.60e-05 loss=1.6708 elapsed=478.7s


ep 13/15 batch 900/4617 lr=8.47e-05 loss=1.6667 elapsed=538.4s


ep 13/15 batch 1000/4617 lr=8.35e-05 loss=1.6535 elapsed=598.2s


ep 13/15 batch 1100/4617 lr=8.23e-05 loss=1.6483 elapsed=658.0s


ep 13/15 batch 1200/4617 lr=8.10e-05 loss=1.6545 elapsed=717.8s


ep 13/15 batch 1300/4617 lr=7.98e-05 loss=1.6382 elapsed=777.6s


ep 13/15 batch 1400/4617 lr=7.86e-05 loss=1.6338 elapsed=837.5s


ep 13/15 batch 1500/4617 lr=7.74e-05 loss=1.6408 elapsed=897.4s


ep 13/15 batch 1600/4617 lr=7.62e-05 loss=1.6364 elapsed=957.1s


ep 13/15 batch 1700/4617 lr=7.50e-05 loss=1.6325 elapsed=1016.8s


ep 13/15 batch 1800/4617 lr=7.38e-05 loss=1.6360 elapsed=1076.5s


ep 13/15 batch 1900/4617 lr=7.26e-05 loss=1.6265 elapsed=1136.4s


ep 13/15 batch 2000/4617 lr=7.15e-05 loss=1.6201 elapsed=1196.2s


ep 13/15 batch 2100/4617 lr=7.03e-05 loss=1.6210 elapsed=1256.1s


ep 13/15 batch 2200/4617 lr=6.92e-05 loss=1.6185 elapsed=1315.9s


ep 13/15 batch 2300/4617 lr=6.80e-05 loss=1.6198 elapsed=1375.7s


ep 13/15 batch 2400/4617 lr=6.69e-05 loss=1.6223 elapsed=1435.6s


ep 13/15 batch 2500/4617 lr=6.58e-05 loss=1.6201 elapsed=1495.3s


ep 13/15 batch 2600/4617 lr=6.47e-05 loss=1.6191 elapsed=1555.3s


ep 13/15 batch 2700/4617 lr=6.36e-05 loss=1.6111 elapsed=1615.2s


ep 13/15 batch 2800/4617 lr=6.25e-05 loss=1.6051 elapsed=1675.1s


ep 13/15 batch 2900/4617 lr=6.14e-05 loss=1.6044 elapsed=1734.9s


ep 13/15 batch 3000/4617 lr=6.03e-05 loss=1.6032 elapsed=1794.7s


ep 13/15 batch 3100/4617 lr=5.92e-05 loss=1.6009 elapsed=1854.6s


ep 13/15 batch 3200/4617 lr=5.82e-05 loss=1.5964 elapsed=1914.4s


ep 13/15 batch 3300/4617 lr=5.71e-05 loss=1.5942 elapsed=1974.2s


ep 13/15 batch 3400/4617 lr=5.61e-05 loss=1.5961 elapsed=2034.0s


ep 13/15 batch 3500/4617 lr=5.51e-05 loss=1.5969 elapsed=2093.8s


ep 13/15 batch 3600/4617 lr=5.40e-05 loss=1.5975 elapsed=2153.6s


ep 13/15 batch 3700/4617 lr=5.30e-05 loss=1.5935 elapsed=2213.5s


ep 13/15 batch 3800/4617 lr=5.20e-05 loss=1.5864 elapsed=2273.1s


ep 13/15 batch 3900/4617 lr=5.10e-05 loss=1.5808 elapsed=2332.9s


ep 13/15 batch 4000/4617 lr=5.00e-05 loss=1.5801 elapsed=2392.8s


ep 13/15 batch 4100/4617 lr=4.91e-05 loss=1.5758 elapsed=2452.7s


ep 13/15 batch 4200/4617 lr=4.81e-05 loss=1.5749 elapsed=2512.3s


ep 13/15 batch 4300/4617 lr=4.72e-05 loss=1.5748 elapsed=2572.0s


ep 13/15 batch 4400/4617 lr=4.62e-05 loss=1.5722 elapsed=2631.8s


ep 13/15 batch 4500/4617 lr=4.53e-05 loss=1.5715 elapsed=2691.6s


ep 13/15 batch 4600/4617 lr=4.43e-05 loss=1.5701 elapsed=2751.6s


Epoch 13 train_loss=1.5698 epoch_time=2761.5s total_elapsed=607.2m


  Eval batch 50/238


  Eval batch 100/238


  Eval batch 150/238


  Eval batch 200/238


Eval done in 46.7s; acc=0.8381; loss=0.5909


Saved new best (EMA) acc=0.8381


ep 14/15 batch 100/4617 lr=4.33e-05 loss=1.3857 elapsed=59.9s


ep 14/15 batch 200/4617 lr=4.24e-05 loss=1.4248 elapsed=119.9s


ep 14/15 batch 300/4617 lr=4.15e-05 loss=1.4621 elapsed=179.7s


ep 14/15 batch 400/4617 lr=4.06e-05 loss=1.4587 elapsed=239.4s


ep 14/15 batch 500/4617 lr=3.97e-05 loss=1.4686 elapsed=299.1s


ep 14/15 batch 600/4617 lr=3.88e-05 loss=1.4783 elapsed=359.0s


ep 14/15 batch 700/4617 lr=3.80e-05 loss=1.4791 elapsed=418.8s


ep 14/15 batch 800/4617 lr=3.71e-05 loss=1.4587 elapsed=478.5s


ep 14/15 batch 900/4617 lr=3.63e-05 loss=1.4695 elapsed=538.2s


ep 14/15 batch 1000/4617 lr=3.54e-05 loss=1.4854 elapsed=598.0s


ep 14/15 batch 1100/4617 lr=3.46e-05 loss=1.4811 elapsed=657.9s


ep 14/15 batch 1200/4617 lr=3.38e-05 loss=1.4862 elapsed=717.7s


ep 14/15 batch 1300/4617 lr=3.30e-05 loss=1.4837 elapsed=777.3s


ep 14/15 batch 1400/4617 lr=3.22e-05 loss=1.4966 elapsed=837.2s


ep 14/15 batch 1500/4617 lr=3.14e-05 loss=1.4940 elapsed=897.0s


ep 14/15 batch 1600/4617 lr=3.07e-05 loss=1.4876 elapsed=956.8s


ep 14/15 batch 1700/4617 lr=2.99e-05 loss=1.4857 elapsed=1016.6s


ep 14/15 batch 1800/4617 lr=2.91e-05 loss=1.4933 elapsed=1076.4s


ep 14/15 batch 1900/4617 lr=2.84e-05 loss=1.4995 elapsed=1136.3s


ep 14/15 batch 2000/4617 lr=2.77e-05 loss=1.4988 elapsed=1196.3s


ep 14/15 batch 2100/4617 lr=2.69e-05 loss=1.4980 elapsed=1256.0s


ep 14/15 batch 2200/4617 lr=2.62e-05 loss=1.5024 elapsed=1315.8s


ep 14/15 batch 2300/4617 lr=2.55e-05 loss=1.4963 elapsed=1375.6s


ep 14/15 batch 2400/4617 lr=2.48e-05 loss=1.4966 elapsed=1435.4s


ep 14/15 batch 2500/4617 lr=2.41e-05 loss=1.4999 elapsed=1495.0s


ep 14/15 batch 2600/4617 lr=2.34e-05 loss=1.5061 elapsed=1554.8s


ep 14/15 batch 2700/4617 lr=2.28e-05 loss=1.5077 elapsed=1614.6s


ep 14/15 batch 2800/4617 lr=2.21e-05 loss=1.5051 elapsed=1674.3s


ep 14/15 batch 2900/4617 lr=2.15e-05 loss=1.5082 elapsed=1734.1s


ep 14/15 batch 3000/4617 lr=2.08e-05 loss=1.5117 elapsed=1793.9s


ep 14/15 batch 3100/4617 lr=2.02e-05 loss=1.5160 elapsed=1853.7s


ep 14/15 batch 3200/4617 lr=1.96e-05 loss=1.5195 elapsed=1913.5s


ep 14/15 batch 3300/4617 lr=1.90e-05 loss=1.5175 elapsed=1973.4s


ep 14/15 batch 3400/4617 lr=1.84e-05 loss=1.5157 elapsed=2033.2s


ep 14/15 batch 3500/4617 lr=1.78e-05 loss=1.5154 elapsed=2093.1s


ep 14/15 batch 3600/4617 lr=1.72e-05 loss=1.5168 elapsed=2153.0s


ep 14/15 batch 3700/4617 lr=1.67e-05 loss=1.5182 elapsed=2212.8s


ep 14/15 batch 3800/4617 lr=1.61e-05 loss=1.5181 elapsed=2272.5s


ep 14/15 batch 3900/4617 lr=1.56e-05 loss=1.5168 elapsed=2332.1s


ep 14/15 batch 4000/4617 lr=1.50e-05 loss=1.5153 elapsed=2392.1s


ep 14/15 batch 4100/4617 lr=1.45e-05 loss=1.5123 elapsed=2452.1s


ep 14/15 batch 4200/4617 lr=1.40e-05 loss=1.5120 elapsed=2511.9s


ep 14/15 batch 4300/4617 lr=1.35e-05 loss=1.5113 elapsed=2571.6s


ep 14/15 batch 4400/4617 lr=1.30e-05 loss=1.5089 elapsed=2631.4s


ep 14/15 batch 4500/4617 lr=1.25e-05 loss=1.5078 elapsed=2691.2s


ep 14/15 batch 4600/4617 lr=1.20e-05 loss=1.5092 elapsed=2751.1s


Epoch 14 train_loss=1.5080 epoch_time=2761.0s total_elapsed=654.0m


  Eval batch 50/238


  Eval batch 100/238


  Eval batch 150/238


  Eval batch 200/238


Eval done in 46.6s; acc=0.8404; loss=0.5881


Saved new best (EMA) acc=0.8404


ep 15/15 batch 100/4617 lr=1.14e-05 loss=1.7300 elapsed=60.1s


ep 15/15 batch 200/4617 lr=1.10e-05 loss=1.5469 elapsed=120.0s


ep 15/15 batch 300/4617 lr=1.05e-05 loss=1.4784 elapsed=179.9s


ep 15/15 batch 400/4617 lr=1.01e-05 loss=1.4614 elapsed=239.7s


ep 15/15 batch 500/4617 lr=9.69e-06 loss=1.4615 elapsed=299.5s


ep 15/15 batch 600/4617 lr=9.27e-06 loss=1.4663 elapsed=359.4s


ep 15/15 batch 700/4617 lr=8.86e-06 loss=1.4647 elapsed=419.3s


ep 15/15 batch 800/4617 lr=8.47e-06 loss=1.4694 elapsed=479.2s


ep 15/15 batch 900/4617 lr=8.08e-06 loss=1.4713 elapsed=539.1s


ep 15/15 batch 1000/4617 lr=7.71e-06 loss=1.4651 elapsed=598.9s


ep 15/15 batch 1100/4617 lr=7.34e-06 loss=1.4780 elapsed=658.5s


ep 15/15 batch 1200/4617 lr=6.99e-06 loss=1.4884 elapsed=718.3s


ep 15/15 batch 1300/4617 lr=6.64e-06 loss=1.4991 elapsed=778.2s


ep 15/15 batch 1400/4617 lr=6.31e-06 loss=1.4941 elapsed=837.9s


ep 15/15 batch 1500/4617 lr=5.98e-06 loss=1.4822 elapsed=897.7s


ep 15/15 batch 1600/4617 lr=5.67e-06 loss=1.4771 elapsed=957.4s


ep 15/15 batch 1700/4617 lr=5.37e-06 loss=1.4613 elapsed=1017.2s


ep 15/15 batch 1800/4617 lr=5.07e-06 loss=1.4541 elapsed=1077.0s


ep 15/15 batch 1900/4617 lr=4.79e-06 loss=1.4540 elapsed=1136.7s


ep 15/15 batch 2000/4617 lr=4.52e-06 loss=1.4517 elapsed=1196.7s


ep 15/15 batch 2100/4617 lr=4.25e-06 loss=1.4586 elapsed=1256.5s


ep 15/15 batch 2200/4617 lr=4.00e-06 loss=1.4542 elapsed=1316.3s


ep 15/15 batch 2300/4617 lr=3.76e-06 loss=1.4606 elapsed=1376.1s


ep 15/15 batch 2400/4617 lr=3.52e-06 loss=1.4622 elapsed=1435.9s


ep 15/15 batch 2500/4617 lr=3.30e-06 loss=1.4588 elapsed=1495.5s


ep 15/15 batch 2600/4617 lr=3.09e-06 loss=1.4569 elapsed=1555.3s


ep 15/15 batch 2700/4617 lr=2.89e-06 loss=1.4558 elapsed=1615.2s


ep 15/15 batch 2800/4617 lr=2.70e-06 loss=1.4583 elapsed=1675.1s


ep 15/15 batch 2900/4617 lr=2.51e-06 loss=1.4560 elapsed=1735.0s


ep 15/15 batch 3000/4617 lr=2.34e-06 loss=1.4551 elapsed=1794.9s


ep 15/15 batch 3100/4617 lr=2.18e-06 loss=1.4545 elapsed=1854.8s


ep 15/15 batch 3200/4617 lr=2.03e-06 loss=1.4532 elapsed=1914.7s


ep 15/15 batch 3300/4617 lr=1.89e-06 loss=1.4541 elapsed=1974.6s


ep 15/15 batch 3400/4617 lr=1.76e-06 loss=1.4531 elapsed=2034.5s


ep 15/15 batch 3500/4617 lr=1.64e-06 loss=1.4533 elapsed=2094.3s


ep 15/15 batch 3600/4617 lr=1.53e-06 loss=1.4545 elapsed=2154.0s


ep 15/15 batch 3700/4617 lr=1.43e-06 loss=1.4562 elapsed=2213.8s


ep 15/15 batch 3800/4617 lr=1.34e-06 loss=1.4584 elapsed=2273.6s


ep 15/15 batch 3900/4617 lr=1.26e-06 loss=1.4565 elapsed=2333.5s


ep 15/15 batch 4000/4617 lr=1.20e-06 loss=1.4532 elapsed=2393.3s


ep 15/15 batch 4100/4617 lr=1.14e-06 loss=1.4541 elapsed=2453.1s


ep 15/15 batch 4200/4617 lr=1.09e-06 loss=1.4515 elapsed=2512.9s


ep 15/15 batch 4300/4617 lr=1.05e-06 loss=1.4521 elapsed=2572.8s


ep 15/15 batch 4400/4617 lr=1.02e-06 loss=1.4505 elapsed=2632.6s


ep 15/15 batch 4500/4617 lr=1.01e-06 loss=1.4504 elapsed=2692.3s


ep 15/15 batch 4600/4617 lr=1.00e-06 loss=1.4494 elapsed=2752.1s


Epoch 15 train_loss=1.4497 epoch_time=2762.1s total_elapsed=700.9m


  Eval batch 50/238


  Eval batch 100/238


  Eval batch 150/238


  Eval batch 200/238


Eval done in 46.5s; acc=0.8401; loss=0.5888
Training finished. Best holdout acc: 0.8404208680403332
Next: add TTA inference on test and write submission.


In [None]:
# Inference & Submission: TTA on test, write submission.csv
import json, time, math, torch, gc
from pathlib import Path
from PIL import Image
import torchvision.transforms as T
import torchvision.transforms.functional as TF
from torch.utils.data import Dataset, DataLoader
import timm

torch.backends.cudnn.benchmark = True
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
try:
    torch.set_float32_matmul_precision('high')
except Exception:
    pass

# Clean GPU memory before inference
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()

# Load mappings
with open('artifacts/idx2catid.json','r') as f:
    idx2catid = {int(k): int(v) for k,v in json.load(f).items()}

# Build test records (filter missing)
def build_test_records(j):
    recs = []
    miss = 0
    for img in j['images']:
        img_id = img['id']
        fn = img['file_name']
        fp = Path(fn)
        if not fp.exists():
            alt = Path(fp.name)
            if alt.exists():
                fn = str(alt)
            else:
                miss += 1
                continue
        recs.append((img_id, fn))
    if miss:
        print(f"[WARN] Dropped {miss} missing test files")
    return recs

test_recs = build_test_records(test_json)
print(f"Test records: {len(test_recs)}")

class TestDS(Dataset):
    def __init__(self, records, tfm):
        self.records = records
        self.tfm = tfm
    def __len__(self): return len(self.records)
    def __getitem__(self, i):
        img_id, fn = self.records[i]
        img = Image.open(fn).convert('RGB')
        x = self.tfm(img)
        return x, int(img_id)

def make_eval_tfms(sz):
    return T.Compose([
        T.Resize(int(sz/0.875), interpolation=T.InterpolationMode.BICUBIC),
        T.CenterCrop(sz),
        T.ToTensor(),
        T.Normalize((0.485,0.456,0.406),(0.229,0.224,0.225)),
    ])

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
NUM_CLASSES = len(idx2catid)

# Select best available checkpoint (prioritize averaged checkpoints)
ckpt_candidates = [
    Path('avg_ema_last_448.pt'),
    Path('avg_ema_last.pt'),
    Path('best_convnext_baseline_cont.pt'),
    Path('best_convnext_baseline.pt'),
]
ckpt_path = next((p for p in ckpt_candidates if p.exists()), None)
assert ckpt_path is not None, 'No checkpoint found. Train first.'
print('Using checkpoint:', ckpt_path)

# Use the exact same architecture variant as training
model = timm.create_model('convnext_base.fb_in22k_ft_in1k', pretrained=False, num_classes=NUM_CLASSES)
sd = torch.load(ckpt_path, map_location='cpu')
state = sd['model'] if isinstance(sd, dict) and 'model' in sd else sd
missing, unexpected = model.load_state_dict(state, strict=False)
if missing or unexpected:
    print('[WARN] load_state_dict mismatch -> missing:', len(missing), 'unexpected:', len(unexpected))
model.to(DEVICE).eval()
model = model.to(memory_format=torch.channels_last)

# TTA settings per plan: multi-scale with hflip
scales = [384, 416]
use_hflip = True
bs = 32  # OOM-safe on A10 for 384/416

all_ids = [rid for rid,_ in test_recs]
all_logits = torch.zeros((len(test_recs), NUM_CLASSES), dtype=torch.float32, device='cpu')

@torch.no_grad()
def infer_one_scale(sz, flip=False):
    tfm = make_eval_tfms(sz)
    ds = TestDS(test_recs, tfm)
    dl = DataLoader(ds, batch_size=bs, shuffle=False, num_workers=8, pin_memory=True, persistent_workers=True, prefetch_factor=4)
    idx = 0
    t0 = time.time()
    for b, (x, ids) in enumerate(dl):
        x = x.to(DEVICE, non_blocking=True).to(memory_format=torch.channels_last)
        if flip:
            x = TF.hflip(x)
        with torch.autocast(device_type='cuda', dtype=torch.float16, enabled=DEVICE.type=='cuda'):
            logits = model(x)
        n = logits.shape[0]
        all_logits[idx:idx+n] += logits.detach().to('cpu')
        idx += n
        if (b+1)%100==0:
            print(f"  INF sz={sz} flip={flip} batch {b+1}/{len(dl)}", flush=True)
    print(f"  Done sz={sz} flip={flip} in {time.time()-t0:.1f}s")

t_start = time.time()
print('Inference scales:', scales, 'hflip:', use_hflip, 'bs:', bs)
for sz in scales:
    infer_one_scale(sz, flip=False)
    if use_hflip:
        infer_one_scale(sz, flip=True)
num_passes = len(scales) * (2 if use_hflip else 1)
print(f"Inference complete in {(time.time()-t_start)/60:.1f}m; passes={num_passes}")

# Average logits by number of passes
all_logits /= max(1, num_passes)
pred_idx = all_logits.argmax(dim=1).tolist()

# Map to original category_id and write submission
id_to_pred = {}
for i, (img_id, _) in enumerate(test_recs):
    id_to_pred[img_id] = idx2catid[int(pred_idx[i])]

# Ensure all test ids present; if any missing due to file issues, fill with most frequent train class id (idx 0 as fallback)
all_test_ids = [img['id'] for img in test_json['images']]
fallback_cid = idx2catid[0]
rows = []
for img_id in all_test_ids:
    cid = id_to_pred.get(img_id, fallback_cid)
    rows.append((img_id, cid))
rows.sort(key=lambda x: x[0])

with open('submission.csv','w') as f:
    f.write('id,predicted\n')
    for img_id, cid in rows:
        f.write(f"{img_id},{cid}\n")
print('Wrote submission.csv with', len(rows), 'rows')
print('Head:')
print('\n'.join(open('submission.csv').read().splitlines()[:5]))

Test records: 32214
Using checkpoint: avg_ema_last.pt


Inference scales: [384, 416] hflip: True bs: 32


  sd = torch.load(ckpt_path, map_location='cpu')


In [15]:
# Continuation: extend training to 25 epochs and prep averaged EMA weights
import os, time, json, math, torch, glob, gc
from pathlib import Path
from torch import nn
from torch.utils.data import DataLoader, WeightedRandomSampler
import torchvision.transforms as T
import timm
from timm.data.mixup import Mixup
from timm.loss import SoftTargetCrossEntropy
from timm.scheduler.cosine_lr import CosineLRScheduler
from timm.utils import ModelEmaV2

# Enable TF32 for throughput on A10
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
try:
    torch.set_float32_matmul_precision('high')
except Exception:
    pass

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
num_classes = len(catid2idx)

def build_loaders_for_continuation(im_size=384, batch_size=48, workers=12, erase_p=0.25):
    train_tf = T.Compose([
        T.RandomResizedCrop(im_size, scale=(0.4,1.0), ratio=(0.75,1.33)),
        T.RandomHorizontalFlip(p=0.5),
        T.TrivialAugmentWide(num_magnitude_bins=31),
        T.ToTensor(),
        T.RandomErasing(p=erase_p, value='random'),
        T.Normalize((0.485,0.456,0.406),(0.229,0.224,0.225)),
    ])
    val_tf = T.Compose([
        T.Resize(int(im_size/0.875), interpolation=T.InterpolationMode.BICUBIC),
        T.CenterCrop(im_size),
        T.ToTensor(),
        T.Normalize((0.485,0.456,0.406),(0.229,0.224,0.225)),
    ])
    train_ds = INatDatasetSimple(train_split, train_tf)
    val_ds = INatDatasetSimple(valid_split, val_tf)
    from collections import Counter
    cls_counts = Counter([y for _,_,y in train_split])
    weights = [1.0/np.sqrt(cls_counts[y]) for _,_,y in train_split]
    sampler = WeightedRandomSampler(weights, num_samples=len(train_split), replacement=True)
    train_dl = DataLoader(train_ds, batch_size=batch_size, sampler=sampler, shuffle=False, num_workers=workers, pin_memory=True, persistent_workers=True, prefetch_factor=4)
    val_dl = DataLoader(val_ds, batch_size=batch_size, shuffle=False, num_workers=workers, pin_memory=True, persistent_workers=True, prefetch_factor=4)
    return train_dl, val_dl

def evaluate_model(m, dl):
    m.eval(); correct=0; total=0; loss_sum=0.0
    ce = nn.CrossEntropyLoss()
    t0=time.time()
    with torch.no_grad():
        for bx,(x,y,ids) in enumerate(dl):
            x=x.to(device, non_blocking=True).to(memory_format=torch.channels_last); y=y.to(device, non_blocking=True)
            with torch.autocast(device_type='cuda', dtype=torch.float16, enabled=device.type=='cuda'):
                logits=m(x); loss=ce(logits,y)
            loss_sum+=loss.item()*y.size(0); pred=logits.argmax(1); correct+=(pred==y).sum().item(); total+=y.size(0)
    acc=correct/max(1,total); vloss=loss_sum/max(1,total)
    print(f"[CONT] Eval acc={acc:.4f} loss={vloss:.4f} time={time.time()-t0:.1f}s")
    return acc, vloss

def average_state_dicts(paths):
    assert paths, 'No checkpoint paths for averaging'
    avg=None; n=0
    for p in paths:
        sd=torch.load(p, map_location='cpu')
        state=sd['model'] if isinstance(sd, dict) and 'model' in sd else sd
        if avg is None:
            avg={k: v.clone().float() for k,v in state.items()}
        else:
            for k in avg.keys():
                avg[k] += state[k].float()
        n+=1
    for k in avg.keys():
        avg[k] /= n
    return avg

def free_cuda_memory():
    for name in ('model','ema','optimizer','scaler','train_dl','valid_dl','train_ds','valid_ds'):
        if name in globals():
            try: del globals()[name]
            except: pass
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

def continue_training(extra_epochs=10, peak_lr=1e-3, ema_decay=0.9999, warmup_updates=300, save_last_n=8, batch_size=40, workers=12):
    free_cuda_memory()
    print(f"[CONT] Loading best checkpoint and continuing for {extra_epochs} epochs")
    ckpt = torch.load('best_convnext_baseline.pt', map_location='cpu')
    model = timm.create_model('convnext_base.fb_in22k_ft_in1k', pretrained=False, num_classes=num_classes)
    model.load_state_dict(ckpt['model'], strict=True)
    model.to(device).train()
    model = model.to(memory_format=torch.channels_last)
    ema = ModelEmaV2(model, decay=ema_decay, device=device if device.type=='cuda' else None)
    # Initialize EMA with current weights
    for ema_v, model_v in zip(ema.module.state_dict().values(), model.state_dict().values()):
        ema_v.copy_(model_v)

    # Build loaders (RandomErasing on initially; will disable in final epoch below)
    train_dl, val_dl = build_loaders_for_continuation(im_size=384, batch_size=batch_size, workers=workers, erase_p=0.25)

    optimizer = torch.optim.AdamW(model.parameters(), lr=peak_lr, weight_decay=0.05, betas=(0.9,0.999), fused=(device.type=='cuda'))
    steps_per_epoch = len(train_dl)
    total_updates = extra_epochs * steps_per_epoch
    sched = CosineLRScheduler(optimizer, t_initial=total_updates, lr_min=1e-6, warmup_t=warmup_updates, warmup_lr_init=1e-6, k_decay=1.0, t_in_epochs=False)
    scaler = torch.amp.GradScaler('cuda', enabled=(device.type=='cuda'))

    mixup_fn = Mixup(mixup_alpha=0.3, cutmix_alpha=0.5, prob=1.0, switch_prob=0.5, label_smoothing=0.0, num_classes=num_classes)
    criterion_soft = SoftTargetCrossEntropy()
    criterion_hard = nn.CrossEntropyLoss(label_smoothing=0.1)

    best_acc = -1.0
    num_updates = 0
    last_ckpts = []
    t_start = time.time()
    for ep in range(extra_epochs):
        is_last = (ep == extra_epochs-1)
        # Taper mixup: last 3 epochs prob=0.5; final epoch off
        if is_last:
            mixup_fn.prob = 0.0
        elif ep >= extra_epochs - 3:
            mixup_fn.prob = 0.5
        else:
            mixup_fn.prob = 1.0
        # Disable RandomErasing in final epoch by rebuilding train loader with erase_p=0
        if is_last:
            train_dl, _ = build_loaders_for_continuation(im_size=384, batch_size=batch_size, workers=workers, erase_p=0.0)
        model.train(); t0 = time.time(); running=0.0; n_seen=0
        for bi, (x,y,ids) in enumerate(train_dl):
            x = x.to(device, non_blocking=True).to(memory_format=torch.channels_last)
            y = y.to(device, non_blocking=True)
            optimizer.zero_grad(set_to_none=True)
            if mixup_fn.prob > 0:
                x, y_soft = mixup_fn(x, y)
                with torch.autocast(device_type='cuda', dtype=torch.float16, enabled=device.type=='cuda'):
                    logits = model(x)
                    loss = criterion_soft(logits, y_soft)
            else:
                with torch.autocast(device_type='cuda', dtype=torch.float16, enabled=device.type=='cuda'):
                    logits = model(x)
                    loss = criterion_hard(logits, y)
            scaler.scale(loss).backward()
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            scaler.step(optimizer); scaler.update()
            num_updates += 1; sched.step_update(num_updates)
            if num_updates > warmup_updates:
                ema.update(model)
            running += loss.item()*x.size(0); n_seen += x.size(0)
            if (bi+1)%100==0:
                cur_lr = optimizer.param_groups[0]['lr']
                print(f"[CONT] ep {ep+1}/{extra_epochs} batch {bi+1}/{len(train_dl)} lr={cur_lr:.2e} loss={running/max(1,n_seen):.4f} elapsed={time.time()-t0:.1f}s", flush=True)
        print(f"[CONT] Epoch {ep+1} train_loss={running/max(1,n_seen):.4f} time={time.time()-t0:.1f}s total={(time.time()-t_start)/60:.1f}m")
        acc, vloss = evaluate_model(ema.module, val_dl)
        # Save rolling last-N EMA checkpoints
        outp = f"ema_cont_ep{ep+1}.pt"
        torch.save({'model': ema.module.state_dict(), 'acc': acc}, outp)
        last_ckpts.append(outp)
        if len(last_ckpts) > save_last_n:
            rm = last_ckpts.pop(0)
            try: os.remove(rm)
            except: pass
        if acc > best_acc:
            best_acc = acc
            torch.save({'model': ema.module.state_dict(), 'acc': acc}, 'best_convnext_baseline_cont.pt')
            print(f"[CONT] New best acc={acc:.4f}")
    print('[CONT] Done. Best acc:', best_acc)
    print('[CONT] Averaging last', len(last_ckpts), 'EMA checkpoints...')
    avg_sd = average_state_dicts(last_ckpts) if last_ckpts else ema.module.state_dict()
    torch.save({'model': avg_sd, 'acc': best_acc}, 'avg_ema_last.pt')
    print('[CONT] Saved avg_ema_last.pt')

print('Continuation cell ready. After the initial 15-epoch run finishes, execute this cell to extend to 25 epochs and produce avg_ema_last.pt for TTA.')

Continuation cell ready. After the initial 15-epoch run finishes, execute this cell to extend to 25 epochs and produce avg_ema_last.pt for TTA.


In [17]:
# Execute continuation (extend training) after initial 15-epoch run completes
# This will load best_convnext_baseline.pt and run extra epochs with EMA=0.9999,
# taper mixup, final-epoch CE+LS and RandomErasing off, and save/average last-N EMA checkpoints.
try:
    import os
    os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
    # Adjust extra_epochs if needed to hit ~22-25 total
    # Reduce batch_size and workers to avoid CUDA OOM
    continue_training(extra_epochs=10, peak_lr=1e-3, ema_decay=0.9999, warmup_updates=300, save_last_n=8, batch_size=32, workers=8)
except NameError:
    print("Continuation functions not defined yet. Run cell 7 first.")

[CONT] Loading best checkpoint and continuing for 10 epochs


  ckpt = torch.load('best_convnext_baseline.pt', map_location='cpu')


[CONT] ep 1/10 batch 100/6925 lr=3.34e-04 loss=1.3814 elapsed=49.4s


[CONT] ep 1/10 batch 200/6925 lr=6.67e-04 loss=1.5789 elapsed=87.9s


[CONT] ep 1/10 batch 300/6925 lr=1.00e-03 loss=1.9061 elapsed=126.9s


[CONT] ep 1/10 batch 400/6925 lr=1.00e-03 loss=2.1384 elapsed=167.2s


[CONT] ep 1/10 batch 500/6925 lr=1.00e-03 loss=2.2669 elapsed=207.8s


[CONT] ep 1/10 batch 600/6925 lr=1.00e-03 loss=2.3607 elapsed=248.6s


[CONT] ep 1/10 batch 700/6925 lr=1.00e-03 loss=2.4514 elapsed=289.5s


[CONT] ep 1/10 batch 800/6925 lr=1.00e-03 loss=2.4873 elapsed=330.4s


[CONT] ep 1/10 batch 900/6925 lr=1.00e-03 loss=2.5442 elapsed=371.6s


[CONT] ep 1/10 batch 1000/6925 lr=9.99e-04 loss=2.6054 elapsed=412.8s


[CONT] ep 1/10 batch 1100/6925 lr=9.99e-04 loss=2.6521 elapsed=454.0s


[CONT] ep 1/10 batch 1200/6925 lr=9.99e-04 loss=2.6917 elapsed=495.2s


[CONT] ep 1/10 batch 1300/6925 lr=9.99e-04 loss=2.7237 elapsed=536.4s


[CONT] ep 1/10 batch 1400/6925 lr=9.99e-04 loss=2.7437 elapsed=577.6s


[CONT] ep 1/10 batch 1500/6925 lr=9.99e-04 loss=2.7700 elapsed=618.7s


[CONT] ep 1/10 batch 1600/6925 lr=9.99e-04 loss=2.7867 elapsed=659.7s


[CONT] ep 1/10 batch 1700/6925 lr=9.99e-04 loss=2.8024 elapsed=701.0s


[CONT] ep 1/10 batch 1800/6925 lr=9.98e-04 loss=2.8239 elapsed=742.3s


[CONT] ep 1/10 batch 1900/6925 lr=9.98e-04 loss=2.8411 elapsed=783.6s


[CONT] ep 1/10 batch 2000/6925 lr=9.98e-04 loss=2.8500 elapsed=824.9s


[CONT] ep 1/10 batch 2100/6925 lr=9.98e-04 loss=2.8680 elapsed=866.1s


[CONT] ep 1/10 batch 2200/6925 lr=9.98e-04 loss=2.8700 elapsed=907.3s


[CONT] ep 1/10 batch 2300/6925 lr=9.97e-04 loss=2.8813 elapsed=948.6s


[CONT] ep 1/10 batch 2400/6925 lr=9.97e-04 loss=2.8871 elapsed=989.8s


[CONT] ep 1/10 batch 2500/6925 lr=9.97e-04 loss=2.8988 elapsed=1031.1s


[CONT] ep 1/10 batch 2600/6925 lr=9.97e-04 loss=2.9083 elapsed=1072.3s


[CONT] ep 1/10 batch 2700/6925 lr=9.96e-04 loss=2.9199 elapsed=1113.5s


[CONT] ep 1/10 batch 2800/6925 lr=9.96e-04 loss=2.9281 elapsed=1154.7s


[CONT] ep 1/10 batch 2900/6925 lr=9.96e-04 loss=2.9299 elapsed=1195.9s


[CONT] ep 1/10 batch 3000/6925 lr=9.95e-04 loss=2.9388 elapsed=1237.2s


[CONT] ep 1/10 batch 3100/6925 lr=9.95e-04 loss=2.9513 elapsed=1278.5s


[CONT] ep 1/10 batch 3200/6925 lr=9.95e-04 loss=2.9532 elapsed=1319.8s


[CONT] ep 1/10 batch 3300/6925 lr=9.94e-04 loss=2.9570 elapsed=1361.0s


[CONT] ep 1/10 batch 3400/6925 lr=9.94e-04 loss=2.9587 elapsed=1402.3s


[CONT] ep 1/10 batch 3500/6925 lr=9.94e-04 loss=2.9645 elapsed=1443.5s


[CONT] ep 1/10 batch 3600/6925 lr=9.93e-04 loss=2.9668 elapsed=1484.8s


[CONT] ep 1/10 batch 3700/6925 lr=9.93e-04 loss=2.9640 elapsed=1526.0s


[CONT] ep 1/10 batch 3800/6925 lr=9.93e-04 loss=2.9687 elapsed=1567.3s


[CONT] ep 1/10 batch 3900/6925 lr=9.92e-04 loss=2.9689 elapsed=1608.5s


[CONT] ep 1/10 batch 4000/6925 lr=9.92e-04 loss=2.9707 elapsed=1649.8s


[CONT] ep 1/10 batch 4100/6925 lr=9.91e-04 loss=2.9741 elapsed=1690.9s


[CONT] ep 1/10 batch 4200/6925 lr=9.91e-04 loss=2.9760 elapsed=1732.1s


[CONT] ep 1/10 batch 4300/6925 lr=9.91e-04 loss=2.9782 elapsed=1773.2s


[CONT] ep 1/10 batch 4400/6925 lr=9.90e-04 loss=2.9820 elapsed=1814.4s


[CONT] ep 1/10 batch 4500/6925 lr=9.90e-04 loss=2.9855 elapsed=1855.7s


[CONT] ep 1/10 batch 4600/6925 lr=9.89e-04 loss=2.9885 elapsed=1897.0s


[CONT] ep 1/10 batch 4700/6925 lr=9.89e-04 loss=2.9906 elapsed=1938.3s


[CONT] ep 1/10 batch 4800/6925 lr=9.88e-04 loss=2.9908 elapsed=1979.5s


[CONT] ep 1/10 batch 4900/6925 lr=9.88e-04 loss=2.9893 elapsed=2020.8s


[CONT] ep 1/10 batch 5000/6925 lr=9.87e-04 loss=2.9924 elapsed=2061.9s


[CONT] ep 1/10 batch 5100/6925 lr=9.87e-04 loss=2.9910 elapsed=2103.1s


[CONT] ep 1/10 batch 5200/6925 lr=9.86e-04 loss=2.9934 elapsed=2144.5s


[CONT] ep 1/10 batch 5300/6925 lr=9.86e-04 loss=2.9975 elapsed=2185.7s


[CONT] ep 1/10 batch 5400/6925 lr=9.85e-04 loss=2.9967 elapsed=2226.9s


[CONT] ep 1/10 batch 5500/6925 lr=9.85e-04 loss=2.9999 elapsed=2268.2s


[CONT] ep 1/10 batch 5600/6925 lr=9.84e-04 loss=3.0011 elapsed=2309.5s


[CONT] ep 1/10 batch 5700/6925 lr=9.83e-04 loss=3.0019 elapsed=2350.7s


[CONT] ep 1/10 batch 5800/6925 lr=9.83e-04 loss=3.0018 elapsed=2391.9s


[CONT] ep 1/10 batch 5900/6925 lr=9.82e-04 loss=3.0033 elapsed=2433.0s


[CONT] ep 1/10 batch 6000/6925 lr=9.82e-04 loss=3.0013 elapsed=2474.3s


[CONT] ep 1/10 batch 6100/6925 lr=9.81e-04 loss=3.0005 elapsed=2515.6s


[CONT] ep 1/10 batch 6200/6925 lr=9.80e-04 loss=3.0003 elapsed=2556.9s


[CONT] ep 1/10 batch 6300/6925 lr=9.80e-04 loss=3.0021 elapsed=2598.1s


[CONT] ep 1/10 batch 6400/6925 lr=9.79e-04 loss=3.0001 elapsed=2639.4s


[CONT] ep 1/10 batch 6500/6925 lr=9.78e-04 loss=3.0027 elapsed=2680.5s


[CONT] ep 1/10 batch 6600/6925 lr=9.78e-04 loss=3.0010 elapsed=2721.6s


[CONT] ep 1/10 batch 6700/6925 lr=9.77e-04 loss=3.0008 elapsed=2762.8s


[CONT] ep 1/10 batch 6800/6925 lr=9.76e-04 loss=2.9999 elapsed=2804.1s


[CONT] ep 1/10 batch 6900/6925 lr=9.76e-04 loss=3.0000 elapsed=2845.2s


[CONT] Epoch 1 train_loss=3.0007 time=2855.5s total=47.6m


[CONT] Eval acc=0.8165 loss=0.6478 time=48.2s


[CONT] New best acc=0.8165


[CONT] ep 2/10 batch 100/6925 lr=9.75e-04 loss=2.9620 elapsed=41.2s


[CONT] ep 2/10 batch 200/6925 lr=9.74e-04 loss=2.9464 elapsed=82.2s


[CONT] ep 2/10 batch 300/6925 lr=9.73e-04 loss=2.9751 elapsed=123.3s


[CONT] ep 2/10 batch 400/6925 lr=9.73e-04 loss=3.0058 elapsed=164.5s


[CONT] ep 2/10 batch 500/6925 lr=9.72e-04 loss=2.9713 elapsed=205.7s


[CONT] ep 2/10 batch 600/6925 lr=9.71e-04 loss=2.9647 elapsed=246.8s


[CONT] ep 2/10 batch 700/6925 lr=9.70e-04 loss=2.9542 elapsed=288.0s


[CONT] ep 2/10 batch 800/6925 lr=9.70e-04 loss=2.9525 elapsed=329.2s


[CONT] ep 2/10 batch 900/6925 lr=9.69e-04 loss=2.9797 elapsed=370.4s


[CONT] ep 2/10 batch 1000/6925 lr=9.68e-04 loss=2.9704 elapsed=411.7s


[CONT] ep 2/10 batch 1100/6925 lr=9.67e-04 loss=2.9989 elapsed=453.0s


[CONT] ep 2/10 batch 1200/6925 lr=9.66e-04 loss=2.9952 elapsed=494.3s


[CONT] ep 2/10 batch 1300/6925 lr=9.66e-04 loss=2.9960 elapsed=535.6s


[CONT] ep 2/10 batch 1400/6925 lr=9.65e-04 loss=3.0140 elapsed=576.9s


[CONT] ep 2/10 batch 1500/6925 lr=9.64e-04 loss=3.0174 elapsed=618.1s


[CONT] ep 2/10 batch 1600/6925 lr=9.63e-04 loss=3.0169 elapsed=659.2s


[CONT] ep 2/10 batch 1700/6925 lr=9.62e-04 loss=3.0072 elapsed=700.3s


[CONT] ep 2/10 batch 1800/6925 lr=9.61e-04 loss=3.0106 elapsed=741.5s


[CONT] ep 2/10 batch 1900/6925 lr=9.61e-04 loss=3.0062 elapsed=782.7s


[CONT] ep 2/10 batch 2000/6925 lr=9.60e-04 loss=3.0016 elapsed=823.9s


[CONT] ep 2/10 batch 2100/6925 lr=9.59e-04 loss=2.9966 elapsed=865.0s


[CONT] ep 2/10 batch 2200/6925 lr=9.58e-04 loss=2.9973 elapsed=906.2s


[CONT] ep 2/10 batch 2300/6925 lr=9.57e-04 loss=3.0012 elapsed=947.4s


[CONT] ep 2/10 batch 2400/6925 lr=9.56e-04 loss=2.9998 elapsed=988.6s


[CONT] ep 2/10 batch 2500/6925 lr=9.55e-04 loss=3.0069 elapsed=1029.8s


[CONT] ep 2/10 batch 2600/6925 lr=9.54e-04 loss=3.0077 elapsed=1071.0s


[CONT] ep 2/10 batch 2700/6925 lr=9.53e-04 loss=3.0045 elapsed=1112.3s


[CONT] ep 2/10 batch 2800/6925 lr=9.52e-04 loss=3.0121 elapsed=1153.5s


[CONT] ep 2/10 batch 2900/6925 lr=9.51e-04 loss=3.0098 elapsed=1194.8s


[CONT] ep 2/10 batch 3000/6925 lr=9.50e-04 loss=3.0070 elapsed=1236.0s


[CONT] ep 2/10 batch 3100/6925 lr=9.49e-04 loss=3.0062 elapsed=1277.2s


[CONT] ep 2/10 batch 3200/6925 lr=9.48e-04 loss=3.0072 elapsed=1318.4s


[CONT] ep 2/10 batch 3300/6925 lr=9.47e-04 loss=3.0041 elapsed=1359.5s


[CONT] ep 2/10 batch 3400/6925 lr=9.46e-04 loss=3.0038 elapsed=1400.7s


[CONT] ep 2/10 batch 3500/6925 lr=9.45e-04 loss=3.0025 elapsed=1442.0s


[CONT] ep 2/10 batch 3600/6925 lr=9.44e-04 loss=3.0047 elapsed=1483.2s


[CONT] ep 2/10 batch 3700/6925 lr=9.43e-04 loss=3.0049 elapsed=1524.5s


[CONT] ep 2/10 batch 3800/6925 lr=9.42e-04 loss=3.0048 elapsed=1565.7s


[CONT] ep 2/10 batch 3900/6925 lr=9.41e-04 loss=3.0070 elapsed=1607.0s


[CONT] ep 2/10 batch 4000/6925 lr=9.40e-04 loss=3.0080 elapsed=1648.3s


[CONT] ep 2/10 batch 4100/6925 lr=9.39e-04 loss=3.0071 elapsed=1689.6s


[CONT] ep 2/10 batch 4200/6925 lr=9.38e-04 loss=3.0071 elapsed=1730.8s


[CONT] ep 2/10 batch 4300/6925 lr=9.37e-04 loss=3.0088 elapsed=1772.1s


[CONT] ep 2/10 batch 4400/6925 lr=9.36e-04 loss=3.0102 elapsed=1813.4s


[CONT] ep 2/10 batch 4500/6925 lr=9.34e-04 loss=3.0088 elapsed=1854.5s


[CONT] ep 2/10 batch 4600/6925 lr=9.33e-04 loss=3.0064 elapsed=1895.6s


[CONT] ep 2/10 batch 4700/6925 lr=9.32e-04 loss=3.0036 elapsed=1936.8s


[CONT] ep 2/10 batch 4800/6925 lr=9.31e-04 loss=3.0019 elapsed=1977.9s


[CONT] ep 2/10 batch 4900/6925 lr=9.30e-04 loss=3.0012 elapsed=2019.1s


[CONT] ep 2/10 batch 5000/6925 lr=9.29e-04 loss=3.0030 elapsed=2060.3s


[CONT] ep 2/10 batch 5100/6925 lr=9.27e-04 loss=3.0022 elapsed=2101.5s


[CONT] ep 2/10 batch 5200/6925 lr=9.26e-04 loss=3.0018 elapsed=2142.8s


[CONT] ep 2/10 batch 5300/6925 lr=9.25e-04 loss=3.0020 elapsed=2183.9s


[CONT] ep 2/10 batch 5400/6925 lr=9.24e-04 loss=3.0008 elapsed=2225.0s


[CONT] ep 2/10 batch 5500/6925 lr=9.23e-04 loss=3.0020 elapsed=2266.3s


[CONT] ep 2/10 batch 5600/6925 lr=9.22e-04 loss=3.0035 elapsed=2307.7s


[CONT] ep 2/10 batch 5700/6925 lr=9.20e-04 loss=3.0032 elapsed=2348.9s


[CONT] ep 2/10 batch 5800/6925 lr=9.19e-04 loss=3.0019 elapsed=2390.0s


[CONT] ep 2/10 batch 5900/6925 lr=9.18e-04 loss=3.0023 elapsed=2431.2s


[CONT] ep 2/10 batch 6000/6925 lr=9.17e-04 loss=2.9998 elapsed=2472.5s


[CONT] ep 2/10 batch 6100/6925 lr=9.15e-04 loss=3.0018 elapsed=2513.7s


[CONT] ep 2/10 batch 6200/6925 lr=9.14e-04 loss=2.9988 elapsed=2554.8s


[CONT] ep 2/10 batch 6300/6925 lr=9.13e-04 loss=2.9959 elapsed=2596.0s


[CONT] ep 2/10 batch 6400/6925 lr=9.11e-04 loss=2.9961 elapsed=2637.3s


[CONT] ep 2/10 batch 6500/6925 lr=9.10e-04 loss=2.9936 elapsed=2678.5s


[CONT] ep 2/10 batch 6600/6925 lr=9.09e-04 loss=2.9931 elapsed=2719.6s


[CONT] ep 2/10 batch 6700/6925 lr=9.08e-04 loss=2.9933 elapsed=2760.8s


[CONT] ep 2/10 batch 6800/6925 lr=9.06e-04 loss=2.9927 elapsed=2802.0s


[CONT] ep 2/10 batch 6900/6925 lr=9.05e-04 loss=2.9943 elapsed=2843.4s


[CONT] Epoch 2 train_loss=2.9942 time=2853.7s total=96.0m


[CONT] Eval acc=0.7873 loss=0.7502 time=47.2s


[CONT] ep 3/10 batch 100/6925 lr=9.03e-04 loss=2.9959 elapsed=41.3s


[CONT] ep 3/10 batch 200/6925 lr=9.02e-04 loss=2.9686 elapsed=82.4s


[CONT] ep 3/10 batch 300/6925 lr=9.01e-04 loss=2.9302 elapsed=123.8s


[CONT] ep 3/10 batch 400/6925 lr=8.99e-04 loss=2.9284 elapsed=165.0s


[CONT] ep 3/10 batch 500/6925 lr=8.98e-04 loss=2.9477 elapsed=206.2s


[CONT] ep 3/10 batch 600/6925 lr=8.96e-04 loss=2.9416 elapsed=247.3s


[CONT] ep 3/10 batch 700/6925 lr=8.95e-04 loss=2.9576 elapsed=288.5s


[CONT] ep 3/10 batch 800/6925 lr=8.94e-04 loss=2.9443 elapsed=329.8s


[CONT] ep 3/10 batch 900/6925 lr=8.92e-04 loss=2.9451 elapsed=371.0s


[CONT] ep 3/10 batch 1000/6925 lr=8.91e-04 loss=2.9356 elapsed=412.2s


[CONT] ep 3/10 batch 1100/6925 lr=8.89e-04 loss=2.9333 elapsed=453.5s


[CONT] ep 3/10 batch 1200/6925 lr=8.88e-04 loss=2.9441 elapsed=494.7s


[CONT] ep 3/10 batch 1300/6925 lr=8.87e-04 loss=2.9404 elapsed=535.9s


[CONT] ep 3/10 batch 1400/6925 lr=8.85e-04 loss=2.9355 elapsed=577.2s


[CONT] ep 3/10 batch 1500/6925 lr=8.84e-04 loss=2.9306 elapsed=618.4s


[CONT] ep 3/10 batch 1600/6925 lr=8.82e-04 loss=2.9238 elapsed=659.6s


[CONT] ep 3/10 batch 1700/6925 lr=8.81e-04 loss=2.9124 elapsed=700.9s


[CONT] ep 3/10 batch 1800/6925 lr=8.79e-04 loss=2.8998 elapsed=742.0s


[CONT] ep 3/10 batch 1900/6925 lr=8.78e-04 loss=2.9005 elapsed=783.1s


[CONT] ep 3/10 batch 2000/6925 lr=8.76e-04 loss=2.9023 elapsed=824.3s


[CONT] ep 3/10 batch 2100/6925 lr=8.75e-04 loss=2.9088 elapsed=865.6s


[CONT] ep 3/10 batch 2200/6925 lr=8.73e-04 loss=2.9071 elapsed=906.8s


[CONT] ep 3/10 batch 2300/6925 lr=8.72e-04 loss=2.8974 elapsed=948.0s


[CONT] ep 3/10 batch 2400/6925 lr=8.70e-04 loss=2.8923 elapsed=989.3s


[CONT] ep 3/10 batch 2500/6925 lr=8.69e-04 loss=2.8843 elapsed=1030.4s


[CONT] ep 3/10 batch 2600/6925 lr=8.67e-04 loss=2.8836 elapsed=1071.5s


[CONT] ep 3/10 batch 2700/6925 lr=8.66e-04 loss=2.8865 elapsed=1112.6s


[CONT] ep 3/10 batch 2800/6925 lr=8.64e-04 loss=2.8912 elapsed=1153.8s


[CONT] ep 3/10 batch 2900/6925 lr=8.63e-04 loss=2.8998 elapsed=1195.0s


[CONT] ep 3/10 batch 3000/6925 lr=8.61e-04 loss=2.9032 elapsed=1236.2s


[CONT] ep 3/10 batch 3100/6925 lr=8.59e-04 loss=2.8982 elapsed=1277.5s


[CONT] ep 3/10 batch 3200/6925 lr=8.58e-04 loss=2.8997 elapsed=1318.7s


[CONT] ep 3/10 batch 3300/6925 lr=8.56e-04 loss=2.9012 elapsed=1360.0s


[CONT] ep 3/10 batch 3400/6925 lr=8.55e-04 loss=2.9006 elapsed=1401.2s


[CONT] ep 3/10 batch 3500/6925 lr=8.53e-04 loss=2.9018 elapsed=1442.5s


[CONT] ep 3/10 batch 3600/6925 lr=8.51e-04 loss=2.8988 elapsed=1483.7s


[CONT] ep 3/10 batch 3700/6925 lr=8.50e-04 loss=2.8962 elapsed=1524.9s


[CONT] ep 3/10 batch 3800/6925 lr=8.48e-04 loss=2.8984 elapsed=1566.1s


[CONT] ep 3/10 batch 3900/6925 lr=8.47e-04 loss=2.9023 elapsed=1607.4s


[CONT] ep 3/10 batch 4000/6925 lr=8.45e-04 loss=2.9014 elapsed=1648.7s


[CONT] ep 3/10 batch 4100/6925 lr=8.43e-04 loss=2.9005 elapsed=1689.8s


[CONT] ep 3/10 batch 4200/6925 lr=8.42e-04 loss=2.8984 elapsed=1731.0s


[CONT] ep 3/10 batch 4300/6925 lr=8.40e-04 loss=2.8972 elapsed=1772.2s


[CONT] ep 3/10 batch 4400/6925 lr=8.38e-04 loss=2.8981 elapsed=1813.4s


[CONT] ep 3/10 batch 4500/6925 lr=8.37e-04 loss=2.8995 elapsed=1854.7s


[CONT] ep 3/10 batch 4600/6925 lr=8.35e-04 loss=2.8963 elapsed=1895.9s


[CONT] ep 3/10 batch 4700/6925 lr=8.33e-04 loss=2.8936 elapsed=1937.1s


[CONT] ep 3/10 batch 4800/6925 lr=8.32e-04 loss=2.8940 elapsed=1978.3s


[CONT] ep 3/10 batch 4900/6925 lr=8.30e-04 loss=2.8923 elapsed=2019.5s


[CONT] ep 3/10 batch 5000/6925 lr=8.28e-04 loss=2.8909 elapsed=2060.7s


[CONT] ep 3/10 batch 5100/6925 lr=8.27e-04 loss=2.8908 elapsed=2101.9s


[CONT] ep 3/10 batch 5200/6925 lr=8.25e-04 loss=2.8900 elapsed=2143.2s


[CONT] ep 3/10 batch 5300/6925 lr=8.23e-04 loss=2.8909 elapsed=2184.4s


[CONT] ep 3/10 batch 5400/6925 lr=8.21e-04 loss=2.8914 elapsed=2225.6s


[CONT] ep 3/10 batch 5500/6925 lr=8.20e-04 loss=2.8925 elapsed=2266.8s


[CONT] ep 3/10 batch 5600/6925 lr=8.18e-04 loss=2.8918 elapsed=2308.0s


[CONT] ep 3/10 batch 5700/6925 lr=8.16e-04 loss=2.8924 elapsed=2349.2s


[CONT] ep 3/10 batch 5800/6925 lr=8.14e-04 loss=2.8952 elapsed=2390.6s


[CONT] ep 3/10 batch 5900/6925 lr=8.13e-04 loss=2.8902 elapsed=2431.9s


[CONT] ep 3/10 batch 6000/6925 lr=8.11e-04 loss=2.8922 elapsed=2473.1s


[CONT] ep 3/10 batch 6100/6925 lr=8.09e-04 loss=2.8897 elapsed=2514.3s


[CONT] ep 3/10 batch 6200/6925 lr=8.07e-04 loss=2.8891 elapsed=2555.5s


[CONT] ep 3/10 batch 6300/6925 lr=8.05e-04 loss=2.8855 elapsed=2596.6s


[CONT] ep 3/10 batch 6400/6925 lr=8.04e-04 loss=2.8838 elapsed=2637.7s


[CONT] ep 3/10 batch 6500/6925 lr=8.02e-04 loss=2.8844 elapsed=2678.9s


[CONT] ep 3/10 batch 6600/6925 lr=8.00e-04 loss=2.8851 elapsed=2720.1s


[CONT] ep 3/10 batch 6700/6925 lr=7.98e-04 loss=2.8838 elapsed=2761.4s


[CONT] ep 3/10 batch 6800/6925 lr=7.96e-04 loss=2.8840 elapsed=2802.6s


[CONT] ep 3/10 batch 6900/6925 lr=7.95e-04 loss=2.8811 elapsed=2843.8s


[CONT] Epoch 3 train_loss=2.8810 time=2854.1s total=144.3m


[CONT] Eval acc=0.7800 loss=0.7903 time=47.1s


[CONT] ep 4/10 batch 100/6925 lr=7.92e-04 loss=2.7437 elapsed=41.3s


[CONT] ep 4/10 batch 200/6925 lr=7.90e-04 loss=2.7745 elapsed=82.6s


[CONT] ep 4/10 batch 300/6925 lr=7.89e-04 loss=2.7677 elapsed=123.9s


[CONT] ep 4/10 batch 400/6925 lr=7.87e-04 loss=2.7641 elapsed=165.1s


[CONT] ep 4/10 batch 500/6925 lr=7.85e-04 loss=2.7521 elapsed=206.3s


[CONT] ep 4/10 batch 600/6925 lr=7.83e-04 loss=2.7835 elapsed=247.5s


[CONT] ep 4/10 batch 700/6925 lr=7.81e-04 loss=2.7911 elapsed=288.7s


[CONT] ep 4/10 batch 800/6925 lr=7.79e-04 loss=2.7973 elapsed=329.9s


[CONT] ep 4/10 batch 900/6925 lr=7.77e-04 loss=2.8028 elapsed=371.1s


[CONT] ep 4/10 batch 1000/6925 lr=7.75e-04 loss=2.7872 elapsed=412.3s


[CONT] ep 4/10 batch 1100/6925 lr=7.74e-04 loss=2.7986 elapsed=453.5s


[CONT] ep 4/10 batch 1200/6925 lr=7.72e-04 loss=2.8017 elapsed=494.7s


[CONT] ep 4/10 batch 1300/6925 lr=7.70e-04 loss=2.7972 elapsed=536.0s


[CONT] ep 4/10 batch 1400/6925 lr=7.68e-04 loss=2.7887 elapsed=577.2s


[CONT] ep 4/10 batch 1500/6925 lr=7.66e-04 loss=2.8000 elapsed=618.4s


[CONT] ep 4/10 batch 1600/6925 lr=7.64e-04 loss=2.8006 elapsed=659.7s


[CONT] ep 4/10 batch 1700/6925 lr=7.62e-04 loss=2.7937 elapsed=701.0s


[CONT] ep 4/10 batch 1800/6925 lr=7.60e-04 loss=2.7898 elapsed=742.3s


[CONT] ep 4/10 batch 1900/6925 lr=7.58e-04 loss=2.7919 elapsed=783.4s


[CONT] ep 4/10 batch 2000/6925 lr=7.56e-04 loss=2.7912 elapsed=824.5s


[CONT] ep 4/10 batch 2100/6925 lr=7.54e-04 loss=2.7882 elapsed=865.7s


[CONT] ep 4/10 batch 2200/6925 lr=7.52e-04 loss=2.7869 elapsed=906.9s


[CONT] ep 4/10 batch 2300/6925 lr=7.50e-04 loss=2.7825 elapsed=948.1s


[CONT] ep 4/10 batch 2400/6925 lr=7.48e-04 loss=2.7791 elapsed=989.3s


[CONT] ep 4/10 batch 2500/6925 lr=7.46e-04 loss=2.7802 elapsed=1030.6s


[CONT] ep 4/10 batch 2600/6925 lr=7.45e-04 loss=2.7734 elapsed=1071.8s


[CONT] ep 4/10 batch 2700/6925 lr=7.43e-04 loss=2.7698 elapsed=1113.1s


[CONT] ep 4/10 batch 2800/6925 lr=7.41e-04 loss=2.7671 elapsed=1154.3s


[CONT] ep 4/10 batch 2900/6925 lr=7.39e-04 loss=2.7634 elapsed=1195.5s


[CONT] ep 4/10 batch 3000/6925 lr=7.37e-04 loss=2.7598 elapsed=1236.6s


[CONT] ep 4/10 batch 3100/6925 lr=7.35e-04 loss=2.7571 elapsed=1277.7s


[CONT] ep 4/10 batch 3200/6925 lr=7.33e-04 loss=2.7553 elapsed=1318.8s


[CONT] ep 4/10 batch 3300/6925 lr=7.31e-04 loss=2.7518 elapsed=1360.1s


[CONT] ep 4/10 batch 3400/6925 lr=7.29e-04 loss=2.7500 elapsed=1401.4s


[CONT] ep 4/10 batch 3500/6925 lr=7.27e-04 loss=2.7532 elapsed=1442.6s


[CONT] ep 4/10 batch 3600/6925 lr=7.24e-04 loss=2.7531 elapsed=1483.7s


[CONT] ep 4/10 batch 3700/6925 lr=7.22e-04 loss=2.7543 elapsed=1524.8s


[CONT] ep 4/10 batch 3800/6925 lr=7.20e-04 loss=2.7525 elapsed=1566.1s


[CONT] ep 4/10 batch 3900/6925 lr=7.18e-04 loss=2.7543 elapsed=1607.3s


[CONT] ep 4/10 batch 4000/6925 lr=7.16e-04 loss=2.7532 elapsed=1648.3s


[CONT] ep 4/10 batch 4100/6925 lr=7.14e-04 loss=2.7535 elapsed=1689.5s


[CONT] ep 4/10 batch 4200/6925 lr=7.12e-04 loss=2.7536 elapsed=1730.7s


[CONT] ep 4/10 batch 4300/6925 lr=7.10e-04 loss=2.7466 elapsed=1772.0s


[CONT] ep 4/10 batch 4400/6925 lr=7.08e-04 loss=2.7492 elapsed=1813.1s


[CONT] ep 4/10 batch 4500/6925 lr=7.06e-04 loss=2.7454 elapsed=1854.3s


[CONT] ep 4/10 batch 4600/6925 lr=7.04e-04 loss=2.7453 elapsed=1895.5s


[CONT] ep 4/10 batch 4700/6925 lr=7.02e-04 loss=2.7457 elapsed=1936.8s


[CONT] ep 4/10 batch 4800/6925 lr=7.00e-04 loss=2.7419 elapsed=1978.0s


[CONT] ep 4/10 batch 4900/6925 lr=6.98e-04 loss=2.7399 elapsed=2019.3s


[CONT] ep 4/10 batch 5000/6925 lr=6.96e-04 loss=2.7392 elapsed=2060.4s


[CONT] ep 4/10 batch 5100/6925 lr=6.94e-04 loss=2.7392 elapsed=2101.5s


[CONT] ep 4/10 batch 5200/6925 lr=6.92e-04 loss=2.7349 elapsed=2142.6s


[CONT] ep 4/10 batch 5300/6925 lr=6.89e-04 loss=2.7331 elapsed=2183.8s


[CONT] ep 4/10 batch 5400/6925 lr=6.87e-04 loss=2.7336 elapsed=2225.0s


[CONT] ep 4/10 batch 5500/6925 lr=6.85e-04 loss=2.7333 elapsed=2266.2s


[CONT] ep 4/10 batch 5600/6925 lr=6.83e-04 loss=2.7311 elapsed=2307.4s


[CONT] ep 4/10 batch 5700/6925 lr=6.81e-04 loss=2.7292 elapsed=2348.5s


[CONT] ep 4/10 batch 5800/6925 lr=6.79e-04 loss=2.7283 elapsed=2389.7s


[CONT] ep 4/10 batch 5900/6925 lr=6.77e-04 loss=2.7273 elapsed=2431.0s


[CONT] ep 4/10 batch 6000/6925 lr=6.75e-04 loss=2.7275 elapsed=2472.2s


[CONT] ep 4/10 batch 6100/6925 lr=6.73e-04 loss=2.7291 elapsed=2513.5s


[CONT] ep 4/10 batch 6200/6925 lr=6.70e-04 loss=2.7293 elapsed=2554.7s


[CONT] ep 4/10 batch 6300/6925 lr=6.68e-04 loss=2.7259 elapsed=2596.0s


[CONT] ep 4/10 batch 6400/6925 lr=6.66e-04 loss=2.7255 elapsed=2637.3s


[CONT] ep 4/10 batch 6500/6925 lr=6.64e-04 loss=2.7267 elapsed=2678.5s


[CONT] ep 4/10 batch 6600/6925 lr=6.62e-04 loss=2.7244 elapsed=2719.6s


[CONT] ep 4/10 batch 6700/6925 lr=6.60e-04 loss=2.7267 elapsed=2761.0s


[CONT] ep 4/10 batch 6800/6925 lr=6.58e-04 loss=2.7276 elapsed=2802.2s


[CONT] ep 4/10 batch 6900/6925 lr=6.55e-04 loss=2.7249 elapsed=2843.3s


[CONT] Epoch 4 train_loss=2.7245 time=2853.5s total=192.7m


[CONT] Eval acc=0.7814 loss=0.7870 time=47.2s


[CONT] ep 5/10 batch 100/6925 lr=6.53e-04 loss=2.7800 elapsed=41.3s


[CONT] ep 5/10 batch 200/6925 lr=6.51e-04 loss=2.6203 elapsed=82.5s


[CONT] ep 5/10 batch 300/6925 lr=6.48e-04 loss=2.6682 elapsed=123.8s


[CONT] ep 5/10 batch 400/6925 lr=6.46e-04 loss=2.6566 elapsed=165.1s


[CONT] ep 5/10 batch 500/6925 lr=6.44e-04 loss=2.6411 elapsed=206.3s


[CONT] ep 5/10 batch 600/6925 lr=6.42e-04 loss=2.5962 elapsed=247.6s


[CONT] ep 5/10 batch 700/6925 lr=6.40e-04 loss=2.5899 elapsed=288.8s


[CONT] ep 5/10 batch 800/6925 lr=6.38e-04 loss=2.5804 elapsed=330.0s


[CONT] ep 5/10 batch 900/6925 lr=6.35e-04 loss=2.5781 elapsed=371.2s


[CONT] ep 5/10 batch 1000/6925 lr=6.33e-04 loss=2.5688 elapsed=412.3s


[CONT] ep 5/10 batch 1100/6925 lr=6.31e-04 loss=2.5689 elapsed=453.6s


[CONT] ep 5/10 batch 1200/6925 lr=6.29e-04 loss=2.5649 elapsed=494.7s


[CONT] ep 5/10 batch 1300/6925 lr=6.27e-04 loss=2.5667 elapsed=535.9s


[CONT] ep 5/10 batch 1400/6925 lr=6.24e-04 loss=2.5695 elapsed=577.2s


[CONT] ep 5/10 batch 1500/6925 lr=6.22e-04 loss=2.5686 elapsed=618.4s


[CONT] ep 5/10 batch 1600/6925 lr=6.20e-04 loss=2.5630 elapsed=659.7s


[CONT] ep 5/10 batch 1700/6925 lr=6.18e-04 loss=2.5610 elapsed=700.8s


[CONT] ep 5/10 batch 1800/6925 lr=6.16e-04 loss=2.5536 elapsed=742.0s


[CONT] ep 5/10 batch 1900/6925 lr=6.13e-04 loss=2.5514 elapsed=783.3s


[CONT] ep 5/10 batch 2000/6925 lr=6.11e-04 loss=2.5556 elapsed=824.6s


[CONT] ep 5/10 batch 2100/6925 lr=6.09e-04 loss=2.5594 elapsed=865.9s


[CONT] ep 5/10 batch 2200/6925 lr=6.07e-04 loss=2.5562 elapsed=907.2s


[CONT] ep 5/10 batch 2300/6925 lr=6.05e-04 loss=2.5629 elapsed=948.4s


[CONT] ep 5/10 batch 2400/6925 lr=6.02e-04 loss=2.5674 elapsed=989.5s


[CONT] ep 5/10 batch 2500/6925 lr=6.00e-04 loss=2.5633 elapsed=1030.6s


[CONT] ep 5/10 batch 2600/6925 lr=5.98e-04 loss=2.5649 elapsed=1071.8s


[CONT] ep 5/10 batch 2700/6925 lr=5.96e-04 loss=2.5636 elapsed=1113.0s


[CONT] ep 5/10 batch 2800/6925 lr=5.93e-04 loss=2.5626 elapsed=1154.2s


[CONT] ep 5/10 batch 2900/6925 lr=5.91e-04 loss=2.5595 elapsed=1195.4s


[CONT] ep 5/10 batch 3000/6925 lr=5.89e-04 loss=2.5601 elapsed=1236.6s


[CONT] ep 5/10 batch 3100/6925 lr=5.87e-04 loss=2.5514 elapsed=1277.9s


[CONT] ep 5/10 batch 3200/6925 lr=5.85e-04 loss=2.5509 elapsed=1319.1s


[CONT] ep 5/10 batch 3300/6925 lr=5.82e-04 loss=2.5488 elapsed=1360.4s


[CONT] ep 5/10 batch 3400/6925 lr=5.80e-04 loss=2.5492 elapsed=1401.6s


[CONT] ep 5/10 batch 3500/6925 lr=5.78e-04 loss=2.5486 elapsed=1442.9s


[CONT] ep 5/10 batch 3600/6925 lr=5.76e-04 loss=2.5478 elapsed=1484.2s


[CONT] ep 5/10 batch 3700/6925 lr=5.73e-04 loss=2.5525 elapsed=1525.4s


[CONT] ep 5/10 batch 3800/6925 lr=5.71e-04 loss=2.5513 elapsed=1566.6s


[CONT] ep 5/10 batch 3900/6925 lr=5.69e-04 loss=2.5461 elapsed=1607.8s


[CONT] ep 5/10 batch 4000/6925 lr=5.67e-04 loss=2.5423 elapsed=1649.0s


[CONT] ep 5/10 batch 4100/6925 lr=5.64e-04 loss=2.5427 elapsed=1690.2s


[CONT] ep 5/10 batch 4200/6925 lr=5.62e-04 loss=2.5377 elapsed=1731.5s


[CONT] ep 5/10 batch 4300/6925 lr=5.60e-04 loss=2.5346 elapsed=1772.7s


[CONT] ep 5/10 batch 4400/6925 lr=5.58e-04 loss=2.5348 elapsed=1814.0s


[CONT] ep 5/10 batch 4500/6925 lr=5.55e-04 loss=2.5369 elapsed=1855.3s


[CONT] ep 5/10 batch 4600/6925 lr=5.53e-04 loss=2.5374 elapsed=1896.5s


[CONT] ep 5/10 batch 4700/6925 lr=5.51e-04 loss=2.5362 elapsed=1937.8s


[CONT] ep 5/10 batch 4800/6925 lr=5.49e-04 loss=2.5356 elapsed=1979.1s


[CONT] ep 5/10 batch 4900/6925 lr=5.46e-04 loss=2.5372 elapsed=2020.4s


[CONT] ep 5/10 batch 5000/6925 lr=5.44e-04 loss=2.5344 elapsed=2061.5s


[CONT] ep 5/10 batch 5100/6925 lr=5.42e-04 loss=2.5327 elapsed=2102.8s


[CONT] ep 5/10 batch 5200/6925 lr=5.40e-04 loss=2.5303 elapsed=2143.9s


[CONT] ep 5/10 batch 5300/6925 lr=5.37e-04 loss=2.5279 elapsed=2185.2s


[CONT] ep 5/10 batch 5400/6925 lr=5.35e-04 loss=2.5253 elapsed=2226.4s


[CONT] ep 5/10 batch 5500/6925 lr=5.33e-04 loss=2.5216 elapsed=2267.7s


[CONT] ep 5/10 batch 5600/6925 lr=5.31e-04 loss=2.5211 elapsed=2309.0s


[CONT] ep 5/10 batch 5700/6925 lr=5.28e-04 loss=2.5226 elapsed=2350.2s


[CONT] ep 5/10 batch 5800/6925 lr=5.26e-04 loss=2.5215 elapsed=2391.5s


[CONT] ep 5/10 batch 5900/6925 lr=5.24e-04 loss=2.5206 elapsed=2432.7s


[CONT] ep 5/10 batch 6000/6925 lr=5.21e-04 loss=2.5236 elapsed=2474.0s


[CONT] ep 5/10 batch 6100/6925 lr=5.19e-04 loss=2.5209 elapsed=2515.0s


[CONT] ep 5/10 batch 6200/6925 lr=5.17e-04 loss=2.5169 elapsed=2556.2s


[CONT] ep 5/10 batch 6300/6925 lr=5.15e-04 loss=2.5155 elapsed=2597.4s


[CONT] ep 5/10 batch 6400/6925 lr=5.12e-04 loss=2.5171 elapsed=2638.6s


[CONT] ep 5/10 batch 6500/6925 lr=5.10e-04 loss=2.5158 elapsed=2679.8s


[CONT] ep 5/10 batch 6600/6925 lr=5.08e-04 loss=2.5145 elapsed=2721.0s


[CONT] ep 5/10 batch 6700/6925 lr=5.06e-04 loss=2.5129 elapsed=2762.3s


[CONT] ep 5/10 batch 6800/6925 lr=5.03e-04 loss=2.5119 elapsed=2803.5s


[CONT] ep 5/10 batch 6900/6925 lr=5.01e-04 loss=2.5123 elapsed=2844.8s


[CONT] Epoch 5 train_loss=2.5116 time=2855.0s total=241.1m


[CONT] Eval acc=0.7878 loss=0.7610 time=47.1s


[CONT] ep 6/10 batch 100/6925 lr=4.98e-04 loss=2.4650 elapsed=41.2s


[CONT] ep 6/10 batch 200/6925 lr=4.96e-04 loss=2.3829 elapsed=82.5s


[CONT] ep 6/10 batch 300/6925 lr=4.94e-04 loss=2.3558 elapsed=123.7s


[CONT] ep 6/10 batch 400/6925 lr=4.91e-04 loss=2.3685 elapsed=164.8s


[CONT] ep 6/10 batch 500/6925 lr=4.89e-04 loss=2.3642 elapsed=205.9s


[CONT] ep 6/10 batch 600/6925 lr=4.87e-04 loss=2.3771 elapsed=247.0s


[CONT] ep 6/10 batch 700/6925 lr=4.85e-04 loss=2.4017 elapsed=288.2s


[CONT] ep 6/10 batch 800/6925 lr=4.82e-04 loss=2.4221 elapsed=329.4s


[CONT] ep 6/10 batch 900/6925 lr=4.80e-04 loss=2.4107 elapsed=370.6s


[CONT] ep 6/10 batch 1000/6925 lr=4.78e-04 loss=2.4262 elapsed=411.9s


[CONT] ep 6/10 batch 1100/6925 lr=4.76e-04 loss=2.4253 elapsed=453.1s


[CONT] ep 6/10 batch 1200/6925 lr=4.73e-04 loss=2.4111 elapsed=494.4s


[CONT] ep 6/10 batch 1300/6925 lr=4.71e-04 loss=2.4046 elapsed=535.7s


[CONT] ep 6/10 batch 1400/6925 lr=4.69e-04 loss=2.4046 elapsed=576.9s


[CONT] ep 6/10 batch 1500/6925 lr=4.67e-04 loss=2.3940 elapsed=618.2s


[CONT] ep 6/10 batch 1600/6925 lr=4.64e-04 loss=2.3956 elapsed=659.4s


[CONT] ep 6/10 batch 1700/6925 lr=4.62e-04 loss=2.3832 elapsed=700.6s


[CONT] ep 6/10 batch 1800/6925 lr=4.60e-04 loss=2.3844 elapsed=742.0s


[CONT] ep 6/10 batch 1900/6925 lr=4.57e-04 loss=2.3809 elapsed=783.2s


[CONT] ep 6/10 batch 2000/6925 lr=4.55e-04 loss=2.3753 elapsed=824.5s


[CONT] ep 6/10 batch 2100/6925 lr=4.53e-04 loss=2.3718 elapsed=865.7s


[CONT] ep 6/10 batch 2200/6925 lr=4.51e-04 loss=2.3789 elapsed=907.0s


[CONT] ep 6/10 batch 2300/6925 lr=4.48e-04 loss=2.3763 elapsed=948.4s


[CONT] ep 6/10 batch 2400/6925 lr=4.46e-04 loss=2.3725 elapsed=989.7s


[CONT] ep 6/10 batch 2500/6925 lr=4.44e-04 loss=2.3674 elapsed=1030.9s


[CONT] ep 6/10 batch 2600/6925 lr=4.42e-04 loss=2.3582 elapsed=1072.1s


[CONT] ep 6/10 batch 2700/6925 lr=4.39e-04 loss=2.3558 elapsed=1113.3s


[CONT] ep 6/10 batch 2800/6925 lr=4.37e-04 loss=2.3572 elapsed=1154.5s


[CONT] ep 6/10 batch 2900/6925 lr=4.35e-04 loss=2.3548 elapsed=1195.7s


[CONT] ep 6/10 batch 3000/6925 lr=4.33e-04 loss=2.3557 elapsed=1237.0s


[CONT] ep 6/10 batch 3100/6925 lr=4.30e-04 loss=2.3564 elapsed=1278.3s


[CONT] ep 6/10 batch 3200/6925 lr=4.28e-04 loss=2.3611 elapsed=1319.5s


[CONT] ep 6/10 batch 3300/6925 lr=4.26e-04 loss=2.3578 elapsed=1360.8s


[CONT] ep 6/10 batch 3400/6925 lr=4.24e-04 loss=2.3581 elapsed=1402.0s


[CONT] ep 6/10 batch 3500/6925 lr=4.22e-04 loss=2.3632 elapsed=1443.1s


[CONT] ep 6/10 batch 3600/6925 lr=4.19e-04 loss=2.3615 elapsed=1484.2s


[CONT] ep 6/10 batch 3700/6925 lr=4.17e-04 loss=2.3651 elapsed=1525.4s


[CONT] ep 6/10 batch 3800/6925 lr=4.15e-04 loss=2.3630 elapsed=1566.6s


[CONT] ep 6/10 batch 3900/6925 lr=4.13e-04 loss=2.3581 elapsed=1607.9s


[CONT] ep 6/10 batch 4000/6925 lr=4.10e-04 loss=2.3550 elapsed=1649.1s


[CONT] ep 6/10 batch 4100/6925 lr=4.08e-04 loss=2.3529 elapsed=1690.3s


[CONT] ep 6/10 batch 4200/6925 lr=4.06e-04 loss=2.3480 elapsed=1731.5s


[CONT] ep 6/10 batch 4300/6925 lr=4.04e-04 loss=2.3521 elapsed=1772.6s


[CONT] ep 6/10 batch 4400/6925 lr=4.01e-04 loss=2.3499 elapsed=1813.9s


[CONT] ep 6/10 batch 4500/6925 lr=3.99e-04 loss=2.3500 elapsed=1855.3s


[CONT] ep 6/10 batch 4600/6925 lr=3.97e-04 loss=2.3466 elapsed=1896.6s


[CONT] ep 6/10 batch 4700/6925 lr=3.95e-04 loss=2.3425 elapsed=1937.9s


[CONT] ep 6/10 batch 4800/6925 lr=3.93e-04 loss=2.3436 elapsed=1979.1s


[CONT] ep 6/10 batch 4900/6925 lr=3.90e-04 loss=2.3401 elapsed=2020.2s


[CONT] ep 6/10 batch 5000/6925 lr=3.88e-04 loss=2.3411 elapsed=2061.4s


[CONT] ep 6/10 batch 5100/6925 lr=3.86e-04 loss=2.3389 elapsed=2102.8s


[CONT] ep 6/10 batch 5200/6925 lr=3.84e-04 loss=2.3344 elapsed=2144.1s


[CONT] ep 6/10 batch 5300/6925 lr=3.82e-04 loss=2.3302 elapsed=2185.3s


[CONT] ep 6/10 batch 5400/6925 lr=3.79e-04 loss=2.3278 elapsed=2226.4s


[CONT] ep 6/10 batch 5500/6925 lr=3.77e-04 loss=2.3251 elapsed=2267.4s


[CONT] ep 6/10 batch 5600/6925 lr=3.75e-04 loss=2.3222 elapsed=2308.6s


[CONT] ep 6/10 batch 5700/6925 lr=3.73e-04 loss=2.3216 elapsed=2349.7s


[CONT] ep 6/10 batch 5800/6925 lr=3.71e-04 loss=2.3213 elapsed=2390.9s


[CONT] ep 6/10 batch 5900/6925 lr=3.68e-04 loss=2.3195 elapsed=2432.2s


[CONT] ep 6/10 batch 6000/6925 lr=3.66e-04 loss=2.3172 elapsed=2473.4s


[CONT] ep 6/10 batch 6100/6925 lr=3.64e-04 loss=2.3166 elapsed=2514.6s


[CONT] ep 6/10 batch 6200/6925 lr=3.62e-04 loss=2.3133 elapsed=2555.8s


[CONT] ep 6/10 batch 6300/6925 lr=3.60e-04 loss=2.3120 elapsed=2597.1s


[CONT] ep 6/10 batch 6400/6925 lr=3.58e-04 loss=2.3094 elapsed=2638.3s


[CONT] ep 6/10 batch 6500/6925 lr=3.55e-04 loss=2.3100 elapsed=2679.6s


[CONT] ep 6/10 batch 6600/6925 lr=3.53e-04 loss=2.3089 elapsed=2720.9s


[CONT] ep 6/10 batch 6700/6925 lr=3.51e-04 loss=2.3054 elapsed=2762.1s


[CONT] ep 6/10 batch 6800/6925 lr=3.49e-04 loss=2.3033 elapsed=2803.4s


[CONT] ep 6/10 batch 6900/6925 lr=3.47e-04 loss=2.3009 elapsed=2844.5s


[CONT] Epoch 6 train_loss=2.3003 time=2854.8s total=289.4m


[CONT] Eval acc=0.7946 loss=0.7226 time=47.2s


[CONT] ep 7/10 batch 100/6925 lr=3.44e-04 loss=2.1347 elapsed=41.1s


[CONT] ep 7/10 batch 200/6925 lr=3.42e-04 loss=2.1619 elapsed=82.1s


[CONT] ep 7/10 batch 300/6925 lr=3.40e-04 loss=2.1030 elapsed=123.2s


[CONT] ep 7/10 batch 400/6925 lr=3.38e-04 loss=2.1651 elapsed=164.3s


[CONT] ep 7/10 batch 500/6925 lr=3.35e-04 loss=2.1625 elapsed=205.5s


[CONT] ep 7/10 batch 600/6925 lr=3.33e-04 loss=2.1624 elapsed=246.7s


[CONT] ep 7/10 batch 700/6925 lr=3.31e-04 loss=2.1665 elapsed=287.9s


[CONT] ep 7/10 batch 800/6925 lr=3.29e-04 loss=2.1622 elapsed=329.1s


[CONT] ep 7/10 batch 900/6925 lr=3.27e-04 loss=2.1583 elapsed=370.3s


[CONT] ep 7/10 batch 1000/6925 lr=3.25e-04 loss=2.1617 elapsed=411.4s


[CONT] ep 7/10 batch 1100/6925 lr=3.23e-04 loss=2.1550 elapsed=452.6s


[CONT] ep 7/10 batch 1200/6925 lr=3.21e-04 loss=2.1657 elapsed=494.0s


[CONT] ep 7/10 batch 1300/6925 lr=3.18e-04 loss=2.1601 elapsed=535.3s


[CONT] ep 7/10 batch 1400/6925 lr=3.16e-04 loss=2.1554 elapsed=576.5s


[CONT] ep 7/10 batch 1500/6925 lr=3.14e-04 loss=2.1576 elapsed=617.6s


[CONT] ep 7/10 batch 1600/6925 lr=3.12e-04 loss=2.1570 elapsed=658.8s


[CONT] ep 7/10 batch 1700/6925 lr=3.10e-04 loss=2.1559 elapsed=700.0s


[CONT] ep 7/10 batch 1800/6925 lr=3.08e-04 loss=2.1492 elapsed=741.3s


[CONT] ep 7/10 batch 1900/6925 lr=3.06e-04 loss=2.1533 elapsed=782.6s


[CONT] ep 7/10 batch 2000/6925 lr=3.04e-04 loss=2.1488 elapsed=823.9s


[CONT] ep 7/10 batch 2100/6925 lr=3.02e-04 loss=2.1446 elapsed=865.1s


[CONT] ep 7/10 batch 2200/6925 lr=3.00e-04 loss=2.1490 elapsed=906.4s


[CONT] ep 7/10 batch 2300/6925 lr=2.98e-04 loss=2.1511 elapsed=947.6s


[CONT] ep 7/10 batch 2400/6925 lr=2.95e-04 loss=2.1480 elapsed=988.8s


[CONT] ep 7/10 batch 2500/6925 lr=2.93e-04 loss=2.1487 elapsed=1030.1s


[CONT] ep 7/10 batch 2600/6925 lr=2.91e-04 loss=2.1429 elapsed=1071.3s


[CONT] ep 7/10 batch 2700/6925 lr=2.89e-04 loss=2.1418 elapsed=1112.6s


[CONT] ep 7/10 batch 2800/6925 lr=2.87e-04 loss=2.1388 elapsed=1153.9s


[CONT] ep 7/10 batch 2900/6925 lr=2.85e-04 loss=2.1380 elapsed=1195.0s


[CONT] ep 7/10 batch 3000/6925 lr=2.83e-04 loss=2.1383 elapsed=1236.3s


[CONT] ep 7/10 batch 3100/6925 lr=2.81e-04 loss=2.1341 elapsed=1277.5s


[CONT] ep 7/10 batch 3200/6925 lr=2.79e-04 loss=2.1352 elapsed=1318.7s


[CONT] ep 7/10 batch 3300/6925 lr=2.77e-04 loss=2.1343 elapsed=1360.0s


[CONT] ep 7/10 batch 3400/6925 lr=2.75e-04 loss=2.1333 elapsed=1401.2s


[CONT] ep 7/10 batch 3500/6925 lr=2.73e-04 loss=2.1287 elapsed=1442.5s


[CONT] ep 7/10 batch 3600/6925 lr=2.71e-04 loss=2.1237 elapsed=1483.6s


[CONT] ep 7/10 batch 3700/6925 lr=2.69e-04 loss=2.1249 elapsed=1524.8s


[CONT] ep 7/10 batch 3800/6925 lr=2.67e-04 loss=2.1216 elapsed=1566.0s


[CONT] ep 7/10 batch 3900/6925 lr=2.65e-04 loss=2.1180 elapsed=1607.2s


[CONT] ep 7/10 batch 4000/6925 lr=2.63e-04 loss=2.1130 elapsed=1648.4s


[CONT] ep 7/10 batch 4100/6925 lr=2.61e-04 loss=2.1123 elapsed=1689.7s


[CONT] ep 7/10 batch 4200/6925 lr=2.59e-04 loss=2.1072 elapsed=1731.0s


[CONT] ep 7/10 batch 4300/6925 lr=2.57e-04 loss=2.1049 elapsed=1772.2s


[CONT] ep 7/10 batch 4400/6925 lr=2.55e-04 loss=2.1039 elapsed=1813.3s


[CONT] ep 7/10 batch 4500/6925 lr=2.53e-04 loss=2.1071 elapsed=1854.6s


[CONT] ep 7/10 batch 4600/6925 lr=2.51e-04 loss=2.1044 elapsed=1895.8s


[CONT] ep 7/10 batch 4700/6925 lr=2.49e-04 loss=2.1029 elapsed=1937.1s


[CONT] ep 7/10 batch 4800/6925 lr=2.47e-04 loss=2.1012 elapsed=1978.4s


[CONT] ep 7/10 batch 4900/6925 lr=2.45e-04 loss=2.0979 elapsed=2019.6s


[CONT] ep 7/10 batch 5000/6925 lr=2.43e-04 loss=2.0963 elapsed=2060.9s


[CONT] ep 7/10 batch 5100/6925 lr=2.41e-04 loss=2.0965 elapsed=2102.1s


[CONT] ep 7/10 batch 5200/6925 lr=2.39e-04 loss=2.0938 elapsed=2143.3s


[CONT] ep 7/10 batch 5300/6925 lr=2.37e-04 loss=2.0916 elapsed=2184.4s


[CONT] ep 7/10 batch 5400/6925 lr=2.36e-04 loss=2.0899 elapsed=2225.6s


[CONT] ep 7/10 batch 5500/6925 lr=2.34e-04 loss=2.0890 elapsed=2266.9s


[CONT] ep 7/10 batch 5600/6925 lr=2.32e-04 loss=2.0870 elapsed=2308.1s


[CONT] ep 7/10 batch 5700/6925 lr=2.30e-04 loss=2.0850 elapsed=2349.3s


[CONT] ep 7/10 batch 5800/6925 lr=2.28e-04 loss=2.0818 elapsed=2390.5s


[CONT] ep 7/10 batch 5900/6925 lr=2.26e-04 loss=2.0782 elapsed=2431.7s


[CONT] ep 7/10 batch 6000/6925 lr=2.24e-04 loss=2.0770 elapsed=2473.0s


[CONT] ep 7/10 batch 6100/6925 lr=2.22e-04 loss=2.0736 elapsed=2514.2s


[CONT] ep 7/10 batch 6200/6925 lr=2.20e-04 loss=2.0702 elapsed=2555.4s


[CONT] ep 7/10 batch 6300/6925 lr=2.18e-04 loss=2.0666 elapsed=2596.7s


[CONT] ep 7/10 batch 6400/6925 lr=2.17e-04 loss=2.0633 elapsed=2637.9s


[CONT] ep 7/10 batch 6500/6925 lr=2.15e-04 loss=2.0633 elapsed=2679.1s


[CONT] ep 7/10 batch 6600/6925 lr=2.13e-04 loss=2.0608 elapsed=2720.3s


[CONT] ep 7/10 batch 6700/6925 lr=2.11e-04 loss=2.0590 elapsed=2761.6s


[CONT] ep 7/10 batch 6800/6925 lr=2.09e-04 loss=2.0605 elapsed=2802.8s


[CONT] ep 7/10 batch 6900/6925 lr=2.07e-04 loss=2.0581 elapsed=2843.9s


[CONT] Epoch 7 train_loss=2.0584 time=2854.2s total=337.8m


[CONT] Eval acc=0.8063 loss=0.6831 time=47.2s


[CONT] ep 8/10 batch 100/6925 lr=2.05e-04 loss=2.0055 elapsed=41.3s


[CONT] ep 8/10 batch 200/6925 lr=2.03e-04 loss=2.0729 elapsed=82.6s


[CONT] ep 8/10 batch 300/6925 lr=2.01e-04 loss=2.0002 elapsed=124.0s


[CONT] ep 8/10 batch 400/6925 lr=2.00e-04 loss=2.0188 elapsed=165.3s


[CONT] ep 8/10 batch 500/6925 lr=1.98e-04 loss=2.0447 elapsed=206.7s


[CONT] ep 8/10 batch 600/6925 lr=1.96e-04 loss=2.0156 elapsed=248.0s


[CONT] ep 8/10 batch 700/6925 lr=1.94e-04 loss=2.0090 elapsed=289.3s


[CONT] ep 8/10 batch 800/6925 lr=1.92e-04 loss=2.0037 elapsed=330.6s


[CONT] ep 8/10 batch 900/6925 lr=1.91e-04 loss=1.9959 elapsed=371.9s


[CONT] ep 8/10 batch 1000/6925 lr=1.89e-04 loss=1.9769 elapsed=413.1s


[CONT] ep 8/10 batch 1100/6925 lr=1.87e-04 loss=1.9721 elapsed=454.4s


[CONT] ep 8/10 batch 1200/6925 lr=1.85e-04 loss=1.9597 elapsed=495.7s


[CONT] ep 8/10 batch 1300/6925 lr=1.84e-04 loss=1.9586 elapsed=537.0s


[CONT] ep 8/10 batch 1400/6925 lr=1.82e-04 loss=1.9606 elapsed=578.3s


[CONT] ep 8/10 batch 1500/6925 lr=1.80e-04 loss=1.9544 elapsed=619.6s


[CONT] ep 8/10 batch 1600/6925 lr=1.78e-04 loss=1.9515 elapsed=660.7s


[CONT] ep 8/10 batch 1700/6925 lr=1.77e-04 loss=1.9475 elapsed=701.8s


[CONT] ep 8/10 batch 1800/6925 lr=1.75e-04 loss=1.9387 elapsed=743.1s


[CONT] ep 8/10 batch 1900/6925 lr=1.73e-04 loss=1.9321 elapsed=784.4s


[CONT] ep 8/10 batch 2000/6925 lr=1.71e-04 loss=1.9304 elapsed=825.6s


[CONT] ep 8/10 batch 2100/6925 lr=1.70e-04 loss=1.9318 elapsed=867.0s


[CONT] ep 8/10 batch 2200/6925 lr=1.68e-04 loss=1.9243 elapsed=908.3s


[CONT] ep 8/10 batch 2300/6925 lr=1.66e-04 loss=1.9223 elapsed=949.5s


[CONT] ep 8/10 batch 2400/6925 lr=1.65e-04 loss=1.9183 elapsed=990.8s


[CONT] ep 8/10 batch 2500/6925 lr=1.63e-04 loss=1.9149 elapsed=1032.1s


[CONT] ep 8/10 batch 2600/6925 lr=1.61e-04 loss=1.9155 elapsed=1073.3s


[CONT] ep 8/10 batch 2700/6925 lr=1.60e-04 loss=1.9147 elapsed=1114.5s


[CONT] ep 8/10 batch 2800/6925 lr=1.58e-04 loss=1.9121 elapsed=1155.9s


[CONT] ep 8/10 batch 2900/6925 lr=1.56e-04 loss=1.9153 elapsed=1197.2s


[CONT] ep 8/10 batch 3000/6925 lr=1.55e-04 loss=1.9163 elapsed=1238.4s


[CONT] ep 8/10 batch 3100/6925 lr=1.53e-04 loss=1.9104 elapsed=1279.7s


[CONT] ep 8/10 batch 3200/6925 lr=1.52e-04 loss=1.9100 elapsed=1321.0s


[CONT] ep 8/10 batch 3300/6925 lr=1.50e-04 loss=1.9065 elapsed=1362.2s


[CONT] ep 8/10 batch 3400/6925 lr=1.48e-04 loss=1.9058 elapsed=1403.5s


[CONT] ep 8/10 batch 3500/6925 lr=1.47e-04 loss=1.9070 elapsed=1444.7s


[CONT] ep 8/10 batch 3600/6925 lr=1.45e-04 loss=1.9064 elapsed=1486.0s


[CONT] ep 8/10 batch 3700/6925 lr=1.44e-04 loss=1.9027 elapsed=1527.2s


[CONT] ep 8/10 batch 3800/6925 lr=1.42e-04 loss=1.8995 elapsed=1568.4s


[CONT] ep 8/10 batch 3900/6925 lr=1.40e-04 loss=1.8959 elapsed=1609.7s


[CONT] ep 8/10 batch 4000/6925 lr=1.39e-04 loss=1.8943 elapsed=1650.9s


[CONT] ep 8/10 batch 4100/6925 lr=1.37e-04 loss=1.8894 elapsed=1692.1s


[CONT] ep 8/10 batch 4200/6925 lr=1.36e-04 loss=1.8852 elapsed=1733.3s


[CONT] ep 8/10 batch 4300/6925 lr=1.34e-04 loss=1.8769 elapsed=1774.7s


[CONT] ep 8/10 batch 4400/6925 lr=1.33e-04 loss=1.8729 elapsed=1815.9s


[CONT] ep 8/10 batch 4500/6925 lr=1.31e-04 loss=1.8694 elapsed=1857.2s


[CONT] ep 8/10 batch 4600/6925 lr=1.30e-04 loss=1.8668 elapsed=1898.4s


[CONT] ep 8/10 batch 4700/6925 lr=1.28e-04 loss=1.8672 elapsed=1939.6s


[CONT] ep 8/10 batch 4800/6925 lr=1.27e-04 loss=1.8655 elapsed=1980.8s


[CONT] ep 8/10 batch 4900/6925 lr=1.25e-04 loss=1.8666 elapsed=2022.1s


[CONT] ep 8/10 batch 5000/6925 lr=1.24e-04 loss=1.8647 elapsed=2063.3s


[CONT] ep 8/10 batch 5100/6925 lr=1.22e-04 loss=1.8641 elapsed=2104.6s


[CONT] ep 8/10 batch 5200/6925 lr=1.21e-04 loss=1.8614 elapsed=2145.9s


[CONT] ep 8/10 batch 5300/6925 lr=1.19e-04 loss=1.8601 elapsed=2187.1s


[CONT] ep 8/10 batch 5400/6925 lr=1.18e-04 loss=1.8574 elapsed=2228.2s


[CONT] ep 8/10 batch 5500/6925 lr=1.16e-04 loss=1.8567 elapsed=2269.5s


[CONT] ep 8/10 batch 5600/6925 lr=1.15e-04 loss=1.8575 elapsed=2310.8s


[CONT] ep 8/10 batch 5700/6925 lr=1.13e-04 loss=1.8566 elapsed=2352.1s


[CONT] ep 8/10 batch 5800/6925 lr=1.12e-04 loss=1.8551 elapsed=2393.4s


[CONT] ep 8/10 batch 5900/6925 lr=1.10e-04 loss=1.8522 elapsed=2434.6s


[CONT] ep 8/10 batch 6000/6925 lr=1.09e-04 loss=1.8485 elapsed=2475.8s


[CONT] ep 8/10 batch 6100/6925 lr=1.08e-04 loss=1.8467 elapsed=2517.0s


[CONT] ep 8/10 batch 6200/6925 lr=1.06e-04 loss=1.8440 elapsed=2558.2s


[CONT] ep 8/10 batch 6300/6925 lr=1.05e-04 loss=1.8420 elapsed=2599.5s


[CONT] ep 8/10 batch 6400/6925 lr=1.04e-04 loss=1.8418 elapsed=2640.8s


[CONT] ep 8/10 batch 6500/6925 lr=1.02e-04 loss=1.8404 elapsed=2682.1s


[CONT] ep 8/10 batch 6600/6925 lr=1.01e-04 loss=1.8398 elapsed=2723.4s


[CONT] ep 8/10 batch 6700/6925 lr=9.94e-05 loss=1.8378 elapsed=2764.5s


[CONT] ep 8/10 batch 6800/6925 lr=9.81e-05 loss=1.8367 elapsed=2805.8s


[CONT] ep 8/10 batch 6900/6925 lr=9.67e-05 loss=1.8357 elapsed=2847.1s


[CONT] Epoch 8 train_loss=1.8341 time=2857.4s total=386.2m


[CONT] Eval acc=0.8150 loss=0.6502 time=47.1s


KeyboardInterrupt: 

In [None]:
# 448 fine-tune: load averaged EMA, 2 epochs, eval-style augs, small LR, save avg EMA
import time, torch
from pathlib import Path
from torch import nn
from torch.utils.data import DataLoader, WeightedRandomSampler
import torchvision.transforms as T
import timm
from timm.scheduler.cosine_lr import CosineLRScheduler
from timm.utils import ModelEmaV2

def build_loaders_448(batch_size=24, workers=8):
    sz = 448
    train_tf = T.Compose([
        T.Resize(int(sz/0.875), interpolation=T.InterpolationMode.BICUBIC),
        T.CenterCrop(sz),
        T.RandomHorizontalFlip(p=0.5),
        T.ToTensor(),
        T.Normalize((0.485,0.456,0.406),(0.229,0.224,0.225)),
    ])
    val_tf = T.Compose([
        T.Resize(int(sz/0.875), interpolation=T.InterpolationMode.BICUBIC),
        T.CenterCrop(sz),
        T.ToTensor(),
        T.Normalize((0.485,0.456,0.406),(0.229,0.224,0.225)),
    ])
    train_ds = INatDatasetSimple(train_split, train_tf)
    val_ds = INatDatasetSimple(valid_split, val_tf)
    from collections import Counter
    cls_counts = Counter([y for _,_,y in train_split])
    weights = [1.0/np.sqrt(cls_counts[y]) for _,_,y in train_split]
    sampler = WeightedRandomSampler(weights, num_samples=len(train_split), replacement=True)
    train_dl = DataLoader(train_ds, batch_size=batch_size, sampler=sampler, shuffle=False, num_workers=workers, pin_memory=True, persistent_workers=True, prefetch_factor=4)
    val_dl = DataLoader(val_ds, batch_size=batch_size, shuffle=False, num_workers=workers, pin_memory=True, persistent_workers=True, prefetch_factor=4)
    return train_dl, val_dl

def evaluate_448(m, dl):
    m.eval(); correct=0; total=0; loss_sum=0.0
    ce = nn.CrossEntropyLoss()
    t0=time.time()
    with torch.no_grad():
        for bx,(x,y,ids) in enumerate(dl):
            x=x.to(device, non_blocking=True).to(memory_format=torch.channels_last); y=y.to(device, non_blocking=True)
            with torch.autocast(device_type='cuda', dtype=torch.float16, enabled=device.type=='cuda'):
                logits=m(x); loss=ce(logits,y)
            loss_sum+=loss.item()*y.size(0); pred=logits.argmax(1); correct+=(pred==y).sum().item(); total+=y.size(0)
    acc=correct/max(1,total); vloss=loss_sum/max(1,total)
    print(f"[FT448] Eval acc={acc:.4f} loss={vloss:.4f}")
    return acc, vloss

def average_sd(paths):
    avg=None; n=0
    for p in paths:
        sd=torch.load(p, map_location='cpu')
        state=sd['model'] if isinstance(sd, dict) and 'model' in sd else sd
        if avg is None:
            avg={k: v.clone().float() for k,v in state.items()}
        else:
            for k in avg.keys():
                avg[k] += state[k].float()
        n+=1
    for k in avg.keys(): avg[k]/=max(1,n)
    return avg

def fine_tune_448(epochs=1, lr=2e-5, wd=0.05, ema_decay=0.9999, warmup_updates=50):
    print('[FT448] Loading avg_ema_last.pt for 448 fine-tune...')
    ckpt_path = Path('avg_ema_last.pt')
    assert ckpt_path.exists(), 'avg_ema_last.pt not found; run continuation first.'
    sd = torch.load(ckpt_path, map_location='cpu')
    model = timm.create_model('convnext_base.fb_in22k_ft_in1k', pretrained=False, num_classes=len(catid2idx))
    missing, unexpected = model.load_state_dict(sd['model'], strict=False)
    if missing or unexpected:
        print('[FT448][WARN] load_state_dict mismatch -> missing:', len(missing), 'unexpected:', len(unexpected))
    model.to(device).train()
    model = model.to(memory_format=torch.channels_last)
    ema = ModelEmaV2(model, decay=ema_decay, device=device if device.type=='cuda' else None)
    for ev, mv in zip(ema.module.state_dict().values(), model.state_dict().values()): ev.copy_(mv)

    train_dl, val_dl = build_loaders_448(batch_size=24, workers=8)
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=wd, betas=(0.9,0.999), fused=(device.type=='cuda'))
    steps_per_epoch = len(train_dl)
    total_updates = epochs * steps_per_epoch
    sched = CosineLRScheduler(optimizer, t_initial=total_updates, lr_min=1e-6, warmup_t=warmup_updates, warmup_lr_init=1e-6, k_decay=1.0, t_in_epochs=False)
    scaler = torch.amp.GradScaler('cuda', enabled=(device.type=='cuda'))
    criterion = nn.CrossEntropyLoss(label_smoothing=0.1)

    best = -1.0; num_updates=0; saved=[]
    for ep in range(epochs):
        t0=time.time(); run=0.0; n=0
        for bi,(x,y,ids) in enumerate(train_dl):
            x=x.to(device, non_blocking=True).to(memory_format=torch.channels_last); y=y.to(device, non_blocking=True)
            optimizer.zero_grad(set_to_none=True)
            with torch.autocast(device_type='cuda', dtype=torch.float16, enabled=device.type=='cuda'):
                logits=model(x); loss=criterion(logits,y)
            scaler.scale(loss).backward()
            scaler.unscale_(optimizer); torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            scaler.step(optimizer); scaler.update()
            num_updates+=1; sched.step_update(num_updates); ema.update(model)
            run+=loss.item()*x.size(0); n+=x.size(0)
            if (bi+1)%100==0:
                print(f"[FT448] ep {ep+1}/{epochs} batch {bi+1}/{len(train_dl)} lr={optimizer.param_groups[0]['lr']:.2e} loss={run/max(1,n):.4f}", flush=True)
        print(f"[FT448] Epoch {ep+1} train_loss={run/max(1,n):.4f} time={time.time()-t0:.1f}s")
        acc,_ = evaluate_448(ema.module, val_dl)
        outp=f"ema_ft448_ep{ep+1}.pt"
        torch.save({'model': ema.module.state_dict(), 'acc': acc}, outp); saved.append(outp)
        if acc>best: best=acc; torch.save({'model': ema.module.state_dict(), 'acc': acc}, 'best_ft448.pt')
        # Optional: add a 2nd epoch only if improved
        if ep==0 and epochs>=2 and best<0.0001:
            print('[FT448] No improvement after 1 epoch; stopping early.')
            break
    print('[FT448] Averaging', len(saved), 'EMA checkpoints...')
    avg = average_sd(saved)
    torch.save({'model': avg, 'acc': best}, 'avg_ema_last_448.pt')
    print('[FT448] Saved avg_ema_last_448.pt (prioritize this in inference).')

print('FT-448 cell ready. After continuation produces avg_ema_last.pt, run fine_tune_448() then inference (update TTA to include 448).')

In [18]:
# Utility: Average saved continuation EMA checkpoints into avg_ema_last.pt
import glob, re, torch
from pathlib import Path

def average_saved_ema(last_n=8, pattern='ema_cont_ep*.pt', out_path='avg_ema_last.pt'):
    paths = sorted(glob.glob(pattern), key=lambda p: int(re.search(r'ep(\d+)', p).group(1)) if re.search(r'ep(\d+)', p) else -1)
    if not paths:
        print('[AVG] No ema_cont_ep*.pt files found.')
        return
    if last_n is not None and last_n > 0:
        paths = paths[-last_n:]
    print('[AVG] Averaging the following checkpoints:', paths)
    avg=None; n=0
    for p in paths:
        sd=torch.load(p, map_location='cpu')
        state=sd['model'] if isinstance(sd, dict) and 'model' in sd else sd
        if avg is None:
            avg={k: v.clone().float() for k,v in state.items()}
        else:
            for k in avg.keys():
                avg[k] += state[k].float()
        n+=1
    for k in avg.keys():
        avg[k] /= max(1,n)
    torch.save({'model': avg, 'acc': None, 'averaged_n': n}, out_path)
    print(f"[AVG] Saved {out_path} from {n} checkpoints")

print('Averaging utility ready. If you stop continuation early (e.g., after 8-9 epochs), run average_saved_ema(last_n=6 or 8).')

Averaging utility ready. If you stop continuation early (e.g., after 8-9 epochs), run average_saved_ema(last_n=6 or 8).


In [20]:
# Average last 8 EMA checkpoints now (fix glob shadowing)
import importlib
import glob as glob_module
globals()['glob'] = glob_module  # ensure average_saved_ema sees the module, not a function
import os
print("Found EMA ckpts:", sorted(glob_module.glob('ema_cont_ep*.pt')))
try:
    average_saved_ema(last_n=8, pattern='ema_cont_ep*.pt', out_path='avg_ema_last.pt')
except Exception as e:
    import traceback; traceback.print_exc()
print("avg_ema_last.pt exists:", os.path.exists('avg_ema_last.pt'))

Found EMA ckpts: ['ema_cont_ep1.pt', 'ema_cont_ep2.pt', 'ema_cont_ep3.pt', 'ema_cont_ep4.pt', 'ema_cont_ep5.pt', 'ema_cont_ep6.pt', 'ema_cont_ep7.pt', 'ema_cont_ep8.pt']
[AVG] Averaging the following checkpoints: ['ema_cont_ep1.pt', 'ema_cont_ep2.pt', 'ema_cont_ep3.pt', 'ema_cont_ep4.pt', 'ema_cont_ep5.pt', 'ema_cont_ep6.pt', 'ema_cont_ep7.pt', 'ema_cont_ep8.pt']


  sd=torch.load(p, map_location='cpu')


[AVG] Saved avg_ema_last.pt from 8 checkpoints
avg_ema_last.pt exists: True
