# Plan: iNaturalist 2019 (FGVC6) — Medal-Oriented Workflow

Objectives:
- Ship a strong, working baseline fast; iterate to medal.

Data & Metric:
- Train/Val/Test artifacts present: train_val2019.tar.gz, train2019.json, val2019.json, test2019.tar.gz, test2019.json.
- Metric: top-1 classification error (minimize). 1010 classes.

Validation:
- Use official val2019 as holdout for fast feedback. Avoid leakage.
- Later add K-fold on train+val via stratified split if beneficial, but keep official val for LB sanity.

Baseline Model:
- Single-GPU PyTorch with timm.
- Pretrained strong backbone (e.g., convnext_base, tf_efficientnet_b3_ns).
- Input size 224→320 later; AMP; label smoothing; cosine schedule; warmup.
- Augs: RandAugment/AutoAugment, Mixup/CutMix moderate, RandomResizedCrop, flips.
- BCE/CE with label smoothing (CE+LS).
- Save best on val top-1.

Pipeline:
1) Environment & GPU check.
2) Extract tars to /mnt if space allows; build PyTorch datasets from JSONs.
3) Train baseline (few epochs on 224 to validate pipeline).
4) Full train with tuned epochs (e.g., 25-40) with AMP, EMA.
5) Inference: TTA (e.g., 5-10 crops/flips).
6) Generate submission.csv matching sample format.

Risk/Time Controls:
- Log progress and elapsed per epoch; early stop if divergence.
- Cache checkpoints and logits.
- While training, consult experts for architecture/aug choices and whether to scale input/epochs.

Next:
- Run environment checks, parse JSONs, inspect class counts, then implement training script (train.py) and inference script (predict.py).
- Request expert review on plan before heavy training.

In [None]:
# Environment & GPU check; repo contents overview
import os, sys, json, time, shutil, subprocess, platform, psutil

def run(cmd):
    print("$", " ".join(cmd), flush=True)
    try:
        out = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
        print(out.stdout, flush=True)
        return out.returncode, out.stdout
    except Exception as e:
        print("ERR:", e, flush=True)
        return 1, str(e)

print(f"Python: {platform.python_version()} | Platform: {platform.platform()}")
print(f"CWD: {os.getcwd()}")
print("Checking GPU via nvidia-smi...")
run(['bash','-lc','nvidia-smi || true'])

print("Disk usage (df -h):")
run(['bash','-lc','df -h'])

print("Listing repo files with sizes and mtimes:")
run(['bash','-lc','ls -alh --time-style=+%Y-%m-%dT%H:%M:%S'])

print("CPU/Memory info:")
print(f"CPU count: {psutil.cpu_count(logical=True)} | RAM: {psutil.virtual_memory().total/1e9:.1f} GB")

print("If torch not installed, we will install cu121 stack next.")
try:
    import torch, torchvision
    print("torch:", torch.__version__, "CUDA avail:", torch.cuda.is_available(), "CUDA build:", getattr(torch.version,'cuda',None))
    if torch.cuda.is_available():
        print("GPU:", torch.cuda.get_device_name(0))
except Exception as e:
    print("torch not importable yet:", e)

In [None]:
# Install CUDA 12.1-compatible PyTorch stack and verify GPU
import os, sys, subprocess, shutil, time, textwrap

def pip(*args):
    print(">", *args, flush=True)
    subprocess.run([sys.executable, "-m", "pip", *args], check=True)

# 0) Uninstall any preexisting torch stack
for pkg in ("torch","torchvision","torchaudio"):
    subprocess.run([sys.executable, "-m", "pip", "uninstall", "-y", pkg], check=False)

# Clean stray site dirs that can shadow correct wheels (idempotent)
for d in (
    "/app/.pip-target/torch",
    "/app/.pip-target/torch-2.8.0.dist-info",
    "/app/.pip-target/torch-2.4.1.dist-info",
    "/app/.pip-target/torchvision",
    "/app/.pip-target/torchvision-0.23.0.dist-info",
    "/app/.pip-target/torchvision-0.19.1.dist-info",
    "/app/.pip-target/torchaudio",
    "/app/.pip-target/torchaudio-2.8.0.dist-info",
    "/app/.pip-target/torchaudio-2.4.1.dist-info",
    "/app/.pip-target/torchgen",
    "/app/.pip-target/functorch",
):
    if os.path.exists(d):
        print("Removing", d, flush=True)
        shutil.rmtree(d, ignore_errors=True)

# 1) Install EXACT cu121 torch stack
pip("install",
    "--index-url", "https://download.pytorch.org/whl/cu121",
    "--extra-index-url", "https://pypi.org/simple",
    "torch==2.4.1", "torchvision==0.19.1", "torchaudio==2.4.1")

# 2) Freeze versions
from pathlib import Path
Path("constraints.txt").write_text("\n".join([
    "torch==2.4.1",
    "torchvision==0.19.1",
    "torchaudio==2.4.1",
]))

# 3) Install non-torch deps
pip("install", "-c", "constraints.txt",
    "timm==1.0.9",
    "albumentations==1.4.14",
    "opencv-python-headless",
    "scikit-learn",
    "pillow-simd",
    "jpeg4py",
    "pyyaml",
    "einops",
    "wandb",
    "--upgrade-strategy", "only-if-needed")

# 4) Sanity check
import torch
print("torch:", torch.__version__, "built CUDA:", getattr(torch.version, "cuda", None))
print("CUDA available:", torch.cuda.is_available())
assert str(getattr(torch.version, "cuda", "")).startswith("12.1"), f"Wrong CUDA build: {torch.version.cuda}"
assert torch.cuda.is_available(), "CUDA not available"
print("GPU:", torch.cuda.get_device_name(0))

print("Environment ready.")

In [None]:
# Install remaining deps without pillow-simd/jpeg4py (use standard Pillow) and verify torch GPU
import sys, subprocess

def pip(*args):
    print(">", *args, flush=True)
    subprocess.run([sys.executable, "-m", "pip", *args], check=True)

pip("install", "-c", "constraints.txt",
    "timm==1.0.9",
    "albumentations==1.4.14",
    "opencv-python-headless",
    "scikit-learn",
    "pyyaml",
    "einops",
    "wandb",
    "--upgrade-strategy", "only-if-needed")

import torch
print("torch:", torch.__version__, "built CUDA:", getattr(torch.version, "cuda", None))
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))
print("Deps ready.")

In [None]:
# Extract datasets to project dir (./inat2019) using Python tarfile (auto-detect compression) and verify a few files
import os, json, time, random, tarfile, pathlib

root = os.path.abspath("./inat2019")
train_root = os.path.join(root, "train_val2019")
test_root = os.path.join(root, "test2019")
os.makedirs(root, exist_ok=True)

def extract_tar_auto(tar_path, dest_dir):
    print(f"Extracting {tar_path} -> {dest_dir}", flush=True)
    t0=time.time()
    with tarfile.open(tar_path, mode="r:*") as tf:
        members = tf.getmembers()
        total = len(members)
        print(f"Members: {total}")
        for i, m in enumerate(members, 1):
            tf.extract(m, path=dest_dir)
            if i % 5000 == 0 or i == total:
                print(f"  {i}/{total} extracted ({(time.time()-t0)/60:.1f} min)", flush=True)
    print(f"Done {tar_path} in {(time.time()-t0)/60:.1f} min")

start=time.time()
if not os.path.exists(train_root) or (os.path.isdir(train_root) and not any(os.scandir(train_root))):
    os.makedirs(train_root, exist_ok=True)
    extract_tar_auto("train_val2019.tar.gz", root)
else:
    print("Train/Val already extracted:", train_root)

if not os.path.exists(test_root) or (os.path.isdir(test_root) and not any(os.scandir(test_root))):
    os.makedirs(test_root, exist_ok=True)
    extract_tar_auto("test2019.tar.gz", root)
else:
    print("Test already extracted:", test_root)

print(f"Extraction stage took {(time.time()-start)/60:.1f} min total")

# Quick JSON and file existence sanity checks
with open("train2019.json","r") as f: train_json=json.load(f)
with open("val2019.json","r") as f: val_json=json.load(f)
with open("test2019.json","r") as f: test_json=json.load(f)

def rel_paths_from_json(js):
    # COCO-style jsons have images list with file_name fields
    return [img["file_name"] for img in js.get("images", js)]

train_files = rel_paths_from_json(train_json)
val_files = rel_paths_from_json(val_json)
test_files = rel_paths_from_json(test_json)

def check_some(files, base):
    if not files:
        print("No files listed in JSON")
        return
    ok=0; tot=min(10, len(files))
    for p in random.sample(files, tot):
        full=os.path.join(base, p)
        if os.path.exists(full):
            ok+=1
        else:
            print("Missing:", full)
    print(f"Existence check: {ok}/{tot} ok")

print("Example train file:", train_files[0] if train_files else None)
print("Example test file:", test_files[0] if test_files else None)

# Check existence relative to extracted root
check_some(train_files, base=root)
check_some(test_files, base=root)

print("Ready for dataset building.")

In [None]:
# Inspect extracted directory structure and locate sample files
import os, json, random, glob

root = os.path.abspath("./inat2019")
print("Root:", root)
print("Top-level entries:", os.listdir(root))
for d in os.listdir(root):
    p = os.path.join(root, d)
    print(d, "-> dir:", os.path.isdir(p), "files:", sum(len(files) for _,_,files in os.walk(p)))

with open("train2019.json","r") as f: train_json=json.load(f)
with open("test2019.json","r") as f: test_json=json.load(f)

train_files = [img["file_name"] for img in train_json["images"]]
test_files = [img["file_name"] for img in test_json["images"]]
ex_train = train_files[0]
ex_test = test_files[0]
print("Example train JSON path:", ex_train)
print("Example test JSON path:", ex_test)

def exists_at_root(rel):
    return os.path.exists(os.path.join(root, rel))

print("Exists at root (train ex):", exists_at_root(ex_train))
print("Exists at root (test ex):", exists_at_root(ex_test))

# If not found, try to locate by basename anywhere under root
def locate_by_basename(rel):
    base = os.path.basename(rel)
    hits = list(glob.iglob(os.path.join(root, "**", base), recursive=True))
    return hits[:5]

if not exists_at_root(ex_train):
    print("Searching for train basename...")
    print(locate_by_basename(ex_train))
if not exists_at_root(ex_test):
    print("Searching for test basename...")
    print(locate_by_basename(ex_test))

print("Done structure inspection.")

In [1]:
# Build datasets with robust path resolution + basename index (filters missing files)
import os, json, time, math, random
from pathlib import Path
from typing import Dict, Tuple, List
import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image, ImageFile
import torchvision.transforms as T

ImageFile.LOAD_TRUNCATED_IMAGES = True
torch.backends.cudnn.benchmark = True

root = os.path.abspath("./inat2019")
print("Root:", root)

# Load JSONs
with open("train2019.json","r") as f: train_json = json.load(f)
with open("val2019.json","r") as f: val_json = json.load(f)

# Build category id mapping (sparse category_id -> dense [0..C-1])
cat_ids = sorted({c["id"] for c in train_json["categories"]})
catid_to_idx: Dict[int,int] = {cid:i for i, cid in enumerate(cat_ids)}
idx_to_catid: Dict[int,int] = {i:cid for cid,i in catid_to_idx.items()}
num_classes = len(catid_to_idx)
print("Classes:", num_classes)

# COCO-style join images and anns (1 ann per image expected)
def build_records(js: dict) -> List[Tuple[str,int]]:
    anns_by_img = {a["image_id"]: a for a in js["annotations"]}
    recs = []
    for im in js["images"]:
        img_id = im["id"]
        fn = im["file_name"]
        ann = anns_by_img.get(img_id, None)
        if ann is None:
            continue
        cid = ann["category_id"]
        if cid not in catid_to_idx:
            continue
        recs.append((fn, catid_to_idx[cid]))
    return recs

# Build a basename -> full path index (handles any layout differences robustly)
def build_basename_index(root_dir: str):
    t0 = time.time()
    idx: Dict[str, str] = {}
    dup = 0
    count = 0
    for dirpath, _, files in os.walk(root_dir):
        for fn in files:
            if not fn.lower().endswith(('.jpg','.jpeg','.png')):
                continue
            count += 1
            if fn in idx:
                dup += 1
                continue
            idx[fn] = os.path.join(dirpath, fn)
    print(f"Basename index: {count} files, {dup} dups, built in {time.time()-t0:.1f}s")
    return idx

BASENAME_INDEX = build_basename_index(root)

# Robust path resolver because extracted layout differs from JSON top-level dirs
def resolve_path(rel: str) -> str:
    # 1) as-is (e.g., train_val2019/Plants/... or test2019/...)
    p = os.path.join(root, rel)
    if os.path.exists(p):
        return p
    # 2) drop first component
    parts = rel.split('/')
    if len(parts) > 1:
        p2 = os.path.join(root, "/".join(parts[1:]))
        if os.path.exists(p2):
            return p2
    # 3) basename lookup (handles val/test pointing to different top-level)
    base = os.path.basename(rel)
    p3 = BASENAME_INDEX.get(base, None)
    if p3 is not None and os.path.exists(p3):
        return p3
    # 4) last fallback: direct basename in root
    p4 = os.path.join(root, base)
    if os.path.exists(p4):
        return p4
    return p  # may not exist; will be filtered out

train_recs_raw = build_records(train_json)
val_recs_raw = build_records(val_json)
print("Train records (raw):", len(train_recs_raw), "Val records (raw):", len(val_recs_raw))

# Filter out missing files to avoid DataLoader errors
def filter_existing(recs: List[Tuple[str,int]]):
    kept = []
    missing = 0
    for rel, y in recs:
        rp = resolve_path(rel)
        if os.path.exists(rp):
            kept.append((rel, y))
        else:
            missing += 1
    return kept, missing

t0=time.time()
train_recs, miss_tr = filter_existing(train_recs_raw)
val_recs, miss_va = filter_existing(val_recs_raw)
print(f"Filtered: train {len(train_recs)} kept, {miss_tr} missing; val {len(val_recs)} kept, {miss_va} missing; took {time.time()-t0:.1f}s")

class INatDataset(Dataset):
    def __init__(self, recs, train: bool):
        self.recs = recs
        self.train = train
        if train:
            self.tf = T.Compose([
                T.RandomResizedCrop(224, scale=(0.5, 1.0), ratio=(0.75, 1.33)),
                T.RandomHorizontalFlip(),
                T.ToTensor(),
                T.Normalize(mean=[0.485,0.456,0.406], std=[0.229,0.224,0.225]),
            ])
        else:
            self.tf = T.Compose([
                T.Resize(256, interpolation=T.InterpolationMode.BICUBIC),
                T.CenterCrop(224),
                T.ToTensor(),
                T.Normalize(mean=[0.485,0.456,0.406], std=[0.229,0.224,0.225]),
            ])
    def __len__(self):
        return len(self.recs)
    def __getitem__(self, idx):
        rel, y = self.recs[idx]
        path = resolve_path(rel)
        with Image.open(path) as img:
            img = img.convert('RGB')
            x = self.tf(img)
        return x, y

# Instantiate datasets and loaders
train_ds = INatDataset(train_recs, train=True)
val_ds = INatDataset(val_recs, train=False)

batch_size = 64
num_workers = min(12, os.cpu_count() or 8)
train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, num_workers=num_workers, pin_memory=True, persistent_workers=True)
val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=True, persistent_workers=True)

# Quick smoke: iterate a batch
t0=time.time()
xb, yb = next(iter(train_loader))
print("Train batch:", xb.shape, yb.shape, "time:", f"{time.time()-t0:.2f}s")
if len(val_ds) > 0:
    t0=time.time()
    xb2, yb2 = next(iter(val_loader))
    print("Val batch:", xb2.shape, yb2.shape, "time:", f"{time.time()-t0:.2f}s")
else:
    print("Val dataset is empty after filtering; will adjust resolver if needed.")

# Torch/CUDA sanity
print("Torch:", torch.__version__, "CUDA avail:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))
print("Dataset smoke OK.")

Root: /var/lib/simon/agent_run_states/inaturalist-2019-fgvc6-20250927-163935/inat2019


Classes: 1010


Basename index: 265213 files, 0 dups, built in 0.4s
Train records (raw): 232999 Val records (raw): 3030


Filtered: train 232999 kept, 0 missing; val 0 kept, 3030 missing; took 3.0s


Train batch: torch.Size([64, 3, 224, 224]) torch.Size([64]) time: 0.98s
Val dataset is empty after filtering; will adjust resolver if needed.
Torch: 2.4.1+cu121 CUDA avail: True
GPU: NVIDIA A10-24Q
Dataset smoke OK.


In [None]:
# Baseline training (stratified holdout from train) + inference to submission
import os, time, math, random, json
from collections import defaultdict
import torch
import torch.nn as nn
from torch.utils.data import Subset, DataLoader
from torchvision import models
import torchvision.transforms as T

# Reuse train_ds, resolve_path, idx_to_catid from previous cell
assert 'train_ds' in globals(), 'Run previous cell to build datasets/mappings.'
assert 'resolve_path' in globals() and 'idx_to_catid' in globals(), 'Mappings/resolver missing.'

# Build a small stratified validation split from train (since val images are not present here)
labels = [y for _, y in train_ds.recs]
by_cls = defaultdict(list)
for i, y in enumerate(labels):
    by_cls[y].append(i)
val_idx = []
train_idx = []
random.seed(42)
per_cls_val = 3  # small, fast sanity; later increase
for c, idxs in by_cls.items():
    random.shuffle(idxs)
    v = idxs[:per_cls_val]
    t = idxs[per_cls_val:]
    val_idx.extend(v)
    train_idx.extend(t)
print(f"Holdout sizes -> train: {len(train_idx)}, val: {len(val_idx)}")

train_sub = Subset(train_ds, train_idx)
val_sub = Subset(INatDataset([(train_ds.recs[i][0], train_ds.recs[i][1]) for i in val_idx], train=False), list(range(len(val_idx))))

batch_size = 64
num_workers = min(12, os.cpu_count() or 8)
train_loader = DataLoader(train_sub, batch_size=batch_size, shuffle=True, num_workers=num_workers, pin_memory=True, persistent_workers=True)
val_loader = DataLoader(val_sub, batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=True, persistent_workers=True)

# Model: torchvision ConvNeXt-Base pretrained, replace head to 1010 classes
num_classes = len(idx_to_catid)
print('Building model...')
model = models.convnext_base(weights=models.ConvNeXt_Base_Weights.DEFAULT)
in_features = model.classifier[2].in_features
model.classifier[2] = nn.Linear(in_features, num_classes)
model = model.to('cuda' if torch.cuda.is_available() else 'cpu', memory_format=torch.channels_last)

optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4, weight_decay=0.05)
criterion = nn.CrossEntropyLoss(label_smoothing=0.1)
scaler = torch.cuda.amp.GradScaler(enabled=torch.cuda.is_available())

def evaluate(model, loader):
    model.eval()
    correct = 0
    total = 0
    device = next(model.parameters()).device
    with torch.no_grad():
        for xb, yb in loader:
            xb = xb.to(device, non_blocking=True).to(memory_format=torch.channels_last)
            yb = yb.to(device, non_blocking=True)
            with torch.cuda.amp.autocast(enabled=torch.cuda.is_available()):
                logits = model(xb)
            preds = logits.argmax(1)
            correct += (preds == yb).sum().item()
            total += yb.size(0)
    acc = correct / max(1,total)
    return acc

# Train a very short sanity run (1 epoch); extend later
epochs = 1
device = next(model.parameters()).device
best_acc = 0.0
best_path = 'baseline_convnext_base_224.pth'
t_start = time.time()
for epoch in range(epochs):
    model.train()
    epoch_start = time.time()
    seen = 0
    for it, (xb, yb) in enumerate(train_loader):
        xb = xb.to(device, non_blocking=True).to(memory_format=torch.channels_last)
        yb = yb.to(device, non_blocking=True)
        optimizer.zero_grad(set_to_none=True)
        with torch.cuda.amp.autocast(enabled=torch.cuda.is_available()):
            logits = model(xb)
            loss = criterion(logits, yb)
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        seen += yb.size(0)
        if (it+1) % 50 == 0:
            print(f"Epoch {epoch+1}/{epochs} it {it+1} seen {seen} loss {loss.item():.4f} elapsed {time.time()-epoch_start:.1f}s", flush=True)
    val_acc = evaluate(model, val_loader)
    print(f"Epoch {epoch+1} val_acc {val_acc:.4f} epoch_time {time.time()-epoch_start:.1f}s total_elapsed {time.time()-t_start:.1f}s", flush=True)
    if val_acc > best_acc:
        best_acc = val_acc
        torch.save({'model': model.state_dict(), 'num_classes': num_classes}, best_path)
        print(f"Saved best to {best_path}")

print(f"Training done. Best val_acc {best_acc:.4f}")

# Inference on test set and write submission.csv
print('Preparing test dataset...')
with open('test2019.json','r') as f: test_json = json.load(f)
test_images = test_json['images']
test_file_names = [im['file_name'] for im in test_images]
test_ids = [im['id'] for im in test_images]

class INatTestDataset(torch.utils.data.Dataset):
    def __init__(self, rel_paths):
        self.rel_paths = rel_paths
        self.tf = T.Compose([
            T.Resize(256, interpolation=T.InterpolationMode.BICUBIC),
            T.CenterCrop(224),
            T.ToTensor(),
            T.Normalize(mean=[0.485,0.456,0.406], std=[0.229,0.224,0.225]),
        ])
    def __len__(self):
        return len(self.rel_paths)
    def __getitem__(self, idx):
        rel = self.rel_paths[idx]
        path = resolve_path(rel)
        from PIL import Image
        with Image.open(path) as img:
            img = img.convert('RGB')
            x = self.tf(img)
        return x

test_ds = INatTestDataset(test_file_names)
test_loader = DataLoader(test_ds, batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=True, persistent_workers=True)

# Load best weights if saved
if os.path.exists(best_path):
    ckpt = torch.load(best_path, map_location=device)
    model.load_state_dict(ckpt['model'], strict=False)
model.eval()

all_preds = []
with torch.no_grad():
    for it, xb in enumerate(test_loader):
        xb = xb.to(device, non_blocking=True).to(memory_format=torch.channels_last)
        with torch.cuda.amp.autocast(enabled=torch.cuda.is_available()):
            logits = model(xb)
        preds = logits.argmax(1).detach().cpu().tolist()
        all_preds.extend(preds)
        if (it+1) % 100 == 0:
            print(f"Infer it {it+1}/{math.ceil(len(test_ds)/batch_size)}", flush=True)

assert len(all_preds) == len(test_ids), 'Prediction length mismatch'

# Map model idx -> original category_id
pred_cat_ids = [idx_to_catid[p] for p in all_preds]

# Build submission matching sample format (image_id, category_id)
import pandas as pd
sub = pd.DataFrame({'image_id': test_ids, 'category_id': pred_cat_ids})
sub.to_csv('submission.csv', index=False)
print('Wrote submission.csv with', len(sub), 'rows')

In [2]:
# Train timm EfficientNetV2-M @320 with AMP + EMA on stratified holdout; then infer test and write submission.csv
import os, math, time, random, json, subprocess, sys, importlib
from collections import defaultdict
import torch
import torch.nn as nn
from torch.utils.data import Subset, DataLoader
import torchvision.transforms as T

# Improve CUDA memory handling
os.environ.setdefault('PYTORCH_CUDA_ALLOC_CONF', 'expandable_segments:True')

# Ensure huggingface_hub available BEFORE importing timm (timm checks at import-time)
try:
    import huggingface_hub  # noqa: F401
except Exception:
    subprocess.run([sys.executable, '-m', 'pip', 'install', 'huggingface_hub==0.24.6', '--no-deps'], check=True)
    import huggingface_hub  # noqa: F401

# Safe timm import without touching torch stack
try:
    import timm
except Exception:
    subprocess.run([sys.executable, '-m', 'pip', 'install', 'timm==1.0.9', '--no-deps'], check=True)
    import timm

from timm.loss import SoftTargetCrossEntropy
from timm.data.mixup import Mixup
from timm.utils import ModelEmaV2

device = 'cuda' if torch.cuda.is_available() else 'cpu'
torch.backends.cudnn.benchmark = True
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
if hasattr(torch, 'set_float32_matmul_precision'):
    torch.set_float32_matmul_precision('medium')

# Reuse train_ds, resolve_path, idx_to_catid from previous cells
assert 'train_ds' in globals() and 'resolve_path' in globals() and 'idx_to_catid' in globals()

# Build larger stratified holdout (10/class) from train records
labels = [y for _, y in train_ds.recs]
by_cls = defaultdict(list)
for i, y in enumerate(labels):
    by_cls[y].append(i)
val_idx, tr_idx = [], []
random.seed(42)
per_cls_val = 10
for c, idxs in by_cls.items():
    random.shuffle(idxs)
    v = idxs[:per_cls_val]
    t = idxs[per_cls_val:]
    val_idx.extend(v)
    tr_idx.extend(t)
print(f"Holdout sizes -> train: {len(tr_idx)}, val: {len(val_idx)}")

# 320px transforms
train_tf = T.Compose([
    T.RandomResizedCrop(320, scale=(0.2, 1.0), ratio=(0.75, 1.33), interpolation=T.InterpolationMode.BICUBIC),
    T.RandAugment(num_ops=2, magnitude=9),
    T.RandomHorizontalFlip(),
    T.ToTensor(),
    T.Normalize([0.485,0.456,0.406],[0.229,0.224,0.225]),
    T.RandomErasing(p=0.25, value='random')
])
val_tf = T.Compose([
    T.Resize(352, interpolation=T.InterpolationMode.BICUBIC),
    T.CenterCrop(320),
    T.ToTensor(),
    T.Normalize([0.485,0.456,0.406],[0.229,0.224,0.225]),
])

class WrappedDS(torch.utils.data.Dataset):
    def __init__(self, base, indices, train):
        self.base = base
        self.idxs = indices
        self.train = train
    def __len__(self):
        return len(self.idxs)
    def __getitem__(self, k):
        rel, y = self.base.recs[self.idxs[k]]
        from PIL import Image
        path = resolve_path(rel)
        with Image.open(path) as img:
            img = img.convert('RGB')
            x = (train_tf if self.train else val_tf)(img)
        return x, y

# Model FIRST to control CUDA memory; build loaders after
num_classes = len(idx_to_catid)
print('Building timm model...')
try:
    model = timm.create_model(
        'tf_efficientnetv2_m.in21k_ft_in1k',
        pretrained=True,
        num_classes=num_classes,
        drop_path_rate=0.3,
        pretrained_cfg_overlay={'hf_hub_id': None}
    )
except Exception as e:
    print('Pretrained load w/ URL fallback failed, retrying default...', e)
    try:
        model = timm.create_model('tf_efficientnetv2_m.in21k_ft_in1k', pretrained=True, num_classes=num_classes, drop_path_rate=0.3)
    except Exception as e2:
        print('Retry failed, falling back to pretrained=False (sanity only):', e2)
        model = timm.create_model('tf_efficientnetv2_m.in21k_ft_in1k', pretrained=False, num_classes=num_classes, drop_path_rate=0.3)

# Free any cached GPU memory before moving model
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    try:
        print('Before to():', torch.cuda.memory_summary(abbreviated=True))
    except Exception:
        pass

if hasattr(model, 'set_grad_checkpointing'):
    model.set_grad_checkpointing(True)
model = model.to(device)  # defer channels_last until after to() to reduce transient copies
model = model.to(memory_format=torch.channels_last)

# Optimizer & LR schedule
batch_size = 48  # further reduced; build loaders after model to minimize CUDA pressure
base_lr = 3e-4 * (batch_size/64)
optimizer = torch.optim.AdamW(model.parameters(), lr=base_lr, weight_decay=0.05, betas=(0.9,0.999))
epochs = 2  # quick sanity; extend to 35 later
warmup_epochs = 1  # use 5 when epochs=35
cosine_min_lr = 1e-6

def lr_at_epoch(ep):
    if ep < warmup_epochs:
        return base_lr * (ep+1)/max(1,warmup_epochs)
    t = (ep - warmup_epochs)/max(1,(epochs - warmup_epochs))
    return cosine_min_lr + 0.5*(base_lr - cosine_min_lr)*(1 + math.cos(math.pi*t))

# Dataloaders (after model) and with conservative worker settings to avoid fragmentation)
num_workers = min(8, os.cpu_count() or 8)
train_sub = WrappedDS(train_ds, tr_idx, train=True)
val_sub = WrappedDS(train_ds, val_idx, train=False)
train_loader = DataLoader(train_sub, batch_size=batch_size, shuffle=True, num_workers=num_workers, pin_memory=True, persistent_workers=False, drop_last=True, prefetch_factor=2)
val_loader = DataLoader(val_sub, batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=True, persistent_workers=False, prefetch_factor=2)

# Mixup/CutMix and loss (lighter, prob=0.8)
mixup_fn = Mixup(
    mixup_alpha=0.2,
    cutmix_alpha=0.3,
    prob=0.8,
    switch_prob=0.5,
    label_smoothing=0.1,
    num_classes=num_classes
)
use_mixup = True
criterion = SoftTargetCrossEntropy() if use_mixup else nn.CrossEntropyLoss(label_smoothing=0.1)
scaler = torch.amp.GradScaler('cuda' if torch.cuda.is_available() else 'cpu')

# EMA with timm
ema = ModelEmaV2(model, decay=0.9998)

def evaluate(m, loader):
    m.eval()
    correct = 0
    total = 0
    with torch.no_grad(), torch.amp.autocast(device_type='cuda', enabled=torch.cuda.is_available()):
        for xb, yb in loader:
            xb = xb.to(device, non_blocking=True).to(memory_format=torch.channels_last)
            yb = yb.to(device, non_blocking=True)
            logits = m(xb)
            preds = logits.argmax(1)
            correct += (preds == yb).sum().item()
            total += yb.size(0)
    return correct / max(1,total)

best_acc = 0.0
best_path = 'effv2m_320_ema.pth'
t0 = time.time()
for epoch in range(epochs):
    for pg in optimizer.param_groups:
        pg['lr'] = lr_at_epoch(epoch)
    model.train()
    ep_start = time.time()
    seen = 0
    for it, (xb, yb) in enumerate(train_loader):
        xb = xb.to(device, non_blocking=True).to(memory_format=torch.channels_last)
        yb = yb.to(device, non_blocking=True)
        optimizer.zero_grad(set_to_none=True)
        with torch.amp.autocast(device_type='cuda', enabled=torch.cuda.is_available()):
            if use_mixup:
                mx, my = mixup_fn(xb, yb)
                logits = model(mx)
                loss = criterion(logits, my)
            else:
                logits = model(xb)
                loss = criterion(logits, yb)
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        ema.update(model)
        seen += yb.size(0)
        if (it+1) % 100 == 0:
            print(f"Ep {epoch+1}/{epochs} it {it+1} seen {seen} loss {loss.item():.4f} lr {optimizer.param_groups[0]['lr']:.2e} elapsed {time.time()-ep_start:.1f}s", flush=True)
    # Evaluate EMA model
    val_acc = evaluate(ema.module, val_loader)
    print(f"Epoch {epoch+1} val_acc(EMA) {val_acc:.4f} ep_time {time.time()-ep_start:.1f}s total {time.time()-t0:.1f}s", flush=True)
    if val_acc > best_acc:
        best_acc = val_acc
        torch.save({'model': ema.module.state_dict(), 'num_classes': num_classes}, best_path)
        print(f"Saved EMA best to {best_path}")

print(f"Training done. Best EMA val_acc {best_acc:.4f}")

# Inference with 2x TTA (orig + hflip) using EMA weights
print('Preparing test loader...')
with open('test2019.json','r') as f: test_json = json.load(f)
test_images = test_json['images']
test_rel = [im['file_name'] for im in test_images]
test_ids = [im['id'] for im in test_images]

class TestDS(torch.utils.data.Dataset):
    def __init__(self, rels):
        self.rels = rels
        self.tf = val_tf
    def __len__(self): return len(self.rels)
    def __getitem__(self, i):
        from PIL import Image
        p = resolve_path(self.rels[i])
        with Image.open(p) as im:
            im = im.convert('RGB')
            x = self.tf(im)
        return x

test_loader = DataLoader(TestDS(test_rel), batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=True, persistent_workers=False, prefetch_factor=2)

# Load EMA weights
if os.path.exists(best_path):
    ckpt = torch.load(best_path, map_location=device)
    model.load_state_dict(ckpt['model'], strict=False)
else:
    print('Warning: best EMA checkpoint not found; using current weights')

model.eval()
preds = []
with torch.no_grad(), torch.amp.autocast(device_type='cuda', enabled=torch.cuda.is_available()):
    for it, xb in enumerate(test_loader):
        xb = xb.to(device, non_blocking=True).to(memory_format=torch.channels_last)
        logits = model(xb)
        logits_flip = model(torch.flip(xb, dims=[-1]))
        logits = (logits + logits_flip) * 0.5
        preds.extend(logits.argmax(1).cpu().tolist())
        if (it+1) % 100 == 0:
            total_it = math.ceil(len(test_rel)/batch_size)
            print(f"Infer it {it+1}/{total_it}", flush=True)

assert len(preds) == len(test_ids)
pred_cat_ids = [idx_to_catid[i] for i in preds]
import pandas as pd
sub = pd.DataFrame({'image_id': test_ids, 'category_id': pred_cat_ids})
sub.to_csv('submission.csv', index=False)
print('Wrote submission.csv with', len(sub), 'rows')

# Notes:
# - For full run: set epochs=35, warmup_epochs=5; keep same recipe.
# - If OOM persists at batch=48, try 40/32; only then reduce image size.

  from .autonotebook import tqdm as notebook_tqdm


Holdout sizes -> train: 222899, val: 10100
Building timm model...


|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |    512 B   |    512 B   |    512 B   |      0 B   |
|---------------------------------------------------------------------------|
| Active memory         |    512 B   |    512 B   |    512 B   |      0 B   |
|---------------------------------------------------------------------------|
| Requested memory      |      4 B   |      4 B   |      4 B   |      0 B   |
|---------------------------------------------------------------------------|
| GPU reserved memory   |   2048 KiB |   2048 KiB |   2048 KiB |      0 B   |
|---------------------------------------------------------------

  return fn(*args, **kwargs)


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Ep 1/2 it 100 seen 4800 loss 6.3330 lr 2.25e-04 elapsed 45.3s


Ep 1/2 it 200 seen 9600 loss 5.5159 lr 2.25e-04 elapsed 85.4s


Ep 1/2 it 300 seen 14400 loss 5.3506 lr 2.25e-04 elapsed 125.8s


Ep 1/2 it 400 seen 19200 loss 6.0161 lr 2.25e-04 elapsed 166.3s


Ep 1/2 it 500 seen 24000 loss 4.9669 lr 2.25e-04 elapsed 206.9s


Ep 1/2 it 600 seen 28800 loss 5.3659 lr 2.25e-04 elapsed 247.7s


Ep 1/2 it 700 seen 33600 loss 4.5594 lr 2.25e-04 elapsed 288.4s


Ep 1/2 it 800 seen 38400 loss 3.3625 lr 2.25e-04 elapsed 329.2s


Ep 1/2 it 900 seen 43200 loss 3.7671 lr 2.25e-04 elapsed 369.8s


Ep 1/2 it 1000 seen 48000 loss 5.0295 lr 2.25e-04 elapsed 410.5s


Ep 1/2 it 1100 seen 52800 loss 5.4533 lr 2.25e-04 elapsed 451.2s


Ep 1/2 it 1200 seen 57600 loss 5.1888 lr 2.25e-04 elapsed 491.8s


Ep 1/2 it 1300 seen 62400 loss 2.9670 lr 2.25e-04 elapsed 532.5s


Ep 1/2 it 1400 seen 67200 loss 3.2395 lr 2.25e-04 elapsed 573.2s


Ep 1/2 it 1500 seen 72000 loss 5.1117 lr 2.25e-04 elapsed 614.3s


Ep 1/2 it 1600 seen 76800 loss 2.9796 lr 2.25e-04 elapsed 655.5s


Ep 1/2 it 1700 seen 81600 loss 3.1759 lr 2.25e-04 elapsed 696.5s


Ep 1/2 it 1800 seen 86400 loss 5.4103 lr 2.25e-04 elapsed 737.4s


Ep 1/2 it 1900 seen 91200 loss 2.8411 lr 2.25e-04 elapsed 778.4s


Ep 1/2 it 2000 seen 96000 loss 3.1248 lr 2.25e-04 elapsed 819.4s


Ep 1/2 it 2100 seen 100800 loss 3.9237 lr 2.25e-04 elapsed 860.2s


Ep 1/2 it 2200 seen 105600 loss 3.6367 lr 2.25e-04 elapsed 901.2s


Ep 1/2 it 2300 seen 110400 loss 3.0370 lr 2.25e-04 elapsed 942.0s


Ep 1/2 it 2400 seen 115200 loss 3.3330 lr 2.25e-04 elapsed 983.1s


Ep 1/2 it 2500 seen 120000 loss 4.0690 lr 2.25e-04 elapsed 1024.1s


Ep 1/2 it 2600 seen 124800 loss 5.4907 lr 2.25e-04 elapsed 1065.2s


Ep 1/2 it 2700 seen 129600 loss 5.1362 lr 2.25e-04 elapsed 1106.2s


Ep 1/2 it 2800 seen 134400 loss 4.7662 lr 2.25e-04 elapsed 1147.3s


Ep 1/2 it 2900 seen 139200 loss 2.8764 lr 2.25e-04 elapsed 1188.4s


Ep 1/2 it 3000 seen 144000 loss 2.8157 lr 2.25e-04 elapsed 1229.2s


Ep 1/2 it 3100 seen 148800 loss 4.9404 lr 2.25e-04 elapsed 1270.2s


Ep 1/2 it 3200 seen 153600 loss 2.9053 lr 2.25e-04 elapsed 1311.0s


Ep 1/2 it 3300 seen 158400 loss 4.4846 lr 2.25e-04 elapsed 1351.8s


Ep 1/2 it 3400 seen 163200 loss 2.7080 lr 2.25e-04 elapsed 1392.6s


Ep 1/2 it 3500 seen 168000 loss 3.0966 lr 2.25e-04 elapsed 1433.5s


Ep 1/2 it 3600 seen 172800 loss 3.8890 lr 2.25e-04 elapsed 1474.4s


Ep 1/2 it 3700 seen 177600 loss 4.9617 lr 2.25e-04 elapsed 1515.4s


Ep 1/2 it 3800 seen 182400 loss 5.0171 lr 2.25e-04 elapsed 1556.5s


Ep 1/2 it 3900 seen 187200 loss 2.7811 lr 2.25e-04 elapsed 1597.4s


Ep 1/2 it 4000 seen 192000 loss 2.6402 lr 2.25e-04 elapsed 1638.3s


Ep 1/2 it 4100 seen 196800 loss 2.1949 lr 2.25e-04 elapsed 1679.2s


Ep 1/2 it 4200 seen 201600 loss 5.2894 lr 2.25e-04 elapsed 1720.1s


Ep 1/2 it 4300 seen 206400 loss 4.6924 lr 2.25e-04 elapsed 1760.9s


Ep 1/2 it 4400 seen 211200 loss 2.8227 lr 2.25e-04 elapsed 1801.9s


Ep 1/2 it 4500 seen 216000 loss 3.5423 lr 2.25e-04 elapsed 1842.8s


Ep 1/2 it 4600 seen 220800 loss 2.1446 lr 2.25e-04 elapsed 1883.7s




Epoch 1 val_acc(EMA) 0.4904 ep_time 1919.9s total 1919.9s


Saved EMA best to effv2m_320_ema.pth


Ep 2/2 it 100 seen 4800 loss 4.6906 lr 2.25e-04 elapsed 41.5s


Ep 2/2 it 200 seen 9600 loss 2.5339 lr 2.25e-04 elapsed 82.6s


Ep 2/2 it 300 seen 14400 loss 3.3515 lr 2.25e-04 elapsed 123.6s


Ep 2/2 it 400 seen 19200 loss 2.5314 lr 2.25e-04 elapsed 164.6s


Ep 2/2 it 500 seen 24000 loss 4.1245 lr 2.25e-04 elapsed 205.6s


Ep 2/2 it 600 seen 28800 loss 2.5058 lr 2.25e-04 elapsed 246.4s


Ep 2/2 it 700 seen 33600 loss 2.5979 lr 2.25e-04 elapsed 287.3s


Ep 2/2 it 800 seen 38400 loss 1.9202 lr 2.25e-04 elapsed 328.1s


Ep 2/2 it 900 seen 43200 loss 2.7805 lr 2.25e-04 elapsed 369.1s


Ep 2/2 it 1000 seen 48000 loss 3.4676 lr 2.25e-04 elapsed 409.9s


Ep 2/2 it 1100 seen 52800 loss 2.4305 lr 2.25e-04 elapsed 450.9s


Ep 2/2 it 1200 seen 57600 loss 2.4085 lr 2.25e-04 elapsed 491.9s


Ep 2/2 it 1300 seen 62400 loss 2.5230 lr 2.25e-04 elapsed 532.7s


Ep 2/2 it 1400 seen 67200 loss 3.4217 lr 2.25e-04 elapsed 573.5s


Ep 2/2 it 1500 seen 72000 loss 2.8199 lr 2.25e-04 elapsed 614.4s


Ep 2/2 it 1600 seen 76800 loss 2.5724 lr 2.25e-04 elapsed 655.2s


Ep 2/2 it 1700 seen 81600 loss 3.3558 lr 2.25e-04 elapsed 695.8s


Ep 2/2 it 1800 seen 86400 loss 5.1512 lr 2.25e-04 elapsed 736.8s


Ep 2/2 it 1900 seen 91200 loss 4.8229 lr 2.25e-04 elapsed 777.7s


Ep 2/2 it 2000 seen 96000 loss 4.1234 lr 2.25e-04 elapsed 818.6s


Ep 2/2 it 2100 seen 100800 loss 2.9692 lr 2.25e-04 elapsed 859.5s


Ep 2/2 it 2200 seen 105600 loss 2.6118 lr 2.25e-04 elapsed 900.3s


Ep 2/2 it 2300 seen 110400 loss 2.1596 lr 2.25e-04 elapsed 941.2s


Ep 2/2 it 2400 seen 115200 loss 2.6629 lr 2.25e-04 elapsed 982.1s


Ep 2/2 it 2500 seen 120000 loss 4.5040 lr 2.25e-04 elapsed 1023.0s


Ep 2/2 it 2600 seen 124800 loss 2.5083 lr 2.25e-04 elapsed 1063.9s


Ep 2/2 it 2700 seen 129600 loss 2.6579 lr 2.25e-04 elapsed 1104.9s


Ep 2/2 it 2800 seen 134400 loss 2.3411 lr 2.25e-04 elapsed 1145.9s


Ep 2/2 it 2900 seen 139200 loss 4.3179 lr 2.25e-04 elapsed 1186.8s


Ep 2/2 it 3000 seen 144000 loss 4.3257 lr 2.25e-04 elapsed 1227.6s


Ep 2/2 it 3100 seen 148800 loss 4.3465 lr 2.25e-04 elapsed 1268.6s


Ep 2/2 it 3200 seen 153600 loss 2.1517 lr 2.25e-04 elapsed 1309.7s


Ep 2/2 it 3300 seen 158400 loss 4.9708 lr 2.25e-04 elapsed 1350.6s


Ep 2/2 it 3400 seen 163200 loss 2.5253 lr 2.25e-04 elapsed 1391.7s


Ep 2/2 it 3500 seen 168000 loss 2.3319 lr 2.25e-04 elapsed 1432.6s


Ep 2/2 it 3600 seen 172800 loss 4.7614 lr 2.25e-04 elapsed 1473.6s


Ep 2/2 it 3700 seen 177600 loss 4.8354 lr 2.25e-04 elapsed 1514.5s


Ep 2/2 it 3800 seen 182400 loss 4.7978 lr 2.25e-04 elapsed 1555.4s


Ep 2/2 it 3900 seen 187200 loss 4.4130 lr 2.25e-04 elapsed 1596.3s


Ep 2/2 it 4000 seen 192000 loss 2.4038 lr 2.25e-04 elapsed 1637.3s


Ep 2/2 it 4100 seen 196800 loss 2.0450 lr 2.25e-04 elapsed 1678.3s


Ep 2/2 it 4200 seen 201600 loss 2.3812 lr 2.25e-04 elapsed 1719.2s


Ep 2/2 it 4300 seen 206400 loss 5.1900 lr 2.25e-04 elapsed 1760.2s


Ep 2/2 it 4400 seen 211200 loss 2.4810 lr 2.25e-04 elapsed 1801.1s


Ep 2/2 it 4500 seen 216000 loss 4.5549 lr 2.25e-04 elapsed 1842.1s


Ep 2/2 it 4600 seen 220800 loss 2.4127 lr 2.25e-04 elapsed 1883.0s


Epoch 2 val_acc(EMA) 0.6508 ep_time 1918.5s total 3838.7s


Saved EMA best to effv2m_320_ema.pth
Training done. Best EMA val_acc 0.6508
Preparing test loader...


  ckpt = torch.load(best_path, map_location=device)


Infer it 100/672


Infer it 200/672


Infer it 300/672


Infer it 400/672


Infer it 500/672


Infer it 600/672


Wrote submission.csv with 32214 rows


In [4]:
# Full 35-epoch training: tf_efficientnetv2_m.in21k_ft_in1k @320 with EMA, Mixup/CutMix, GradClip, and 6x TTA
import os, math, time, random, json, subprocess, sys
from collections import defaultdict
import torch, torch.nn as nn
from torch.utils.data import DataLoader
import torchvision.transforms as T

os.environ.setdefault('PYTORCH_CUDA_ALLOC_CONF', 'expandable_segments:True')
device = 'cuda' if torch.cuda.is_available() else 'cpu'
torch.backends.cudnn.benchmark = True
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
if hasattr(torch, 'set_float32_matmul_precision'):
    torch.set_float32_matmul_precision('medium')

# AMP dtype selection (BF16 for A10); GradScaler only if FP16
cap = torch.cuda.get_device_capability(0)[0] if torch.cuda.is_available() else 0
autocast_dtype = torch.bfloat16 if (torch.cuda.is_available() and cap >= 8) else torch.float16
scaler = torch.amp.GradScaler(enabled=(autocast_dtype == torch.float16))

# Pre-reqs from earlier cells
assert 'train_ds' in globals() and 'resolve_path' in globals() and 'idx_to_catid' in globals()

# Build stratified holdout: safer cap by percentage and availability
labels = [y for _, y in train_ds.recs]
by_cls = defaultdict(list)
for i, y in enumerate(labels):
    by_cls[y].append(i)
val_idx, tr_idx = [], []
random.seed(42)
for c, idxs in by_cls.items():
    random.shuffle(idxs)
    n = len(idxs)
    v_count = min(20, max(5, n // 10))
    v_count = min(v_count, n - 1)  # leave at least one for train
    v = idxs[:v_count]
    t = idxs[v_count:]
    val_idx.extend(v)
    tr_idx.extend(t)
print(f"Holdout sizes -> train: {len(tr_idx)}, val: {len(val_idx)}", flush=True)

# Transforms
train_tf = T.Compose([
    T.RandomResizedCrop(320, scale=(0.2, 1.0), ratio=(0.75, 1.33), interpolation=T.InterpolationMode.BICUBIC),
    T.RandAugment(num_ops=2, magnitude=9),
    T.RandomHorizontalFlip(),
    T.ToTensor(),
    T.Normalize([0.485,0.456,0.406],[0.229,0.224,0.225]),
    T.RandomErasing(p=0.10, value='random')
])
val_tf = T.Compose([
    T.Resize(352, interpolation=T.InterpolationMode.BICUBIC),
    T.CenterCrop(320),
    T.ToTensor(),
    T.Normalize([0.485,0.456,0.406],[0.229,0.224,0.225]),
])

class WrappedDS(torch.utils.data.Dataset):
    def __init__(self, base, indices, train):
        self.base = base
        self.idxs = indices
        self.train = train
    def __len__(self): return len(self.idxs)
    def __getitem__(self, k):
        rel, y = self.base.recs[self.idxs[k]]
        from PIL import Image
        p = resolve_path(rel)
        with Image.open(p) as img:
            img = img.convert('RGB')
            x = (train_tf if self.train else val_tf)(img)
        return x, y

# Import timm and helpers
try:
    import timm
except Exception:
    subprocess.run([sys.executable, '-m', 'pip', 'install', 'timm==1.0.9', '--no-deps'], check=True)
    import timm
from timm.loss import SoftTargetCrossEntropy
from timm.data.mixup import Mixup
from timm.utils import ModelEmaV2

num_classes = len(idx_to_catid)
print('Building model...', flush=True)
model = None
try:
    model = timm.create_model(
        'tf_efficientnetv2_m.in21k_ft_in1k',
        pretrained=True,
        num_classes=num_classes,
        drop_path_rate=0.3,
        pretrained_cfg_overlay={'hf_hub_id': None}
    )
except Exception as e:
    print('Pretrained URL fallback failed, retrying default:', e, flush=True)
    model = timm.create_model('tf_efficientnetv2_m.in21k_ft_in1k', pretrained=True, num_classes=num_classes, drop_path_rate=0.3)

if hasattr(model, 'set_grad_checkpointing'):
    model.set_grad_checkpointing(True)
if torch.cuda.is_available():
    torch.cuda.empty_cache()
model = model.to(device)
model = model.to(memory_format=torch.channels_last)

# Disable torch.compile due to Triton requiring Python.h in this env; use eager for stability
try:
    import torch._dynamo
    torch._dynamo.config.suppress_errors = True
except Exception:
    pass
print("Skipping torch.compile: using eager mode.", flush=True)

# Optimizer, sched, EMA, loss, mixup
init_batch = 64
batch_size = init_batch
base_lr = 3e-4 * (batch_size/64)
optimizer = torch.optim.AdamW(model.parameters(), lr=base_lr, weight_decay=0.05, betas=(0.9,0.999))
epochs = 35
warmup_epochs = 5
cosine_min_lr = 1e-6

def lr_at_epoch(ep):
    if ep < warmup_epochs:
        return base_lr * (ep+1)/max(1,warmup_epochs)
    t = (ep - warmup_epochs)/max(1,(epochs - warmup_epochs))
    return cosine_min_lr + 0.5*(base_lr - cosine_min_lr)*(1 + math.cos(math.pi*t))

mixup_fn = Mixup(
    mixup_alpha=0.2,
    cutmix_alpha=0.3,
    prob=0.7,
    switch_prob=0.5,
    label_smoothing=0.1,
    num_classes=num_classes
)
use_mixup = True
criterion = SoftTargetCrossEntropy() if use_mixup else nn.CrossEntropyLoss(label_smoothing=0.1)
ema = ModelEmaV2(model, decay=0.9995)  # ramp to 0.9998

# DataLoaders (after model for memory safety), tuned throughput
num_workers = min(12, os.cpu_count() or 12)
train_sub = WrappedDS(train_ds, tr_idx, train=True)
val_sub = WrappedDS(train_ds, val_idx, train=False)

def make_loaders(bs):
    tl = DataLoader(train_sub, batch_size=bs, shuffle=True, num_workers=num_workers, pin_memory=True, persistent_workers=False, drop_last=True, prefetch_factor=4)
    vl = DataLoader(val_sub, batch_size=bs, shuffle=False, num_workers=num_workers, pin_memory=True, persistent_workers=False, prefetch_factor=4)
    return tl, vl

# Try larger batch, fallback on OOM automatically with quick forward sanity
for bs_try in (64, 56, 48, 40, 32):
    try:
        torch.cuda.empty_cache()
        train_loader, val_loader = make_loaders(bs_try)
        xb, yb = next(iter(train_loader))
        with torch.no_grad(), torch.amp.autocast(device_type='cuda', dtype=autocast_dtype, enabled=torch.cuda.is_available()):
            _ = model(xb.to(device, non_blocking=True).to(memory_format=torch.channels_last))
        batch_size = bs_try
        print(f"Using batch_size={batch_size}", flush=True)
        break
    except RuntimeError as e:
        if 'out of memory' in str(e).lower():
            print(f"OOM with batch_size={bs_try}, reducing...", flush=True)
            try:
                del train_loader, val_loader
            except Exception:
                pass
            torch.cuda.empty_cache()
            continue
        else:
            raise

# Update LR to reflect final batch size
for pg in optimizer.param_groups:
    pg['lr'] = 3e-4 * (batch_size/64)
base_lr = optimizer.param_groups[0]['lr']
print(f"Base LR set to {base_lr:.2e}", flush=True)

def evaluate(m, loader):
    m.eval()
    correct = 0
    total = 0
    with torch.no_grad(), torch.amp.autocast(device_type='cuda', dtype=autocast_dtype, enabled=torch.cuda.is_available()):
        for xb, yb in loader:
            xb = xb.to(device, non_blocking=True).to(memory_format=torch.channels_last)
            yb = yb.to(device, non_blocking=True)
            logits = m(xb)
            preds = logits.argmax(1)
            correct += (preds == yb).sum().item()
            total += yb.size(0)
    return correct / max(1,total)

best_acc = 0.0
best_path = 'effv2m_320_ema.pth'
t0 = time.time()
update_count = 0
for epoch in range(epochs):
    # Set LR for epoch
    cur_lr = lr_at_epoch(epoch)
    for pg in optimizer.param_groups:
        pg['lr'] = cur_lr
    # Ramp EMA decay from 0.9995 -> 0.9998 across training
    ema.decay = 0.9995 + (0.9998 - 0.9995) * (epoch / max(1, epochs-1))
    model.train()
    ep_start = time.time()
    seen = 0
    for it, (xb, yb) in enumerate(train_loader):
        xb = xb.to(device, non_blocking=True).to(memory_format=torch.channels_last)
        yb = yb.to(device, non_blocking=True)
        optimizer.zero_grad(set_to_none=True)
        with torch.amp.autocast(device_type='cuda', dtype=autocast_dtype, enabled=torch.cuda.is_available()):
            if use_mixup:
                mx, my = mixup_fn(xb, yb)
                logits = model(mx)
                loss = criterion(logits, my)
            else:
                logits = model(xb)
                loss = criterion(logits, yb)
        scaler.scale(loss).backward()
        # Grad clipping
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        scaler.step(optimizer)
        scaler.update()
        ema.update(model)
        update_count += 1
        seen += yb.size(0)
        if (it+1) % 200 == 0:
            print(f"Ep {epoch+1}/{epochs} it {it+1} seen {seen} loss {loss.item():.4f} lr {cur_lr:.2e} ema {ema.decay:.6f} elapsed {time.time()-ep_start:.1f}s", flush=True)
    val_acc = evaluate(ema.module, val_loader)
    print(f"Epoch {epoch+1} val_acc(EMA) {val_acc:.4f} ep_time {time.time()-ep_start:.1f}s total {time.time()-t0:.1f}s", flush=True)
    if val_acc > best_acc:
        best_acc = val_acc
        torch.save({'model': ema.module.state_dict(), 'num_classes': num_classes}, best_path)
        print(f"Saved EMA best to {best_path}", flush=True)
    # Save 'last' checkpoint every epoch for resume
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'ema_state_dict': ema.state_dict(),
        'scaler_state_dict': getattr(scaler, 'state_dict', lambda: {})(),
        'best_acc': best_acc,
        'batch_size': batch_size,
    }, 'checkpoint-last.pth')
    # Periodic EMA snapshot every 5 epochs
    if (epoch + 1) % 5 == 0:
        p = f'effv2m_320_ema_ep{epoch+1}.pth'
        torch.save({'model': ema.module.state_dict(), 'num_classes': num_classes}, p)
        print(f"Saved periodic EMA to {p}", flush=True)

print(f"Training done. Best EMA val_acc {best_acc:.4f}", flush=True)

# Inference with 6x TTA: scales [288,320,352] x {orig, hflip}, CPU accumulation and 320 weighted 2x
with open('test2019.json','r') as f: test_json = json.load(f)
test_images = test_json['images']
test_rel = [im['file_name'] for im in test_images]
test_ids = [im['id'] for im in test_images]

def make_val_tf(scale):
    resize_side = scale + 32
    return T.Compose([
        T.Resize(resize_side, interpolation=T.InterpolationMode.BICUBIC),
        T.CenterCrop(scale),
        T.ToTensor(),
        T.Normalize([0.485,0.456,0.406],[0.229,0.224,0.225]),
    ])

class TestDS(torch.utils.data.Dataset):
    def __init__(self, rels, tf):
        self.rels = rels
        self.tf = tf
    def __len__(self): return len(self.rels)
    def __getitem__(self, i):
        from PIL import Image
        p = resolve_path(self.rels[i])
        with Image.open(p) as im:
            im = im.convert('RGB')
            x = self.tf(im)
        return x

# Load EMA weights
ckpt = torch.load(best_path, map_location=device)
model.load_state_dict(ckpt['model'], strict=False)
model.eval()

scales = [288, 320, 352]
tta_logits = torch.zeros((len(test_rel), num_classes), dtype=torch.float32)  # CPU accumulation
tta_weight = 0.0
num_workers = min(12, os.cpu_count() or 12)

with torch.no_grad():
    for s in scales:
        tf_s = make_val_tf(s)
        ds_s = TestDS(test_rel, tf_s)
        dl_s = DataLoader(ds_s, batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=True, persistent_workers=False, prefetch_factor=4)
        w = 2.0 if s == 320 else 1.0
        idx = 0
        for it, xb in enumerate(dl_s):
            xb = xb.to(device, non_blocking=True).to(memory_format=torch.channels_last)
            with torch.amp.autocast(device_type='cuda', dtype=autocast_dtype, enabled=torch.cuda.is_available()):
                lg = model(xb)
                lg_flip = model(torch.flip(xb, dims=[-1]))
                lg = (lg + lg_flip) * 0.5
            lg = lg.float().cpu()
            bsz = lg.shape[0]
            tta_logits[idx:idx+bsz] += lg * w
            idx += bsz
            if (it+1) % 100 == 0:
                total_it = math.ceil(len(ds_s)/batch_size)
                print(f"TTA scale {s} it {it+1}/{total_it}", flush=True)
        tta_weight += w

tta_logits /= max(1.0, tta_weight)
preds = tta_logits.argmax(1).tolist()
pred_cat_ids = [idx_to_catid[i] for i in preds]
import pandas as pd
sub = pd.DataFrame({'image_id': test_ids, 'category_id': pred_cat_ids})
sub.to_csv('submission.csv', index=False)
print('Wrote submission.csv with', len(sub), 'rows')
print('Done.')

Holdout sizes -> train: 217930, val: 15069


Building model...


Skipping torch.compile: using eager mode.


  return fn(*args, **kwargs)


Using batch_size=64


Base LR set to 3.00e-04


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Ep 1/35 it 200 seen 12800 loss 6.7616 lr 6.00e-05 ema 0.999500 elapsed 123.9s


Ep 1/35 it 400 seen 25600 loss 5.5811 lr 6.00e-05 ema 0.999500 elapsed 235.4s


Ep 1/35 it 600 seen 38400 loss 6.0355 lr 6.00e-05 ema 0.999500 elapsed 348.5s


Ep 1/35 it 800 seen 51200 loss 4.3742 lr 6.00e-05 ema 0.999500 elapsed 462.1s


Ep 1/35 it 1000 seen 64000 loss 4.0707 lr 6.00e-05 ema 0.999500 elapsed 575.1s


Ep 1/35 it 1200 seen 76800 loss 5.3470 lr 6.00e-05 ema 0.999500 elapsed 688.5s


Ep 1/35 it 1400 seen 89600 loss 3.5551 lr 6.00e-05 ema 0.999500 elapsed 801.4s


Ep 1/35 it 1600 seen 102400 loss 3.3467 lr 6.00e-05 ema 0.999500 elapsed 915.1s


Ep 1/35 it 1800 seen 115200 loss 3.6219 lr 6.00e-05 ema 0.999500 elapsed 1028.5s


Ep 1/35 it 2000 seen 128000 loss 3.5069 lr 6.00e-05 ema 0.999500 elapsed 1141.2s


Ep 1/35 it 2200 seen 140800 loss 5.0420 lr 6.00e-05 ema 0.999500 elapsed 1254.8s


Ep 1/35 it 2400 seen 153600 loss 3.0317 lr 6.00e-05 ema 0.999500 elapsed 1367.5s


Ep 1/35 it 2600 seen 166400 loss 5.0835 lr 6.00e-05 ema 0.999500 elapsed 1481.2s


Ep 1/35 it 2800 seen 179200 loss 4.0170 lr 6.00e-05 ema 0.999500 elapsed 1594.6s


Ep 1/35 it 3000 seen 192000 loss 2.7126 lr 6.00e-05 ema 0.999500 elapsed 1707.6s


Ep 1/35 it 3200 seen 204800 loss 2.9315 lr 6.00e-05 ema 0.999500 elapsed 1821.3s


Ep 1/35 it 3400 seen 217600 loss 3.0413 lr 6.00e-05 ema 0.999500 elapsed 1933.8s


Epoch 1 val_acc(EMA) 0.4848 ep_time 1963.4s total 1963.4s


Saved EMA best to effv2m_320_ema.pth


Ep 2/35 it 200 seen 12800 loss 4.1699 lr 1.20e-04 ema 0.999509 elapsed 113.8s


Ep 2/35 it 400 seen 25600 loss 2.6717 lr 1.20e-04 ema 0.999509 elapsed 228.1s


Ep 2/35 it 600 seen 38400 loss 2.9924 lr 1.20e-04 ema 0.999509 elapsed 341.0s


Ep 2/35 it 800 seen 51200 loss 2.7077 lr 1.20e-04 ema 0.999509 elapsed 454.4s


Ep 2/35 it 1000 seen 64000 loss 2.4574 lr 1.20e-04 ema 0.999509 elapsed 568.3s


Ep 2/35 it 1200 seen 76800 loss 5.3453 lr 1.20e-04 ema 0.999509 elapsed 681.5s


Ep 2/35 it 1400 seen 89600 loss 4.5054 lr 1.20e-04 ema 0.999509 elapsed 794.9s


Ep 2/35 it 1600 seen 102400 loss 2.2664 lr 1.20e-04 ema 0.999509 elapsed 907.6s


Ep 2/35 it 1800 seen 115200 loss 3.4976 lr 1.20e-04 ema 0.999509 elapsed 1021.4s


Ep 2/35 it 2000 seen 128000 loss 2.3797 lr 1.20e-04 ema 0.999509 elapsed 1135.2s


Ep 2/35 it 2200 seen 140800 loss 3.8406 lr 1.20e-04 ema 0.999509 elapsed 1247.8s


Ep 2/35 it 2400 seen 153600 loss 3.6691 lr 1.20e-04 ema 0.999509 elapsed 1361.1s


Ep 2/35 it 2600 seen 166400 loss 2.2451 lr 1.20e-04 ema 0.999509 elapsed 1474.1s


Ep 2/35 it 2800 seen 179200 loss 5.1646 lr 1.20e-04 ema 0.999509 elapsed 1588.1s


Ep 2/35 it 3000 seen 192000 loss 2.4083 lr 1.20e-04 ema 0.999509 elapsed 1701.7s


Ep 2/35 it 3200 seen 204800 loss 4.2392 lr 1.20e-04 ema 0.999509 elapsed 1814.4s


Ep 2/35 it 3400 seen 217600 loss 2.5025 lr 1.20e-04 ema 0.999509 elapsed 1928.0s


Epoch 2 val_acc(EMA) 0.6834 ep_time 1957.1s total 3923.2s


Saved EMA best to effv2m_320_ema.pth


Ep 3/35 it 200 seen 12800 loss 4.5424 lr 1.80e-04 ema 0.999518 elapsed 113.8s


Ep 3/35 it 400 seen 25600 loss 2.4367 lr 1.80e-04 ema 0.999518 elapsed 227.5s


Ep 3/35 it 600 seen 38400 loss 2.3912 lr 1.80e-04 ema 0.999518 elapsed 340.3s


Ep 3/35 it 800 seen 51200 loss 2.5148 lr 1.80e-04 ema 0.999518 elapsed 454.3s


Ep 3/35 it 1000 seen 64000 loss 3.5920 lr 1.80e-04 ema 0.999518 elapsed 566.9s


Ep 3/35 it 1200 seen 76800 loss 3.4121 lr 1.80e-04 ema 0.999518 elapsed 680.4s


Ep 3/35 it 1400 seen 89600 loss 2.4619 lr 1.80e-04 ema 0.999518 elapsed 794.2s


Ep 3/35 it 1600 seen 102400 loss 2.4570 lr 1.80e-04 ema 0.999518 elapsed 906.9s


Ep 3/35 it 1800 seen 115200 loss 2.7913 lr 1.80e-04 ema 0.999518 elapsed 1020.5s


Ep 3/35 it 2000 seen 128000 loss 4.5104 lr 1.80e-04 ema 0.999518 elapsed 1134.2s


Ep 3/35 it 2200 seen 140800 loss 2.4381 lr 1.80e-04 ema 0.999518 elapsed 1246.9s


Ep 3/35 it 2400 seen 153600 loss 2.5261 lr 1.80e-04 ema 0.999518 elapsed 1360.8s


Ep 3/35 it 2600 seen 166400 loss 2.4166 lr 1.80e-04 ema 0.999518 elapsed 1474.2s


Ep 3/35 it 2800 seen 179200 loss 2.4168 lr 1.80e-04 ema 0.999518 elapsed 1586.6s


Ep 3/35 it 3000 seen 192000 loss 2.7311 lr 1.80e-04 ema 0.999518 elapsed 1700.3s


Ep 3/35 it 3200 seen 204800 loss 2.5516 lr 1.80e-04 ema 0.999518 elapsed 1813.0s


Ep 3/35 it 3400 seen 217600 loss 2.2187 lr 1.80e-04 ema 0.999518 elapsed 1926.3s


Epoch 3 val_acc(EMA) 0.7486 ep_time 1955.1s total 5880.0s


Saved EMA best to effv2m_320_ema.pth


Ep 4/35 it 200 seen 12800 loss 3.4591 lr 2.40e-04 ema 0.999526 elapsed 113.6s


Ep 4/35 it 400 seen 25600 loss 2.4368 lr 2.40e-04 ema 0.999526 elapsed 227.8s


Ep 4/35 it 600 seen 38400 loss 4.0749 lr 2.40e-04 ema 0.999526 elapsed 340.5s


Ep 4/35 it 800 seen 51200 loss 2.2733 lr 2.40e-04 ema 0.999526 elapsed 453.9s


Ep 4/35 it 1000 seen 64000 loss 3.9126 lr 2.40e-04 ema 0.999526 elapsed 567.4s


Ep 4/35 it 1200 seen 76800 loss 2.6018 lr 2.40e-04 ema 0.999526 elapsed 680.2s


Ep 4/35 it 1400 seen 89600 loss 2.7218 lr 2.40e-04 ema 0.999526 elapsed 793.7s


Ep 4/35 it 1600 seen 102400 loss 2.2073 lr 2.40e-04 ema 0.999526 elapsed 906.4s


Ep 4/35 it 1800 seen 115200 loss 2.0807 lr 2.40e-04 ema 0.999526 elapsed 1019.8s


Ep 4/35 it 2000 seen 128000 loss 4.4117 lr 2.40e-04 ema 0.999526 elapsed 1133.5s


Ep 4/35 it 2200 seen 140800 loss 3.4837 lr 2.40e-04 ema 0.999526 elapsed 1246.3s


Ep 4/35 it 2400 seen 153600 loss 2.3853 lr 2.40e-04 ema 0.999526 elapsed 1359.7s


Ep 4/35 it 2600 seen 166400 loss 2.2889 lr 2.40e-04 ema 0.999526 elapsed 1473.3s


Ep 4/35 it 2800 seen 179200 loss 4.4853 lr 2.40e-04 ema 0.999526 elapsed 1586.1s


Ep 4/35 it 3000 seen 192000 loss 3.4448 lr 2.40e-04 ema 0.999526 elapsed 1699.4s


Ep 4/35 it 3200 seen 204800 loss 2.2480 lr 2.40e-04 ema 0.999526 elapsed 1811.9s


Ep 4/35 it 3400 seen 217600 loss 2.3372 lr 2.40e-04 ema 0.999526 elapsed 1925.4s


Epoch 4 val_acc(EMA) 0.7660 ep_time 1954.4s total 7836.5s


Saved EMA best to effv2m_320_ema.pth


Ep 5/35 it 200 seen 12800 loss 4.3506 lr 3.00e-04 ema 0.999535 elapsed 113.9s


Ep 5/35 it 400 seen 25600 loss 5.0326 lr 3.00e-04 ema 0.999535 elapsed 227.6s


Ep 5/35 it 600 seen 38400 loss 2.4043 lr 3.00e-04 ema 0.999535 elapsed 340.0s


Ep 5/35 it 800 seen 51200 loss 4.9031 lr 3.00e-04 ema 0.999535 elapsed 453.6s


Ep 5/35 it 1000 seen 64000 loss 3.4125 lr 3.00e-04 ema 0.999535 elapsed 567.2s


Ep 5/35 it 1200 seen 76800 loss 2.5950 lr 3.00e-04 ema 0.999535 elapsed 679.8s


Ep 5/35 it 1400 seen 89600 loss 2.6903 lr 3.00e-04 ema 0.999535 elapsed 793.4s


Ep 5/35 it 1600 seen 102400 loss 4.2878 lr 3.00e-04 ema 0.999535 elapsed 906.2s


Ep 5/35 it 1800 seen 115200 loss 2.0800 lr 3.00e-04 ema 0.999535 elapsed 1019.8s


Ep 5/35 it 2000 seen 128000 loss 2.7634 lr 3.00e-04 ema 0.999535 elapsed 1133.0s


Ep 5/35 it 2200 seen 140800 loss 5.0463 lr 3.00e-04 ema 0.999535 elapsed 1245.5s


Ep 5/35 it 2400 seen 153600 loss 2.4686 lr 3.00e-04 ema 0.999535 elapsed 1359.2s


Ep 5/35 it 2600 seen 166400 loss 2.2289 lr 3.00e-04 ema 0.999535 elapsed 1471.8s


Ep 5/35 it 2800 seen 179200 loss 4.3326 lr 3.00e-04 ema 0.999535 elapsed 1585.3s


Ep 5/35 it 3000 seen 192000 loss 4.2855 lr 3.00e-04 ema 0.999535 elapsed 1697.7s


Ep 5/35 it 3200 seen 204800 loss 2.2040 lr 3.00e-04 ema 0.999535 elapsed 1811.4s


Ep 5/35 it 3400 seen 217600 loss 3.4666 lr 3.00e-04 ema 0.999535 elapsed 1925.0s


Epoch 5 val_acc(EMA) 0.7717 ep_time 1953.9s total 9792.0s


Saved EMA best to effv2m_320_ema.pth


Saved periodic EMA to effv2m_320_ema_ep5.pth


Ep 6/35 it 200 seen 12800 loss 2.0199 lr 3.00e-04 ema 0.999544 elapsed 114.6s


Ep 6/35 it 400 seen 25600 loss 2.1971 lr 3.00e-04 ema 0.999544 elapsed 227.6s


Ep 6/35 it 600 seen 38400 loss 4.0562 lr 3.00e-04 ema 0.999544 elapsed 341.8s


Ep 6/35 it 800 seen 51200 loss 4.2924 lr 3.00e-04 ema 0.999544 elapsed 454.6s


Ep 6/35 it 1000 seen 64000 loss 5.0904 lr 3.00e-04 ema 0.999544 elapsed 568.2s


Ep 6/35 it 1200 seen 76800 loss 2.1793 lr 3.00e-04 ema 0.999544 elapsed 681.0s


Ep 6/35 it 1400 seen 89600 loss 4.4109 lr 3.00e-04 ema 0.999544 elapsed 794.7s


Ep 6/35 it 1600 seen 102400 loss 4.1336 lr 3.00e-04 ema 0.999544 elapsed 907.4s


Ep 6/35 it 1800 seen 115200 loss 2.4608 lr 3.00e-04 ema 0.999544 elapsed 1021.1s


Ep 6/35 it 2000 seen 128000 loss 2.3146 lr 3.00e-04 ema 0.999544 elapsed 1134.9s


Ep 6/35 it 2200 seen 140800 loss 2.6958 lr 3.00e-04 ema 0.999544 elapsed 1247.8s


Ep 6/35 it 2400 seen 153600 loss 4.2520 lr 3.00e-04 ema 0.999544 elapsed 1361.5s


Ep 6/35 it 2600 seen 166400 loss 2.5628 lr 3.00e-04 ema 0.999544 elapsed 1474.2s


Ep 6/35 it 2800 seen 179200 loss 2.8046 lr 3.00e-04 ema 0.999544 elapsed 1588.1s


Ep 6/35 it 3000 seen 192000 loss 2.4306 lr 3.00e-04 ema 0.999544 elapsed 1701.9s


Ep 6/35 it 3200 seen 204800 loss 3.6446 lr 3.00e-04 ema 0.999544 elapsed 1814.6s


Ep 6/35 it 3400 seen 217600 loss 2.3992 lr 3.00e-04 ema 0.999544 elapsed 1928.3s


Epoch 6 val_acc(EMA) 0.7809 ep_time 1957.2s total 11751.2s


Saved EMA best to effv2m_320_ema.pth


Ep 7/35 it 200 seen 12800 loss 1.9602 lr 2.99e-04 ema 0.999553 elapsed 113.8s


Ep 7/35 it 400 seen 25600 loss 3.8764 lr 2.99e-04 ema 0.999553 elapsed 227.5s


Ep 7/35 it 600 seen 38400 loss 3.9796 lr 2.99e-04 ema 0.999553 elapsed 340.9s


Ep 7/35 it 800 seen 51200 loss 2.1194 lr 2.99e-04 ema 0.999553 elapsed 453.6s


Ep 7/35 it 1000 seen 64000 loss 2.1527 lr 2.99e-04 ema 0.999553 elapsed 567.1s


Ep 7/35 it 1200 seen 76800 loss 2.0395 lr 2.99e-04 ema 0.999553 elapsed 679.5s


Ep 7/35 it 1400 seen 89600 loss 2.8882 lr 2.99e-04 ema 0.999553 elapsed 793.0s


Ep 7/35 it 1600 seen 102400 loss 3.8699 lr 2.99e-04 ema 0.999553 elapsed 906.7s


Ep 7/35 it 1800 seen 115200 loss 2.3285 lr 2.99e-04 ema 0.999553 elapsed 1019.3s


Ep 7/35 it 2000 seen 128000 loss 2.3312 lr 2.99e-04 ema 0.999553 elapsed 1132.8s


Ep 7/35 it 2200 seen 140800 loss 1.9248 lr 2.99e-04 ema 0.999553 elapsed 1245.1s


Ep 7/35 it 2400 seen 153600 loss 3.3169 lr 2.99e-04 ema 0.999553 elapsed 1358.3s


Ep 7/35 it 2600 seen 166400 loss 2.5833 lr 2.99e-04 ema 0.999553 elapsed 1471.9s


Ep 7/35 it 2800 seen 179200 loss 1.9232 lr 2.99e-04 ema 0.999553 elapsed 1584.2s


Ep 7/35 it 3000 seen 192000 loss 4.1844 lr 2.99e-04 ema 0.999553 elapsed 1697.8s


Ep 7/35 it 3200 seen 204800 loss 4.0085 lr 2.99e-04 ema 0.999553 elapsed 1811.3s


Ep 7/35 it 3400 seen 217600 loss 1.9568 lr 2.99e-04 ema 0.999553 elapsed 1923.6s


Epoch 7 val_acc(EMA) 0.7866 ep_time 1952.5s total 13705.5s


Saved EMA best to effv2m_320_ema.pth


Ep 8/35 it 200 seen 12800 loss 1.8278 lr 2.97e-04 ema 0.999562 elapsed 114.8s


Ep 8/35 it 400 seen 25600 loss 2.2141 lr 2.97e-04 ema 0.999562 elapsed 227.7s


Ep 8/35 it 600 seen 38400 loss 3.7407 lr 2.97e-04 ema 0.999562 elapsed 341.6s


Ep 8/35 it 800 seen 51200 loss 2.0901 lr 2.97e-04 ema 0.999562 elapsed 455.0s


Ep 8/35 it 1000 seen 64000 loss 3.6507 lr 2.97e-04 ema 0.999562 elapsed 567.5s


Ep 8/35 it 1200 seen 76800 loss 1.9323 lr 2.97e-04 ema 0.999562 elapsed 681.1s


Ep 8/35 it 1400 seen 89600 loss 2.4760 lr 2.97e-04 ema 0.999562 elapsed 794.8s


Ep 8/35 it 1600 seen 102400 loss 3.3395 lr 2.97e-04 ema 0.999562 elapsed 907.3s


Ep 8/35 it 1800 seen 115200 loss 3.7181 lr 2.97e-04 ema 0.999562 elapsed 1020.7s


Ep 8/35 it 2000 seen 128000 loss 1.9927 lr 2.97e-04 ema 0.999562 elapsed 1133.4s


Ep 8/35 it 2200 seen 140800 loss 1.9777 lr 2.97e-04 ema 0.999562 elapsed 1247.1s


Ep 8/35 it 2400 seen 153600 loss 2.2467 lr 2.97e-04 ema 0.999562 elapsed 1360.4s


Ep 8/35 it 2600 seen 166400 loss 2.1807 lr 2.97e-04 ema 0.999562 elapsed 1472.9s


Ep 8/35 it 2800 seen 179200 loss 2.8052 lr 2.97e-04 ema 0.999562 elapsed 1586.3s


Ep 8/35 it 3000 seen 192000 loss 1.9633 lr 2.97e-04 ema 0.999562 elapsed 1699.1s


Ep 8/35 it 3200 seen 204800 loss 3.7729 lr 2.97e-04 ema 0.999562 elapsed 1812.9s


Ep 8/35 it 3400 seen 217600 loss 4.7345 lr 2.97e-04 ema 0.999562 elapsed 1925.5s


Epoch 8 val_acc(EMA) 0.7958 ep_time 1954.4s total 15661.7s


Saved EMA best to effv2m_320_ema.pth


Ep 9/35 it 200 seen 12800 loss 1.9945 lr 2.93e-04 ema 0.999571 elapsed 113.8s


Ep 9/35 it 400 seen 25600 loss 1.9753 lr 2.93e-04 ema 0.999571 elapsed 227.6s


Ep 9/35 it 600 seen 38400 loss 3.9980 lr 2.93e-04 ema 0.999571 elapsed 340.7s


Ep 9/35 it 800 seen 51200 loss 2.2013 lr 2.93e-04 ema 0.999571 elapsed 453.4s


Ep 9/35 it 1000 seen 64000 loss 2.1461 lr 2.93e-04 ema 0.999571 elapsed 566.6s


Ep 9/35 it 1200 seen 76800 loss 2.1419 lr 2.93e-04 ema 0.999571 elapsed 679.1s


Ep 9/35 it 1400 seen 89600 loss 2.0889 lr 2.93e-04 ema 0.999571 elapsed 792.5s


Ep 9/35 it 1600 seen 102400 loss 2.0216 lr 2.93e-04 ema 0.999571 elapsed 904.9s


Ep 9/35 it 1800 seen 115200 loss 2.2653 lr 2.93e-04 ema 0.999571 elapsed 1018.3s


Ep 9/35 it 2000 seen 128000 loss 3.1770 lr 2.93e-04 ema 0.999571 elapsed 1131.8s


Ep 9/35 it 2200 seen 140800 loss 3.5830 lr 2.93e-04 ema 0.999571 elapsed 1244.1s


Ep 9/35 it 2400 seen 153600 loss 2.1972 lr 2.93e-04 ema 0.999571 elapsed 1357.5s


Ep 9/35 it 2600 seen 166400 loss 2.0750 lr 2.93e-04 ema 0.999571 elapsed 1470.0s


Ep 9/35 it 2800 seen 179200 loss 2.7491 lr 2.93e-04 ema 0.999571 elapsed 1583.4s


Ep 9/35 it 3000 seen 192000 loss 2.0540 lr 2.93e-04 ema 0.999571 elapsed 1696.0s


Ep 9/35 it 3200 seen 204800 loss 3.7552 lr 2.93e-04 ema 0.999571 elapsed 1809.4s


Ep 9/35 it 3400 seen 217600 loss 2.6245 lr 2.93e-04 ema 0.999571 elapsed 1922.6s


Epoch 9 val_acc(EMA) 0.7991 ep_time 1951.6s total 17616.3s


Saved EMA best to effv2m_320_ema.pth


Ep 10/35 it 200 seen 12800 loss 1.9914 lr 2.87e-04 ema 0.999579 elapsed 113.5s


Ep 10/35 it 400 seen 25600 loss 2.0108 lr 2.87e-04 ema 0.999579 elapsed 227.4s


Ep 10/35 it 600 seen 38400 loss 1.7529 lr 2.87e-04 ema 0.999579 elapsed 340.1s


Ep 10/35 it 800 seen 51200 loss 2.3158 lr 2.87e-04 ema 0.999579 elapsed 453.7s


Ep 10/35 it 1000 seen 64000 loss 3.7305 lr 2.87e-04 ema 0.999579 elapsed 567.3s


Ep 10/35 it 1200 seen 76800 loss 1.9247 lr 2.87e-04 ema 0.999579 elapsed 679.9s


Ep 10/35 it 1400 seen 89600 loss 1.9293 lr 2.87e-04 ema 0.999579 elapsed 793.7s


Ep 10/35 it 1600 seen 102400 loss 2.3394 lr 2.87e-04 ema 0.999579 elapsed 906.1s


Ep 10/35 it 1800 seen 115200 loss 1.9342 lr 2.87e-04 ema 0.999579 elapsed 1019.5s


Ep 10/35 it 2000 seen 128000 loss 2.7111 lr 2.87e-04 ema 0.999579 elapsed 1133.4s


Ep 10/35 it 2200 seen 140800 loss 1.8178 lr 2.87e-04 ema 0.999579 elapsed 1246.0s


Ep 10/35 it 2400 seen 153600 loss 3.8475 lr 2.87e-04 ema 0.999579 elapsed 1359.2s


Ep 10/35 it 2600 seen 166400 loss 3.7364 lr 2.87e-04 ema 0.999579 elapsed 1472.0s


Ep 10/35 it 2800 seen 179200 loss 3.8618 lr 2.87e-04 ema 0.999579 elapsed 1585.8s


Ep 10/35 it 3000 seen 192000 loss 3.8206 lr 2.87e-04 ema 0.999579 elapsed 1699.3s


Ep 10/35 it 3200 seen 204800 loss 4.2853 lr 2.87e-04 ema 0.999579 elapsed 1812.0s


Ep 10/35 it 3400 seen 217600 loss 1.9747 lr 2.87e-04 ema 0.999579 elapsed 1925.6s


Epoch 10 val_acc(EMA) 0.8028 ep_time 1954.5s total 19572.8s


Saved EMA best to effv2m_320_ema.pth


Saved periodic EMA to effv2m_320_ema_ep10.pth


Ep 11/35 it 200 seen 12800 loss 2.2844 lr 2.80e-04 ema 0.999588 elapsed 113.6s


Ep 11/35 it 400 seen 25600 loss 2.1195 lr 2.80e-04 ema 0.999588 elapsed 227.1s


Ep 11/35 it 600 seen 38400 loss 2.1235 lr 2.80e-04 ema 0.999588 elapsed 339.3s


Ep 11/35 it 800 seen 51200 loss 3.4267 lr 2.80e-04 ema 0.999588 elapsed 453.0s


Ep 11/35 it 1000 seen 64000 loss 4.4211 lr 2.80e-04 ema 0.999588 elapsed 566.4s


Ep 11/35 it 1200 seen 76800 loss 2.6195 lr 2.80e-04 ema 0.999588 elapsed 678.7s


Ep 11/35 it 1400 seen 89600 loss 1.8978 lr 2.80e-04 ema 0.999588 elapsed 791.9s


Ep 11/35 it 1600 seen 102400 loss 1.5801 lr 2.80e-04 ema 0.999588 elapsed 904.7s


Ep 11/35 it 1800 seen 115200 loss 1.8795 lr 2.80e-04 ema 0.999588 elapsed 1018.2s


Ep 11/35 it 2000 seen 128000 loss 4.1125 lr 2.80e-04 ema 0.999588 elapsed 1131.4s


Ep 11/35 it 2200 seen 140800 loss 3.4675 lr 2.80e-04 ema 0.999588 elapsed 1243.8s


Ep 11/35 it 2400 seen 153600 loss 2.0959 lr 2.80e-04 ema 0.999588 elapsed 1357.5s


Ep 11/35 it 2600 seen 166400 loss 2.0595 lr 2.80e-04 ema 0.999588 elapsed 1470.8s


Ep 11/35 it 2800 seen 179200 loss 1.9688 lr 2.80e-04 ema 0.999588 elapsed 1583.2s


Ep 11/35 it 3000 seen 192000 loss 2.1600 lr 2.80e-04 ema 0.999588 elapsed 1696.6s


Ep 11/35 it 3200 seen 204800 loss 3.6945 lr 2.80e-04 ema 0.999588 elapsed 1809.3s


Ep 11/35 it 3400 seen 217600 loss 3.9803 lr 2.80e-04 ema 0.999588 elapsed 1922.8s


Epoch 11 val_acc(EMA) 0.8042 ep_time 1951.6s total 21526.5s


Saved EMA best to effv2m_320_ema.pth


Ep 12/35 it 200 seen 12800 loss 2.1531 lr 2.71e-04 ema 0.999597 elapsed 114.5s


Ep 12/35 it 400 seen 25600 loss 2.8195 lr 2.71e-04 ema 0.999597 elapsed 227.3s


Ep 12/35 it 600 seen 38400 loss 3.7191 lr 2.71e-04 ema 0.999597 elapsed 341.1s


Ep 12/35 it 800 seen 51200 loss 1.7685 lr 2.71e-04 ema 0.999597 elapsed 453.8s


Ep 12/35 it 1000 seen 64000 loss 1.9690 lr 2.71e-04 ema 0.999597 elapsed 567.0s


Ep 12/35 it 1200 seen 76800 loss 1.8982 lr 2.71e-04 ema 0.999597 elapsed 679.7s


Ep 12/35 it 1400 seen 89600 loss 1.9585 lr 2.71e-04 ema 0.999597 elapsed 793.5s


Ep 12/35 it 1600 seen 102400 loss 1.8985 lr 2.71e-04 ema 0.999597 elapsed 905.8s


Ep 12/35 it 1800 seen 115200 loss 2.9948 lr 2.71e-04 ema 0.999597 elapsed 1018.9s


Ep 12/35 it 2000 seen 128000 loss 2.1604 lr 2.71e-04 ema 0.999597 elapsed 1131.7s


Ep 12/35 it 2200 seen 140800 loss 1.8994 lr 2.71e-04 ema 0.999597 elapsed 1245.4s


Ep 12/35 it 2400 seen 153600 loss 2.2001 lr 2.71e-04 ema 0.999597 elapsed 1358.8s


Ep 12/35 it 2600 seen 166400 loss 3.9950 lr 2.71e-04 ema 0.999597 elapsed 1472.1s


Ep 12/35 it 2800 seen 179200 loss 2.0086 lr 2.71e-04 ema 0.999597 elapsed 1584.9s


Ep 12/35 it 3000 seen 192000 loss 1.9410 lr 2.71e-04 ema 0.999597 elapsed 1698.6s


Ep 12/35 it 3200 seen 204800 loss 3.7179 lr 2.71e-04 ema 0.999597 elapsed 1810.9s


Ep 12/35 it 3400 seen 217600 loss 1.8674 lr 2.71e-04 ema 0.999597 elapsed 1924.1s


Epoch 12 val_acc(EMA) 0.8093 ep_time 1953.1s total 23483.7s


Saved EMA best to effv2m_320_ema.pth


Ep 13/35 it 200 seen 12800 loss 1.7705 lr 2.62e-04 ema 0.999606 elapsed 114.2s


Ep 13/35 it 400 seen 25600 loss 3.9027 lr 2.62e-04 ema 0.999606 elapsed 228.5s


Ep 13/35 it 600 seen 38400 loss 3.7888 lr 2.62e-04 ema 0.999606 elapsed 341.9s


Ep 13/35 it 800 seen 51200 loss 2.1093 lr 2.62e-04 ema 0.999606 elapsed 454.5s


Ep 13/35 it 1000 seen 64000 loss 3.8567 lr 2.62e-04 ema 0.999606 elapsed 568.4s


Ep 13/35 it 1200 seen 76800 loss 3.2236 lr 2.62e-04 ema 0.999606 elapsed 681.1s


Ep 13/35 it 1400 seen 89600 loss 2.0888 lr 2.62e-04 ema 0.999606 elapsed 794.6s


Ep 13/35 it 1600 seen 102400 loss 3.3493 lr 2.62e-04 ema 0.999606 elapsed 908.1s


Ep 13/35 it 1800 seen 115200 loss 2.0063 lr 2.62e-04 ema 0.999606 elapsed 1021.2s


Ep 13/35 it 2000 seen 128000 loss 4.7517 lr 2.62e-04 ema 0.999606 elapsed 1134.5s


Ep 13/35 it 2200 seen 140800 loss 2.0427 lr 2.62e-04 ema 0.999606 elapsed 1247.0s


Ep 13/35 it 2400 seen 153600 loss 1.6880 lr 2.62e-04 ema 0.999606 elapsed 1360.7s


Ep 13/35 it 2600 seen 166400 loss 2.0772 lr 2.62e-04 ema 0.999606 elapsed 1473.7s


Ep 13/35 it 2800 seen 179200 loss 3.7020 lr 2.62e-04 ema 0.999606 elapsed 1587.1s


Ep 13/35 it 3000 seen 192000 loss 3.7981 lr 2.62e-04 ema 0.999606 elapsed 1700.7s


Ep 13/35 it 3200 seen 204800 loss 3.4432 lr 2.62e-04 ema 0.999606 elapsed 1813.5s


Ep 13/35 it 3400 seen 217600 loss 1.7283 lr 2.62e-04 ema 0.999606 elapsed 1927.2s


Epoch 13 val_acc(EMA) 0.8121 ep_time 1956.1s total 25441.8s


Saved EMA best to effv2m_320_ema.pth


Ep 14/35 it 200 seen 12800 loss 2.1159 lr 2.51e-04 ema 0.999615 elapsed 114.8s


Ep 14/35 it 400 seen 25600 loss 1.5453 lr 2.51e-04 ema 0.999615 elapsed 227.5s


Ep 14/35 it 600 seen 38400 loss 1.8760 lr 2.51e-04 ema 0.999615 elapsed 341.5s


Ep 14/35 it 800 seen 51200 loss 1.6982 lr 2.51e-04 ema 0.999615 elapsed 454.3s


Ep 14/35 it 1000 seen 64000 loss 3.7951 lr 2.51e-04 ema 0.999615 elapsed 567.8s


Ep 14/35 it 1200 seen 76800 loss 2.6142 lr 2.51e-04 ema 0.999615 elapsed 681.2s


Ep 14/35 it 1400 seen 89600 loss 2.7710 lr 2.51e-04 ema 0.999615 elapsed 794.2s


Ep 14/35 it 1600 seen 102400 loss 1.8945 lr 2.51e-04 ema 0.999615 elapsed 907.8s


Ep 14/35 it 1800 seen 115200 loss 1.8031 lr 2.51e-04 ema 0.999615 elapsed 1021.3s


Ep 14/35 it 2000 seen 128000 loss 2.0882 lr 2.51e-04 ema 0.999615 elapsed 1133.9s


Ep 14/35 it 2200 seen 140800 loss 2.4734 lr 2.51e-04 ema 0.999615 elapsed 1247.5s


Ep 14/35 it 2400 seen 153600 loss 1.8301 lr 2.51e-04 ema 0.999615 elapsed 1360.3s


Ep 14/35 it 2600 seen 166400 loss 1.7054 lr 2.51e-04 ema 0.999615 elapsed 1473.8s


Ep 14/35 it 2800 seen 179200 loss 1.8023 lr 2.51e-04 ema 0.999615 elapsed 1586.3s


Ep 14/35 it 3000 seen 192000 loss 2.7363 lr 2.51e-04 ema 0.999615 elapsed 1700.2s


Ep 14/35 it 3200 seen 204800 loss 1.8024 lr 2.51e-04 ema 0.999615 elapsed 1813.9s


Ep 14/35 it 3400 seen 217600 loss 1.7692 lr 2.51e-04 ema 0.999615 elapsed 1926.4s


Epoch 14 val_acc(EMA) 0.8164 ep_time 1955.4s total 27398.8s


Saved EMA best to effv2m_320_ema.pth


Ep 15/35 it 200 seen 12800 loss 3.7154 lr 2.38e-04 ema 0.999624 elapsed 115.3s


Ep 15/35 it 400 seen 25600 loss 1.9620 lr 2.38e-04 ema 0.999624 elapsed 229.3s


Ep 15/35 it 600 seen 38400 loss 2.3862 lr 2.38e-04 ema 0.999624 elapsed 341.8s


Ep 15/35 it 800 seen 51200 loss 3.5010 lr 2.38e-04 ema 0.999624 elapsed 455.3s


Ep 15/35 it 1000 seen 64000 loss 1.7360 lr 2.38e-04 ema 0.999624 elapsed 568.9s


Ep 15/35 it 1200 seen 76800 loss 2.4864 lr 2.38e-04 ema 0.999624 elapsed 681.6s


Ep 15/35 it 1400 seen 89600 loss 4.1559 lr 2.38e-04 ema 0.999624 elapsed 795.0s


Ep 15/35 it 1600 seen 102400 loss 2.9865 lr 2.38e-04 ema 0.999624 elapsed 907.6s


Ep 15/35 it 1800 seen 115200 loss 3.4023 lr 2.38e-04 ema 0.999624 elapsed 1021.4s


Ep 15/35 it 2000 seen 128000 loss 1.7250 lr 2.38e-04 ema 0.999624 elapsed 1135.1s


Ep 15/35 it 2200 seen 140800 loss 2.1245 lr 2.38e-04 ema 0.999624 elapsed 1247.8s


Ep 15/35 it 2400 seen 153600 loss 2.0426 lr 2.38e-04 ema 0.999624 elapsed 1361.4s


Ep 15/35 it 2600 seen 166400 loss 2.4513 lr 2.38e-04 ema 0.999624 elapsed 1474.2s


Ep 15/35 it 2800 seen 179200 loss 1.8625 lr 2.38e-04 ema 0.999624 elapsed 1588.0s


Ep 15/35 it 3000 seen 192000 loss 1.9238 lr 2.38e-04 ema 0.999624 elapsed 1701.6s


Ep 15/35 it 3200 seen 204800 loss 1.8882 lr 2.38e-04 ema 0.999624 elapsed 1814.2s


Ep 15/35 it 3400 seen 217600 loss 4.3440 lr 2.38e-04 ema 0.999624 elapsed 1928.0s


Epoch 15 val_acc(EMA) 0.8202 ep_time 1957.1s total 29358.0s


Saved EMA best to effv2m_320_ema.pth


Saved periodic EMA to effv2m_320_ema_ep15.pth


Ep 16/35 it 200 seen 12800 loss 1.7928 lr 2.25e-04 ema 0.999632 elapsed 113.7s


Ep 16/35 it 400 seen 25600 loss 1.6377 lr 2.25e-04 ema 0.999632 elapsed 227.7s


Ep 16/35 it 600 seen 38400 loss 3.7989 lr 2.25e-04 ema 0.999632 elapsed 341.2s


Ep 16/35 it 800 seen 51200 loss 2.3431 lr 2.25e-04 ema 0.999632 elapsed 453.8s


Ep 16/35 it 1000 seen 64000 loss 1.7669 lr 2.25e-04 ema 0.999632 elapsed 567.5s


Ep 16/35 it 1200 seen 76800 loss 1.7873 lr 2.25e-04 ema 0.999632 elapsed 679.9s


Ep 16/35 it 1400 seen 89600 loss 3.6389 lr 2.25e-04 ema 0.999632 elapsed 793.4s


Ep 16/35 it 1600 seen 102400 loss 2.3796 lr 2.25e-04 ema 0.999632 elapsed 906.3s


Ep 16/35 it 1800 seen 115200 loss 1.8883 lr 2.25e-04 ema 0.999632 elapsed 1019.8s


Ep 16/35 it 2000 seen 128000 loss 1.9354 lr 2.25e-04 ema 0.999632 elapsed 1133.4s


Ep 16/35 it 2200 seen 140800 loss 2.4794 lr 2.25e-04 ema 0.999632 elapsed 1246.1s


Ep 16/35 it 2400 seen 153600 loss 1.8596 lr 2.25e-04 ema 0.999632 elapsed 1359.8s


Ep 16/35 it 2600 seen 166400 loss 1.9171 lr 2.25e-04 ema 0.999632 elapsed 1472.2s


Ep 16/35 it 2800 seen 179200 loss 2.7669 lr 2.25e-04 ema 0.999632 elapsed 1585.8s


Ep 16/35 it 3000 seen 192000 loss 2.0258 lr 2.25e-04 ema 0.999632 elapsed 1699.3s


Ep 16/35 it 3200 seen 204800 loss 2.4222 lr 2.25e-04 ema 0.999632 elapsed 1812.1s


Ep 16/35 it 3400 seen 217600 loss 3.3081 lr 2.25e-04 ema 0.999632 elapsed 1925.4s


Epoch 16 val_acc(EMA) 0.8221 ep_time 1954.4s total 31314.6s


Saved EMA best to effv2m_320_ema.pth


Ep 17/35 it 200 seen 12800 loss 3.1646 lr 2.11e-04 ema 0.999641 elapsed 114.9s


Ep 17/35 it 400 seen 25600 loss 1.7421 lr 2.11e-04 ema 0.999641 elapsed 227.8s


Ep 17/35 it 600 seen 38400 loss 1.8406 lr 2.11e-04 ema 0.999641 elapsed 341.2s


Ep 17/35 it 800 seen 51200 loss 3.4851 lr 2.11e-04 ema 0.999641 elapsed 453.8s


Ep 17/35 it 1000 seen 64000 loss 1.6973 lr 2.11e-04 ema 0.999641 elapsed 567.2s


Ep 17/35 it 1200 seen 76800 loss 1.9235 lr 2.11e-04 ema 0.999641 elapsed 681.0s


Ep 17/35 it 1400 seen 89600 loss 3.2707 lr 2.11e-04 ema 0.999641 elapsed 793.7s


Ep 17/35 it 1600 seen 102400 loss 1.7985 lr 2.11e-04 ema 0.999641 elapsed 907.0s


Ep 17/35 it 1800 seen 115200 loss 3.7281 lr 2.11e-04 ema 0.999641 elapsed 1019.6s


Ep 17/35 it 2000 seen 128000 loss 3.4163 lr 2.11e-04 ema 0.999641 elapsed 1133.2s


Ep 17/35 it 2200 seen 140800 loss 3.5478 lr 2.11e-04 ema 0.999641 elapsed 1245.9s


Ep 17/35 it 2400 seen 153600 loss 2.4573 lr 2.11e-04 ema 0.999641 elapsed 1359.1s


Ep 17/35 it 2600 seen 166400 loss 2.7563 lr 2.11e-04 ema 0.999641 elapsed 1472.8s


Ep 17/35 it 2800 seen 179200 loss 3.4575 lr 2.11e-04 ema 0.999641 elapsed 1585.5s


Ep 17/35 it 3000 seen 192000 loss 2.1555 lr 2.11e-04 ema 0.999641 elapsed 1699.0s


Ep 17/35 it 3200 seen 204800 loss 2.1834 lr 2.11e-04 ema 0.999641 elapsed 1811.5s


Ep 17/35 it 3400 seen 217600 loss 1.6901 lr 2.11e-04 ema 0.999641 elapsed 1924.8s


Epoch 17 val_acc(EMA) 0.8214 ep_time 1953.8s total 33270.3s


Ep 18/35 it 200 seen 12800 loss 1.9851 lr 1.97e-04 ema 0.999650 elapsed 113.8s


Ep 18/35 it 400 seen 25600 loss 2.6755 lr 1.97e-04 ema 0.999650 elapsed 227.6s


Ep 18/35 it 600 seen 38400 loss 1.6569 lr 1.97e-04 ema 0.999650 elapsed 341.0s


Ep 18/35 it 800 seen 51200 loss 3.4843 lr 1.97e-04 ema 0.999650 elapsed 453.9s


Ep 18/35 it 1000 seen 64000 loss 3.7019 lr 1.97e-04 ema 0.999650 elapsed 567.2s


Ep 18/35 it 1200 seen 76800 loss 2.9957 lr 1.97e-04 ema 0.999650 elapsed 679.7s


Ep 18/35 it 1400 seen 89600 loss 3.3761 lr 1.97e-04 ema 0.999650 elapsed 793.3s


Ep 18/35 it 1600 seen 102400 loss 3.4619 lr 1.97e-04 ema 0.999650 elapsed 906.1s


Ep 18/35 it 1800 seen 115200 loss 3.7320 lr 1.97e-04 ema 0.999650 elapsed 1019.4s


Ep 18/35 it 2000 seen 128000 loss 1.8891 lr 1.97e-04 ema 0.999650 elapsed 1132.7s


Ep 18/35 it 2200 seen 140800 loss 1.7909 lr 1.97e-04 ema 0.999650 elapsed 1245.3s


Ep 18/35 it 2400 seen 153600 loss 3.4906 lr 1.97e-04 ema 0.999650 elapsed 1358.9s


Ep 18/35 it 2600 seen 166400 loss 1.9359 lr 1.97e-04 ema 0.999650 elapsed 1472.2s


Ep 18/35 it 2800 seen 179200 loss 1.4644 lr 1.97e-04 ema 0.999650 elapsed 1584.8s


Ep 18/35 it 3000 seen 192000 loss 3.5482 lr 1.97e-04 ema 0.999650 elapsed 1698.2s


Ep 18/35 it 3200 seen 204800 loss 1.9409 lr 1.97e-04 ema 0.999650 elapsed 1810.9s


Ep 18/35 it 3400 seen 217600 loss 1.7371 lr 1.97e-04 ema 0.999650 elapsed 1924.3s


Epoch 18 val_acc(EMA) 0.8263 ep_time 1953.3s total 35225.4s


Saved EMA best to effv2m_320_ema.pth


Ep 19/35 it 200 seen 12800 loss 1.6387 lr 1.82e-04 ema 0.999659 elapsed 114.8s


Ep 19/35 it 400 seen 25600 loss 1.7075 lr 1.82e-04 ema 0.999659 elapsed 227.8s


Ep 19/35 it 600 seen 38400 loss 3.6655 lr 1.82e-04 ema 0.999659 elapsed 341.2s


Ep 19/35 it 800 seen 51200 loss 2.4727 lr 1.82e-04 ema 0.999659 elapsed 453.8s


Ep 19/35 it 1000 seen 64000 loss 3.4182 lr 1.82e-04 ema 0.999659 elapsed 567.5s


Ep 19/35 it 1200 seen 76800 loss 2.1189 lr 1.82e-04 ema 0.999659 elapsed 680.1s


Ep 19/35 it 1400 seen 89600 loss 2.0583 lr 1.82e-04 ema 0.999659 elapsed 793.7s


Ep 19/35 it 1600 seen 102400 loss 1.8109 lr 1.82e-04 ema 0.999659 elapsed 907.2s


Ep 19/35 it 1800 seen 115200 loss 2.2743 lr 1.82e-04 ema 0.999659 elapsed 1019.8s


Ep 19/35 it 2000 seen 128000 loss 2.1465 lr 1.82e-04 ema 0.999659 elapsed 1133.4s


Ep 19/35 it 2200 seen 140800 loss 2.9512 lr 1.82e-04 ema 0.999659 elapsed 1246.2s


Ep 19/35 it 2400 seen 153600 loss 3.5792 lr 1.82e-04 ema 0.999659 elapsed 1359.8s


Ep 19/35 it 2600 seen 166400 loss 3.2093 lr 1.82e-04 ema 0.999659 elapsed 1473.6s


Ep 19/35 it 2800 seen 179200 loss 3.6112 lr 1.82e-04 ema 0.999659 elapsed 1586.3s


Ep 19/35 it 3000 seen 192000 loss 3.3841 lr 1.82e-04 ema 0.999659 elapsed 1699.9s


Ep 19/35 it 3200 seen 204800 loss 1.8289 lr 1.82e-04 ema 0.999659 elapsed 1813.4s


Ep 19/35 it 3400 seen 217600 loss 3.1599 lr 1.82e-04 ema 0.999659 elapsed 1926.2s


Epoch 19 val_acc(EMA) 0.8277 ep_time 1955.2s total 37184.7s


Saved EMA best to effv2m_320_ema.pth


Ep 20/35 it 200 seen 12800 loss 1.7888 lr 1.66e-04 ema 0.999668 elapsed 115.0s


Ep 20/35 it 400 seen 25600 loss 1.6857 lr 1.66e-04 ema 0.999668 elapsed 227.7s


Ep 20/35 it 600 seen 38400 loss 3.5817 lr 1.66e-04 ema 0.999668 elapsed 341.5s


Ep 20/35 it 800 seen 51200 loss 3.4971 lr 1.66e-04 ema 0.999668 elapsed 455.1s


Ep 20/35 it 1000 seen 64000 loss 3.1440 lr 1.66e-04 ema 0.999668 elapsed 568.0s


Ep 20/35 it 1200 seen 76800 loss 1.6909 lr 1.66e-04 ema 0.999668 elapsed 681.7s


Ep 20/35 it 1400 seen 89600 loss 1.5648 lr 1.66e-04 ema 0.999668 elapsed 795.3s


Ep 20/35 it 1600 seen 102400 loss 2.5638 lr 1.66e-04 ema 0.999668 elapsed 908.1s


Ep 20/35 it 1800 seen 115200 loss 2.0217 lr 1.66e-04 ema 0.999668 elapsed 1021.8s


Ep 20/35 it 2000 seen 128000 loss 4.7377 lr 1.66e-04 ema 0.999668 elapsed 1134.7s


Ep 20/35 it 2200 seen 140800 loss 3.3271 lr 1.66e-04 ema 0.999668 elapsed 1248.5s


Ep 20/35 it 2400 seen 153600 loss 3.3036 lr 1.66e-04 ema 0.999668 elapsed 1361.0s


Ep 20/35 it 2600 seen 166400 loss 1.6438 lr 1.66e-04 ema 0.999668 elapsed 1474.6s


Ep 20/35 it 2800 seen 179200 loss 2.9069 lr 1.66e-04 ema 0.999668 elapsed 1588.2s


Ep 20/35 it 3000 seen 192000 loss 1.6370 lr 1.66e-04 ema 0.999668 elapsed 1701.1s


Ep 20/35 it 3200 seen 204800 loss 3.5830 lr 1.66e-04 ema 0.999668 elapsed 1814.7s


Ep 20/35 it 3400 seen 217600 loss 1.5212 lr 1.66e-04 ema 0.999668 elapsed 1927.1s


Epoch 20 val_acc(EMA) 0.8295 ep_time 1956.1s total 39142.8s


Saved EMA best to effv2m_320_ema.pth


Saved periodic EMA to effv2m_320_ema_ep20.pth


Ep 21/35 it 200 seen 12800 loss 3.1852 lr 1.50e-04 ema 0.999676 elapsed 115.0s


Ep 21/35 it 400 seen 25600 loss 1.7841 lr 1.50e-04 ema 0.999676 elapsed 228.0s


Ep 21/35 it 600 seen 38400 loss 1.5528 lr 1.50e-04 ema 0.999676 elapsed 341.5s


Ep 21/35 it 800 seen 51200 loss 1.5762 lr 1.50e-04 ema 0.999676 elapsed 454.1s


Ep 21/35 it 1000 seen 64000 loss 1.5722 lr 1.50e-04 ema 0.999676 elapsed 567.9s


Ep 21/35 it 1200 seen 76800 loss 2.5107 lr 1.50e-04 ema 0.999676 elapsed 681.4s


Ep 21/35 it 1400 seen 89600 loss 1.4472 lr 1.50e-04 ema 0.999676 elapsed 793.7s


Ep 21/35 it 1600 seen 102400 loss 4.2933 lr 1.50e-04 ema 0.999676 elapsed 907.2s


Ep 21/35 it 1800 seen 115200 loss 1.6673 lr 1.50e-04 ema 0.999676 elapsed 1019.9s


Ep 21/35 it 2000 seen 128000 loss 1.6748 lr 1.50e-04 ema 0.999676 elapsed 1133.7s


Ep 21/35 it 2200 seen 140800 loss 3.5652 lr 1.50e-04 ema 0.999676 elapsed 1247.3s


Ep 21/35 it 2400 seen 153600 loss 2.0351 lr 1.50e-04 ema 0.999676 elapsed 1360.1s


Ep 21/35 it 2600 seen 166400 loss 1.5260 lr 1.50e-04 ema 0.999676 elapsed 1473.8s


Ep 21/35 it 2800 seen 179200 loss 2.1133 lr 1.50e-04 ema 0.999676 elapsed 1586.5s


Ep 21/35 it 3000 seen 192000 loss 2.1102 lr 1.50e-04 ema 0.999676 elapsed 1699.6s


Ep 21/35 it 3200 seen 204800 loss 3.4510 lr 1.50e-04 ema 0.999676 elapsed 1813.2s


Ep 21/35 it 3400 seen 217600 loss 2.9254 lr 1.50e-04 ema 0.999676 elapsed 1925.7s


Epoch 21 val_acc(EMA) 0.8306 ep_time 1954.6s total 41101.4s


Saved EMA best to effv2m_320_ema.pth


Ep 22/35 it 200 seen 12800 loss 2.5500 lr 1.35e-04 ema 0.999685 elapsed 114.8s


Ep 22/35 it 400 seen 25600 loss 1.7718 lr 1.35e-04 ema 0.999685 elapsed 227.5s


Ep 22/35 it 600 seen 38400 loss 4.1240 lr 1.35e-04 ema 0.999685 elapsed 341.6s


Ep 22/35 it 800 seen 51200 loss 1.5318 lr 1.35e-04 ema 0.999685 elapsed 454.5s


Ep 22/35 it 1000 seen 64000 loss 1.5888 lr 1.35e-04 ema 0.999685 elapsed 567.8s


Ep 22/35 it 1200 seen 76800 loss 1.5956 lr 1.35e-04 ema 0.999685 elapsed 680.2s


Ep 22/35 it 1400 seen 89600 loss 1.7276 lr 1.35e-04 ema 0.999685 elapsed 794.2s


Ep 22/35 it 1600 seen 102400 loss 2.7838 lr 1.35e-04 ema 0.999685 elapsed 907.9s


Ep 22/35 it 1800 seen 115200 loss 4.0614 lr 1.35e-04 ema 0.999685 elapsed 1020.3s


Ep 22/35 it 2000 seen 128000 loss 1.5210 lr 1.35e-04 ema 0.999685 elapsed 1133.9s


Ep 22/35 it 2200 seen 140800 loss 1.4935 lr 1.35e-04 ema 0.999685 elapsed 1247.7s


Ep 22/35 it 2400 seen 153600 loss 1.6186 lr 1.35e-04 ema 0.999685 elapsed 1360.4s


Ep 22/35 it 2600 seen 166400 loss 1.6308 lr 1.35e-04 ema 0.999685 elapsed 1473.7s


Ep 22/35 it 2800 seen 179200 loss 1.6225 lr 1.35e-04 ema 0.999685 elapsed 1586.1s


Ep 22/35 it 3000 seen 192000 loss 3.5487 lr 1.35e-04 ema 0.999685 elapsed 1699.8s


Ep 22/35 it 3200 seen 204800 loss 3.2331 lr 1.35e-04 ema 0.999685 elapsed 1813.6s


Ep 22/35 it 3400 seen 217600 loss 2.1693 lr 1.35e-04 ema 0.999685 elapsed 1926.1s


Epoch 22 val_acc(EMA) 0.8340 ep_time 1954.9s total 43058.1s


Saved EMA best to effv2m_320_ema.pth


Ep 23/35 it 200 seen 12800 loss 1.3607 lr 1.19e-04 ema 0.999694 elapsed 114.8s


Ep 23/35 it 400 seen 25600 loss 2.9662 lr 1.19e-04 ema 0.999694 elapsed 228.8s


Ep 23/35 it 600 seen 38400 loss 3.2277 lr 1.19e-04 ema 0.999694 elapsed 341.7s


Ep 23/35 it 800 seen 51200 loss 2.1880 lr 1.19e-04 ema 0.999694 elapsed 455.1s


Ep 23/35 it 1000 seen 64000 loss 1.4443 lr 1.19e-04 ema 0.999694 elapsed 568.0s


Ep 23/35 it 1200 seen 76800 loss 1.5785 lr 1.19e-04 ema 0.999694 elapsed 682.0s


Ep 23/35 it 1400 seen 89600 loss 1.5061 lr 1.19e-04 ema 0.999694 elapsed 795.6s


Ep 23/35 it 1600 seen 102400 loss 1.9141 lr 1.19e-04 ema 0.999694 elapsed 908.2s


Ep 23/35 it 1800 seen 115200 loss 1.5142 lr 1.19e-04 ema 0.999694 elapsed 1021.7s


Ep 23/35 it 2000 seen 128000 loss 2.0645 lr 1.19e-04 ema 0.999694 elapsed 1135.7s


Ep 23/35 it 2200 seen 140800 loss 3.8599 lr 1.19e-04 ema 0.999694 elapsed 1248.4s


Ep 23/35 it 2400 seen 153600 loss 3.5283 lr 1.19e-04 ema 0.999694 elapsed 1361.7s


Ep 23/35 it 2600 seen 166400 loss 1.5588 lr 1.19e-04 ema 0.999694 elapsed 1474.5s


Ep 23/35 it 2800 seen 179200 loss 1.6875 lr 1.19e-04 ema 0.999694 elapsed 1588.2s


Ep 23/35 it 3000 seen 192000 loss 1.4477 lr 1.19e-04 ema 0.999694 elapsed 1701.4s


Ep 23/35 it 3200 seen 204800 loss 3.3090 lr 1.19e-04 ema 0.999694 elapsed 1814.0s


Ep 23/35 it 3400 seen 217600 loss 1.7276 lr 1.19e-04 ema 0.999694 elapsed 1927.5s


Epoch 23 val_acc(EMA) 0.8389 ep_time 1956.4s total 45016.5s


Saved EMA best to effv2m_320_ema.pth


Ep 24/35 it 200 seen 12800 loss 3.1454 lr 1.04e-04 ema 0.999703 elapsed 113.9s


Ep 24/35 it 400 seen 25600 loss 1.5950 lr 1.04e-04 ema 0.999703 elapsed 227.5s


Ep 24/35 it 600 seen 38400 loss 2.0424 lr 1.04e-04 ema 0.999703 elapsed 340.2s


Ep 24/35 it 800 seen 51200 loss 3.3313 lr 1.04e-04 ema 0.999703 elapsed 454.1s


Ep 24/35 it 1000 seen 64000 loss 1.5579 lr 1.04e-04 ema 0.999703 elapsed 567.7s


Ep 24/35 it 1200 seen 76800 loss 1.5444 lr 1.04e-04 ema 0.999703 elapsed 680.3s


Ep 24/35 it 1400 seen 89600 loss 1.5521 lr 1.04e-04 ema 0.999703 elapsed 794.0s


Ep 24/35 it 1600 seen 102400 loss 1.5950 lr 1.04e-04 ema 0.999703 elapsed 907.1s


Ep 24/35 it 1800 seen 115200 loss 1.6964 lr 1.04e-04 ema 0.999703 elapsed 1020.8s


Ep 24/35 it 2000 seen 128000 loss 1.6451 lr 1.04e-04 ema 0.999703 elapsed 1133.4s


Ep 24/35 it 2200 seen 140800 loss 1.2724 lr 1.04e-04 ema 0.999703 elapsed 1247.3s


Ep 24/35 it 2400 seen 153600 loss 2.7573 lr 1.04e-04 ema 0.999703 elapsed 1361.1s


Ep 24/35 it 2600 seen 166400 loss 1.5496 lr 1.04e-04 ema 0.999703 elapsed 1473.8s


Ep 24/35 it 2800 seen 179200 loss 1.8920 lr 1.04e-04 ema 0.999703 elapsed 1587.3s


Ep 24/35 it 3000 seen 192000 loss 1.5026 lr 1.04e-04 ema 0.999703 elapsed 1700.1s


Ep 24/35 it 3200 seen 204800 loss 3.4564 lr 1.04e-04 ema 0.999703 elapsed 1814.1s


Ep 24/35 it 3400 seen 217600 loss 1.5738 lr 1.04e-04 ema 0.999703 elapsed 1927.5s


Epoch 24 val_acc(EMA) 0.8410 ep_time 1956.4s total 46976.4s


Saved EMA best to effv2m_320_ema.pth


Ep 25/35 it 200 seen 12800 loss 1.4416 lr 8.97e-05 ema 0.999712 elapsed 113.9s


Ep 25/35 it 400 seen 25600 loss 3.0289 lr 8.97e-05 ema 0.999712 elapsed 228.1s


Ep 25/35 it 600 seen 38400 loss 1.5816 lr 8.97e-05 ema 0.999712 elapsed 341.7s


Ep 25/35 it 800 seen 51200 loss 3.3126 lr 8.97e-05 ema 0.999712 elapsed 454.1s


Ep 25/35 it 1000 seen 64000 loss 3.2917 lr 8.97e-05 ema 0.999712 elapsed 567.8s


Ep 25/35 it 1200 seen 76800 loss 1.6987 lr 8.97e-05 ema 0.999712 elapsed 681.8s


Ep 25/35 it 1400 seen 89600 loss 1.8814 lr 8.97e-05 ema 0.999712 elapsed 794.2s


Ep 25/35 it 1600 seen 102400 loss 1.6261 lr 8.97e-05 ema 0.999712 elapsed 907.7s


Ep 25/35 it 1800 seen 115200 loss 2.6828 lr 8.97e-05 ema 0.999712 elapsed 1020.4s


Ep 25/35 it 2000 seen 128000 loss 1.7641 lr 8.97e-05 ema 0.999712 elapsed 1133.9s


Ep 25/35 it 2200 seen 140800 loss 1.6940 lr 8.97e-05 ema 0.999712 elapsed 1247.6s


Ep 25/35 it 2400 seen 153600 loss 3.0430 lr 8.97e-05 ema 0.999712 elapsed 1360.0s


Ep 25/35 it 2600 seen 166400 loss 1.6212 lr 8.97e-05 ema 0.999712 elapsed 1473.5s


Ep 25/35 it 2800 seen 179200 loss 1.4278 lr 8.97e-05 ema 0.999712 elapsed 1586.1s


Ep 25/35 it 3000 seen 192000 loss 4.1322 lr 8.97e-05 ema 0.999712 elapsed 1699.6s


Ep 25/35 it 3200 seen 204800 loss 1.6983 lr 8.97e-05 ema 0.999712 elapsed 1812.4s


Ep 25/35 it 3400 seen 217600 loss 1.5919 lr 8.97e-05 ema 0.999712 elapsed 1925.9s


Epoch 25 val_acc(EMA) 0.8415 ep_time 1954.7s total 48932.9s


Saved EMA best to effv2m_320_ema.pth


Saved periodic EMA to effv2m_320_ema_ep25.pth


Ep 26/35 it 200 seen 12800 loss 1.5083 lr 7.58e-05 ema 0.999721 elapsed 114.8s


Ep 26/35 it 400 seen 25600 loss 1.4921 lr 7.58e-05 ema 0.999721 elapsed 227.5s


Ep 26/35 it 600 seen 38400 loss 3.8491 lr 7.58e-05 ema 0.999721 elapsed 341.3s


Ep 26/35 it 800 seen 51200 loss 1.5575 lr 7.58e-05 ema 0.999721 elapsed 454.8s


Ep 26/35 it 1000 seen 64000 loss 2.7491 lr 7.58e-05 ema 0.999721 elapsed 567.3s


Ep 26/35 it 1200 seen 76800 loss 3.3743 lr 7.58e-05 ema 0.999721 elapsed 680.7s


Ep 26/35 it 1400 seen 89600 loss 1.4896 lr 7.58e-05 ema 0.999721 elapsed 793.6s


Ep 26/35 it 1600 seen 102400 loss 3.4119 lr 7.58e-05 ema 0.999721 elapsed 906.9s


Ep 26/35 it 1800 seen 115200 loss 2.9995 lr 7.58e-05 ema 0.999721 elapsed 1020.3s


Ep 26/35 it 2000 seen 128000 loss 1.7399 lr 7.58e-05 ema 0.999721 elapsed 1132.6s


Ep 26/35 it 2200 seen 140800 loss 3.5571 lr 7.58e-05 ema 0.999721 elapsed 1246.0s


Ep 26/35 it 2400 seen 153600 loss 3.0969 lr 7.58e-05 ema 0.999721 elapsed 1358.4s


Ep 26/35 it 2600 seen 166400 loss 1.4492 lr 7.58e-05 ema 0.999721 elapsed 1471.7s


Ep 26/35 it 2800 seen 179200 loss 1.5395 lr 7.58e-05 ema 0.999721 elapsed 1585.1s


Ep 26/35 it 3000 seen 192000 loss 1.5549 lr 7.58e-05 ema 0.999721 elapsed 1697.5s


Ep 26/35 it 3200 seen 204800 loss 3.2632 lr 7.58e-05 ema 0.999721 elapsed 1810.7s


Ep 26/35 it 3400 seen 217600 loss 2.0356 lr 7.58e-05 ema 0.999721 elapsed 1923.0s


Epoch 26 val_acc(EMA) 0.8418 ep_time 1951.8s total 50886.9s


Saved EMA best to effv2m_320_ema.pth


Ep 27/35 it 200 seen 12800 loss 1.5438 lr 6.26e-05 ema 0.999729 elapsed 115.1s


Ep 27/35 it 400 seen 25600 loss 1.6559 lr 6.26e-05 ema 0.999729 elapsed 228.7s


Ep 27/35 it 600 seen 38400 loss 3.2763 lr 6.26e-05 ema 0.999729 elapsed 341.4s


Ep 27/35 it 800 seen 51200 loss 3.1308 lr 6.26e-05 ema 0.999729 elapsed 455.0s


Ep 27/35 it 1000 seen 64000 loss 2.1644 lr 6.26e-05 ema 0.999729 elapsed 567.9s


Ep 27/35 it 1200 seen 76800 loss 1.3265 lr 6.26e-05 ema 0.999729 elapsed 681.6s


Ep 27/35 it 1400 seen 89600 loss 1.9561 lr 6.26e-05 ema 0.999729 elapsed 794.9s


Ep 27/35 it 1600 seen 102400 loss 1.9471 lr 6.26e-05 ema 0.999729 elapsed 907.6s


Ep 27/35 it 1800 seen 115200 loss 1.4816 lr 6.26e-05 ema 0.999729 elapsed 1021.4s


Ep 27/35 it 2000 seen 128000 loss 1.5987 lr 6.26e-05 ema 0.999729 elapsed 1133.9s


Ep 27/35 it 2200 seen 140800 loss 3.4236 lr 6.26e-05 ema 0.999729 elapsed 1247.3s


Ep 27/35 it 2400 seen 153600 loss 1.4714 lr 6.26e-05 ema 0.999729 elapsed 1361.1s


Ep 27/35 it 2600 seen 166400 loss 2.0113 lr 6.26e-05 ema 0.999729 elapsed 1474.0s


Ep 27/35 it 2800 seen 179200 loss 2.2616 lr 6.26e-05 ema 0.999729 elapsed 1587.6s


Ep 27/35 it 3000 seen 192000 loss 1.4871 lr 6.26e-05 ema 0.999729 elapsed 1700.1s


Ep 27/35 it 3200 seen 204800 loss 4.0478 lr 6.26e-05 ema 0.999729 elapsed 1813.5s


Ep 27/35 it 3400 seen 217600 loss 1.5918 lr 6.26e-05 ema 0.999729 elapsed 1927.1s


Epoch 27 val_acc(EMA) 0.8427 ep_time 1956.2s total 52844.8s


Saved EMA best to effv2m_320_ema.pth


Ep 28/35 it 200 seen 12800 loss 1.3721 lr 5.05e-05 ema 0.999738 elapsed 113.6s


Ep 28/35 it 400 seen 25600 loss 1.4470 lr 5.05e-05 ema 0.999738 elapsed 227.4s


Ep 28/35 it 600 seen 38400 loss 1.3838 lr 5.05e-05 ema 0.999738 elapsed 340.5s


Ep 28/35 it 800 seen 51200 loss 1.3906 lr 5.05e-05 ema 0.999738 elapsed 454.1s


Ep 28/35 it 1000 seen 64000 loss 1.4390 lr 5.05e-05 ema 0.999738 elapsed 567.6s


Ep 28/35 it 1200 seen 76800 loss 1.4825 lr 5.05e-05 ema 0.999738 elapsed 680.4s


Ep 28/35 it 1400 seen 89600 loss 1.4021 lr 5.05e-05 ema 0.999738 elapsed 794.1s


Ep 28/35 it 1600 seen 102400 loss 3.4461 lr 5.05e-05 ema 0.999738 elapsed 906.8s


Ep 28/35 it 1800 seen 115200 loss 3.5624 lr 5.05e-05 ema 0.999738 elapsed 1020.1s


Ep 28/35 it 2000 seen 128000 loss 3.1772 lr 5.05e-05 ema 0.999738 elapsed 1133.7s


Ep 28/35 it 2200 seen 140800 loss 3.7425 lr 5.05e-05 ema 0.999738 elapsed 1246.6s


Ep 28/35 it 2400 seen 153600 loss 2.9554 lr 5.05e-05 ema 0.999738 elapsed 1360.0s


Ep 28/35 it 2600 seen 166400 loss 1.4792 lr 5.05e-05 ema 0.999738 elapsed 1472.5s


Ep 28/35 it 2800 seen 179200 loss 1.3473 lr 5.05e-05 ema 0.999738 elapsed 1586.1s


Ep 28/35 it 3000 seen 192000 loss 2.8003 lr 5.05e-05 ema 0.999738 elapsed 1698.9s


Ep 28/35 it 3200 seen 204800 loss 1.5731 lr 5.05e-05 ema 0.999738 elapsed 1812.3s


Ep 28/35 it 3400 seen 217600 loss 1.3609 lr 5.05e-05 ema 0.999738 elapsed 1925.6s


Epoch 28 val_acc(EMA) 0.8443 ep_time 1954.5s total 54801.4s


Saved EMA best to effv2m_320_ema.pth


Ep 29/35 it 200 seen 12800 loss 1.2487 lr 3.94e-05 ema 0.999747 elapsed 113.8s


Ep 29/35 it 400 seen 25600 loss 1.3174 lr 3.94e-05 ema 0.999747 elapsed 228.3s


Ep 29/35 it 600 seen 38400 loss 1.8952 lr 3.94e-05 ema 0.999747 elapsed 341.9s


Ep 29/35 it 800 seen 51200 loss 1.5017 lr 3.94e-05 ema 0.999747 elapsed 454.5s


Ep 29/35 it 1000 seen 64000 loss 2.4761 lr 3.94e-05 ema 0.999747 elapsed 568.1s


Ep 29/35 it 1200 seen 76800 loss 1.4466 lr 3.94e-05 ema 0.999747 elapsed 681.0s


Ep 29/35 it 1400 seen 89600 loss 1.3637 lr 3.94e-05 ema 0.999747 elapsed 794.4s


Ep 29/35 it 1600 seen 102400 loss 1.6591 lr 3.94e-05 ema 0.999747 elapsed 908.0s


Ep 29/35 it 1800 seen 115200 loss 2.5932 lr 3.94e-05 ema 0.999747 elapsed 1020.9s


Ep 29/35 it 2000 seen 128000 loss 3.1851 lr 3.94e-05 ema 0.999747 elapsed 1134.7s


Ep 29/35 it 2200 seen 140800 loss 1.2808 lr 3.94e-05 ema 0.999747 elapsed 1247.2s


Ep 29/35 it 2400 seen 153600 loss 2.6585 lr 3.94e-05 ema 0.999747 elapsed 1360.6s


Ep 29/35 it 2600 seen 166400 loss 1.5099 lr 3.94e-05 ema 0.999747 elapsed 1473.4s


Ep 29/35 it 2800 seen 179200 loss 1.7738 lr 3.94e-05 ema 0.999747 elapsed 1587.2s


Ep 29/35 it 3000 seen 192000 loss 1.5695 lr 3.94e-05 ema 0.999747 elapsed 1700.8s


Ep 29/35 it 3200 seen 204800 loss 1.2295 lr 3.94e-05 ema 0.999747 elapsed 1813.4s


Ep 29/35 it 3400 seen 217600 loss 1.4840 lr 3.94e-05 ema 0.999747 elapsed 1926.9s


Epoch 29 val_acc(EMA) 0.8461 ep_time 1955.9s total 56759.3s


Saved EMA best to effv2m_320_ema.pth


Ep 30/35 it 200 seen 12800 loss 1.9267 lr 2.96e-05 ema 0.999756 elapsed 115.0s


Ep 30/35 it 400 seen 25600 loss 1.4031 lr 2.96e-05 ema 0.999756 elapsed 227.8s


Ep 30/35 it 600 seen 38400 loss 1.4236 lr 2.96e-05 ema 0.999756 elapsed 341.5s


Ep 30/35 it 800 seen 51200 loss 1.2349 lr 2.96e-05 ema 0.999756 elapsed 454.6s


Ep 30/35 it 1000 seen 64000 loss 1.2933 lr 2.96e-05 ema 0.999756 elapsed 568.2s


Ep 30/35 it 1200 seen 76800 loss 1.3176 lr 2.96e-05 ema 0.999756 elapsed 681.0s


Ep 30/35 it 1400 seen 89600 loss 1.3806 lr 2.96e-05 ema 0.999756 elapsed 795.0s


Ep 30/35 it 1600 seen 102400 loss 2.6018 lr 2.96e-05 ema 0.999756 elapsed 909.1s


Ep 30/35 it 1800 seen 115200 loss 1.2525 lr 2.96e-05 ema 0.999756 elapsed 1021.8s


Ep 30/35 it 2000 seen 128000 loss 1.4686 lr 2.96e-05 ema 0.999756 elapsed 1135.4s


Ep 30/35 it 2200 seen 140800 loss 2.7182 lr 2.96e-05 ema 0.999756 elapsed 1248.4s


Ep 30/35 it 2400 seen 153600 loss 1.3051 lr 2.96e-05 ema 0.999756 elapsed 1362.2s


Ep 30/35 it 2600 seen 166400 loss 1.4787 lr 2.96e-05 ema 0.999756 elapsed 1476.0s


Ep 30/35 it 2800 seen 179200 loss 1.4938 lr 2.96e-05 ema 0.999756 elapsed 1588.8s


Ep 30/35 it 3000 seen 192000 loss 1.3710 lr 2.96e-05 ema 0.999756 elapsed 1702.7s


Ep 30/35 it 3200 seen 204800 loss 1.3585 lr 2.96e-05 ema 0.999756 elapsed 1815.8s


Ep 30/35 it 3400 seen 217600 loss 1.5345 lr 2.96e-05 ema 0.999756 elapsed 1929.2s


Epoch 30 val_acc(EMA) 0.8462 ep_time 1958.3s total 58719.4s


Saved EMA best to effv2m_320_ema.pth


Saved periodic EMA to effv2m_320_ema_ep30.pth


Ep 31/35 it 200 seen 12800 loss 1.5063 lr 2.10e-05 ema 0.999765 elapsed 114.8s


Ep 31/35 it 400 seen 25600 loss 4.0072 lr 2.10e-05 ema 0.999765 elapsed 227.9s


Ep 31/35 it 600 seen 38400 loss 3.1874 lr 2.10e-05 ema 0.999765 elapsed 341.4s


Ep 31/35 it 800 seen 51200 loss 1.4251 lr 2.10e-05 ema 0.999765 elapsed 454.0s


Ep 31/35 it 1000 seen 64000 loss 1.4459 lr 2.10e-05 ema 0.999765 elapsed 567.9s


Ep 31/35 it 1200 seen 76800 loss 1.2902 lr 2.10e-05 ema 0.999765 elapsed 680.7s


Ep 31/35 it 1400 seen 89600 loss 1.3470 lr 2.10e-05 ema 0.999765 elapsed 794.1s


Ep 31/35 it 1600 seen 102400 loss 1.4440 lr 2.10e-05 ema 0.999765 elapsed 907.7s


Ep 31/35 it 1800 seen 115200 loss 1.3510 lr 2.10e-05 ema 0.999765 elapsed 1020.7s


Ep 31/35 it 2000 seen 128000 loss 1.2033 lr 2.10e-05 ema 0.999765 elapsed 1134.4s


Ep 31/35 it 2200 seen 140800 loss 2.2355 lr 2.10e-05 ema 0.999765 elapsed 1247.1s


Ep 31/35 it 2400 seen 153600 loss 1.4816 lr 2.10e-05 ema 0.999765 elapsed 1360.6s


Ep 31/35 it 2600 seen 166400 loss 1.5350 lr 2.10e-05 ema 0.999765 elapsed 1474.3s


Ep 31/35 it 2800 seen 179200 loss 3.6237 lr 2.10e-05 ema 0.999765 elapsed 1587.2s


Ep 31/35 it 3000 seen 192000 loss 2.9537 lr 2.10e-05 ema 0.999765 elapsed 1700.5s


Ep 31/35 it 3200 seen 204800 loss 1.3570 lr 2.10e-05 ema 0.999765 elapsed 1813.1s


Ep 31/35 it 3400 seen 217600 loss 2.9964 lr 2.10e-05 ema 0.999765 elapsed 1926.8s


Epoch 31 val_acc(EMA) 0.8454 ep_time 1955.6s total 60679.4s


Ep 32/35 it 200 seen 12800 loss 1.3283 lr 1.39e-05 ema 0.999774 elapsed 114.6s


Ep 32/35 it 400 seen 25600 loss 2.8047 lr 1.39e-05 ema 0.999774 elapsed 227.2s


Ep 32/35 it 600 seen 38400 loss 1.3937 lr 1.39e-05 ema 0.999774 elapsed 340.8s


Ep 32/35 it 800 seen 51200 loss 1.2531 lr 1.39e-05 ema 0.999774 elapsed 453.5s


Ep 32/35 it 1000 seen 64000 loss 1.6448 lr 1.39e-05 ema 0.999774 elapsed 567.1s


Ep 32/35 it 1200 seen 76800 loss 3.4045 lr 1.39e-05 ema 0.999774 elapsed 680.8s


Ep 32/35 it 1400 seen 89600 loss 3.3843 lr 1.39e-05 ema 0.999774 elapsed 793.6s


Ep 32/35 it 1600 seen 102400 loss 1.5223 lr 1.39e-05 ema 0.999774 elapsed 907.2s


Ep 32/35 it 1800 seen 115200 loss 1.4436 lr 1.39e-05 ema 0.999774 elapsed 1019.9s


Ep 32/35 it 2000 seen 128000 loss 1.4395 lr 1.39e-05 ema 0.999774 elapsed 1133.6s


Ep 32/35 it 2200 seen 140800 loss 1.6840 lr 1.39e-05 ema 0.999774 elapsed 1247.2s


Ep 32/35 it 2400 seen 153600 loss 1.2856 lr 1.39e-05 ema 0.999774 elapsed 1360.0s


Ep 32/35 it 2600 seen 166400 loss 1.4181 lr 1.39e-05 ema 0.999774 elapsed 1473.4s


Ep 32/35 it 2800 seen 179200 loss 1.3930 lr 1.39e-05 ema 0.999774 elapsed 1586.1s


Ep 32/35 it 3000 seen 192000 loss 2.9870 lr 1.39e-05 ema 0.999774 elapsed 1699.7s


Ep 32/35 it 3200 seen 204800 loss 2.9712 lr 1.39e-05 ema 0.999774 elapsed 1813.3s


Ep 32/35 it 3400 seen 217600 loss 2.6440 lr 1.39e-05 ema 0.999774 elapsed 1925.8s


Epoch 32 val_acc(EMA) 0.8472 ep_time 1954.7s total 62636.0s


Saved EMA best to effv2m_320_ema.pth


Ep 33/35 it 200 seen 12800 loss 1.4602 lr 8.32e-06 ema 0.999782 elapsed 115.1s


Ep 33/35 it 400 seen 25600 loss 3.7967 lr 8.32e-06 ema 0.999782 elapsed 229.2s


Ep 33/35 it 600 seen 38400 loss 1.3790 lr 8.32e-06 ema 0.999782 elapsed 341.7s


Ep 33/35 it 800 seen 51200 loss 3.0062 lr 8.32e-06 ema 0.999782 elapsed 455.3s


Ep 33/35 it 1000 seen 64000 loss 2.7951 lr 8.32e-06 ema 0.999782 elapsed 568.0s


Ep 33/35 it 1200 seen 76800 loss 1.2901 lr 8.32e-06 ema 0.999782 elapsed 681.6s


Ep 33/35 it 1400 seen 89600 loss 1.3732 lr 8.32e-06 ema 0.999782 elapsed 794.8s


Ep 33/35 it 1600 seen 102400 loss 1.4295 lr 8.32e-06 ema 0.999782 elapsed 907.5s


Ep 33/35 it 1800 seen 115200 loss 3.0617 lr 8.32e-06 ema 0.999782 elapsed 1021.1s


Ep 33/35 it 2000 seen 128000 loss 3.1248 lr 8.32e-06 ema 0.999782 elapsed 1133.6s


Ep 33/35 it 2200 seen 140800 loss 2.0241 lr 8.32e-06 ema 0.999782 elapsed 1246.9s


Ep 33/35 it 2400 seen 153600 loss 1.9152 lr 8.32e-06 ema 0.999782 elapsed 1359.6s


Ep 33/35 it 2600 seen 166400 loss 1.3144 lr 8.32e-06 ema 0.999782 elapsed 1472.9s


Ep 33/35 it 2800 seen 179200 loss 1.3416 lr 8.32e-06 ema 0.999782 elapsed 1586.2s


Ep 33/35 it 3000 seen 192000 loss 3.1082 lr 8.32e-06 ema 0.999782 elapsed 1698.8s


Ep 33/35 it 3200 seen 204800 loss 1.4629 lr 8.32e-06 ema 0.999782 elapsed 1812.6s


Ep 33/35 it 3400 seen 217600 loss 1.3510 lr 8.32e-06 ema 0.999782 elapsed 1926.2s


Epoch 33 val_acc(EMA) 0.8475 ep_time 1955.0s total 64592.8s


Saved EMA best to effv2m_320_ema.pth


Ep 34/35 it 200 seen 12800 loss 3.1430 lr 4.27e-06 ema 0.999791 elapsed 113.4s


Ep 34/35 it 400 seen 25600 loss 1.5441 lr 4.27e-06 ema 0.999791 elapsed 227.3s


Ep 34/35 it 600 seen 38400 loss 1.2814 lr 4.27e-06 ema 0.999791 elapsed 341.2s


Ep 34/35 it 800 seen 51200 loss 2.4170 lr 4.27e-06 ema 0.999791 elapsed 453.6s


Ep 34/35 it 1000 seen 64000 loss 1.3296 lr 4.27e-06 ema 0.999791 elapsed 566.8s


Ep 34/35 it 1200 seen 76800 loss 2.1521 lr 4.27e-06 ema 0.999791 elapsed 679.5s


Ep 34/35 it 1400 seen 89600 loss 1.4402 lr 4.27e-06 ema 0.999791 elapsed 793.2s


Ep 34/35 it 1600 seen 102400 loss 1.5941 lr 4.27e-06 ema 0.999791 elapsed 905.4s


Ep 34/35 it 1800 seen 115200 loss 1.2953 lr 4.27e-06 ema 0.999791 elapsed 1018.7s


Ep 34/35 it 2000 seen 128000 loss 1.4183 lr 4.27e-06 ema 0.999791 elapsed 1132.3s


Ep 34/35 it 2200 seen 140800 loss 1.3433 lr 4.27e-06 ema 0.999791 elapsed 1245.1s


Ep 34/35 it 2400 seen 153600 loss 3.2319 lr 4.27e-06 ema 0.999791 elapsed 1358.2s


Ep 34/35 it 2600 seen 166400 loss 1.8548 lr 4.27e-06 ema 0.999791 elapsed 1471.5s


Ep 34/35 it 2800 seen 179200 loss 1.4026 lr 4.27e-06 ema 0.999791 elapsed 1584.1s


Ep 34/35 it 3000 seen 192000 loss 3.0708 lr 4.27e-06 ema 0.999791 elapsed 1697.6s


Ep 34/35 it 3200 seen 204800 loss 2.1978 lr 4.27e-06 ema 0.999791 elapsed 1809.8s


Ep 34/35 it 3400 seen 217600 loss 2.9668 lr 4.27e-06 ema 0.999791 elapsed 1923.2s


Epoch 34 val_acc(EMA) 0.8472 ep_time 1952.2s total 66547.0s


Ep 35/35 it 200 seen 12800 loss 1.3576 lr 1.82e-06 ema 0.999800 elapsed 115.2s


Ep 35/35 it 400 seen 25600 loss 1.2822 lr 1.82e-06 ema 0.999800 elapsed 228.0s


Ep 35/35 it 600 seen 38400 loss 1.9796 lr 1.82e-06 ema 0.999800 elapsed 341.3s


Ep 35/35 it 800 seen 51200 loss 1.3511 lr 1.82e-06 ema 0.999800 elapsed 454.0s


Ep 35/35 it 1000 seen 64000 loss 2.6053 lr 1.82e-06 ema 0.999800 elapsed 567.6s


Ep 35/35 it 1200 seen 76800 loss 1.4531 lr 1.82e-06 ema 0.999800 elapsed 680.1s


Ep 35/35 it 1400 seen 89600 loss 3.8607 lr 1.82e-06 ema 0.999800 elapsed 793.4s


Ep 35/35 it 1600 seen 102400 loss 1.4004 lr 1.82e-06 ema 0.999800 elapsed 907.0s


Ep 35/35 it 1800 seen 115200 loss 2.6725 lr 1.82e-06 ema 0.999800 elapsed 1019.8s


Ep 35/35 it 2000 seen 128000 loss 2.6255 lr 1.82e-06 ema 0.999800 elapsed 1133.1s


Ep 35/35 it 2200 seen 140800 loss 2.9372 lr 1.82e-06 ema 0.999800 elapsed 1245.7s


Ep 35/35 it 2400 seen 153600 loss 3.5339 lr 1.82e-06 ema 0.999800 elapsed 1359.4s


Ep 35/35 it 2600 seen 166400 loss 1.2398 lr 1.82e-06 ema 0.999800 elapsed 1472.2s


Ep 35/35 it 2800 seen 179200 loss 2.7649 lr 1.82e-06 ema 0.999800 elapsed 1585.7s


Ep 35/35 it 3000 seen 192000 loss 1.3494 lr 1.82e-06 ema 0.999800 elapsed 1699.1s


Ep 35/35 it 3200 seen 204800 loss 2.4771 lr 1.82e-06 ema 0.999800 elapsed 1811.8s


Ep 35/35 it 3400 seen 217600 loss 2.4842 lr 1.82e-06 ema 0.999800 elapsed 1925.6s


Epoch 35 val_acc(EMA) 0.8484 ep_time 1954.5s total 68503.4s


Saved EMA best to effv2m_320_ema.pth


Saved periodic EMA to effv2m_320_ema_ep35.pth


Training done. Best EMA val_acc 0.8484


  ckpt = torch.load(best_path, map_location=device)


TTA scale 288 it 100/504


TTA scale 288 it 200/504


TTA scale 288 it 300/504


TTA scale 288 it 400/504


TTA scale 288 it 500/504


TTA scale 320 it 100/504


TTA scale 320 it 200/504


TTA scale 320 it 300/504


TTA scale 320 it 400/504


TTA scale 320 it 500/504


TTA scale 352 it 100/504


TTA scale 352 it 200/504


TTA scale 352 it 300/504


TTA scale 352 it 400/504


TTA scale 352 it 500/504


Wrote submission.csv with 32214 rows
Done.


In [6]:
# Diagnose and fix submission format to match Kaggle sample
import pandas as pd
import os

sample_path = 'kaggle_sample_submission.csv'
sub_path = 'submission.csv'
assert os.path.exists(sample_path), 'kaggle_sample_submission.csv not found'
assert os.path.exists(sub_path), 'submission.csv not found'

sample = pd.read_csv(sample_path)
sub = pd.read_csv(sub_path)
print('Sample columns:', list(sample.columns))
print('Submission columns:', list(sub.columns))
print('Sample shape:', sample.shape, 'Submission shape:', sub.shape)

# Ensure columns and order match sample exactly
required_cols = list(sample.columns)  # expected e.g. ['id', 'predicted']
rename_map = {}
# Map common alt names to required ones
if 'id' in required_cols and 'id' not in sub.columns and 'image_id' in sub.columns:
    rename_map['image_id'] = 'id'
if 'predicted' in required_cols and 'predicted' not in sub.columns and 'category_id' in sub.columns:
    rename_map['category_id'] = 'predicted'
if rename_map:
    sub = sub.rename(columns=rename_map)

# Reorder and filter to required columns only
missing = [c for c in required_cols if c not in sub.columns]
assert not missing, f'Missing columns in submission: {missing}'
sub = sub[required_cols]

# Enforce integer dtypes for id/predicted
for c in required_cols:
    if pd.api.types.is_float_dtype(sub[c]):
        sub[c] = sub[c].round().astype('int64')
    elif not pd.api.types.is_integer_dtype(sub[c]):
        try:
            sub[c] = sub[c].astype('int64')
        except Exception:
            pass

# Final sanity
assert sub.shape[0] == sample.shape[0], f'Row count mismatch: {sub.shape[0]} vs sample {sample.shape[0]}'
assert list(sub.columns) == required_cols, f'Column order mismatch: {list(sub.columns)} vs {required_cols}'

sub.to_csv(sub_path, index=False)
print('Fixed submission saved. Columns:', list(sub.columns), 'shape:', sub.shape)

Sample columns: ['id', 'predicted']
Submission columns: ['image_id', 'category_id']
Sample shape: (32214, 2) Submission shape: (32214, 2)
Fixed submission saved. Columns: ['id', 'predicted'] shape: (32214, 2)
