In [1]:
import os
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
import gc, numpy as np, pandas as pd, torch, torch.nn as nn, timm
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.metrics import log_loss

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')
if torch.cuda.is_available():
    print(f'GPU: {torch.cuda.get_device_name(0)}')
    free, total = torch.cuda.mem_get_info()
    print(f'VRAM Free/Total: {free/1e9:.2f}/{total/1e9:.2f} GB')

torch.backends.cudnn.benchmark = True
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

SEED=793; N_FOLDS=5; EPOCHS=20; PATIENCE=5; BATCH=4; LR=1e-4
np.random.seed(SEED); torch.manual_seed(SEED)

label_cols = ['C1','C2','C3','C4','C5','C6','C7']
df = pd.read_csv('train.csv')

class Axial3ch(Dataset):
    def __init__(self, df, is_train=True, cache_dir='temp_3d_vols'):
        self.df=df.reset_index(drop=True); self.is_train=is_train; self.cache_dir=cache_dir
    def __len__(self): return len(self.df)
    def __getitem__(self,i):
        r=self.df.iloc[i]; uid=r.StudyInstanceUID
        vol=np.load(os.path.join(self.cache_dir,f'{uid}.npy')).astype(np.float32)  # (2,Z,H,W)
        bone=(vol[1]>0.2).astype(np.float32); zsum=bone.sum(axis=(1,2))+1e-6
        zc=int(np.average(np.arange(vol.shape[1]), weights=zsum))
        z0=np.clip(zc,0,vol.shape[1]-1); zb=np.clip(zc-1,0,vol.shape[1]-1); za=np.clip(zc+1,0,vol.shape[1]-1)
        soft_c=vol[0,z0]; bone_c=vol[1,z0]; bone_ctx=(vol[1,zb]+vol[1,za])/2.0
        img=np.stack([soft_c,bone_c,bone_ctx],0).astype(np.float32)
        if self.is_train:
            if np.random.rand()<0.5: img=img[:, :, ::-1].copy()
            if np.random.rand()<0.5: img=img[:, ::-1, :].copy()
            if np.random.rand()<0.8:
                a=1+np.random.uniform(-0.1,0.1); b=np.random.uniform(-0.05,0.05); img=np.clip(img*a+b,0,1)
        x=torch.from_numpy(img).float(); y7=torch.from_numpy(r[label_cols].values.astype(np.float32))
        return x,y7

def build_model():
    m=timm.create_model('efficientnet_b3', pretrained=True, in_chans=3, num_classes=7)
    m=m.to(device).to(memory_format=torch.channels_last)
    if hasattr(m,'set_grad_checkpointing'): m.set_grad_checkpointing(True)
    return m

df = df[df['StudyInstanceUID'].apply(lambda u: os.path.exists(os.path.join('temp_3d_vols', f'{u}.npy')))].reset_index(drop=True)
y = df[label_cols].values.astype(np.float32)
y_overall = y.max(1).astype(int); groups=df['StudyInstanceUID'].values

# Compute pos_weight
col_sums = y.sum(axis=0)
pos_weight = (len(df) - col_sums) / np.clip(col_sums, 1, None)
pos_weight = np.clip(pos_weight, 1, 4)
pos_weight = torch.tensor(pos_weight, dtype=torch.float32).to(device)

skf=StratifiedGroupKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)
oof=np.zeros((len(df),7),np.float32); scores=[]
for fold,(tr,va) in enumerate(skf.split(df,y_overall,groups),1):
    print(f'Starting Fold {fold}')
    tr_ds=Axial3ch(df.iloc[tr],True); va_ds=Axial3ch(df.iloc[va],False)
    tr_dl=DataLoader(tr_ds,batch_size=BATCH,shuffle=True,num_workers=0,pin_memory=True)
    va_dl=DataLoader(va_ds,batch_size=BATCH,shuffle=False,num_workers=0,pin_memory=True)
    model=build_model()
    opt=torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=1e-4)
    sch=torch.optim.lr_scheduler.CosineAnnealingLR(opt,T_max=EPOCHS)
    crit=nn.BCEWithLogitsLoss(pos_weight=pos_weight)
    best=1e9; bad=0
    for ep in range(EPOCHS):
        model.train(); tr_loss=0
        for x,yb in tr_dl:
            x = x.to(device, non_blocking=True).to(memory_format=torch.channels_last)
            yb = yb.to(device, non_blocking=True)
            with torch.amp.autocast('cuda', dtype=torch.bfloat16):
                logits=model(x); loss=crit(logits,yb)
            opt.zero_grad(set_to_none=True); loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(),1.0)
            opt.step(); tr_loss+=loss.item()
        tr_loss/=max(1,len(tr_dl))
        model.eval(); va_loss=0; preds=[]
        with torch.no_grad():
            for x,yb in va_dl:
                x = x.to(device, non_blocking=True).to(memory_format=torch.channels_last)
                yb = yb.to(device, non_blocking=True)
                with torch.amp.autocast('cuda', dtype=torch.bfloat16):
                    logits=model(x); loss=crit(logits,yb)
                va_loss+=loss.item(); preds.append(torch.sigmoid(logits).cpu().numpy())
        va_loss/=max(1,len(va_dl)); sch.step()
        print(f'Fold {fold} Ep{ep+1}: tr {tr_loss:.4f} va {va_loss:.4f}')
        if va_loss<best: best=va_loss; bad=0; best_state={k:v.detach().cpu() for k,v in model.state_dict().items()}
        else:
            bad+=1
            if bad>=PATIENCE: break
    model.load_state_dict(best_state, strict=True)
    model.eval(); pred=[]
    with torch.no_grad():
        for x,_ in va_dl:
            x = x.to(device, non_blocking=True).to(memory_format=torch.channels_last)
            with torch.amp.autocast('cuda', dtype=torch.bfloat16):
                pred.append(model(x).sigmoid().cpu().numpy())
    oof[va]=np.concatenate(pred)
    yv=y[va]; anyv=yv.max(1)
    vl=[log_loss(yv[:,i],oof[va][:,i],labels=[0,1]) for i in range(7)]
    ol=log_loss(anyv,oof[va].max(1),labels=[0,1])
    wll=np.average(vl+[ol],weights=[1]*7+[2]); scores.append(wll)
    torch.save(best_state, f'fold_{fold}_2p5d_efficientnetb3.pth')
    del model,tr_dl,va_dl; gc.collect(); torch.cuda.empty_cache()

np.save('oof_logits_2p5d_axial.npy', oof)
print('CV WLL:', np.mean(scores))

Using device: cuda
GPU: NVIDIA A10-24Q


RuntimeError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
