In [1]:
# DeBERTa-v3-large multitask (30 targets) per expert plan
# - Weighted SmoothL1Loss: answer_*=1.5x, answer_helpful=2.0x
# - lr=1.5e-5, batch_size=6, grad_accum=6, epochs=4 (add 5th if improving)
# - WeightedLayerPooling + masked mean pooling
# - EMA (decay=0.99) with warmup delay; dual eval (plain vs EMA)
# - Eval/Test-time MC dropout T=5
# - Quota-based packing to protect Answer tokens
# - GroupKFold via precomputed folds.npy; robust logging

import os, time, math, gc, random, sys
import numpy as np, pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel, get_cosine_schedule_with_warmup
from scipy.stats import spearmanr

# Mitigate CUDA fragmentation
os.environ.setdefault('PYTORCH_CUDA_ALLOC_CONF', 'expandable_segments:True')
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
assert torch.cuda.is_available(), 'CUDA is required for this run'

# Load data and schema
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sample_sub = pd.read_csv('sample_submission.csv')
id_col = sample_sub.columns[0]
target_cols = [c for c in sample_sub.columns if c != id_col]
assert target_cols == list(sample_sub.columns[1:])
assert all(c in train.columns for c in target_cols)
folds = np.load('folds.npy')

# Text fields
TITLE, BODY, ANSWER = 'question_title','question_body','answer'
assert all(c in train.columns for c in [TITLE, BODY, ANSWER])

# Loss weights: boost all answer_* targets by 1.5x; answer_helpful 2.0x
loss_weights = np.ones(len(target_cols), dtype=np.float32)
for i, col in enumerate(target_cols):
    if col.startswith('answer_'):
        loss_weights[i] = 1.5
    if col == 'answer_helpful':
        loss_weights[i] = 2.0
print('Loss weights summary:', float(loss_weights.min()), float(loss_weights.max()), 'answer_* boosted, helpful=2.0x')

# Model/Tokenizer
model_name = 'microsoft/deberta-v3-large'
tokenizer = AutoTokenizer.from_pretrained(model_name)
MAX_LEN = 512
CLS_ID = tokenizer.cls_token_id
SEP_ID = tokenizer.sep_token_id
PAD_ID = tokenizer.pad_token_id
assert CLS_ID is not None and SEP_ID is not None and PAD_ID is not None, 'Tokenizer missing special tokens'

def _encode_no_specials(text: str):
    return tokenizer.encode(text, add_special_tokens=False)

def _trim_to(ids, lim):
    if len(ids) <= lim: return ids
    return ids[:max(0, lim)]

def pack_inputs(title, body, answer):
    # Quota-based manual packing: [CLS] title [SEP] body [SEP] answer [SEP]
    # Base quotas under 512 (reserve 4 specials): title 48, body 256, answer 196 (sum=500)
    q_title, q_body, q_answer = 48, 256, 196
    content_budget = MAX_LEN - 4
    # Tokenize without specials
    t_ids = _encode_no_specials(f"Title: {title}")
    b_ids = _encode_no_specials(f"Body: {body}")
    a_ids = _encode_no_specials(f"Answer: {answer}")
    # Initial trims
    t_used = min(q_title, len(t_ids))
    b_used = min(q_body, len(b_ids))
    a_used = min(q_answer, len(a_ids))
    used = t_used + b_used + a_used
    # Enforce Answer minimum cap (>=200) by borrowing from Body down to 100, then Title down to 50
    if a_used < 200:
        need = 200 - a_used
        take = min(need, max(0, b_used - 100))
        b_used -= take; a_used += take; need -= take
        if need > 0:
            take2 = min(need, max(0, t_used - 50))
            t_used -= take2; a_used += take2; need -= take2
    used = t_used + b_used + a_used
    # Redistribute leftover budget in Answer -> Body -> Title order
    if used < content_budget:
        leftover = content_budget - used
        t_room = max(0, len(t_ids) - t_used)
        b_room = max(0, len(b_ids) - b_used)
        a_room = max(0, len(a_ids) - a_used)
        while leftover > 0 and (t_room + b_room + a_room) > 0:
            if a_room > 0 and leftover > 0:
                add = min(8, min(leftover, a_room))
                a_used += add; leftover -= add; a_room -= add
            if b_room > 0 and leftover > 0:
                add = min(8, min(leftover, b_room))
                b_used += add; leftover -= add; b_room -= add
            if t_room > 0 and leftover > 0:
                add = min(4, min(leftover, t_room))
                t_used += add; leftover -= add; t_room -= add
    # Final trims
    t_ids = _trim_to(t_ids, t_used)
    b_ids = _trim_to(b_ids, b_used)
    a_ids = _trim_to(a_ids, a_used)
    # Assemble
    input_ids = [CLS_ID] + t_ids + [SEP_ID] + b_ids + [SEP_ID] + a_ids + [SEP_ID]
    if len(input_ids) > MAX_LEN:
        input_ids = input_ids[:MAX_LEN]
        input_ids[-1] = SEP_ID
    attn_mask = [1] * len(input_ids)
    # Pad
    pad_len = MAX_LEN - len(input_ids)
    if pad_len > 0:
        input_ids = input_ids + [PAD_ID] * pad_len
        attn_mask = attn_mask + [0] * pad_len
    return {
        'input_ids': torch.tensor(input_ids, dtype=torch.long).unsqueeze(0),
        'attention_mask': torch.tensor(attn_mask, dtype=torch.long).unsqueeze(0),
    }

class QADataset(Dataset):
    def __init__(self, df, targets=None):
        self.t = df[TITLE].fillna('').astype(str).values
        self.b = df[BODY].fillna('').astype(str).values
        self.a = df[ANSWER].fillna('').astype(str).values
        self.targets = None if targets is None else np.asarray(targets, dtype=np.float32)
    def __len__(self):
        return len(self.t)
    def __getitem__(self, idx):
        enc = pack_inputs(self.t[idx], self.b[idx], self.a[idx])
        item = {k: v.squeeze(0) for k, v in enc.items()}
        if self.targets is not None:
            item['labels'] = torch.tensor(self.targets[idx], dtype=torch.float32)
        return item

def spearman_cols(y_pred: np.ndarray, y_true: np.ndarray):
    rhos = []
    for i in range(y_pred.shape[1]):
        r = spearmanr(y_pred[:, i], y_true[:, i]).correlation
        rhos.append(0.0 if (r is None or np.isnan(r)) else float(r))
    return float(np.mean(rhos)), rhos

class EMA:
    def __init__(self, model, decay=0.99):
        self.decay = decay
        self.shadow = {n: p.detach().clone() for n,p in model.named_parameters() if p.requires_grad}
        self.backup = {}
    @torch.no_grad()
    def update(self, model):
        for n,p in model.named_parameters():
            if p.requires_grad:
                self.shadow[n].mul_((self.decay)).add_(p.detach(), alpha=1.0-self.decay)
    def apply_to(self, model):
        self.backup = {}
        for n,p in model.named_parameters():
            if p.requires_grad:
                self.backup[n] = p.detach().clone()
                p.data.copy_(self.shadow[n].data)
    def restore(self, model):
        for n,p in model.named_parameters():
            if p.requires_grad and n in self.backup:
                p.data.copy_(self.backup[n])
        self.backup = {}

def masked_mean_pooling(last_hidden_state, attention_mask):
    mask = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
    return (last_hidden_state * mask).sum(dim=1) / mask.sum(dim=1).clamp(min=1e-6)

class WeightedLayerPooling(nn.Module):
    def __init__(self, num_layers: int, layer_start: int = -4):
        super().__init__()
        self.layer_start = layer_start
        n = -layer_start
        self.weights = nn.Parameter(torch.ones(n) / n)
    def forward(self, all_hidden_states):
        selected = all_hidden_states[self.layer_start:]
        stacked = torch.stack(selected, dim=0)  # [n, bs, seq, hid]
        w = torch.softmax(self.weights, dim=0).view(-1,1,1,1)
        return (w * stacked).sum(dim=0)

class DebertaMT(nn.Module):
    def __init__(self, name, out_dim=30, dropout_p=0.2, msd_k=1, loss_weights=None):
        super().__init__()
        self.backbone = AutoModel.from_pretrained(name)
        # Optional AMP stability tweak
        if hasattr(self.backbone, 'config'):
            setattr(self.backbone.config, 'layer_norm_eps', 1e-5)
        if hasattr(self.backbone, 'gradient_checkpointing_enable'):
            self.backbone.gradient_checkpointing_enable()
        hidden = self.backbone.config.hidden_size
        self.layer_pool = WeightedLayerPooling(getattr(self.backbone.config, 'num_hidden_layers', 24), layer_start=-4)
        self.msd_k = msd_k
        self.dropouts = nn.ModuleList([nn.Dropout(dropout_p) for _ in range(msd_k)])
        self.head = nn.Linear(hidden, out_dim)
        self.register_buffer('loss_w', torch.tensor(loss_weights if loss_weights is not None else np.ones(out_dim, dtype=np.float32)))
        self.l1 = nn.SmoothL1Loss(reduction='none')
    def forward(self, input_ids=None, attention_mask=None, labels=None):
        out = self.backbone(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True)
        pooled_seq = self.layer_pool(out.hidden_states)
        feat = masked_mean_pooling(pooled_seq, attention_mask)
        logits_accum = 0
        for dp in self.dropouts:
            logits_accum = logits_accum + self.head(dp(feat))
        logits = logits_accum / self.msd_k
        loss = None
        if labels is not None:
            per_elem = self.l1(logits, labels)  # [bs, C]
            loss = (per_elem * self.loss_w).mean()
        return logits, loss

def predict_msd(model, loader, T=5, use_ema=False, ema_obj=None):
    # Temporarily disable gradient checkpointing for inference to avoid warnings/overhead
    gc_supported = hasattr(model.backbone, 'gradient_checkpointing_disable') and hasattr(model.backbone, 'gradient_checkpointing_enable')
    if gc_supported:
        model.backbone.gradient_checkpointing_disable()
    if use_ema and ema_obj is not None:
        ema_obj.apply_to(model)
    # Enable dropout but keep no_grad
    model.train()
    preds = []
    with torch.no_grad():
        for batch in loader:
            inputs = {k: v.to(device, non_blocking=True) for k,v in batch.items() if k not in ('labels','token_type_ids')}
            logits_sum = 0
            for _ in range(T):
                logits_sum = logits_sum + model(**inputs, labels=None)[0]
            preds.append((logits_sum / T).float().cpu().numpy())
    if use_ema and ema_obj is not None:
        ema_obj.restore(model)
    if gc_supported:
        model.backbone.gradient_checkpointing_enable()
    return np.concatenate(preds, axis=0)

def run_fold(fold, train_idx, val_idx):
    print(f'Fold {fold} start: tr={len(train_idx)} va={len(val_idx)}')
    df_tr = train.iloc[train_idx].reset_index(drop=True)
    df_va = train.iloc[val_idx].reset_index(drop=True)
    y_tr = df_tr[target_cols].astype(np.float32).values
    y_va = df_va[target_cols].astype(np.float32).values

    ds_tr = QADataset(df_tr, y_tr)
    ds_va = QADataset(df_va, y_va)
    ds_te = QADataset(test, None)

    train_loader = DataLoader(ds_tr, batch_size=6, shuffle=True, num_workers=4, pin_memory=True, persistent_workers=True)
    val_loader   = DataLoader(ds_va, batch_size=16, shuffle=False, num_workers=4, pin_memory=True, persistent_workers=True)
    test_loader  = DataLoader(ds_te, batch_size=16, shuffle=False, num_workers=4, pin_memory=True, persistent_workers=True)

    model = DebertaMT(model_name, out_dim=len(target_cols), dropout_p=0.2, msd_k=1, loss_weights=loss_weights).to(device)
    # Initialize head bias to target means for stability
    with torch.no_grad():
        if hasattr(model.head, 'bias') and model.head.bias is not None:
            means = train[target_cols].mean().values.astype(np.float32)
            model.head.bias.copy_(torch.tensor(means, device=device))

    # Optimizer with no_decay groups (no WD on bias/LayerNorm)
    no_decay = ['bias', 'LayerNorm.weight']
    decay_params = []
    nodecay_params = []
    for n, p in model.named_parameters():
        if not p.requires_grad: continue
        if any(nd in n for nd in no_decay):
            nodecay_params.append(p)
        else:
            decay_params.append(p)
    optimizer = torch.optim.AdamW([
        {'params': decay_params, 'weight_decay': 0.01},
        {'params': nodecay_params, 'weight_decay': 0.0},
    ], lr=1.5e-5, betas=(0.9,0.999), eps=1e-6)

    num_epochs = 4
    grad_accum = 6  # effective batch 36
    steps_per_epoch = math.ceil(len(train_loader) / grad_accum)
    num_training_steps = steps_per_epoch * num_epochs
    warmup_steps = max(10, int(0.1 * num_training_steps))
    scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=num_training_steps)

    scaler = torch.amp.GradScaler('cuda', enabled=True)
    ema = EMA(model, decay=0.99)

    best_score = -1.0
    best_val_preds = None

    global_step = 0
    t0 = time.time()
    for epoch in range(num_epochs):
        model.train()
        tr_loss = 0.0
        optimizer.zero_grad(set_to_none=True)
        for step, batch in enumerate(train_loader):
            inputs = {k: v.to(device, non_blocking=True) for k,v in batch.items() if k not in ('labels','token_type_ids')}
            labels = batch['labels'].to(device, non_blocking=True)
            with torch.amp.autocast('cuda', enabled=True):
                logits, loss = model(**inputs, labels=labels)
                loss = loss / grad_accum
            scaler.scale(loss).backward()
            if (step + 1) % grad_accum == 0:
                scaler.unscale_(optimizer)
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad(set_to_none=True)
                scheduler.step()
                if global_step >= warmup_steps:
                    ema.update(model)
                global_step += 1
            tr_loss += loss.item() * grad_accum
            if (step+1) % 100 == 0:
                print(f'  Epoch {epoch+1} step {step+1}/{len(train_loader)} loss={tr_loss/(step+1):.4f}', flush=True)

        # Validation: plain vs EMA with MC dropout T=5
        def evaluate():
            preds_plain = predict_msd(model, val_loader, T=5, use_ema=False, ema_obj=None)
            preds_ema   = predict_msd(model, val_loader, T=5, use_ema=True, ema_obj=ema)
            tgts = df_va[target_cols].to_numpy(dtype=np.float32)
            s_plain, per_plain = spearman_cols(preds_plain, tgts)
            s_ema,   per_ema   = spearman_cols(preds_ema, tgts)
            return (s_plain, per_plain, preds_plain), (s_ema, per_ema, preds_ema)

        (s_plain, per_plain, vp_plain), (s_ema, per_ema, vp_ema) = evaluate()
        print(f'  Epoch {epoch+1} mean-30 Spearman plain/EMA: {s_plain:.5f}/{s_ema:.5f} | time {(time.time()-t0):.1f}s')
        score = s_plain if s_plain >= s_ema else s_ema
        val_preds = vp_plain if s_plain >= s_ema else vp_ema
        if score > best_score:
            best_score = score
            best_val_preds = val_preds.copy()

    # Test inference with EMA weights + MC dropout
    test_preds = predict_msd(model, test_loader, T=5, use_ema=True, ema_obj=ema)

    del model, optimizer, scheduler, scaler, train_loader, val_loader, test_loader, ds_tr, ds_va, ds_te
    torch.cuda.empty_cache(); gc.collect()
    return best_val_preds, test_preds, best_score

# Run CV
unique_folds = np.unique(folds)
oof = np.zeros((len(train), len(target_cols)), dtype=np.float32)
test_accum = np.zeros((len(unique_folds), len(test), len(target_cols)), dtype=np.float32)
fold_scores = []

overall_t0 = time.time()
for i, fold in enumerate(unique_folds):
    tr_idx = np.where(folds != fold)[0]
    va_idx = np.where(folds == fold)[0]
    start = time.time()
    va_pred, te_pred, score = run_fold(fold, tr_idx, va_idx)
    oof[va_idx] = va_pred
    test_accum[i] = te_pred
    fold_scores.append(float(score))
    print(f'Fold {fold} best mean-30 Spearman: {score:.5f} | fold time {time.time()-start:.1f}s', flush=True)

oof_mean_score, _ = spearman_cols(oof, train[target_cols].astype(np.float32).values)
print('Fold Spearmans:', [round(s,5) for s in fold_scores])
print(f'OOF mean-30 Spearman (deberta-v3-large): {oof_mean_score:.5f}')

# Save OOF/test
np.save('oof_all_targets_deberta_large.npy', np.clip(oof, 0, 1).astype(np.float32))
test_pred = test_accum.mean(axis=0).astype(np.float32)
test_pred = np.clip(test_pred, 0.0, 1.0).astype(np.float32)
np.save('test_all_targets_deberta_large.npy', test_pred)

# Build submission (separate file to avoid clobbering base run)
sub = sample_sub.copy()
sub[id_col] = test[id_col].values
for i, col in enumerate(target_cols):
    sub[col] = test_pred[:, i]
sub.to_csv('submission_deberta_large.csv', index=False)
print('Saved submission_deberta_large.csv. Total time:', round(time.time()-overall_t0,1),'s')

print('Done.')

  from .autonotebook import tqdm as notebook_tqdm


Using device: cuda
Loss weights summary: 1.0 2.0 answer_* boosted, helpful=2.0x




Fold 0 start: tr=4376 va=1095


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


  Epoch 1 step 100/730 loss=0.1273


  Epoch 1 step 200/730 loss=0.0927


  Epoch 1 step 300/730 loss=0.0768


  Epoch 1 step 400/730 loss=0.0669


  Epoch 1 step 500/730 loss=0.0605


  Epoch 1 step 600/730 loss=0.0560


  Epoch 1 step 700/730 loss=0.0522


  Epoch 1 mean-30 Spearman plain/EMA: 0.27724/0.12023 | time 1046.9s


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


  Epoch 2 step 100/730 loss=0.0294


  Epoch 2 step 200/730 loss=0.0292


  Epoch 2 step 300/730 loss=0.0283


  Epoch 2 step 400/730 loss=0.0279


  Epoch 2 step 500/730 loss=0.0275


  Epoch 2 step 600/730 loss=0.0272


  Epoch 2 step 700/730 loss=0.0270


  Epoch 2 mean-30 Spearman plain/EMA: 0.33093/0.28088 | time 2097.3s


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


  Epoch 3 step 100/730 loss=0.0230


  Epoch 3 step 200/730 loss=0.0234


  Epoch 3 step 300/730 loss=0.0236


  Epoch 3 step 400/730 loss=0.0237


  Epoch 3 step 500/730 loss=0.0234


  Epoch 3 step 600/730 loss=0.0233


  Epoch 3 step 700/730 loss=0.0233


  Epoch 3 mean-30 Spearman plain/EMA: 0.34667/0.33658 | time 3147.6s


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


  Epoch 4 step 100/730 loss=0.0223


  Epoch 4 step 200/730 loss=0.0218


  Epoch 4 step 300/730 loss=0.0219


  Epoch 4 step 400/730 loss=0.0219


  Epoch 4 step 500/730 loss=0.0220


  Epoch 4 step 600/730 loss=0.0220


  Epoch 4 step 700/730 loss=0.0218


  Epoch 4 mean-30 Spearman plain/EMA: 0.35203/0.34818 | time 4198.6s


Fold 0 best mean-30 Spearman: 0.35203 | fold time 4347.1s


Fold 1 start: tr=4377 va=1094


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


  Epoch 1 step 100/730 loss=0.1463


  Epoch 1 step 200/730 loss=0.1034


  Epoch 1 step 300/730 loss=0.0846


  Epoch 1 step 400/730 loss=0.0735


  Epoch 1 step 500/730 loss=0.0657


  Epoch 1 step 600/730 loss=0.0604


  Epoch 1 step 700/730 loss=0.0564


  Epoch 1 mean-30 Spearman plain/EMA: 0.25704/0.11161 | time 1049.6s


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


  Epoch 2 step 100/730 loss=0.0291


  Epoch 2 step 200/730 loss=0.0288


  Epoch 2 step 300/730 loss=0.0283


  Epoch 2 step 400/730 loss=0.0279


  Epoch 2 step 500/730 loss=0.0277


  Epoch 2 step 600/730 loss=0.0275


  Epoch 2 step 700/730 loss=0.0272


  Epoch 2 mean-30 Spearman plain/EMA: 0.31470/0.26542 | time 2100.3s


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


  Epoch 3 step 100/730 loss=0.0241


  Epoch 3 step 200/730 loss=0.0244


  Epoch 3 step 300/730 loss=0.0239


  Epoch 3 step 400/730 loss=0.0237


  Epoch 3 step 500/730 loss=0.0236


  Epoch 3 step 600/730 loss=0.0237


  Epoch 3 step 700/730 loss=0.0237


  Epoch 3 mean-30 Spearman plain/EMA: 0.33425/0.31958 | time 3150.2s


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


  Epoch 4 step 100/730 loss=0.0232


  Epoch 4 step 200/730 loss=0.0225


  Epoch 4 step 300/730 loss=0.0222


  Epoch 4 step 400/730 loss=0.0222


  Epoch 4 step 500/730 loss=0.0222


  Epoch 4 step 600/730 loss=0.0224


  Epoch 4 step 700/730 loss=0.0224


  Epoch 4 mean-30 Spearman plain/EMA: 0.33211/0.33107 | time 4201.1s


Fold 1 best mean-30 Spearman: 0.33425 | fold time 4349.2s


Fold 2 start: tr=4377 va=1094


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


  Epoch 1 step 100/730 loss=0.1156


  Epoch 1 step 200/730 loss=0.0867


  Epoch 1 step 300/730 loss=0.0731


  Epoch 1 step 400/730 loss=0.0646


  Epoch 1 step 500/730 loss=0.0586


  Epoch 1 step 600/730 loss=0.0544


  Epoch 1 step 700/730 loss=0.0511


  Epoch 1 mean-30 Spearman plain/EMA: 0.25433/0.12583 | time 1050.0s


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


  Epoch 2 step 100/730 loss=0.0287


  Epoch 2 step 200/730 loss=0.0288


  Epoch 2 step 300/730 loss=0.0285


  Epoch 2 step 400/730 loss=0.0279


  Epoch 2 step 500/730 loss=0.0275


  Epoch 2 step 600/730 loss=0.0272


  Epoch 2 step 700/730 loss=0.0270


  Epoch 2 mean-30 Spearman plain/EMA: 0.32238/0.26444 | time 2100.7s


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


  Epoch 3 step 100/730 loss=0.0239


  Epoch 3 step 200/730 loss=0.0241


  Epoch 3 step 300/730 loss=0.0241


  Epoch 3 step 400/730 loss=0.0240


  Epoch 3 step 500/730 loss=0.0238


  Epoch 3 step 600/730 loss=0.0237


  Epoch 3 step 700/730 loss=0.0236


  Epoch 3 mean-30 Spearman plain/EMA: 0.34019/0.32707 | time 3151.0s


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


  Epoch 4 step 100/730 loss=0.0219


  Epoch 4 step 200/730 loss=0.0220


  Epoch 4 step 300/730 loss=0.0221


  Epoch 4 step 400/730 loss=0.0221


  Epoch 4 step 500/730 loss=0.0220


  Epoch 4 step 600/730 loss=0.0220


  Epoch 4 step 700/730 loss=0.0219


  Epoch 4 mean-30 Spearman plain/EMA: 0.34624/0.33698 | time 4201.6s


Fold 2 best mean-30 Spearman: 0.34624 | fold time 4349.8s


Fold 3 start: tr=4377 va=1094


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


  Epoch 1 step 100/730 loss=0.1290


  Epoch 1 step 200/730 loss=0.0944


  Epoch 1 step 300/730 loss=0.0772


  Epoch 1 step 400/730 loss=0.0675


  Epoch 1 step 500/730 loss=0.0612


  Epoch 1 step 600/730 loss=0.0564


  Epoch 1 step 700/730 loss=0.0530


  Epoch 1 mean-30 Spearman plain/EMA: 0.26634/0.13801 | time 1049.7s


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


  Epoch 2 step 100/730 loss=0.0291


  Epoch 2 step 200/730 loss=0.0283


  Epoch 2 step 300/730 loss=0.0281


  Epoch 2 step 400/730 loss=0.0279


  Epoch 2 step 500/730 loss=0.0276


  Epoch 2 step 600/730 loss=0.0273


  Epoch 2 step 700/730 loss=0.0272


  Epoch 2 mean-30 Spearman plain/EMA: 0.32646/0.27800 | time 2100.4s


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


  Epoch 3 step 100/730 loss=0.0238


  Epoch 3 step 200/730 loss=0.0242


  Epoch 3 step 300/730 loss=0.0242


  Epoch 3 step 400/730 loss=0.0241


  Epoch 3 step 500/730 loss=0.0240


  Epoch 3 step 600/730 loss=0.0239


  Epoch 3 step 700/730 loss=0.0237


  Epoch 3 mean-30 Spearman plain/EMA: 0.34080/0.32305 | time 3150.9s


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


  Epoch 4 step 100/730 loss=0.0223


  Epoch 4 step 200/730 loss=0.0222


  Epoch 4 step 300/730 loss=0.0225


  Epoch 4 step 400/730 loss=0.0223


  Epoch 4 step 500/730 loss=0.0224


  Epoch 4 step 600/730 loss=0.0223


  Epoch 4 step 700/730 loss=0.0222


  Epoch 4 mean-30 Spearman plain/EMA: 0.34791/0.33907 | time 4201.2s


Fold 3 best mean-30 Spearman: 0.34791 | fold time 4349.7s


Fold 4 start: tr=4377 va=1094


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


  Epoch 1 step 100/730 loss=0.1093


  Epoch 1 step 200/730 loss=0.0822


  Epoch 1 step 300/730 loss=0.0691


  Epoch 1 step 400/730 loss=0.0611


  Epoch 1 step 500/730 loss=0.0557


  Epoch 1 step 600/730 loss=0.0517


  Epoch 1 step 700/730 loss=0.0485


  Epoch 1 mean-30 Spearman plain/EMA: 0.27393/0.14426 | time 1049.9s


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


  Epoch 2 step 100/730 loss=0.0289


  Epoch 2 step 200/730 loss=0.0281


  Epoch 2 step 300/730 loss=0.0277


  Epoch 2 step 400/730 loss=0.0274


  Epoch 2 step 500/730 loss=0.0272


  Epoch 2 step 600/730 loss=0.0269


  Epoch 2 step 700/730 loss=0.0267


  Epoch 2 mean-30 Spearman plain/EMA: 0.32082/0.28410 | time 2100.5s


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


  Epoch 3 step 100/730 loss=0.0237


  Epoch 3 step 200/730 loss=0.0235


  Epoch 3 step 300/730 loss=0.0239


  Epoch 3 step 400/730 loss=0.0237


  Epoch 3 step 500/730 loss=0.0235


  Epoch 3 step 600/730 loss=0.0234


  Epoch 3 step 700/730 loss=0.0234


  Epoch 3 mean-30 Spearman plain/EMA: 0.33726/0.32118 | time 3150.9s


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


  Epoch 4 step 100/730 loss=0.0226


  Epoch 4 step 200/730 loss=0.0223


  Epoch 4 step 300/730 loss=0.0225


  Epoch 4 step 400/730 loss=0.0222


  Epoch 4 step 500/730 loss=0.0221


  Epoch 4 step 600/730 loss=0.0220


  Epoch 4 step 700/730 loss=0.0219


  Epoch 4 mean-30 Spearman plain/EMA: 0.33786/0.33662 | time 4201.7s


Fold 4 best mean-30 Spearman: 0.33786 | fold time 4350.2s


Fold Spearmans: [0.35203, 0.33425, 0.34624, 0.34791, 0.33786]
OOF mean-30 Spearman (deberta-v3-large): 0.34303
Saved submission_deberta_large.csv. Total time: 21746.1 s
Done.
