In [1]:
# Multi-output DeBERTa-v3-base (30 targets) with 5-fold SGKF, Q+A packing, mean Spearman metric
import os, gc, time, json, numpy as np, pandas as pd, torch
from pathlib import Path
from sklearn.model_selection import StratifiedGroupKFold
from scipy.stats import spearmanr
from transformers import (AutoTokenizer, AutoConfig, AutoModelForSequenceClassification,
                          TrainingArguments, Trainer, EarlyStoppingCallback, DataCollatorWithPadding, set_seed)

torch.set_float32_matmul_precision('high')
SEED = 42
set_seed(SEED)

# Data
train = pd.read_csv('train.csv'); test = pd.read_csv('test.csv')
id_col = 'qa_id'
targets = [
    'question_asker_intent_understanding','question_body_critical','question_conversational','question_expect_short_answer',
    'question_fact_seeking','question_has_commonly_accepted_answer','question_interestingness_others','question_interestingness_self',
    'question_multi_intent','question_not_really_a_question','question_opinion_seeking','question_type_choice','question_type_compare',
    'question_type_consequence','question_type_definition','question_type_entity','question_type_instructions','question_type_procedure',
    'question_type_reason_explanation','question_type_spelling','question_well_written','answer_helpful','answer_level_of_information',
    'answer_plausible','answer_relevance','answer_satisfaction','answer_type_instructions','answer_type_procedure',
    'answer_type_reason_explanation','answer_well_written'
]
assert set(targets).issubset(train.columns), 'Missing QUEST targets in train.csv'
Y = train[targets].astype(float).values  # (N,30)

# Folds: StratifiedGroupKFold by group mean of main target (reuse established protocol)
main_target = 'question_asker_intent_understanding'
y_main = train[main_target].values.astype(float)
if Path('train_group_keys.csv').exists():
    groups = pd.read_csv('train_group_keys.csv')['group_key'].values
else:
    groups = pd.util.hash_pandas_object((train['question_title'].fillna('')+'||'+train['question_body'].fillna('')), index=False).astype('int64').values
df_groups = pd.DataFrame({'group': groups, 'y': y_main})
grp_mean = df_groups.groupby('group')['y'].mean()
bins = pd.qcut(grp_mean, q=10, labels=False, duplicates='drop')
grp_to_bin = dict(zip(grp_mean.index.values, bins.astype(int)))
row_bins = np.array([grp_to_bin[g] for g in groups], dtype=int)
sgkf = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42)
splits = list(sgkf.split(np.zeros_like(y_main), y=row_bins, groups=groups))

# Tokenization: Q+A template with dynamic truncation
model_name = 'microsoft/deberta-v3-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)
MAX_LEN = 512; TITLE_MAX = 64

def pack_qa(title: str, body: str, answer: str, tokenizer, max_len=MAX_LEN, title_max=TITLE_MAX):
    ti = tokenizer(title if isinstance(title, str) else '', add_special_tokens=False, truncation=True, max_length=title_max)['input_ids']
    bi_full = tokenizer(body if isinstance(body, str) else '', add_special_tokens=False, truncation=False)['input_ids']
    ai_full = tokenizer(answer if isinstance(answer, str) else '', add_special_tokens=False, truncation=False)['input_ids']
    # CLS + 3*SEP
    rem = max_len - (1 + 1 + 1 + 1) - len(ti)
    rem = max(rem, 0)
    lb, la = len(bi_full), len(ai_full)
    if lb + la == 0:
        bi, ai = [], []
    else:
        if rem >= 96:
            qb = max(48, int(rem * (lb/(lb+la))))
            qa = rem - qb
        else:
            qb = rem // 2
            qa = rem - qb
        bi, ai = bi_full[:qb], ai_full[:qa]
    ids = [tokenizer.cls_token_id] + ti + [tokenizer.sep_token_id] + bi + [tokenizer.sep_token_id] + ai + [tokenizer.sep_token_id]
    ids = ids[:max_len]
    attn = [1]*len(ids)
    pad = max_len - len(ids)
    if pad>0:
        ids += [tokenizer.pad_token_id]*pad
        attn += [0]*pad
    return ids, attn

def build_inputs(df: pd.DataFrame):
    T = df['question_title'].fillna('').astype(str).tolist()
    B = df['question_body'].fillna('').astype(str).tolist()
    A = df['answer'].fillna('').astype(str).tolist() if 'answer' in df.columns else ['']*len(df)
    input_ids, attention_masks = [], []
    for t,b,a in zip(T,B,A):
        ids, attn = pack_qa(t,b,a, tokenizer)
        input_ids.append(ids); attention_masks.append(attn)
    return {'input_ids': np.array(input_ids, dtype=np.int64), 'attention_mask': np.array(attention_masks, dtype=np.int64)}

print('[TOK] Building inputs (Q+A) ...', flush=True)
t0_tok = time.time()
tr_inputs = build_inputs(train)
te_inputs = build_inputs(test)
print(f'[TOK] Done in {time.time()-t0_tok:.1f}s; shapes tr={tr_inputs["input_ids"].shape} te={te_inputs["input_ids"].shape}', flush=True)

class QADataset(torch.utils.data.Dataset):
    def __init__(self, ids, masks, labels=None):
        self.ids = ids; self.masks = masks; self.labels = labels
    def __len__(self): return len(self.ids)
    def __getitem__(self, idx):
        item = {'input_ids': torch.tensor(self.ids[idx]), 'attention_mask': torch.tensor(self.masks[idx])}
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float32)
        return item

def compute_metrics(eval_pred):
    preds, labels = eval_pred  # preds: (N,30), labels: (N,30)
    vals = []
    for j in range(labels.shape[1]):
        p = preds[:, j]; y = labels[:, j]
        if np.std(p)==0 or np.std(y)==0:
            vals.append(0.0)
        else:
            vals.append(float(spearmanr(y, p).correlation))
    return {'mean_spearman': float(np.mean(vals))}

# Training loop (1 seed, 5 folds)
config = AutoConfig.from_pretrained(model_name, num_labels=30, problem_type='regression')
oof = np.zeros((len(train), 30), dtype=np.float32)
test_fold_preds = []
folds_idx = np.full(len(train), -1, dtype=int)

for fold, (trn_idx, val_idx) in enumerate(splits):
    t0 = time.time()
    folds_idx[val_idx] = fold
    print(f'\n[MULTI FOLD {fold}] train={len(trn_idx)} val={len(val_idx)}', flush=True)
    tr_ds = QADataset(tr_inputs['input_ids'][trn_idx], tr_inputs['attention_mask'][trn_idx], Y[trn_idx])
    va_ds = QADataset(tr_inputs['input_ids'][val_idx], tr_inputs['attention_mask'][val_idx], Y[val_idx])
    te_ds = QADataset(te_inputs['input_ids'], te_inputs['attention_mask'], None)

    model = AutoModelForSequenceClassification.from_pretrained(model_name, config=config)
    # Enable gradient checkpointing (also via args)
    try: model.gradient_checkpointing_enable()
    except Exception: pass

    args = TrainingArguments(
        output_dir=f'deberta_multi_fold{fold}',
        num_train_epochs=2,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=32,
        gradient_accumulation_steps=2,
        learning_rate=2e-5,
        weight_decay=0.01,
        warmup_ratio=0.1,
        lr_scheduler_type='linear',
        fp16=True,
        gradient_checkpointing=True,
        evaluation_strategy='epoch',
        save_strategy='epoch',
        load_best_model_at_end=True,
        metric_for_best_model='mean_spearman',
        greater_is_better=True,
        save_total_limit=1,
        logging_steps=50,
        seed=SEED,
        report_to=[]
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=tr_ds,
        eval_dataset=va_ds,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=1)],
        data_collator=DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8)
    )

    trainer.train()
    # Validation preds
    val_pred = trainer.predict(va_ds).predictions  # (N_val,30)
    oof[val_idx] = val_pred.astype(np.float32)
    # Fold metric (mean Spearman)
    vals = []
    for j in range(val_pred.shape[1]):
        p = val_pred[:, j]; y = Y[val_idx, j]
        vals.append(0.0 if np.std(p)==0 or np.std(y)==0 else float(spearmanr(y, p).correlation))
    print(f'[MULTI FOLD {fold}] mean Spearman={np.mean(vals):.5f} time={time.time()-t0:.1f}s', flush=True)
    # Test preds
    te_pred = trainer.predict(te_ds).predictions.astype(np.float32)  # (608,30)
    test_fold_preds.append(te_pred)
    del trainer, model; gc.collect()
    if torch.cuda.is_available(): torch.cuda.empty_cache()

# Aggregate and save artifacts
oof_mean = []
for j in range(oof.shape[1]):
    yj = train[targets[j]].values.astype(float)
    pj = oof[:, j]
    sc = 0.0 if np.std(pj)==0 or np.std(yj)==0 else float(spearmanr(yj, pj).correlation)
    oof_mean.append(sc)
print('[MULTI] OOF per-target Spearman (first 5):', np.round(oof_mean[:5], 5))
print('[MULTI] OOF mean Spearman:', float(np.mean(oof_mean)))
np.save('oof_deberta_multi.npy', oof)
pd.DataFrame({'qa_id': train[id_col], 'fold': folds_idx}).assign(**{f't{j}': oof[:, j] for j in range(oof.shape[1])}).to_csv('oof_deberta_multi.csv', index=False)

test_mean = np.mean(np.stack(test_fold_preds, axis=0), axis=0).astype(np.float32)  # (608,30)
np.save('test_deberta_multi.npy', test_mean)

# Build 31-col submission: fill all 30 targets with model preds (clipped [0,1])
samp = pd.read_csv('sample_submission.csv')
assert 'qa_id' in samp.columns and len(samp.columns)==31, 'Unexpected sample_submission schema'
samp['qa_id'] = pd.to_numeric(test['qa_id'], errors='raise').astype('int64')
for i, col in enumerate(targets):
    samp[col] = np.clip(test_mean[:, i], 0.0, 1.0).astype(float)
samp.to_csv('submission_multi.csv', index=False, float_format='%.8f')
print('[SUB] submission_multi.csv written:', samp.shape)

# Optional: override main target with our single-target ensemble if available
if Path('test_ensemble.npy').exists():
    preds_main = np.clip(np.load('test_ensemble.npy').astype(float), 0.0, 1.0)
    samp_ovr = samp.copy()
    samp_ovr[main_target] = preds_main
    samp_ovr.to_csv('submission_multi_override.csv', index=False, float_format='%.8f')
    print('[SUB] submission_multi_override.csv written with main target overridden.')

print('[DONE] Multi-output pipeline complete. Artifacts: oof_deberta_multi.npy, test_deberta_multi.npy, submission_multi.csv')

  from .autonotebook import tqdm as notebook_tqdm




[TOK] Building inputs (Q+A) ...


[TOK] Done in 5.0s; shapes tr=(5471, 512) te=(608, 512)



[MULTI FOLD 0] train=4395 val=1076


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Epoch,Training Loss,Validation Loss


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


[MULTI FOLD 0] mean Spearman=0.32684 time=426.9s



[MULTI FOLD 1] train=4318 val=1153


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


  return fn(*args, **kwargs)


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Epoch,Training Loss,Validation Loss


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


[MULTI FOLD 1] mean Spearman=0.31641 time=425.9s



[MULTI FOLD 2] train=4389 val=1082


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


  return fn(*args, **kwargs)


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Epoch,Training Loss,Validation Loss


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


[MULTI FOLD 2] mean Spearman=0.29213 time=430.0s



[MULTI FOLD 3] train=4399 val=1072


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


  return fn(*args, **kwargs)


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Epoch,Training Loss,Validation Loss


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


[MULTI FOLD 3] mean Spearman=0.30830 time=431.6s



[MULTI FOLD 4] train=4383 val=1088


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Epoch,Training Loss,Validation Loss


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


[MULTI FOLD 4] mean Spearman=0.32581 time=430.2s


[MULTI] OOF per-target Spearman (first 5): [0.2753  0.4433  0.39041 0.23185 0.31649]
[MULTI] OOF mean Spearman: 0.3114201564172349
[SUB] submission_multi.csv written: (608, 31)
[SUB] submission_multi_override.csv written with main target overridden.
[DONE] Multi-output pipeline complete. Artifacts: oof_deberta_multi.npy, test_deberta_multi.npy, submission_multi.csv


In [2]:
# Promote multi-output submission to submission.csv
import pandas as pd, os
src = 'submission_multi_override.csv' if os.path.exists('submission_multi_override.csv') else 'submission_multi.csv'
df = pd.read_csv(src)
assert df.shape[1] == 31 and 'qa_id' in df.columns, f'Unexpected schema in {src}: {df.shape} columns={list(df.columns)[:5]} ...'
df.to_csv('submission.csv', index=False, float_format='%.8f')
print('[SUBMIT] Wrote submission.csv from', src, 'shape=', df.shape)

[SUBMIT] Wrote submission.csv from submission_multi_override.csv shape= (608, 31)


In [3]:
# MC Dropout TTA (8 passes) for multi-output DeBERTa; build new submission
import json, gc, time, numpy as np, torch
from pathlib import Path
from transformers import AutoConfig, AutoModelForSequenceClassification
from scipy.stats import spearmanr

def spearmanr_mean_30(oof_mat, Y_true):
    vals = []
    for j in range(oof_mat.shape[1]):
        p = oof_mat[:, j]; y = Y_true[:, j]
        if np.std(p)==0 or np.std(y)==0: vals.append(0.0)
        else: vals.append(float(spearmanr(y, p).correlation))
    return float(np.mean(vals))

def best_ckpt_path(out_dir: str):
    # Try trainer_state.json at root
    state_path = Path(out_dir)/'trainer_state.json'
    if state_path.exists():
        try:
            st = json.loads(state_path.read_text())
            best = st.get('best_model_checkpoint', None)
            if best and Path(best).exists():
                return best
        except Exception:
            pass
    # Fallback to most recent checkpoint-*
    p = Path(out_dir)
    if p.exists():
        cands = sorted([q for q in p.glob('checkpoint-*') if q.is_dir()])
        if cands:
            return str(cands[-1])
    return None

use_cuda = torch.cuda.is_available()
device = torch.device('cuda' if use_cuda else 'cpu')
torch.set_float32_matmul_precision('high')

@torch.inference_mode()
def mc_predict_multi(model, ids, masks, passes=8, batch_size=16):
    model.train()  # enable dropout
    N = len(ids); out = np.zeros((N, 30), dtype=np.float32)
    for rep in range(passes):
        t0 = time.time()
        preds = []
        for i in range(0, N, batch_size):
            s = slice(i, min(i+batch_size, N))
            input_ids = torch.tensor(ids[s], device=device)
            attention_mask = torch.tensor(masks[s], device=device)
            if use_cuda:
                with torch.autocast('cuda', dtype=torch.float16):
                    logits = model(input_ids=input_ids, attention_mask=attention_mask).logits  # (B,30)
            else:
                logits = model(input_ids=input_ids, attention_mask=attention_mask).logits
            preds.append(logits.float().cpu().numpy())
        pass_preds = np.vstack(preds)
        out += pass_preds
        print(f"    [mc_pass {rep+1}/{passes}] N={N} elapsed={time.time()-t0:.1f}s", flush=True)
    return out / passes

print('[MC-MULTI] Starting MC TTA over 5 folds ...', flush=True)
oof_mc = np.zeros_like(Y, dtype=np.float32)
test_accum = []

for fold, (trn_idx, val_idx) in enumerate(splits):
    out_dir = f'deberta_multi_fold{fold}'
    ckpt = best_ckpt_path(out_dir)
    if ckpt is None or not Path(ckpt).exists():
        print(f"[MC-MULTI] WARNING: checkpoint not found for {out_dir}")
    assert ckpt is not None and Path(ckpt).exists(), f'Checkpoint for fold {fold} not found'
    print(f"[MC-MULTI] fold={fold} ckpt={ckpt}")

    # Cache paths
    val_cache = Path(f'val_multi_mc_fold{fold}_p8.npy')
    test_cache = Path(f'test_multi_mc_fold{fold}_p8.npy')

    if val_cache.exists() and test_cache.exists():
        print(f"[MC-MULTI] Loading cached mc preds for fold {fold}")
        pv = np.load(val_cache); pt = np.load(test_cache)
    else:
        t_load = time.time()
        config = AutoConfig.from_pretrained('microsoft/deberta-v3-base', num_labels=30, problem_type='regression')
        model = AutoModelForSequenceClassification.from_pretrained(ckpt, config=config).to(device)
        print(f"[MC-MULTI] Model loaded in {time.time()-t_load:.1f}s; running MC inference...", flush=True)
        pv = mc_predict_multi(model, tr_inputs['input_ids'][val_idx], tr_inputs['attention_mask'][val_idx], passes=8, batch_size=16)
        pt = mc_predict_multi(model, te_inputs['input_ids'], te_inputs['attention_mask'], passes=8, batch_size=16)
        np.save(val_cache, pv.astype(np.float32))
        np.save(test_cache, pt.astype(np.float32))
        del model; gc.collect()
        if use_cuda: torch.cuda.empty_cache()

    oof_mc[val_idx] = pv.astype(np.float32)
    test_accum.append(pt.astype(np.float32))
    fold_sc = spearmanr_mean_30(pv, Y[val_idx])
    print(f"[MC-MULTI] fold={fold} mean Spearman={fold_sc:.5f}", flush=True)

test_mc = np.mean(np.stack(test_accum, axis=0), axis=0).astype(np.float32)  # (608,30)
mc_oof_mean = spearmanr_mean_30(oof_mc, Y)
print(f"[MC-MULTI] OOF mean Spearman (MC): {mc_oof_mean:.5f}")
np.save('oof_deberta_multi_mc8.npy', oof_mc)
np.save('test_deberta_multi_mc8.npy', test_mc)

# Build submissions (raw MC and main-target override)
samp = pd.read_csv('sample_submission.csv')
samp['qa_id'] = pd.to_numeric(test['qa_id'], errors='raise').astype('int64')
for i, col in enumerate(targets):
    samp[col] = np.clip(test_mc[:, i], 0.0, 1.0).astype(float)
samp.to_csv('submission_multi_mc.csv', index=False, float_format='%.8f')
print('[SUB] submission_multi_mc.csv written:', samp.shape)

from pathlib import Path as _Path
if _Path('test_ensemble.npy').exists():
    samp_ovr = samp.copy()
    samp_ovr[main_target] = np.clip(np.load('test_ensemble.npy').astype(float), 0.0, 1.0)
    samp_ovr.to_csv('submission_multi_mc_override.csv', index=False, float_format='%.8f')
    print('[SUB] submission_multi_mc_override.csv written (main target overridden)')

print('[MC-MULTI] Done.')

[MC-MULTI] Starting MC TTA over 5 folds ...


[MC-MULTI] fold=0 ckpt=deberta_multi_fold0/checkpoint-550


[MC-MULTI] Model loaded in 0.3s; running MC inference...


    [mc_pass 1/8] N=1076 elapsed=13.3s


    [mc_pass 2/8] N=1076 elapsed=13.3s


    [mc_pass 3/8] N=1076 elapsed=13.3s


    [mc_pass 4/8] N=1076 elapsed=13.3s


    [mc_pass 5/8] N=1076 elapsed=13.3s


    [mc_pass 6/8] N=1076 elapsed=13.4s


    [mc_pass 7/8] N=1076 elapsed=13.4s


    [mc_pass 8/8] N=1076 elapsed=13.4s


    [mc_pass 1/8] N=608 elapsed=7.6s


    [mc_pass 2/8] N=608 elapsed=7.6s


    [mc_pass 3/8] N=608 elapsed=7.6s


    [mc_pass 4/8] N=608 elapsed=7.6s


    [mc_pass 5/8] N=608 elapsed=7.6s


    [mc_pass 6/8] N=608 elapsed=7.6s


    [mc_pass 7/8] N=608 elapsed=7.6s


    [mc_pass 8/8] N=608 elapsed=7.6s


[MC-MULTI] fold=0 mean Spearman=0.31161


[MC-MULTI] fold=1 ckpt=deberta_multi_fold1/checkpoint-540


[MC-MULTI] Model loaded in 0.2s; running MC inference...


    [mc_pass 1/8] N=1153 elapsed=14.4s


    [mc_pass 2/8] N=1153 elapsed=14.4s


    [mc_pass 3/8] N=1153 elapsed=14.4s


    [mc_pass 4/8] N=1153 elapsed=14.4s


    [mc_pass 5/8] N=1153 elapsed=14.5s


    [mc_pass 6/8] N=1153 elapsed=14.5s


    [mc_pass 7/8] N=1153 elapsed=14.5s


    [mc_pass 8/8] N=1153 elapsed=14.5s


    [mc_pass 1/8] N=608 elapsed=7.6s


    [mc_pass 2/8] N=608 elapsed=7.6s


    [mc_pass 3/8] N=608 elapsed=7.6s


    [mc_pass 4/8] N=608 elapsed=7.6s


    [mc_pass 5/8] N=608 elapsed=7.6s


    [mc_pass 6/8] N=608 elapsed=7.6s


    [mc_pass 7/8] N=608 elapsed=7.6s


    [mc_pass 8/8] N=608 elapsed=7.6s


[MC-MULTI] fold=1 mean Spearman=0.29563


[MC-MULTI] fold=2 ckpt=deberta_multi_fold2/checkpoint-548


[MC-MULTI] Model loaded in 0.2s; running MC inference...


    [mc_pass 1/8] N=1082 elapsed=13.6s


    [mc_pass 2/8] N=1082 elapsed=13.6s


    [mc_pass 3/8] N=1082 elapsed=13.6s


    [mc_pass 4/8] N=1082 elapsed=13.6s


    [mc_pass 5/8] N=1082 elapsed=13.6s


    [mc_pass 6/8] N=1082 elapsed=13.6s


    [mc_pass 7/8] N=1082 elapsed=13.6s


    [mc_pass 8/8] N=1082 elapsed=13.6s


    [mc_pass 1/8] N=608 elapsed=7.6s


    [mc_pass 2/8] N=608 elapsed=7.6s


    [mc_pass 3/8] N=608 elapsed=7.6s


    [mc_pass 4/8] N=608 elapsed=7.6s


    [mc_pass 5/8] N=608 elapsed=7.6s


    [mc_pass 6/8] N=608 elapsed=7.6s


    [mc_pass 7/8] N=608 elapsed=7.6s


    [mc_pass 8/8] N=608 elapsed=7.6s


[MC-MULTI] fold=2 mean Spearman=0.27305


[MC-MULTI] fold=3 ckpt=deberta_multi_fold3/checkpoint-550


[MC-MULTI] Model loaded in 0.2s; running MC inference...


    [mc_pass 1/8] N=1072 elapsed=13.4s


    [mc_pass 2/8] N=1072 elapsed=13.5s


    [mc_pass 3/8] N=1072 elapsed=13.4s


    [mc_pass 4/8] N=1072 elapsed=13.4s


    [mc_pass 5/8] N=1072 elapsed=13.4s


    [mc_pass 6/8] N=1072 elapsed=13.4s


    [mc_pass 7/8] N=1072 elapsed=13.4s


    [mc_pass 8/8] N=1072 elapsed=13.5s


    [mc_pass 1/8] N=608 elapsed=7.6s


    [mc_pass 2/8] N=608 elapsed=7.6s


    [mc_pass 3/8] N=608 elapsed=7.6s


    [mc_pass 4/8] N=608 elapsed=7.6s


    [mc_pass 5/8] N=608 elapsed=7.6s


    [mc_pass 6/8] N=608 elapsed=7.6s


    [mc_pass 7/8] N=608 elapsed=7.6s


    [mc_pass 8/8] N=608 elapsed=7.6s


[MC-MULTI] fold=3 mean Spearman=0.28468


[MC-MULTI] fold=4 ckpt=deberta_multi_fold4/checkpoint-548


[MC-MULTI] Model loaded in 0.2s; running MC inference...


    [mc_pass 1/8] N=1088 elapsed=13.7s


    [mc_pass 2/8] N=1088 elapsed=13.7s


    [mc_pass 3/8] N=1088 elapsed=13.7s


    [mc_pass 4/8] N=1088 elapsed=13.7s


    [mc_pass 5/8] N=1088 elapsed=13.7s


    [mc_pass 6/8] N=1088 elapsed=13.7s


    [mc_pass 7/8] N=1088 elapsed=13.7s


    [mc_pass 8/8] N=1088 elapsed=13.7s


    [mc_pass 1/8] N=608 elapsed=7.6s


    [mc_pass 2/8] N=608 elapsed=7.6s


    [mc_pass 3/8] N=608 elapsed=7.6s


    [mc_pass 4/8] N=608 elapsed=7.6s


    [mc_pass 5/8] N=608 elapsed=7.6s


    [mc_pass 6/8] N=608 elapsed=7.6s


    [mc_pass 7/8] N=608 elapsed=7.6s


    [mc_pass 8/8] N=608 elapsed=7.6s


[MC-MULTI] fold=4 mean Spearman=0.30130


[MC-MULTI] OOF mean Spearman (MC): 0.29173
[SUB] submission_multi_mc.csv written: (608, 31)
[SUB] submission_multi_mc_override.csv written (main target overridden)
[MC-MULTI] Done.


In [4]:
# Blend multi-output DeBERTa (test_deberta_multi.npy) with TFIDF+SVD+Ridge predictions per target; override main target with strong single-target ensemble
import numpy as np, pandas as pd
from pathlib import Path
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import Ridge
from scipy import sparse

targets = [
    'question_asker_intent_understanding','question_body_critical','question_conversational','question_expect_short_answer',
    'question_fact_seeking','question_has_commonly_accepted_answer','question_interestingness_others','question_interestingness_self',
    'question_multi_intent','question_not_really_a_question','question_opinion_seeking','question_type_choice','question_type_compare',
    'question_type_consequence','question_type_definition','question_type_entity','question_type_instructions','question_type_procedure',
    'question_type_reason_explanation','question_type_spelling','question_well_written','answer_helpful','answer_level_of_information',
    'answer_plausible','answer_relevance','answer_satisfaction','answer_type_instructions','answer_type_procedure',
    'answer_type_reason_explanation','answer_well_written'
]
main_target = 'question_asker_intent_understanding'

# Load transformer test predictions (N_test,30)
tfm_test = np.load('test_deberta_multi.npy') if Path('test_deberta_multi.npy').exists() else None
if tfm_test is None:
    # fallback to MC if only that exists (even if slightly worse OOF, test may still help via blend)
    tfm_test = np.load('test_deberta_multi_mc8.npy')

# Build TFIDF+SVD features once and train multi-output Ridge on full train to get test preds
train = pd.read_csv('train.csv'); test = pd.read_csv('test.csv')
def combine(df):
    t = df.get('question_title', pd.Series(['']*len(df))).fillna('').astype(str)
    b = df.get('question_body', pd.Series(['']*len(df))).fillna('').astype(str)
    a = df.get('answer', pd.Series(['']*len(df))).fillna('').astype(str)
    return (t + ' [SEP] ' + b + ' [SEP] ' + a).values
txt_tr = combine(train); txt_te = combine(test)
cfg_word = dict(analyzer='word', ngram_range=(1,2), sublinear_tf=True, strip_accents='unicode', lowercase=True, min_df=2)
cfg_char = dict(analyzer='char_wb', ngram_range=(3,6), sublinear_tf=True, min_df=2)
vec_w = TfidfVectorizer(max_features=200_000, **cfg_word)
vec_c = TfidfVectorizer(max_features=200_000, **cfg_char)
Xw_tr = vec_w.fit_transform(txt_tr); Xw_te = vec_w.transform(txt_te)
Xc_tr = vec_c.fit_transform(txt_tr); Xc_te = vec_c.transform(txt_te)
X_tr = sparse.hstack([Xw_tr, Xc_tr], format='csr')
X_te = sparse.hstack([Xw_te, Xc_te], format='csr')
svd = TruncatedSVD(n_components=256, random_state=42)
Z_tr = svd.fit_transform(X_tr); Z_te = svd.transform(X_te)
Y = train[targets].astype(float).values
ridge = Ridge(alpha=10.0, random_state=42)
ridge.fit(Z_tr, Y)
ridge_test = np.clip(ridge.predict(Z_te).astype(float), 0.0, 1.0)

# Simple global blend per target: pred = (1-w)*tfm + w*ridge with small w for stability
w = 0.20
blend = np.clip((1.0 - w) * tfm_test + w * ridge_test, 0.0, 1.0).astype(float)

# Override main target with our best single-target ensemble if available
if Path('test_ensemble.npy').exists():
    main_pred = np.clip(np.load('test_ensemble.npy').astype(float), 0.0, 1.0)
    j = targets.index(main_target)
    blend[:, j] = main_pred

# Write final submission.csv (31 columns) from sample template
samp = pd.read_csv('sample_submission.csv')
samp['qa_id'] = pd.to_numeric(test['qa_id'], errors='raise').astype('int64')
for i, col in enumerate(targets):
    samp[col] = blend[:, i].astype(float)
samp.to_csv('submission.csv', index=False, float_format='%.8f')
print('[BLEND SUB] submission.csv written:', samp.shape)

[BLEND SUB] submission.csv written: (608, 31)


In [7]:
# DeBERTa-v3-base with mean-pooled head (30-dim regressor) - Question-only view (title + body); 5-fold, 4 epochs
import os, gc, time, json, numpy as np, pandas as pd, torch, torch.nn as nn
from pathlib import Path
from scipy.stats import spearmanr
from transformers import (AutoTokenizer, AutoConfig, AutoModel,
                          TrainingArguments, Trainer, EarlyStoppingCallback, DataCollatorWithPadding)

assert 'targets' in globals() and 'train' in globals() and 'test' in globals() and 'splits' in globals(), 'Run Cell 0 first.'
model_name_q = 'microsoft/deberta-v3-base'
if 'tokenizer' not in globals():
    tokenizer = AutoTokenizer.from_pretrained(model_name_q)

MAX_LEN_Q = 512
TITLE_MAX_Q = 64

def pack_question_only(title: str, body: str, tokenizer, max_len=MAX_LEN_Q, title_max=TITLE_MAX_Q):
    ti = tokenizer(title if isinstance(title, str) else '', add_special_tokens=False, truncation=True, max_length=title_max)['input_ids']
    bi_full = tokenizer(body if isinstance(body, str) else '', add_special_tokens=False, truncation=False)['input_ids']
    # CLS + 2*SEP
    rem = max_len - (1 + 1 + 1) - len(ti)
    rem = max(rem, 0)
    bi = bi_full[:rem]
    ids = [tokenizer.cls_token_id] + ti + [tokenizer.sep_token_id] + bi + [tokenizer.sep_token_id]
    ids = ids[:max_len]
    attn = [1]*len(ids)
    pad = max_len - len(ids)
    if pad>0:
        ids += [tokenizer.pad_token_id]*pad
        attn += [0]*pad
    return ids, attn

def build_inputs_q_only(df: pd.DataFrame):
    T = df['question_title'].fillna('').astype(str).tolist()
    B = df['question_body'].fillna('').astype(str).tolist()
    input_ids, attention_masks = [], []
    for t,b in zip(T,B):
        ids, attn = pack_question_only(t,b, tokenizer)
        input_ids.append(ids); attention_masks.append(attn)
    return {'input_ids': np.array(input_ids, dtype=np.int64), 'attention_mask': np.array(attention_masks, dtype=np.int64)}

print('[TOK-Q] Building inputs (Question-only) ...', flush=True)
t0_tok = time.time()
tr_q_inputs = build_inputs_q_only(train)
te_q_inputs = build_inputs_q_only(test)
print(f'[TOK-Q] Done in {time.time()-t0_tok:.1f}s; shapes tr={tr_q_inputs["input_ids"].shape} te={te_q_inputs["input_ids"].shape}', flush=True)

class QDataset(torch.utils.data.Dataset):
    def __init__(self, ids, masks, labels=None):
        self.ids = ids; self.masks = masks; self.labels = labels
    def __len__(self): return len(self.ids)
    def __getitem__(self, idx):
        item = {'input_ids': torch.tensor(self.ids[idx]), 'attention_mask': torch.tensor(self.masks[idx])}
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float32)
        return item

def masked_mean_pool(last_hidden_state, attention_mask):
    mask = attention_mask.unsqueeze(-1).type_as(last_hidden_state)  # (B,L,1)
    summ = (last_hidden_state * mask).sum(dim=1)
    denom = mask.sum(dim=1).clamp(min=1e-6)
    return summ / denom

class QuestMultiRegressor(nn.Module):
    def __init__(self, model_name: str, out_dim: int = 30, dropout: float = 0.2, msd: int = 0):
        super().__init__()
        self.backbone = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(dropout)
        hidden = self.backbone.config.hidden_size
        self.head = nn.Linear(hidden, out_dim)
        self.msd = msd
        # enable gradient checkpointing if available for backbone
        try: self.backbone.gradient_checkpointing_enable()
        except Exception: pass
    # HF Trainer expects these on the model; proxy to backbone
    def gradient_checkpointing_enable(self, gradient_checkpointing_kwargs=None):
        try:
            if gradient_checkpointing_kwargs is None:
                self.backbone.gradient_checkpointing_enable()
            else:
                self.backbone.gradient_checkpointing_enable(**gradient_checkpointing_kwargs)
        except Exception:
            pass
    def gradient_checkpointing_disable(self):
        try: self.backbone.gradient_checkpointing_disable()
        except Exception: pass
    def forward(self, input_ids=None, attention_mask=None, labels=None):
        out = self.backbone(input_ids=input_ids, attention_mask=attention_mask)
        hs = out.last_hidden_state  # (B,L,H)
        pooled = masked_mean_pool(hs, attention_mask)  # (B,H)
        if self.msd and self.training:
            logits_acc = 0.0
            for _ in range(self.msd):
                logits_acc = logits_acc + self.head(self.dropout(pooled))
            logits = logits_acc / float(self.msd)
        else:
            logits = self.head(self.dropout(pooled))
        loss = None
        if labels is not None:
            loss = nn.functional.mse_loss(logits, labels)
        return {'loss': loss, 'logits': logits}

def compute_metrics_30(eval_pred):
    # Supports both tuple and EvalPrediction
    preds = getattr(eval_pred, 'predictions', None)
    labels = getattr(eval_pred, 'label_ids', None)
    if preds is None:
        preds, labels = eval_pred
    vals = []
    for j in range(labels.shape[1]):
        p = preds[:, j]; y = labels[:, j]
        if np.std(p)==0 or np.std(y)==0:
            vals.append(0.0)
        else:
            vals.append(float(spearmanr(y, p).correlation))
    return {'mean_spearman': float(np.mean(vals))}

@torch.inference_mode()
def infer_logits(model, dataset, batch_size=32):
    device = next(model.parameters()).device
    model.eval()
    loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=False)
    outs = []
    use_cuda = device.type == 'cuda'
    for batch in loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        if use_cuda:
            with torch.autocast('cuda', dtype=torch.float16):
                out = model(input_ids=input_ids, attention_mask=attention_mask)
        else:
            out = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = out['logits'] if isinstance(out, dict) else out.logits
        outs.append(logits.float().cpu().numpy())
    return np.vstack(outs)

# Training loop - 5 folds, 4 epochs, MSE loss; saves OOF/test npy
Y30 = train[targets].astype(float).values
oof_q = np.zeros((len(train), 30), dtype=np.float32)
test_fold_preds_q = []
folds_idx_q = np.full(len(train), -1, dtype=int)

for fold, (trn_idx, val_idx) in enumerate(splits):
    t0 = time.time()
    folds_idx_q[val_idx] = fold
    print(f'\n[Q-ONLY FOLD {fold}] train={len(trn_idx)} val={len(val_idx)}', flush=True)
    tr_ds = QDataset(tr_q_inputs['input_ids'][trn_idx], tr_q_inputs['attention_mask'][trn_idx], Y30[trn_idx])
    va_ds = QDataset(tr_q_inputs['input_ids'][val_idx], tr_q_inputs['attention_mask'][val_idx], Y30[val_idx])
    te_ds = QDataset(te_q_inputs['input_ids'], te_q_inputs['attention_mask'], None)

    model = QuestMultiRegressor(model_name_q, out_dim=30, dropout=0.2, msd=0)

    args = TrainingArguments(
        output_dir=f'deberta_q_only_fold{fold}',
        num_train_epochs=4,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=32,
        gradient_accumulation_steps=2,
        learning_rate=2e-5,
        weight_decay=0.01,
        warmup_ratio=0.1,
        lr_scheduler_type='linear',
        fp16=True,
        gradient_checkpointing=True,
        evaluation_strategy='epoch',
        save_strategy='epoch',
        load_best_model_at_end=True,
        metric_for_best_model='mean_spearman',
        greater_is_better=True,
        save_total_limit=1,
        logging_steps=50,
        seed=42,
        report_to=[]
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=tr_ds,
        eval_dataset=va_ds,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics_30,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=1)],
        data_collator=DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8)
    )

    trainer.train()
    val_pred = trainer.predict(va_ds).predictions.astype(np.float32)
    oof_q[val_idx] = val_pred
    # Fold metric
    vals = []
    for j in range(val_pred.shape[1]):
        p = val_pred[:, j]; y = Y30[val_idx, j]
        vals.append(0.0 if np.std(p)==0 or np.std(y)==0 else float(spearmanr(y, p).correlation))
    print(f'[Q-ONLY FOLD {fold}] mean Spearman={np.mean(vals):.5f} time={time.time()-t0:.1f}s', flush=True)
    # Test preds via manual inference to avoid Trainer.predict issues
    te_pred = infer_logits(trainer.model, te_ds, batch_size=32).astype(np.float32)
    test_fold_preds_q.append(te_pred)
    del trainer, model; gc.collect()
    if torch.cuda.is_available(): torch.cuda.empty_cache()

# Aggregate and save artifacts
oof_mean_q = []
for j in range(oof_q.shape[1]):
    yj = train[targets[j]].values.astype(float)
    pj = oof_q[:, j]
    sc = 0.0 if np.std(pj)==0 or np.std(yj)==0 else float(spearmanr(yj, pj).correlation)
    oof_mean_q.append(sc)
print('[Q-ONLY] OOF per-target Spearman (first 5):', np.round(oof_mean_q[:5], 5))
print('[Q-ONLY] OOF mean Spearman:', float(np.mean(oof_mean_q)))
np.save('oof_deberta_q.npy', oof_q)
pd.DataFrame({'qa_id': train['qa_id'], 'fold': folds_idx_q}).assign(**{f't{j}': oof_q[:, j] for j in range(oof_q.shape[1])}).to_csv('oof_deberta_q.csv', index=False)

test_q = np.mean(np.stack(test_fold_preds_q, axis=0), axis=0).astype(np.float32)  # (608,30)
np.save('test_deberta_q.npy', test_q)
print('[Q-ONLY] Saved: oof_deberta_q.npy, test_deberta_q.npy')

# Note: blending across views will be done in a later cell. Ensure folds align across runs.
print('[Q-ONLY] Done.')

[TOK-Q] Building inputs (Question-only) ...


[TOK-Q] Done in 2.6s; shapes tr=(5471, 512) te=(608, 512)



[Q-ONLY FOLD 0] train=4395 val=1076


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Epoch,Training Loss,Validation Loss


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


[Q-ONLY FOLD 0] mean Spearman=0.31187 time=834.7s



[Q-ONLY FOLD 1] train=4318 val=1153


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Epoch,Training Loss,Validation Loss


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


[Q-ONLY FOLD 1] mean Spearman=0.31011 time=825.9s



[Q-ONLY FOLD 2] train=4389 val=1082


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Epoch,Training Loss,Validation Loss


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


[Q-ONLY FOLD 2] mean Spearman=0.30532 time=833.5s



[Q-ONLY FOLD 3] train=4399 val=1072


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Epoch,Training Loss,Validation Loss


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


[Q-ONLY FOLD 3] mean Spearman=0.31799 time=832.2s



[Q-ONLY FOLD 4] train=4383 val=1088


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Epoch,Training Loss,Validation Loss


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


[Q-ONLY FOLD 4] mean Spearman=0.31992 time=835.1s


[Q-ONLY] OOF per-target Spearman (first 5): [0.27453 0.53029 0.38864 0.25553 0.32325]
[Q-ONLY] OOF mean Spearman: 0.3120294183857368
[Q-ONLY] Saved: oof_deberta_q.npy, test_deberta_q.npy
[Q-ONLY] Done.


In [8]:
# DeBERTa-v3-base with mean-pooled head (30-dim regressor) - Answer-only view (answer text); 5-fold, 4 epochs
import os, gc, time, numpy as np, pandas as pd, torch, torch.nn as nn
from pathlib import Path
from scipy.stats import spearmanr
from transformers import (AutoTokenizer, AutoModel, TrainingArguments, Trainer, EarlyStoppingCallback, DataCollatorWithPadding)

assert 'targets' in globals() and 'train' in globals() and 'test' in globals() and 'splits' in globals(), 'Run Cell 0 first.'
model_name_a = 'microsoft/deberta-v3-base'
if 'tokenizer' not in globals():
    tokenizer = AutoTokenizer.from_pretrained(model_name_a)

MAX_LEN_A = 512

def pack_answer_only(answer: str, tokenizer, max_len=MAX_LEN_A):
    ai = tokenizer(answer if isinstance(answer, str) else '', add_special_tokens=False, truncation=True, max_length=max_len-2)['input_ids']
    ids = [tokenizer.cls_token_id] + ai + [tokenizer.sep_token_id]
    ids = ids[:max_len]
    attn = [1]*len(ids)
    pad = max_len - len(ids)
    if pad>0:
        ids += [tokenizer.pad_token_id]*pad
        attn += [0]*pad
    return ids, attn

def build_inputs_a_only(df: pd.DataFrame):
    A = df['answer'].fillna('').astype(str).tolist() if 'answer' in df.columns else ['']*len(df)
    input_ids, attention_masks = [], []
    for a in A:
        ids, attn = pack_answer_only(a, tokenizer)
        input_ids.append(ids); attention_masks.append(attn)
    return {'input_ids': np.array(input_ids, dtype=np.int64), 'attention_mask': np.array(attention_masks, dtype=np.int64)}

print('[TOK-A] Building inputs (Answer-only) ...', flush=True)
t0_tok = time.time()
tr_a_inputs = build_inputs_a_only(train)
te_a_inputs = build_inputs_a_only(test)
print(f'[TOK-A] Done in {time.time()-t0_tok:.1f}s; shapes tr={tr_a_inputs["input_ids"].shape} te={te_a_inputs["input_ids"].shape}', flush=True)

class ADataset(torch.utils.data.Dataset):
    def __init__(self, ids, masks, labels=None):
        self.ids = ids; self.masks = masks; self.labels = labels
    def __len__(self): return len(self.ids)
    def __getitem__(self, idx):
        item = {'input_ids': torch.tensor(self.ids[idx]), 'attention_mask': torch.tensor(self.masks[idx])}
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float32)
        return item

def masked_mean_pool(last_hidden_state, attention_mask):
    mask = attention_mask.unsqueeze(-1).type_as(last_hidden_state)
    summ = (last_hidden_state * mask).sum(dim=1)
    denom = mask.sum(dim=1).clamp(min=1e-6)
    return summ / denom

class QuestMultiRegressorA(nn.Module):
    def __init__(self, model_name: str, out_dim: int = 30, dropout: float = 0.2, msd: int = 0):
        super().__init__()
        self.backbone = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(dropout)
        hidden = self.backbone.config.hidden_size
        self.head = nn.Linear(hidden, out_dim)
        self.msd = msd
        try: self.backbone.gradient_checkpointing_enable()
        except Exception: pass
    def gradient_checkpointing_enable(self, gradient_checkpointing_kwargs=None):
        try:
            if gradient_checkpointing_kwargs is None:
                self.backbone.gradient_checkpointing_enable()
            else:
                self.backbone.gradient_checkpointing_enable(**gradient_checkpointing_kwargs)
        except Exception:
            pass
    def gradient_checkpointing_disable(self):
        try: self.backbone.gradient_checkpointing_disable()
        except Exception: pass
    def forward(self, input_ids=None, attention_mask=None, labels=None):
        out = self.backbone(input_ids=input_ids, attention_mask=attention_mask)
        pooled = masked_mean_pool(out.last_hidden_state, attention_mask)
        if self.msd and self.training:
            logits_acc = 0.0
            for _ in range(self.msd):
                logits_acc = logits_acc + self.head(self.dropout(pooled))
            logits = logits_acc / float(self.msd)
        else:
            logits = self.head(self.dropout(pooled))
        loss = None
        if labels is not None:
            loss = nn.functional.mse_loss(logits, labels)
        return {'loss': loss, 'logits': logits}

def compute_metrics_30(eval_pred):
    preds = getattr(eval_pred, 'predictions', None)
    labels = getattr(eval_pred, 'label_ids', None)
    if preds is None:
        preds, labels = eval_pred
    vals = []
    for j in range(labels.shape[1]):
        p = preds[:, j]; y = labels[:, j]
        if np.std(p)==0 or np.std(y)==0:
            vals.append(0.0)
        else:
            vals.append(float(spearmanr(y, p).correlation))
    return {'mean_spearman': float(np.mean(vals))}

@torch.inference_mode()
def infer_logits(model, dataset, batch_size=32):
    device = next(model.parameters()).device
    model.eval()
    loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=False)
    outs = []
    use_cuda = device.type == 'cuda'
    for batch in loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        if use_cuda:
            with torch.autocast('cuda', dtype=torch.float16):
                out = model(input_ids=input_ids, attention_mask=attention_mask)
        else:
            out = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = out['logits'] if isinstance(out, dict) else out.logits
        outs.append(logits.float().cpu().numpy())
    return np.vstack(outs)

Y30 = train[targets].astype(float).values
oof_a = np.zeros((len(train), 30), dtype=np.float32)
test_fold_preds_a = []
folds_idx_a = np.full(len(train), -1, dtype=int)

for fold, (trn_idx, val_idx) in enumerate(splits):
    t0 = time.time()
    folds_idx_a[val_idx] = fold
    print(f'\n[A-ONLY FOLD {fold}] train={len(trn_idx)} val={len(val_idx)}', flush=True)
    tr_ds = ADataset(tr_a_inputs['input_ids'][trn_idx], tr_a_inputs['attention_mask'][trn_idx], Y30[trn_idx])
    va_ds = ADataset(tr_a_inputs['input_ids'][val_idx], tr_a_inputs['attention_mask'][val_idx], Y30[val_idx])
    te_ds = ADataset(te_a_inputs['input_ids'], te_a_inputs['attention_mask'], None)

    model = QuestMultiRegressorA(model_name_a, out_dim=30, dropout=0.2, msd=0)

    args = TrainingArguments(
        output_dir=f'deberta_a_only_fold{fold}',
        num_train_epochs=4,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=32,
        gradient_accumulation_steps=2,
        learning_rate=2e-5,
        weight_decay=0.01,
        warmup_ratio=0.1,
        lr_scheduler_type='linear',
        fp16=True,
        gradient_checkpointing=True,
        evaluation_strategy='epoch',
        save_strategy='epoch',
        load_best_model_at_end=True,
        metric_for_best_model='mean_spearman',
        greater_is_better=True,
        save_total_limit=1,
        logging_steps=50,
        seed=42,
        report_to=[]
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=tr_ds,
        eval_dataset=va_ds,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics_30,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=1)],
        data_collator=DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8)
    )

    trainer.train()
    val_pred = trainer.predict(va_ds).predictions.astype(np.float32)
    oof_a[val_idx] = val_pred
    vals = []
    for j in range(val_pred.shape[1]):
        p = val_pred[:, j]; y = Y30[val_idx, j]
        vals.append(0.0 if np.std(p)==0 or np.std(y)==0 else float(spearmanr(y, p).correlation))
    print(f'[A-ONLY FOLD {fold}] mean Spearman={np.mean(vals):.5f} time={time.time()-t0:.1f}s', flush=True)
    # Test preds via manual inference to avoid Trainer.predict issues
    te_pred = infer_logits(trainer.model, te_ds, batch_size=32).astype(np.float32)
    test_fold_preds_a.append(te_pred)
    del trainer, model; gc.collect()
    if torch.cuda.is_available(): torch.cuda.empty_cache()

oof_mean_a = []
for j in range(oof_a.shape[1]):
    yj = train[targets[j]].values.astype(float)
    pj = oof_a[:, j]
    sc = 0.0 if np.std(pj)==0 or np.std(yj)==0 else float(spearmanr(yj, pj).correlation)
    oof_mean_a.append(sc)
print('[A-ONLY] OOF per-target Spearman (first 5):', np.round(oof_mean_a[:5], 5))
print('[A-ONLY] OOF mean Spearman:', float(np.mean(oof_mean_a)))
np.save('oof_deberta_a.npy', oof_a)
pd.DataFrame({'qa_id': train['qa_id'], 'fold': folds_idx_a}).assign(**{f't{j}': oof_a[:, j] for j in range(oof_a.shape[1])}).to_csv('oof_deberta_a.csv', index=False)

test_a = np.mean(np.stack(test_fold_preds_a, axis=0), axis=0).astype(np.float32)  # (608,30)
np.save('test_deberta_a.npy', test_a)
print('[A-ONLY] Saved: oof_deberta_a.npy, test_deberta_a.npy')
print('[A-ONLY] Done.')

[TOK-A] Building inputs (Answer-only) ...


[TOK-A] Done in 2.4s; shapes tr=(5471, 512) te=(608, 512)



[A-ONLY FOLD 0] train=4395 val=1076


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Epoch,Training Loss,Validation Loss


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


[A-ONLY FOLD 0] mean Spearman=0.26294 time=836.5s



[A-ONLY FOLD 1] train=4318 val=1153


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Epoch,Training Loss,Validation Loss


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


[A-ONLY FOLD 1] mean Spearman=0.24829 time=822.1s



[A-ONLY FOLD 2] train=4389 val=1082


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Epoch,Training Loss,Validation Loss


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


[A-ONLY FOLD 2] mean Spearman=0.24264 time=839.7s



[A-ONLY FOLD 3] train=4399 val=1072


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Epoch,Training Loss,Validation Loss


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


[A-ONLY FOLD 3] mean Spearman=0.26156 time=837.7s



[A-ONLY FOLD 4] train=4383 val=1088


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Epoch,Training Loss,Validation Loss


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


[A-ONLY FOLD 4] mean Spearman=0.27642 time=833.9s


[A-ONLY] OOF per-target Spearman (first 5): [0.15729 0.30378 0.3401  0.13804 0.25532]
[A-ONLY] OOF mean Spearman: 0.25770002422279903
[A-ONLY] Saved: oof_deberta_a.npy, test_deberta_a.npy
[A-ONLY] Done.


In [None]:
# Rank-standardized per-target blend: Q-only, A-only, Q+A, Ridge; override main target; write submission.csv
import os, time, numpy as np, pandas as pd
from pathlib import Path
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import Ridge
from scipy import sparse

id_col = 'qa_id'
targets = [
    'question_asker_intent_understanding','question_body_critical','question_conversational','question_expect_short_answer',
    'question_fact_seeking','question_has_commonly_accepted_answer','question_interestingness_others','question_interestingness_self',
    'question_multi_intent','question_not_really_a_question','question_opinion_seeking','question_type_choice','question_type_compare',
    'question_type_consequence','question_type_definition','question_type_entity','question_type_instructions','question_type_procedure',
    'question_type_reason_explanation','question_type_spelling','question_well_written','answer_helpful','answer_level_of_information',
    'answer_plausible','answer_relevance','answer_satisfaction','answer_type_instructions','answer_type_procedure',
    'answer_type_reason_explanation','answer_well_written'
]
main_target = 'question_asker_intent_understanding'

def frac_rank_col(col):
    # fractional rank in [0,1]
    n = len(col)
    order = np.argsort(col, kind='mergesort')
    ranks = np.empty(n, dtype=np.float64); ranks[order] = np.arange(n, dtype=np.float64)
    return ranks / max(n-1, 1)

def rank_standardize(mat):
    # mat: (N,30) -> per-column fractional ranks in [0,1]
    out = np.zeros_like(mat, dtype=np.float64)
    for j in range(mat.shape[1]):
        out[:, j] = frac_rank_col(mat[:, j])
    return out

print('[BLEND] Loading model preds ...', flush=True)
need = []
paths = {
    'qa_oof': 'oof_deberta_multi.npy',
    'qa_test': 'test_deberta_multi.npy',
    'q_oof': 'oof_deberta_q.npy',
    'q_test': 'test_deberta_q.npy',
    'a_oof': 'oof_deberta_a.npy',
    'a_test': 'test_deberta_a.npy',
}
loaded = {}
for k,p in paths.items():
    if Path(p).exists():
        loaded[k] = np.load(p)
    else:
        need.append(k)
print('[BLEND] Missing keys:', need)

# Ensure base requirements: at least Q+A + one of Q-only/A-only
assert ('qa_test' in loaded) or Path('test_deberta_multi_mc8.npy').exists(), 'No Q+A test predictions found'
if 'qa_test' not in loaded:
    loaded['qa_test'] = np.load('test_deberta_multi_mc8.npy')

# Build Ridge test (and OOF optionally) if not cached
ridge_test_path = Path('test_ridge_svd256.npy')
ridge_oof_path = Path('oof_ridge_svd256.npy')
if not ridge_test_path.exists() or not ridge_oof_path.exists():
    print('[BLEND][RIDGE] Building TFIDF+SVD features and computing OOF/test ...', flush=True)
    t0 = time.time()
    train_df = pd.read_csv('train.csv'); test_df = pd.read_csv('test.csv')
    def combine(df):
        t = df.get('question_title', pd.Series(['']*len(df))).fillna('').astype(str)
        b = df.get('question_body', pd.Series(['']*len(df))).fillna('').astype(str)
        a = df.get('answer', pd.Series(['']*len(df))).fillna('').astype(str)
        return (t + ' [SEP] ' + b + ' [SEP] ' + a).values
    txt_tr = combine(train_df); txt_te = combine(test_df)
    cfg_word = dict(analyzer='word', ngram_range=(1,2), sublinear_tf=True, strip_accents='unicode', lowercase=True, min_df=2)
    cfg_char = dict(analyzer='char_wb', ngram_range=(3,6), sublinear_tf=True, min_df=2)
    vec_w = TfidfVectorizer(max_features=200_000, **cfg_word)
    vec_c = TfidfVectorizer(max_features=200_000, **cfg_char)
    Xw_tr = vec_w.fit_transform(txt_tr); Xw_te = vec_w.transform(txt_te)
    Xc_tr = vec_c.fit_transform(txt_tr); Xc_te = vec_c.transform(txt_te)
    X_tr = sparse.hstack([Xw_tr, Xc_tr], format='csr'); X_te = sparse.hstack([Xw_te, Xc_te], format='csr')
    svd = TruncatedSVD(n_components=256, random_state=42)
    Z_tr = svd.fit_transform(X_tr); Z_te = svd.transform(X_te)
    Y = pd.read_csv('train.csv')[targets].astype(float).values
    # 5-fold OOF for Ridge to align with CV (use the same splits from globals if available)
    if 'splits' in globals():
        oof_r = np.zeros_like(Y, dtype=np.float64)
        for f,(trn_idx, val_idx) in enumerate(splits):
            rg = Ridge(alpha=10.0, random_state=42)
            rg.fit(Z_tr[trn_idx], Y[trn_idx])
            oof_r[val_idx] = rg.predict(Z_tr[val_idx])
        ridge_oof = np.clip(oof_r, 0.0, 1.0).astype(np.float32)
    else:
        ridge_oof = None
    rg_full = Ridge(alpha=10.0, random_state=42).fit(Z_tr, Y)
    ridge_test = np.clip(rg_full.predict(Z_te), 0.0, 1.0).astype(np.float32)
    np.save(ridge_test_path, ridge_test)
    if ridge_oof is not None: np.save(ridge_oof_path, ridge_oof)
    print(f'[BLEND][RIDGE] Done in {time.time()-t0:.1f}s; ridge_test shape={ridge_test.shape}', flush=True)
else:
    ridge_test = np.load(ridge_test_path)
    ridge_oof = np.load(ridge_oof_path) if ridge_oof_path.exists() else None

# Prepare test matrices available
tests = []
names = []
if 'q_test' in loaded:
    tests.append(loaded['q_test'].astype(np.float64)); names.append('Q')
if 'a_test' in loaded:
    tests.append(loaded['a_test'].astype(np.float64)); names.append('A')
tests.append(loaded['qa_test'].astype(np.float64)); names.append('QA')
tests.append(ridge_test.astype(np.float64)); names.append('Ridge')
print('[BLEND] Models included:', names)

# Rank-standardize each model's test predictions per column
tests_ranked = [rank_standardize(x) for x in tests]  # each (N_test,30) in [0,1]
N_test = tests_ranked[0].shape[0]
blend_rank = np.zeros((N_test, 30), dtype=np.float64)

q_cols = [i for i,c in enumerate(targets) if c.startswith('question_')]
a_cols = [i for i,c in enumerate(targets) if c.startswith('answer_')]

idx_name = {i:n for i,n in enumerate(names)}
def get_model_by(name):
    return tests_ranked[names.index(name)] if name in names else None

# Default weights
for j in range(30):
    if j in q_cols:
        w = {}
        if 'Q' in names: w['Q'] = 0.5
        w['QA'] = 0.4 if 'Q' in names else 0.7
        w['Ridge'] = 0.1
        s = sum(w.values());
        for k in w: w[k] /= s
        tmp = np.zeros(N_test, dtype=np.float64)
        for k,wt in w.items():
            tmp += wt * get_model_by(k)[:, j]
        blend_rank[:, j] = tmp
    else:
        # answer_*
        w = {}
        if 'A' in names: w['A'] = 0.5
        w['QA'] = 0.4 if 'A' in names else 0.7
        w['Ridge'] = 0.1
        s = sum(w.values());
        for k in w: w[k] /= s
        tmp = np.zeros(N_test, dtype=np.float64)
        for k,wt in w.items():
            tmp += wt * get_model_by(k)[:, j]
        blend_rank[:, j] = tmp

# Convert blended ranks to [0,1] via identity (already [0,1]) and clip
blend = np.clip(blend_rank, 0.0, 1.0).astype(np.float32)

# Override main target with best single-target ensemble if present (raw scores, clip at end)
if Path('test_ensemble.npy').exists():
    main_pred = np.clip(np.load('test_ensemble.npy').astype(float), 0.0, 1.0)
    j = targets.index(main_target)
    blend[:, j] = main_pred

# Write submission
samp = pd.read_csv('sample_submission.csv')
test_df = pd.read_csv('test.csv')
samp['qa_id'] = pd.to_numeric(test_df['qa_id'], errors='raise').astype('int64')
for i, col in enumerate(targets):
    samp[col] = blend[:, i].astype(float)
samp.to_csv('submission.csv', index=False, float_format='%.8f')
print('[BLEND] submission.csv written:', samp.shape)

[BLEND] Loading model preds ...


[BLEND] Missing keys: []
[BLEND][RIDGE] Building TFIDF+SVD features and computing OOF/test ...
