In [None]:
# Install CUDA 12.1 PyTorch stack and NLP deps; verify GPU
import os, sys, subprocess, shutil, time
from pathlib import Path

def pip(*args):
    print(">", *args, flush=True)
    subprocess.run([sys.executable, "-m", "pip", *args], check=True)

# Show NVIDIA-SMI first
print("=== NVIDIA-SMI ===", flush=True)
subprocess.run(['bash','-lc','nvidia-smi || true'], check=False)

# Uninstall any preexisting torch stack to avoid duplicates
for pkg in ("torch","torchvision","torchaudio"):
    subprocess.run([sys.executable, "-m", "pip", "uninstall", "-y", pkg], check=False)

# Clean stray site dirs that can shadow correct wheels (idempotent)
for d in (
    "/app/.pip-target/torch",
    "/app/.pip-target/torchvision",
    "/app/.pip-target/torchaudio",
    "/app/.pip-target/torch-2.8.0.dist-info",
    "/app/.pip-target/torch-2.4.1.dist-info",
    "/app/.pip-target/torchvision-0.23.0.dist-info",
    "/app/.pip-target/torchvision-0.19.1.dist-info",
    "/app/.pip-target/torchaudio-2.8.0.dist-info",
    "/app/.pip-target/torchaudio-2.4.1.dist-info",
    "/app/.pip-target/torchgen",
    "/app/.pip-target/functorch",
):
    if os.path.exists(d):
        print("Removing", d, flush=True)
        shutil.rmtree(d, ignore_errors=True)

# 1) Install EXACT cu121 torch stack
pip("install",
    "--index-url", "https://download.pytorch.org/whl/cu121",
    "--extra-index-url", "https://pypi.org/simple",
    "torch==2.4.1", "torchvision==0.19.1", "torchaudio==2.4.1")

# 2) Freeze torch versions
Path("constraints.txt").write_text(
    "torch==2.4.1\n"
    "torchvision==0.19.1\n"
    "torchaudio==2.4.1\n"
)

# 3) Install transformer deps without touching torch
pip("install", "-c", "constraints.txt",
    "transformers==4.44.2", "accelerate==0.34.2",
    "datasets==2.21.0", "evaluate==0.4.2",
    "sentencepiece", "scikit-learn", "numpy", "pandas",
    "tqdm", "scipy",
    "--upgrade-strategy", "only-if-needed")

# 4) Sanity gate
import torch
print("torch:", torch.__version__, "built CUDA:", getattr(torch.version, "cuda", None))
print("CUDA available:", torch.cuda.is_available())
assert str(getattr(torch.version, "cuda", "")).startswith("12.1"), f"Wrong CUDA build: {torch.version.cuda}"
assert torch.cuda.is_available(), "CUDA not available"
print("GPU:", torch.cuda.get_device_name(0))
print("Environment ready.", flush=True)

In [2]:
# DeBERTa-v3-base utilities: data loading, tokenizer (head+tail), dataset, collator, model factory
import os, time, math, random, numpy as np, pandas as pd, torch
from datasets import Dataset as HFDataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding

SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED); torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.benchmark = True

MODEL_NAME = 'microsoft/deberta-v3-base'
MAX_LEN = 512  # model's native max length
HEAD_TOKENS = 200
TAIL_TOKENS = MAX_LEN - 2 - HEAD_TOKENS  # account for special tokens by tokenizer padding/truncation

train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
folds_df = pd.read_csv('folds.csv')
id_col, text_col, target_col = 'essay_id', 'full_text', 'score'
assert {id_col, text_col, target_col}.issubset(train_df.columns)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def head_tail_encode(texts):
    # Tokenize to ids first without truncation to slice head/tail by tokens
    enc = tokenizer(texts, add_special_tokens=False, return_attention_mask=False, return_token_type_ids=False)
    input_ids_list = []
    for ids in enc['input_ids']:
        if len(ids) <= MAX_LEN - 2:
            input_ids_list.append(ids)
        else:
            head = ids[:HEAD_TOKENS]
            tail = ids[-TAIL_TOKENS:] if TAIL_TOKENS > 0 else []
            input_ids_list.append(head + tail)
    # Now add special tokens and pad/truncate to MAX_LEN
    out = tokenizer.pad({'input_ids': [tokenizer.build_inputs_with_special_tokens(ids) for ids in input_ids_list]},
                        padding='max_length', max_length=MAX_LEN, return_tensors=None)
    return out  # dict with input_ids, attention_mask

class TextRegDataset(torch.utils.data.Dataset):
    def __init__(self, texts, targets=None):
        self.texts = texts
        self.targets = targets
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        tokenized = head_tail_encode([text])
        item = {
            'input_ids': torch.tensor(tokenized['input_ids'][0], dtype=torch.long),
            'attention_mask': torch.tensor(tokenized['attention_mask'][0], dtype=torch.long),
        }
        if self.targets is not None:
            item['labels'] = torch.tensor(float(self.targets[idx]), dtype=torch.float)
        return item

collator = DataCollatorWithPadding(tokenizer=tokenizer, pad_to_multiple_of=8)

def get_model():
    model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=1, problem_type='regression')
    return model

print('DeBERTa utilities ready. Next: add training loop with 5-fold CV, AMP, and OOF/test caching.', flush=True)



DeBERTa utilities ready. Next: add training loop with 5-fold CV, AMP, and OOF/test caching.


In [None]:
# DeBERTa-v3-base 5-fold with head-tail training + sliding-window (512, stride 128) eval/infer; QWK early stop
import numpy as np, torch, time, math, os, pandas as pd, random
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback, AutoModelForSequenceClassification
from sklearn.metrics import cohen_kappa_score
from torch.utils.data import Dataset

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
n_splits = int(folds_df['fold'].max()) + 1
y = train_df[target_col].astype(float).values
min_score, max_score = 1.0, 6.0

# Perf/stability flags
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

def seed_everything(seed=42):
    random.seed(seed); np.random.seed(seed); torch.manual_seed(seed); torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = False; torch.backends.cudnn.benchmark = True

def qwk_int(y_true, y_pred_int):
    return cohen_kappa_score(y_true, y_pred_int, weights='quadratic')

# Pre-tokenize ALL texts once to speed per-fold ops
print('[DeBERTa] Pre-tokenizing train/test to raw token ids...', flush=True)
tok_train = tokenizer(train_df[text_col].tolist(), add_special_tokens=False, return_attention_mask=False, return_token_type_ids=False)
tok_test  = tokenizer(test_df[text_col].tolist(),  add_special_tokens=False, return_attention_mask=False, return_token_type_ids=False)
train_ids_all = tok_train['input_ids']
test_ids_all  = tok_test['input_ids']
print('[DeBERTa] Pre-tokenization done.', flush=True)

# Helpers for head+tail pack to 512
def pack_head_tail(ids, max_len=512, head=200):
    tail = max_len - 2 - head
    if len(ids) <= max_len - 2:
        core = ids
    else:
        core = ids[:head] + (ids[-tail:] if tail>0 else [])
    built = tokenizer.build_inputs_with_special_tokens(core)
    out = tokenizer.pad({'input_ids':[built]}, padding='max_length', max_length=max_len, return_tensors='pt')
    return out['input_ids'][0], out['attention_mask'][0]

# Sliding-window chunking (for eval/infer) with stride 128
def chunkify_ids(ids, max_len=512, stride=128):
    usable = max_len - 2
    if len(ids) <= usable:
        chunks = [ids]
    else:
        chunks = []
        start = 0
        while start < len(ids):
            end = min(start + usable, len(ids))
            chunks.append(ids[start:end])
            if end == len(ids):
                break
            # FIX: advance by stride (not usable - stride)
            start += stride
    # build tensors and weights (token count per chunk)
    input_ids = []; attn = []; weights = []
    for ch in chunks:
        built = tokenizer.build_inputs_with_special_tokens(ch)
        padded = tokenizer.pad({'input_ids':[built]}, padding='max_length', max_length=max_len, return_tensors='pt')
        input_ids.append(padded['input_ids'][0]); attn.append(padded['attention_mask'][0]); weights.append(float(len(ch)))
    return input_ids, attn, weights

class HeadTailDataset(Dataset):  # for training (single segment per essay)
    def __init__(self, ids_list, targets=None):
        self.ids_list = ids_list
        self.targets = targets
    def __len__(self): return len(self.ids_list)
    def __getitem__(self, idx):
        ids = self.ids_list[idx]
        input_ids, attention_mask = pack_head_tail(ids, MAX_LEN, HEAD_TOKENS)
        item = {'input_ids': input_ids, 'attention_mask': attention_mask}
        if self.targets is not None:
            item['labels'] = torch.tensor(float(self.targets[idx]), dtype=torch.float)
        return item

class ChunkDataset(Dataset):  # flat chunks for eval/infer
    def __init__(self, ids_list):
        self.inputs = []; self.attns = []; self.essay_idx = []; self.weights = []
        for i, ids in enumerate(ids_list):
            inp, att, w = chunkify_ids(ids, MAX_LEN, stride=128)
            self.inputs.extend(inp); self.attns.extend(att);
            self.essay_idx.extend([i]*len(inp)); self.weights.extend(w)
    def __len__(self): return len(self.inputs)
    def __getitem__(self, idx):
        return {'input_ids': self.inputs[idx], 'attention_mask': self.attns[idx]}

def collate_fn(batch):
    return {k: torch.stack([x[k] for x in batch]) for k in batch[0].keys()}

def compute_metrics(eval_pred):
    preds = eval_pred.predictions.reshape(-1)
    labels = eval_pred.label_ids.reshape(-1)
    preds = np.clip(preds, min_score, max_score)
    base_th = np.array([1.5,2.5,3.5,4.5,5.5])
    bins = [-np.inf] + base_th.tolist() + [np.inf]
    pred_int = np.digitize(preds, bins)
    labels_int = labels.astype(int)
    return {'qwk': qwk_int(labels_int, pred_int)}

def length_weighted_aggregate(flat_preds, essay_idx, weights, n_items):
    agg = np.zeros(n_items, dtype=np.float32); wsum = np.zeros(n_items, dtype=np.float32)
    np.add.at(agg, essay_idx, flat_preds * weights); np.add.at(wsum, essay_idx, weights)
    return agg / np.clip(wsum, 1e-6, None)

# Build test chunks once
test_chunk_ds_global = ChunkDataset(test_ids_all)

oof = np.zeros(len(train_df), dtype=np.float32)
test_pred_f = np.zeros((len(test_df), n_splits), dtype=np.float32)

for f in range(n_splits):
    fold_t0 = time.time()
    tr_idx = folds_df.index[folds_df['fold']!=f].to_numpy()
    va_idx = folds_df.index[folds_df['fold']==f].to_numpy()
    print(f'[DeBERTa] Fold {f} start: tr={len(tr_idx)} va={len(va_idx)}', flush=True)

    seed_everything(42 + f)
    model = get_model()
    # mild dropout
    if hasattr(model.config, 'hidden_dropout_prob'): model.config.hidden_dropout_prob = 0.1
    if hasattr(model.config, 'attention_probs_dropout_prob'): model.config.attention_probs_dropout_prob = 0.1
    model.gradient_checkpointing_enable(); model.to(device)

    # Datasets
    train_ds = HeadTailDataset([train_ids_all[i] for i in tr_idx], train_df.loc[tr_idx, target_col].tolist())
    valid_ds_ht = HeadTailDataset([train_ids_all[i] for i in va_idx], train_df.loc[va_idx, target_col].tolist())  # for ES/QWK monitor

    args = TrainingArguments(
        output_dir=f'outputs_fold{f}',
        per_device_train_batch_size=16,
        per_device_eval_batch_size=64,
        gradient_accumulation_steps=1,
        evaluation_strategy='epoch',
        save_strategy='epoch',
        learning_rate=2e-5,
        num_train_epochs=5,
        weight_decay=0.02,
        lr_scheduler_type='cosine',
        warmup_ratio=0.1,
        bf16=True,
        bf16_full_eval=True,
        logging_steps=50,
        load_best_model_at_end=True,
        metric_for_best_model='qwk',
        greater_is_better=True,
        save_total_limit=5,
        dataloader_pin_memory=True,
        dataloader_num_workers=2,
        report_to=[]
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_ds,
        eval_dataset=valid_ds_ht,
        data_collator=collate_fn,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
    )

    trainer.train()

    # Prepare sliding-window validation chunks
    val_chunk_ds = ChunkDataset([train_ids_all[i] for i in va_idx])
    essay_idx_val = np.array(val_chunk_ds.essay_idx, dtype=np.int64)
    weights_val = np.array(val_chunk_ds.weights, dtype=np.float32)

    # Post-hoc checkpoint selection via sliding-window QWK
    chk_dir = args.output_dir
    ckpts = []
    if os.path.isdir(chk_dir):
        for d in os.listdir(chk_dir):
            if d.startswith('checkpoint-'):
                try:
                    step = int(d.split('-')[-1])
                except:
                    step = -1
                ckpts.append((step, os.path.join(chk_dir, d)))
    ckpts.sort()
    # evaluate last up to 3 checkpoints for speed; if none, fall back to current model
    candidates = [p for _, p in ckpts[-3:]] if ckpts else []
    best_q = -1.0; best_path = None; best_val_pred = None
    for path in candidates if candidates else [None]:
        if path is not None:
            cand_model = AutoModelForSequenceClassification.from_pretrained(path, num_labels=1, problem_type='regression').to(device)
        else:
            cand_model = trainer.model
        cand_trainer = Trainer(model=cand_model, args=args, data_collator=collate_fn)
        with torch.no_grad():
            preds_flat = cand_trainer.predict(val_chunk_ds).predictions.reshape(-1)
        preds_flat = np.clip(preds_flat, min_score, max_score)
        val_pred = length_weighted_aggregate(preds_flat, essay_idx_val, weights_val, len(va_idx))
        # compute QWK with base thresholds
        base_th = np.array([1.5,2.5,3.5,4.5,5.5])
        bins = [-np.inf] + base_th.tolist() + [np.inf]
        val_int = np.digitize(val_pred, bins)
        score = qwk_int(train_df.loc[va_idx, target_col].astype(int).values, val_int)
        if score > best_q:
            best_q = score; best_path = path; best_val_pred = val_pred.astype(np.float32)

    # Save OOF for this fold using the best checkpoint
    oof[va_idx] = np.clip(best_val_pred, min_score, max_score)

    # Load best checkpoint (if different) for test inference
    if best_path is not None:
        best_model = AutoModelForSequenceClassification.from_pretrained(best_path, num_labels=1, problem_type='regression').to(device)
    else:
        best_model = trainer.model
    best_trainer = Trainer(model=best_model, args=args, data_collator=collate_fn)
    with torch.no_grad():
        test_preds_flat = best_trainer.predict(test_chunk_ds_global).predictions.reshape(-1)
    test_preds_flat = np.clip(test_preds_flat, min_score, max_score)
    essay_idx_t = np.array(test_chunk_ds_global.essay_idx, dtype=np.int64); weights_t = np.array(test_chunk_ds_global.weights, dtype=np.float32)
    test_pred_f[:, f] = length_weighted_aggregate(test_preds_flat, essay_idx_t, weights_t, len(test_df)).astype(np.float32)

    del trainer, model, train_ds, valid_ds_ht, val_chunk_ds, best_trainer, best_model
    torch.cuda.empty_cache()
    print(f'[DeBERTa] Fold {f} done in {time.time()-fold_t0:.1f}s (best SW QWK={best_q:.5f})', flush=True)

# Save artifacts
pd.DataFrame({'essay_id': train_df[id_col], 'oof_deberta': oof, 'y': y}).to_csv('oof_deberta_base.csv', index=False)
np.save('test_deberta_base.npy', test_pred_f.mean(axis=1))
print('Saved oof_deberta_base.csv and test_deberta_base.npy', flush=True)

In [None]:
# Post-hoc sliding-window re-eval at stride=64 (no retrain); save new OOF/test artifacts
import os, time, numpy as np, pandas as pd, torch
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset
from sklearn.metrics import cohen_kappa_score

assert 'tokenizer' in globals() and 'train_df' in globals() and 'test_df' in globals() and 'folds_df' in globals(), 'Run setup cells first.'

min_score, max_score = 1.0, 6.0
n_splits = int(folds_df['fold'].max()) + 1

def qwk_int(y_true, y_pred_int):
    return cohen_kappa_score(y_true, y_pred_int, weights='quadratic')

def chunkify_ids_stride(ids, max_len=512, stride=64):
    usable = max_len - 2
    if len(ids) <= usable:
        chunks = [ids]
    else:
        chunks = []
        start = 0
        while start < len(ids):
            end = min(start + usable, len(ids))
            chunks.append(ids[start:end])
            if end == len(ids):
                break
            start += stride
    input_ids = []; attn = []; weights = []
    for ch in chunks:
        built = tokenizer.build_inputs_with_special_tokens(ch)
        padded = tokenizer.pad({'input_ids':[built]}, padding='max_length', max_length=max_len, return_tensors='pt')
        input_ids.append(padded['input_ids'][0]); attn.append(padded['attention_mask'][0]); weights.append(float(len(ch)))
    return input_ids, attn, weights

class ChunkDataset64(Dataset):
    def __init__(self, ids_list):
        self.inputs=[]; self.attns=[]; self.essay_idx=[]; self.weights=[]
        for i, ids in enumerate(ids_list):
            inp, att, w = chunkify_ids_stride(ids, MAX_LEN, stride=64)
            self.inputs.extend(inp); self.attns.extend(att);
            self.essay_idx.extend([i]*len(inp)); self.weights.extend(w)
    def __len__(self): return len(self.inputs)
    def __getitem__(self, idx):
        return {'input_ids': self.inputs[idx], 'attention_mask': self.attns[idx]}

def collate_fn(batch):
    return {k: torch.stack([x[k] for x in batch]) for k in batch[0].keys()}

def length_weighted_aggregate(flat_preds, essay_idx, weights, n_items):
    agg = np.zeros(n_items, dtype=np.float32); wsum = np.zeros(n_items, dtype=np.float32)
    np.add.at(agg, essay_idx, flat_preds * weights); np.add.at(wsum, essay_idx, weights)
    return agg / np.clip(wsum, 1e-6, None)

# Build ids once (use same pretokenization as training cell if present)
if 'train_ids_all' not in globals() or 'test_ids_all' not in globals():
    tok_train = tokenizer(train_df[text_col].tolist(), add_special_tokens=False, return_attention_mask=False, return_token_type_ids=False)
    tok_test  = tokenizer(test_df[text_col].tolist(),  add_special_tokens=False, return_attention_mask=False, return_token_type_ids=False)
    train_ids_all = tok_train['input_ids']
    test_ids_all  = tok_test['input_ids']

test_chunk_ds64 = ChunkDataset64(test_ids_all)
essay_idx_t = np.array(test_chunk_ds64.essay_idx, dtype=np.int64)
weights_t = np.array(test_chunk_ds64.weights, dtype=np.float32)

oof = np.zeros(len(train_df), dtype=np.float32)
test_pred_f = np.zeros((len(test_df), n_splits), dtype=np.float32)
y_int = train_df[target_col].astype(int).values

t0 = time.time()
for f in range(n_splits):
    f_t = time.time()
    va_idx = folds_df.index[folds_df['fold']==f].to_numpy()
    chk_dir = f'outputs_fold{f}'
    assert os.path.isdir(chk_dir), f'Missing {chk_dir}; run training first.'
    ckpts = []
    for d in os.listdir(chk_dir):
        if d.startswith('checkpoint-'):
            try:
                step = int(d.split('-')[-1])
            except:
                step = -1
            ckpts.append((step, os.path.join(chk_dir, d)))
    ckpts.sort()
    candidates = [p for _, p in ckpts[-3:]] if ckpts else []
    val_chunk_ds64 = ChunkDataset64([train_ids_all[i] for i in va_idx])
    essay_idx_val = np.array(val_chunk_ds64.essay_idx, dtype=np.int64)
    weights_val = np.array(val_chunk_ds64.weights, dtype=np.float32)

    args = TrainingArguments(
        output_dir=chk_dir,
        per_device_eval_batch_size=64,
        dataloader_num_workers=2,
        bf16_full_eval=True,
        report_to=[]
    )

    best_q = -1.0; best_path = None; best_val_pred=None
    for path in candidates if candidates else [None]:
        if path is None:
            continue  # require explicit checkpoints
        model = AutoModelForSequenceClassification.from_pretrained(path, num_labels=1, problem_type='regression')
        trainer = Trainer(model=model, args=args, data_collator=collate_fn)
        with torch.no_grad():
            preds_flat = trainer.predict(val_chunk_ds64).predictions.reshape(-1)
        preds_flat = np.clip(preds_flat, min_score, max_score)
        val_pred = length_weighted_aggregate(preds_flat, essay_idx_val, weights_val, len(va_idx))
        base_th = np.array([1.5,2.5,3.5,4.5,5.5])
        bins = [-np.inf] + base_th.tolist() + [np.inf]
        val_int = np.digitize(val_pred, bins)
        score = qwk_int(y_int[va_idx], val_int)
        if score > best_q:
            best_q = score; best_path = path; best_val_pred = val_pred.astype(np.float32)

    assert best_val_pred is not None, 'No valid checkpoint selected.'
    oof[va_idx] = np.clip(best_val_pred, min_score, max_score)

    # Test inference with best checkpoint
    model = AutoModelForSequenceClassification.from_pretrained(best_path, num_labels=1, problem_type='regression')
    trainer = Trainer(model=model, args=args, data_collator=collate_fn)
    with torch.no_grad():
        test_flat = trainer.predict(test_chunk_ds64).predictions.reshape(-1)
    test_flat = np.clip(test_flat, min_score, max_score)
    test_pred_f[:, f] = length_weighted_aggregate(test_flat, essay_idx_t, weights_t, len(test_df)).astype(np.float32)
    print(f'[SW64] Fold {f} done in {time.time()-f_t:.1f}s (best QWK={best_q:.5f})', flush=True)

pd.DataFrame({'essay_id': train_df[id_col], 'oof_deberta': oof, 'y': y_int}).to_csv('oof_deberta_base_sw64.csv', index=False)
np.save('test_deberta_base_sw64.npy', test_pred_f.mean(axis=1))
print(f'[SW64] Saved oof_deberta_base_sw64.csv and test_deberta_base_sw64.npy in {time.time()-t0:.1f}s', flush=True)

In [None]:
# Seed 777 full 5-fold train with SW64 checkpoint selection and TTA (SW64, SW128, Head+Tail); save per-view and combined artifacts
import os, time, math, random, numpy as np, pandas as pd, torch
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback, AutoModelForSequenceClassification
from sklearn.metrics import cohen_kappa_score
from torch.utils.data import Dataset

assert 'train_df' in globals() and 'test_df' in globals() and 'folds_df' in globals() and 'tokenizer' in globals(), 'Run setup cells first.'

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
BASE_SEED = 777
n_splits = int(folds_df['fold'].max()) + 1
y = train_df[target_col].astype(float).values
min_score, max_score = 1.0, 6.0

torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

def seed_everything(seed=42):
    random.seed(seed); np.random.seed(seed); torch.manual_seed(seed); torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = False; torch.backends.cudnn.benchmark = True

def qwk_int(y_true, y_pred_int):
    return cohen_kappa_score(y_true, y_pred_int, weights='quadratic')

# Pre-tokenize if not present
if 'train_ids_all' not in globals() or 'test_ids_all' not in globals():
    print('[s777] Pre-tokenizing...', flush=True)
    tok_train = tokenizer(train_df[text_col].tolist(), add_special_tokens=False, return_attention_mask=False, return_token_type_ids=False)
    tok_test  = tokenizer(test_df[text_col].tolist(),  add_special_tokens=False, return_attention_mask=False, return_token_type_ids=False)
    train_ids_all = tok_train['input_ids']
    test_ids_all  = tok_test['input_ids']

def pack_head_tail(ids, max_len=512, head=HEAD_TOKENS):
    tail = max_len - 2 - head
    if len(ids) <= max_len - 2:
        core = ids
    else:
        core = ids[:head] + (ids[-tail:] if tail>0 else [])
    built = tokenizer.build_inputs_with_special_tokens(core)
    out = tokenizer.pad({'input_ids':[built]}, padding='max_length', max_length=max_len, return_tensors='pt')
    return out['input_ids'][0], out['attention_mask'][0]

def chunkify_stride(ids, max_len=512, stride=128):
    usable = max_len - 2
    if len(ids) <= usable:
        chunks = [ids]
    else:
        chunks = []
        start = 0
        while start < len(ids):
            end = min(start + usable, len(ids))
            chunks.append(ids[start:end])
            if end == len(ids):
                break
            start += stride
    input_ids = []; attn = []; weights = []
    for ch in chunks:
        built = tokenizer.build_inputs_with_special_tokens(ch)
        padded = tokenizer.pad({'input_ids':[built]}, padding='max_length', max_length=max_len, return_tensors='pt')
        input_ids.append(padded['input_ids'][0]); attn.append(padded['attention_mask'][0]); weights.append(float(len(ch)))
    return input_ids, attn, weights

class TrainHTDataset(Dataset):
    def __init__(self, ids_list, targets=None):
        self.ids_list = ids_list; self.targets = targets
    def __len__(self): return len(self.ids_list)
    def __getitem__(self, idx):
        ids = self.ids_list[idx]
        input_ids, attention_mask = pack_head_tail(ids, MAX_LEN, HEAD_TOKENS)
        item = {'input_ids': input_ids, 'attention_mask': attention_mask}
        if self.targets is not None:
            item['labels'] = torch.tensor(float(self.targets[idx]), dtype=torch.float)
        return item

class ChunkDataset(Dataset):
    def __init__(self, ids_list, stride):
        self.inputs=[]; self.attns=[]; self.essay_idx=[]; self.weights=[]
        for i, ids in enumerate(ids_list):
            inp, att, w = chunkify_stride(ids, MAX_LEN, stride=stride)
            self.inputs.extend(inp); self.attns.extend(att);
            self.essay_idx.extend([i]*len(inp)); self.weights.extend(w)
    def __len__(self): return len(self.inputs)
    def __getitem__(self, idx):
        return {'input_ids': self.inputs[idx], 'attention_mask': self.attns[idx]}

class HTInferDataset(Dataset):
    def __init__(self, ids_list):
        self.inputs=[]; self.attns=[]; self.essay_idx=[]
        for i, ids in enumerate(ids_list):
            inp, att = pack_head_tail(ids, MAX_LEN, HEAD_TOKENS)
            self.inputs.append(inp); self.attns.append(att); self.essay_idx.append(i)
    def __len__(self): return len(self.inputs)
    def __getitem__(self, idx):
        return {'input_ids': self.inputs[idx], 'attention_mask': self.attns[idx]}

def collate_fn(batch):
    return {k: torch.stack([x[k] for x in batch]) for k in batch[0].keys()}

def length_weighted_aggregate(flat_preds, essay_idx, weights, n_items):
    agg = np.zeros(n_items, dtype=np.float32); wsum = np.zeros(n_items, dtype=np.float32)
    np.add.at(agg, essay_idx, flat_preds * weights); np.add.at(wsum, essay_idx, weights)
    return agg / np.clip(wsum, 1e-6, None)

# Prebuild test datasets for TTA
test_chunks_64 = ChunkDataset(test_ids_all, stride=64)
test_chunks_128 = ChunkDataset(test_ids_all, stride=128)
test_ht = HTInferDataset(test_ids_all)
essay_idx_t64 = np.array(test_chunks_64.essay_idx, dtype=np.int64); weights_t64 = np.array(test_chunks_64.weights, dtype=np.float32)
essay_idx_t128 = np.array(test_chunks_128.essay_idx, dtype=np.int64); weights_t128 = np.array(test_chunks_128.weights, dtype=np.float32)

oof_64 = np.zeros(len(train_df), dtype=np.float32)
oof_128 = np.zeros(len(train_df), dtype=np.float32)
oof_ht = np.zeros(len(train_df), dtype=np.float32)
test_pred_f64 = np.zeros((len(test_df), n_splits), dtype=np.float32)
test_pred_f128 = np.zeros((len(test_df), n_splits), dtype=np.float32)
test_pred_fht = np.zeros((len(test_df), n_splits), dtype=np.float32)

for f in range(n_splits):
    fold_t0 = time.time()
    tr_idx = folds_df.index[folds_df['fold']!=f].to_numpy()
    va_idx = folds_df.index[folds_df['fold']==f].to_numpy()
    print(f'[s777] Fold {f} start: tr={len(tr_idx)} va={len(va_idx)}', flush=True)

    seed_everything(BASE_SEED + f)
    model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=1, problem_type='regression')
    if hasattr(model.config, 'hidden_dropout_prob'): model.config.hidden_dropout_prob = 0.10
    if hasattr(model.config, 'attention_probs_dropout_prob'): model.config.attention_probs_dropout_prob = 0.10
    model.gradient_checkpointing_enable(); model.to(device)

    train_ds = TrainHTDataset([train_ids_all[i] for i in tr_idx], train_df.loc[tr_idx, target_col].tolist())
    valid_ds_ht = TrainHTDataset([train_ids_all[i] for i in va_idx], train_df.loc[va_idx, target_col].tolist())

    args = TrainingArguments(
        output_dir=f'outputs_fold{f}',
        per_device_train_batch_size=16,
        per_device_eval_batch_size=64,
        gradient_accumulation_steps=1,
        evaluation_strategy='epoch',
        save_strategy='epoch',
        learning_rate=2e-5,
        num_train_epochs=5,
        weight_decay=0.02,
        lr_scheduler_type='cosine',
        warmup_ratio=0.1,
        bf16=True,
        bf16_full_eval=True,
        logging_steps=50,
        load_best_model_at_end=True,
        metric_for_best_model='qwk',
        greater_is_better=True,
        save_total_limit=5,
        dataloader_pin_memory=True,
        dataloader_num_workers=2,
        report_to=[]
    )

    def compute_metrics(eval_pred):
        preds = eval_pred.predictions.reshape(-1)
        labels = eval_pred.label_ids.reshape(-1)
        preds = np.clip(preds, min_score, max_score)
        base_th = np.array([1.5,2.5,3.5,4.5,5.5])
        bins = [-np.inf] + base_th.tolist() + [np.inf]
        pred_int = np.digitize(preds, bins)
        labels_int = labels.astype(int)
        return {'qwk': qwk_int(labels_int, pred_int)}

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_ds,
        eval_dataset=valid_ds_ht,
        data_collator=collate_fn,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
    )
    trainer.train()

    # Build validation datasets for TTA
    val_chunks_64 = ChunkDataset([train_ids_all[i] for i in va_idx], stride=64)
    val_chunks_128 = ChunkDataset([train_ids_all[i] for i in va_idx], stride=128)
    val_ht = HTInferDataset([train_ids_all[i] for i in va_idx])
    essay_idx_v64 = np.array(val_chunks_64.essay_idx, dtype=np.int64); weights_v64 = np.array(val_chunks_64.weights, dtype=np.float32)
    essay_idx_v128 = np.array(val_chunks_128.essay_idx, dtype=np.int64); weights_v128 = np.array(val_chunks_128.weights, dtype=np.float32)

    # Post-hoc checkpoint selection using SW64 QWK
    chk_dir = args.output_dir
    ckpts = []
    if os.path.isdir(chk_dir):
        for d in os.listdir(chk_dir):
            if d.startswith('checkpoint-'):
                try: step = int(d.split('-')[-1])
                except: step = -1
                ckpts.append((step, os.path.join(chk_dir, d)))
    ckpts.sort()
    candidates = [p for _, p in ckpts[-3:]] if ckpts else []
    best_q = -1.0; best_path = None; best_val_pred64 = None; best_val_pred128 = None; best_val_pred_ht = None

    def eval_view(model_path, ds, aggregate, essay_idx, weights=None, n_items=None):
        m = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=1, problem_type='regression').to(device)
        t = Trainer(model=m, args=args, data_collator=collate_fn)
        with torch.no_grad():
            flat = t.predict(ds).predictions.reshape(-1)
        flat = np.clip(flat, min_score, max_score)
        if aggregate:
            return length_weighted_aggregate(flat, essay_idx, weights, n_items).astype(np.float32)
        else:
            return flat.astype(np.float32)

    for path in candidates if candidates else [None]:
        if path is None:
            continue
        # Evaluate SW64 for selection
        val_pred64 = eval_view(path, val_chunks_64, True, essay_idx_v64, weights_v64, len(va_idx))
        base_th = np.array([1.5,2.5,3.5,4.5,5.5])
        bins = [-np.inf] + base_th.tolist() + [np.inf]
        q = qwk_int(train_df.loc[va_idx, target_col].astype(int).values, np.digitize(val_pred64, bins))
        if q > best_q:
            best_q = q; best_path = path; best_val_pred64 = val_pred64
            # Also compute companion views for the same checkpoint
            best_val_pred128 = eval_view(path, val_chunks_128, True, essay_idx_v128, weights_v128, len(va_idx))
            # Head+Tail single view
            m_ht = AutoModelForSequenceClassification.from_pretrained(path, num_labels=1, problem_type='regression').to(device)
            t_ht = Trainer(model=m_ht, args=args, data_collator=collate_fn)
            with torch.no_grad():
                flat_ht = t_ht.predict(val_ht).predictions.reshape(-1)
            best_val_pred_ht = np.clip(flat_ht, min_score, max_score).astype(np.float32)

    assert best_path is not None and best_val_pred64 is not None, '[s777] No valid checkpoint found for fold %d' % f

    # Save OOF per view
    oof_64[va_idx] = np.clip(best_val_pred64, min_score, max_score)
    oof_128[va_idx] = np.clip(best_val_pred128, min_score, max_score)
    oof_ht[va_idx] = np.clip(best_val_pred_ht, min_score, max_score)

    # Test inference for all TTA views with best checkpoint
    best_model = AutoModelForSequenceClassification.from_pretrained(best_path, num_labels=1, problem_type='regression').to(device)
    t_common = Trainer(model=best_model, args=args, data_collator=collate_fn)
    with torch.no_grad():
        flat64 = t_common.predict(test_chunks_64).predictions.reshape(-1)
        flat128 = t_common.predict(test_chunks_128).predictions.reshape(-1)
        flat_ht = t_common.predict(test_ht).predictions.reshape(-1)
    flat64 = np.clip(flat64, min_score, max_score); flat128 = np.clip(flat128, min_score, max_score); flat_ht = np.clip(flat_ht, min_score, max_score)
    test_pred_f64[:, f] = length_weighted_aggregate(flat64, essay_idx_t64, weights_t64, len(test_df)).astype(np.float32)
    test_pred_f128[:, f] = length_weighted_aggregate(flat128, essay_idx_t128, weights_t128, len(test_df)).astype(np.float32)
    # HT is one per essay already
    test_pred_fht[:, f] = flat_ht.astype(np.float32)

    torch.cuda.empty_cache()
    print(f'[s777] Fold {f} done in {time.time()-fold_t0:.1f}s (best SW64 QWK={best_q:.5f})', flush=True)

# Save per-view OOF and test
pd.DataFrame({'essay_id': train_df[id_col], 'oof_deberta': oof_64, 'y': train_df[target_col].astype(int).values}).to_csv('oof_deberta_base_s777_sw64.csv', index=False)
pd.DataFrame({'essay_id': train_df[id_col], 'oof_deberta': oof_128, 'y': train_df[target_col].astype(int).values}).to_csv('oof_deberta_base_s777_sw128.csv', index=False)
pd.DataFrame({'essay_id': train_df[id_col], 'oof_deberta': oof_ht, 'y': train_df[target_col].astype(int).values}).to_csv('oof_deberta_base_s777_ht.csv', index=False)
np.save('test_deberta_base_s777_sw64.npy', test_pred_f64.mean(axis=1))
np.save('test_deberta_base_s777_sw128.npy', test_pred_f128.mean(axis=1))
np.save('test_deberta_base_s777_ht.npy', test_pred_fht.mean(axis=1))

# Also save the TTA-combined view (0.4*SW64 + 0.4*SW128 + 0.2*HT)
oof_tta = 0.4*oof_64 + 0.4*oof_128 + 0.2*oof_ht
test_tta = 0.4*test_pred_f64.mean(axis=1) + 0.4*test_pred_f128.mean(axis=1) + 0.2*test_pred_fht.mean(axis=1)
pd.DataFrame({'essay_id': train_df[id_col], 'oof_deberta': oof_tta.astype(np.float32), 'y': train_df[target_col].astype(int).values}).to_csv('oof_deberta_base_s777.csv', index=False)
np.save('test_deberta_base_s777.npy', test_tta.astype(np.float32))
print('[s777] Saved per-view and combined TTA artifacts for seed 777.', flush=True)

In [None]:
# Seed 2025 partial folds with diversity (lr=1.8e-5, HEAD_TOKENS=256, dropout=0.12) + TTA; configurable folds_to_run
import os, time, math, random, numpy as np, pandas as pd, torch
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback, AutoModelForSequenceClassification
from sklearn.metrics import cohen_kappa_score
from torch.utils.data import Dataset

assert 'train_df' in globals() and 'test_df' in globals() and 'folds_df' in globals() and 'tokenizer' in globals(), 'Run setup cells first.'

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
BASE_SEED = 2025
LOCAL_HEAD_TOKENS = 256
n_splits = int(folds_df['fold'].max()) + 1
y = train_df[target_col].astype(float).values
min_score, max_score = 1.0, 6.0

# Choose weakest folds first; adjust list as needed (run 2 first, add third if time permits)
FOLDS_TO_RUN = [3]  # run fold 3 now to complete 5/5 coverage for s2025

torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

def seed_everything(seed=42):
    random.seed(seed); np.random.seed(seed); torch.manual_seed(seed); torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = False; torch.backends.cudnn.benchmark = True

def qwk_int(y_true, y_pred_int):
    return cohen_kappa_score(y_true, y_pred_int, weights='quadratic')

# Reuse pretokenized ids if present; otherwise build now
if 'train_ids_all' not in globals() or 'test_ids_all' not in globals():
    print('[s2025] Pre-tokenizing...', flush=True)
    tok_train = tokenizer(train_df[text_col].tolist(), add_special_tokens=False, return_attention_mask=False, return_token_type_ids=False)
    tok_test  = tokenizer(test_df[text_col].tolist(),  add_special_tokens=False, return_attention_mask=False, return_token_type_ids=False)
    train_ids_all = tok_train['input_ids']
    test_ids_all  = tok_test['input_ids']

def pack_head_tail_local(ids, max_len=512, head=LOCAL_HEAD_TOKENS):
    tail = max_len - 2 - head
    if len(ids) <= max_len - 2:
        core = ids
    else:
        core = ids[:head] + (ids[-tail:] if tail>0 else [])
    built = tokenizer.build_inputs_with_special_tokens(core)
    out = tokenizer.pad({'input_ids':[built]}, padding='max_length', max_length=max_len, return_tensors='pt')
    return out['input_ids'][0], out['attention_mask'][0]

def chunkify_stride(ids, max_len=512, stride=128):
    usable = max_len - 2
    if len(ids) <= usable:
        chunks = [ids]
    else:
        chunks = []
        start = 0
        while start < len(ids):
            end = min(start + usable, len(ids))
            chunks.append(ids[start:end])
            if end == len(ids):
                break
            start += stride
    input_ids = []; attn = []; weights = []
    for ch in chunks:
        built = tokenizer.build_inputs_with_special_tokens(ch)
        padded = tokenizer.pad({'input_ids':[built]}, padding='max_length', max_length=max_len, return_tensors='pt')
        input_ids.append(padded['input_ids'][0]); attn.append(padded['attention_mask'][0]); weights.append(float(len(ch)))
    return input_ids, attn, weights

class TrainHTDatasetLocal(Dataset):
    def __init__(self, ids_list, targets=None):
        self.ids_list = ids_list; self.targets = targets
    def __len__(self): return len(self.ids_list)
    def __getitem__(self, idx):
        ids = self.ids_list[idx]
        input_ids, attention_mask = pack_head_tail_local(ids, MAX_LEN, LOCAL_HEAD_TOKENS)
        item = {'input_ids': input_ids, 'attention_mask': attention_mask}
        if self.targets is not None:
            item['labels'] = torch.tensor(float(self.targets[idx]), dtype=torch.float)
        return item

class ChunkDataset(Dataset):
    def __init__(self, ids_list, stride):
        self.inputs=[]; self.attns=[]; self.essay_idx=[]; self.weights=[]
        for i, ids in enumerate(ids_list):
            inp, att, w = chunkify_stride(ids, MAX_LEN, stride=stride)
            self.inputs.extend(inp); self.attns.extend(att);
            self.essay_idx.extend([i]*len(inp)); self.weights.extend(w)
    def __len__(self): return len(self.inputs)
    def __getitem__(self, idx):
        return {'input_ids': self.inputs[idx], 'attention_mask': self.attns[idx]}

class HTInferDatasetLocal(Dataset):
    def __init__(self, ids_list):
        self.inputs=[]; self.attns=[]; self.essay_idx=[]
        for i, ids in enumerate(ids_list):
            inp, att = pack_head_tail_local(ids, MAX_LEN, LOCAL_HEAD_TOKENS)
            self.inputs.append(inp); self.attns.append(att); self.essay_idx.append(i)
    def __len__(self): return len(self.inputs)
    def __getitem__(self, idx):
        return {'input_ids': self.inputs[idx], 'attention_mask': self.attns[idx]}

def collate_fn(batch):
    return {k: torch.stack([x[k] for x in batch]) for k in batch[0].keys()}

def length_weighted_aggregate(flat_preds, essay_idx, weights, n_items):
    agg = np.zeros(n_items, dtype=np.float32); wsum = np.zeros(n_items, dtype=np.float32)
    np.add.at(agg, essay_idx, flat_preds * weights); np.add.at(wsum, essay_idx, weights)
    return agg / np.clip(wsum, 1e-6, None)

# Prebuild test datasets for TTA
test_chunks_64 = ChunkDataset(test_ids_all, stride=64)
test_chunks_128 = ChunkDataset(test_ids_all, stride=128)
test_ht = HTInferDatasetLocal(test_ids_all)
essay_idx_t64 = np.array(test_chunks_64.essay_idx, dtype=np.int64); weights_t64 = np.array(test_chunks_64.weights, dtype=np.float32)
essay_idx_t128 = np.array(test_chunks_128.essay_idx, dtype=np.int64); weights_t128 = np.array(test_chunks_128.weights, dtype=np.float32)

# Allocate OOF/test holders only for folds we run; fill others with zeros (ignored in bagging by availability)
oof_64 = np.zeros(len(train_df), dtype=np.float32)
oof_128 = np.zeros(len(train_df), dtype=np.float32)
oof_ht = np.zeros(len(train_df), dtype=np.float32)
test_pred_f64 = np.zeros((len(test_df), n_splits), dtype=np.float32)
test_pred_f128 = np.zeros((len(test_df), n_splits), dtype=np.float32)
test_pred_fht = np.zeros((len(test_df), n_splits), dtype=np.float32)

for f in FOLDS_TO_RUN:
    fold_t0 = time.time()
    tr_idx = folds_df.index[folds_df['fold']!=f].to_numpy()
    va_idx = folds_df.index[folds_df['fold']==f].to_numpy()
    print(f'[s2025] Fold {f} start: tr={len(tr_idx)} va={len(va_idx)}', flush=True)

    seed_everything(BASE_SEED + f)
    model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=1, problem_type='regression')
    if hasattr(model.config, 'hidden_dropout_prob'): model.config.hidden_dropout_prob = 0.12
    if hasattr(model.config, 'attention_probs_dropout_prob'): model.config.attention_probs_dropout_prob = 0.12
    model.gradient_checkpointing_enable(); model.to(device)

    train_ds = TrainHTDatasetLocal([train_ids_all[i] for i in tr_idx], train_df.loc[tr_idx, target_col].tolist())
    valid_ds_ht = TrainHTDatasetLocal([train_ids_all[i] for i in va_idx], train_df.loc[va_idx, target_col].tolist())

    args = TrainingArguments(
        output_dir=f'outputs_fold{f}',
        per_device_train_batch_size=16,
        per_device_eval_batch_size=64,
        gradient_accumulation_steps=1,
        evaluation_strategy='epoch',
        save_strategy='epoch',
        learning_rate=1.8e-5,
        num_train_epochs=5,
        weight_decay=0.02,
        lr_scheduler_type='cosine',
        warmup_ratio=0.1,
        bf16=True,
        bf16_full_eval=True,
        logging_steps=50,
        load_best_model_at_end=True,
        metric_for_best_model='qwk',
        greater_is_better=True,
        save_total_limit=5,
        dataloader_pin_memory=True,
        dataloader_num_workers=2,
        report_to=[]
    )

    def compute_metrics(eval_pred):
        preds = eval_pred.predictions.reshape(-1)
        labels = eval_pred.label_ids.reshape(-1)
        preds = np.clip(preds, min_score, max_score)
        base_th = np.array([1.5,2.5,3.5,4.5,5.5])
        bins = [-np.inf] + base_th.tolist() + [np.inf]
        pred_int = np.digitize(preds, bins)
        labels_int = labels.astype(int)
        return {'qwk': qwk_int(labels_int, pred_int)}

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_ds,
        eval_dataset=valid_ds_ht,
        data_collator=collate_fn,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
    )
    trainer.train()

    # Build validation datasets for TTA
    val_chunks_64 = ChunkDataset([train_ids_all[i] for i in va_idx], stride=64)
    val_chunks_128 = ChunkDataset([train_ids_all[i] for i in va_idx], stride=128)
    val_ht = HTInferDatasetLocal([train_ids_all[i] for i in va_idx])
    essay_idx_v64 = np.array(val_chunks_64.essay_idx, dtype=np.int64); weights_v64 = np.array(val_chunks_64.weights, dtype=np.float32)
    essay_idx_v128 = np.array(val_chunks_128.essay_idx, dtype=np.int64); weights_v128 = np.array(val_chunks_128.weights, dtype=np.float32)

    # Post-hoc checkpoint selection using SW64 QWK
    chk_dir = args.output_dir
    ckpts = []
    if os.path.isdir(chk_dir):
        for d in os.listdir(chk_dir):
            if d.startswith('checkpoint-'):
                try: step = int(d.split('-')[-1])
                except: step = -1
                ckpts.append((step, os.path.join(chk_dir, d)))
    ckpts.sort()
    candidates = [p for _, p in ckpts[-3:]] if ckpts else []
    best_q = -1.0; best_path = None; best_val_pred64 = None; best_val_pred128 = None; best_val_pred_ht = None

    def eval_view(model_path, ds, aggregate, essay_idx, weights=None, n_items=None):
        m = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=1, problem_type='regression').to(device)
        t = Trainer(model=m, args=args, data_collator=collate_fn)
        with torch.no_grad():
            flat = t.predict(ds).predictions.reshape(-1)
        flat = np.clip(flat, min_score, max_score)
        if aggregate:
            return length_weighted_aggregate(flat, essay_idx, weights, n_items).astype(np.float32)
        else:
            return flat.astype(np.float32)

    for path in candidates if candidates else [None]:
        if path is None:
            continue
        val_pred64 = eval_view(path, val_chunks_64, True, essay_idx_v64, weights_v64, len(va_idx))
        base_th = np.array([1.5,2.5,3.5,4.5,5.5])
        bins = [-np.inf] + base_th.tolist() + [np.inf]
        q = qwk_int(train_df.loc[va_idx, target_col].astype(int).values, np.digitize(val_pred64, bins))
        if q > best_q:
            best_q = q; best_path = path; best_val_pred64 = val_pred64
            best_val_pred128 = eval_view(path, val_chunks_128, True, essay_idx_v128, weights_v128, len(va_idx))
            m_ht = AutoModelForSequenceClassification.from_pretrained(path, num_labels=1, problem_type='regression').to(device)
            t_ht = Trainer(model=m_ht, args=args, data_collator=collate_fn)
            with torch.no_grad():
                flat_ht = t_ht.predict(val_ht).predictions.reshape(-1)
            best_val_pred_ht = np.clip(flat_ht, min_score, max_score).astype(np.float32)

    assert best_path is not None and best_val_pred64 is not None, '[s2025] No valid checkpoint found for fold %d' % f

    # Save OOF per view
    oof_64[va_idx] = np.clip(best_val_pred64, min_score, max_score)
    oof_128[va_idx] = np.clip(best_val_pred128, min_score, max_score)
    oof_ht[va_idx] = np.clip(best_val_pred_ht, min_score, max_score)

    # Test inference for all TTA views with best checkpoint
    best_model = AutoModelForSequenceClassification.from_pretrained(best_path, num_labels=1, problem_type='regression').to(device)
    t_common = Trainer(model=best_model, args=args, data_collator=collate_fn)
    with torch.no_grad():
        flat64 = t_common.predict(test_chunks_64).predictions.reshape(-1)
        flat128 = t_common.predict(test_chunks_128).predictions.reshape(-1)
        flat_ht = t_common.predict(test_ht).predictions.reshape(-1)
    flat64 = np.clip(flat64, min_score, max_score); flat128 = np.clip(flat128, min_score, max_score); flat_ht = np.clip(flat_ht, min_score, max_score)
    test_pred_f64[:, f] = length_weighted_aggregate(flat64, essay_idx_t64, weights_t64, len(test_df)).astype(np.float32)
    test_pred_f128[:, f] = length_weighted_aggregate(flat128, essay_idx_t128, weights_t128, len(test_df)).astype(np.float32)
    test_pred_fht[:, f] = flat_ht.astype(np.float32)

    torch.cuda.empty_cache()
    print(f'[s2025] Fold {f} done in {time.time()-fold_t0:.1f}s (best SW64 QWK={best_q:.5f})', flush=True)

# Save per-view OOF and test (note: only filled folds contain non-zero entries)
pd.DataFrame({'essay_id': train_df[id_col], 'oof_deberta': oof_64, 'y': train_df[target_col].astype(int).values}).to_csv('oof_deberta_base_s2025_sw64.csv', index=False)
pd.DataFrame({'essay_id': train_df[id_col], 'oof_deberta': oof_128, 'y': train_df[target_col].astype(int).values}).to_csv('oof_deberta_base_s2025_sw128.csv', index=False)
pd.DataFrame({'essay_id': train_df[id_col], 'oof_deberta': oof_ht, 'y': train_df[target_col].astype(int).values}).to_csv('oof_deberta_base_s2025_ht.csv', index=False)
np.save('test_deberta_base_s2025_sw64.npy', test_pred_f64.mean(axis=1))
np.save('test_deberta_base_s2025_sw128.npy', test_pred_f128.mean(axis=1))
np.save('test_deberta_base_s2025_ht.npy', test_pred_fht.mean(axis=1))

# Also save the TTA-combined view for available folds (0.4*SW64 + 0.4*SW128 + 0.2*HT)
oof_tta = 0.4*oof_64 + 0.4*oof_128 + 0.2*oof_ht
test_tta = 0.4*test_pred_f64.mean(axis=1) + 0.4*test_pred_f128.mean(axis=1) + 0.2*test_pred_fht.mean(axis=1)
pd.DataFrame({'essay_id': train_df[id_col], 'oof_deberta': oof_tta.astype(np.float32), 'y': train_df[target_col].astype(int).values}).to_csv('oof_deberta_base_s2025.csv', index=False)
np.save('test_deberta_base_s2025.npy', test_tta.astype(np.float32))
print('[s2025] Saved per-view and combined TTA artifacts for seed 2025 (partial folds).', flush=True)

In [None]:
# Rebuild s2025 per-view and combined artifacts from existing checkpoints (no training); folds where outputs_fold{f} exist
import os, time, numpy as np, pandas as pd, torch
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset
from sklearn.metrics import cohen_kappa_score

assert 'train_df' in globals() and 'test_df' in globals() and 'folds_df' in globals() and 'tokenizer' in globals(), 'Run setup cells first.'

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
min_score, max_score = 1.0, 6.0
n_splits = int(folds_df['fold'].max()) + 1
y_int = train_df[target_col].astype(int).values

def qwk_int(y_true, y_pred_int):
    return cohen_kappa_score(y_true, y_pred_int, weights='quadratic')

# Ensure token ids are available
if 'train_ids_all' not in globals() or 'test_ids_all' not in globals():
    tok_train = tokenizer(train_df[text_col].tolist(), add_special_tokens=False, return_attention_mask=False, return_token_type_ids=False)
    tok_test  = tokenizer(test_df[text_col].tolist(),  add_special_tokens=False, return_attention_mask=False, return_token_type_ids=False)
    train_ids_all = tok_train['input_ids']
    test_ids_all  = tok_test['input_ids']

MAX_LEN_EVAL = MAX_LEN  # use same 512

def chunkify(ids, max_len=512, stride=64):
    usable = max_len - 2
    if len(ids) <= usable:
        chunks = [ids]
    else:
        chunks = []
        start = 0
        while start < len(ids):
            end = min(start + usable, len(ids))
            chunks.append(ids[start:end])
            if end == len(ids): break
            start += stride
    input_ids = []; attn = []; weights = []
    for ch in chunks:
        built = tokenizer.build_inputs_with_special_tokens(ch)
        padded = tokenizer.pad({'input_ids':[built]}, padding='max_length', max_length=max_len, return_tensors='pt')
        input_ids.append(padded['input_ids'][0]); attn.append(padded['attention_mask'][0]); weights.append(float(len(ch)))
    return input_ids, attn, weights

class ChunkDatasetStride(Dataset):
    def __init__(self, ids_list, stride):
        self.inputs=[]; self.attns=[]; self.essay_idx=[]; self.weights=[]
        for i, ids in enumerate(ids_list):
            inp, att, w = chunkify(ids, MAX_LEN_EVAL, stride=stride)
            self.inputs.extend(inp); self.attns.extend(att); self.essay_idx.extend([i]*len(inp)); self.weights.extend(w)
    def __len__(self): return len(self.inputs)
    def __getitem__(self, idx):
        return {'input_ids': self.inputs[idx], 'attention_mask': self.attns[idx]}

class HTInferDatasetLocal(Dataset):
    def __init__(self, ids_list, head_tokens=256):
        self.inputs=[]; self.attns=[]; self.essay_idx=[]
        tail = MAX_LEN_EVAL - 2 - head_tokens
        for i, ids in enumerate(ids_list):
            if len(ids) <= MAX_LEN_EVAL - 2:
                core = ids
            else:
                core = ids[:head_tokens] + (ids[-tail:] if tail>0 else [])
            built = tokenizer.build_inputs_with_special_tokens(core)
            padded = tokenizer.pad({'input_ids':[built]}, padding='max_length', max_length=MAX_LEN_EVAL, return_tensors='pt')
            self.inputs.append(padded['input_ids'][0]); self.attns.append(padded['attention_mask'][0]); self.essay_idx.append(i)
    def __len__(self): return len(self.inputs)
    def __getitem__(self, idx):
        return {'input_ids': self.inputs[idx], 'attention_mask': self.attns[idx]}

def collate_fn(batch):
    return {k: torch.stack([x[k] for x in batch]) for k in batch[0].keys()}

def length_weighted_aggregate(flat_preds, essay_idx, weights, n_items):
    agg = np.zeros(n_items, dtype=np.float32); wsum = np.zeros(n_items, dtype=np.float32)
    np.add.at(agg, essay_idx, flat_preds * weights); np.add.at(wsum, essay_idx, weights)
    return agg / np.clip(wsum, 1e-6, None)

# Build test datasets
test_sw64 = ChunkDatasetStride(test_ids_all, stride=64)
test_sw128 = ChunkDatasetStride(test_ids_all, stride=128)
test_ht = HTInferDatasetLocal(test_ids_all, head_tokens=256)
essay_idx_t64 = np.array(test_sw64.essay_idx, dtype=np.int64); weights_t64 = np.array(test_sw64.weights, dtype=np.float32)
essay_idx_t128 = np.array(test_sw128.essay_idx, dtype=np.int64); weights_t128 = np.array(test_sw128.weights, dtype=np.float32)

# Holders
oof_64 = np.zeros(len(train_df), dtype=np.float32)
oof_128 = np.zeros(len(train_df), dtype=np.float32)
oof_ht = np.zeros(len(train_df), dtype=np.float32)
test_pred_f64 = np.zeros((len(test_df), n_splits), dtype=np.float32)
test_pred_f128 = np.zeros((len(test_df), n_splits), dtype=np.float32)
test_pred_fht = np.zeros((len(test_df), n_splits), dtype=np.float32)

# Evaluate all folds that have outputs directories (dynamic, includes newly trained fold 3)
folds_to_eval = [f for f in range(n_splits) if os.path.isdir(f'outputs_fold{f}')]
print('[s2025-rebuild] Evaluating folds:', folds_to_eval, flush=True)

for f in folds_to_eval:
    va_idx = folds_df.index[folds_df['fold']==f].to_numpy()
    # Build validation datasets
    val_sw64 = ChunkDatasetStride([train_ids_all[i] for i in va_idx], stride=64)
    val_sw128 = ChunkDatasetStride([train_ids_all[i] for i in va_idx], stride=128)
    val_ht = HTInferDatasetLocal([train_ids_all[i] for i in va_idx], head_tokens=256)
    essay_idx_v64 = np.array(val_sw64.essay_idx, dtype=np.int64); weights_v64 = np.array(val_sw64.weights, dtype=np.float32)
    essay_idx_v128 = np.array(val_sw128.essay_idx, dtype=np.int64); weights_v128 = np.array(val_sw128.weights, dtype=np.float32)

    chk_dir = f'outputs_fold{f}'
    ckpts = []
    for d in os.listdir(chk_dir):
        if d.startswith('checkpoint-'):
            try: step = int(d.split('-')[-1])
            except: step = -1
            ckpts.append((step, os.path.join(chk_dir, d)))
    ckpts.sort()
    candidates = [p for _, p in ckpts[-3:]] if ckpts else []
    assert candidates, f'No checkpoints found for fold {f}'

    args = TrainingArguments(output_dir=chk_dir, per_device_eval_batch_size=64, dataloader_num_workers=2, bf16_full_eval=True, report_to=[])

    def eval_view(model_path, ds, aggregate, essay_idx, weights=None, n_items=None):
        model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=1, problem_type='regression').to(device)
        trainer = Trainer(model=model, args=args, data_collator=collate_fn)
        with torch.no_grad():
            flat = trainer.predict(ds).predictions.reshape(-1)
        flat = np.clip(flat, min_score, max_score)
        if aggregate:
            return length_weighted_aggregate(flat, essay_idx, weights, n_items).astype(np.float32)
        else:
            return flat.astype(np.float32)

    best_q = -1.0; best_path = None; best_val64=None; best_val128=None; best_valht=None
    for path in candidates:
        val64 = eval_view(path, val_sw64, True, essay_idx_v64, weights_v64, len(va_idx))
        base_th = np.array([1.5,2.5,3.5,4.5,5.5])
        bins = [-np.inf] + base_th.tolist() + [np.inf]
        q = qwk_int(y_int[va_idx], np.digitize(val64, bins))
        if q > best_q:
            best_q = q; best_path = path; best_val64 = val64
            best_val128 = eval_view(path, val_sw128, True, essay_idx_v128, weights_v128, len(va_idx))
            # HT single view
            model_ht = AutoModelForSequenceClassification.from_pretrained(path, num_labels=1, problem_type='regression').to(device)
            trainer_ht = Trainer(model=model_ht, args=args, data_collator=collate_fn)
            with torch.no_grad():
                flat_ht = trainer_ht.predict(val_ht).predictions.reshape(-1)
            best_valht = np.clip(flat_ht, min_score, max_score).astype(np.float32)

    # Assign OOF
    oof_64[va_idx] = np.clip(best_val64, min_score, max_score)
    oof_128[va_idx] = np.clip(best_val128, min_score, max_score)
    oof_ht[va_idx] = np.clip(best_valht, min_score, max_score)

    # Test inference from best checkpoint
    best_model = AutoModelForSequenceClassification.from_pretrained(best_path, num_labels=1, problem_type='regression').to(device)
    trainer = Trainer(model=best_model, args=args, data_collator=collate_fn)
    with torch.no_grad():
        flat64 = trainer.predict(test_sw64).predictions.reshape(-1)
        flat128 = trainer.predict(test_sw128).predictions.reshape(-1)
        flatht = trainer.predict(test_ht).predictions.reshape(-1)
    flat64 = np.clip(flat64, min_score, max_score); flat128 = np.clip(flat128, min_score, max_score); flatht = np.clip(flatht, min_score, max_score)
    test_pred_f64[:, f] = length_weighted_aggregate(flat64, essay_idx_t64, weights_t64, len(test_df)).astype(np.float32)
    test_pred_f128[:, f] = length_weighted_aggregate(flat128, essay_idx_t128, weights_t128, len(test_df)).astype(np.float32)
    test_pred_fht[:, f] = flatht.astype(np.float32)
    torch.cuda.empty_cache()
    print(f'[s2025-rebuild] Fold {f} best SW64 QWK={best_q:.5f}', flush=True)

# Save per-view and combined artifacts for s2025
pd.DataFrame({'essay_id': train_df[id_col], 'oof_deberta': oof_64, 'y': y_int}).to_csv('oof_deberta_base_s2025_sw64.csv', index=False)
pd.DataFrame({'essay_id': train_df[id_col], 'oof_deberta': oof_128, 'y': y_int}).to_csv('oof_deberta_base_s2025_sw128.csv', index=False)
pd.DataFrame({'essay_id': train_df[id_col], 'oof_deberta': oof_ht, 'y': y_int}).to_csv('oof_deberta_base_s2025_ht.csv', index=False)
np.save('test_deberta_base_s2025_sw64.npy', test_pred_f64.mean(axis=1))
np.save('test_deberta_base_s2025_sw128.npy', test_pred_f128.mean(axis=1))
np.save('test_deberta_base_s2025_ht.npy', test_pred_fht.mean(axis=1))

oof_tta = (0.4*oof_64 + 0.4*oof_128 + 0.2*oof_ht).astype(np.float32)
test_tta = (0.4*test_pred_f64.mean(axis=1) + 0.4*test_pred_f128.mean(axis=1) + 0.2*test_pred_fht.mean(axis=1)).astype(np.float32)
pd.DataFrame({'essay_id': train_df[id_col], 'oof_deberta': oof_tta, 'y': y_int}).to_csv('oof_deberta_base_s2025.csv', index=False)
np.save('test_deberta_base_s2025.npy', test_tta)
print('[s2025-rebuild] Saved per-view and combined artifacts from existing checkpoints.', flush=True)

In [None]:
# DeBERTa-v3-Large targeted folds (0 first): SW64 checkpoint selection + TTA (SW64, SW128, HT256); save per-view and combined artifacts
import os, time, math, random, numpy as np, pandas as pd, torch
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback, AutoModelForSequenceClassification, AutoTokenizer
from sklearn.metrics import cohen_kappa_score
from torch.utils.data import Dataset

assert 'train_df' in globals() and 'test_df' in globals() and 'folds_df' in globals(), 'Run setup cells first.'

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
BASE_SEED = 130013  # distinct seed for large model
MODEL_NAME_L = 'microsoft/deberta-v3-large'
MAX_LEN_L = 512
HEAD_TOKENS_L = 256
n_splits = int(folds_df['fold'].max()) + 1
y = train_df[target_col].astype(int).values
min_score, max_score = 1.0, 6.0

torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

def seed_everything(seed=42):
    random.seed(seed); np.random.seed(seed); torch.manual_seed(seed); torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = False; torch.backends.cudnn.benchmark = True

def qwk_int(y_true, y_pred_int):
    return cohen_kappa_score(y_true, y_pred_int, weights='quadratic')

# Prepare tokenizer for large
tok_l = AutoTokenizer.from_pretrained(MODEL_NAME_L)

# Pre-tokenize if not present (reuse if base already built, else with large tokenizer for consistency in special tokens)
if 'train_ids_all' not in globals() or 'test_ids_all' not in globals():
    print('[v3-large] Pre-tokenizing...', flush=True)
    tok_train = tok_l(train_df[text_col].tolist(), add_special_tokens=False, return_attention_mask=False, return_token_type_ids=False)
    tok_test  = tok_l(test_df[text_col].tolist(),  add_special_tokens=False, return_attention_mask=False, return_token_type_ids=False)
    train_ids_all = tok_train['input_ids']
    test_ids_all  = tok_test['input_ids']

def pack_head_tail_l(ids, max_len=MAX_LEN_L, head=HEAD_TOKENS_L):
    tail = max_len - 2 - head
    if len(ids) <= max_len - 2:
        core = ids
    else:
        core = ids[:head] + (ids[-tail:] if tail>0 else [])
    built = tok_l.build_inputs_with_special_tokens(core)
    out = tok_l.pad({'input_ids':[built]}, padding='max_length', max_length=max_len, return_tensors='pt')
    return out['input_ids'][0], out['attention_mask'][0]

def chunkify_stride_l(ids, max_len=MAX_LEN_L, stride=64):
    usable = max_len - 2
    if len(ids) <= usable:
        chunks = [ids]
    else:
        chunks = []
        start = 0
        while start < len(ids):
            end = min(start + usable, len(ids))
            chunks.append(ids[start:end])
            if end == len(ids):
                break
            start += stride
    input_ids = []; attn = []; weights = []
    for ch in chunks:
        built = tok_l.build_inputs_with_special_tokens(ch)
        padded = tok_l.pad({'input_ids':[built]}, padding='max_length', max_length=max_len, return_tensors='pt')
        input_ids.append(padded['input_ids'][0]); attn.append(padded['attention_mask'][0]); weights.append(float(len(ch)))
    return input_ids, attn, weights

class TrainHTDatasetL(Dataset):
    def __init__(self, ids_list, targets=None):
        self.ids_list = ids_list; self.targets = targets
    def __len__(self): return len(self.ids_list)
    def __getitem__(self, idx):
        ids = self.ids_list[idx]
        input_ids, attention_mask = pack_head_tail_l(ids)
        item = {'input_ids': input_ids, 'attention_mask': attention_mask}
        if self.targets is not None:
            item['labels'] = torch.tensor(float(self.targets[idx]), dtype=torch.float)
        return item

class ChunkDatasetL(Dataset):
    def __init__(self, ids_list, stride):
        self.inputs=[]; self.attns=[]; self.essay_idx=[]; self.weights=[]
        for i, ids in enumerate(ids_list):
            inp, att, w = chunkify_stride_l(ids, MAX_LEN_L, stride=stride)
            self.inputs.extend(inp); self.attns.extend(att);
            self.essay_idx.extend([i]*len(inp)); self.weights.extend(w)
    def __len__(self): return len(self.inputs)
    def __getitem__(self, idx):
        return {'input_ids': self.inputs[idx], 'attention_mask': self.attns[idx]}

class HTInferDatasetL(Dataset):
    def __init__(self, ids_list):
        self.inputs=[]; self.attns=[]; self.essay_idx=[]
        for i, ids in enumerate(ids_list):
            inp, att = pack_head_tail_l(ids)
            self.inputs.append(inp); self.attns.append(att); self.essay_idx.append(i)
    def __len__(self): return len(self.inputs)
    def __getitem__(self, idx):
        return {'input_ids': self.inputs[idx], 'attention_mask': self.attns[idx]}

def collate_fn_l(batch):
    return {k: torch.stack([x[k] for x in batch]) for k in batch[0].keys()}

def length_weighted_aggregate(flat_preds, essay_idx, weights, n_items):
    agg = np.zeros(n_items, dtype=np.float32); wsum = np.zeros(n_items, dtype=np.float32)
    np.add.at(agg, essay_idx, flat_preds * weights); np.add.at(wsum, essay_idx, weights)
    return agg / np.clip(wsum, 1e-6, None)

# Build test datasets for TTA
test_chunks_64_l = ChunkDatasetL(test_ids_all, stride=64)
test_chunks_128_l = ChunkDatasetL(test_ids_all, stride=128)
test_ht_l = HTInferDatasetL(test_ids_all)
essay_idx_t64_l = np.array(test_chunks_64_l.essay_idx, dtype=np.int64); weights_t64_l = np.array(test_chunks_64_l.weights, dtype=np.float32)
essay_idx_t128_l = np.array(test_chunks_128_l.essay_idx, dtype=np.int64); weights_t128_l = np.array(test_chunks_128_l.weights, dtype=np.float32)

# Holders (full length; we'll fill only target folds)
oof_64_l = np.zeros(len(train_df), dtype=np.float32)
oof_128_l = np.zeros(len(train_df), dtype=np.float32)
oof_ht_l = np.zeros(len(train_df), dtype=np.float32)
test_pred_f64_l = np.zeros((len(test_df), n_splits), dtype=np.float32)
test_pred_f128_l = np.zeros((len(test_df), n_splits), dtype=np.float32)
test_pred_fht_l = np.zeros((len(test_df), n_splits), dtype=np.float32)

# Targeted folds: now run fold 1; we'll rebuild combined sL artifacts afterward
FOLDS_TO_RUN_L = [1]

for f in FOLDS_TO_RUN_L:
    fold_t0 = time.time()
    tr_idx = folds_df.index[folds_df['fold']!=f].to_numpy()
    va_idx = folds_df.index[folds_df['fold']==f].to_numpy()
    print(f'[v3-large] Fold {f} start: tr={len(tr_idx)} va={len(va_idx)}', flush=True)

    seed_everything(BASE_SEED + f)
    model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME_L, num_labels=1, problem_type='regression')
    if hasattr(model.config, 'hidden_dropout_prob'): model.config.hidden_dropout_prob = 0.15
    if hasattr(model.config, 'attention_probs_dropout_prob'): model.config.attention_probs_dropout_prob = 0.15
    model.gradient_checkpointing_enable(); model.to(device)

    train_ds = TrainHTDatasetL([train_ids_all[i] for i in tr_idx], train_df.loc[tr_idx, target_col].tolist())
    valid_ds_ht = TrainHTDatasetL([train_ids_all[i] for i in va_idx], train_df.loc[va_idx, target_col].tolist())

    args = TrainingArguments(
        output_dir=f'outputsL_fold{f}',
        per_device_train_batch_size=2,
        per_device_eval_batch_size=32,
        gradient_accumulation_steps=8,
        evaluation_strategy='epoch',
        save_strategy='epoch',
        learning_rate=1.1e-5,
        num_train_epochs=3,
        weight_decay=0.01,
        lr_scheduler_type='linear',
        warmup_ratio=0.10,
        bf16=True,
        bf16_full_eval=True,
        logging_steps=50,
        load_best_model_at_end=True,
        metric_for_best_model='qwk',
        greater_is_better=True,
        save_total_limit=4,
        dataloader_pin_memory=True,
        dataloader_num_workers=2,
        report_to=[]
    )

    def compute_metrics(eval_pred):
        preds = np.clip(eval_pred.predictions.reshape(-1), min_score, max_score)
        labels = eval_pred.label_ids.reshape(-1)
        base_th = np.array([1.5,2.5,3.5,4.5,5.5])
        bins = [-np.inf] + base_th.tolist() + [np.inf]
        pred_int = np.digitize(preds, bins)
        labels_int = labels.astype(int)
        return {'qwk': qwk_int(labels_int, pred_int)}

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_ds,
        eval_dataset=valid_ds_ht,
        data_collator=collate_fn_l,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
    )
    trainer.train()

    # Build validation datasets for views
    val_chunks_64 = ChunkDatasetL([train_ids_all[i] for i in va_idx], stride=64)
    val_chunks_128 = ChunkDatasetL([train_ids_all[i] for i in va_idx], stride=128)
    val_ht = HTInferDatasetL([train_ids_all[i] for i in va_idx])
    essay_idx_v64 = np.array(val_chunks_64.essay_idx, dtype=np.int64); weights_v64 = np.array(val_chunks_64.weights, dtype=np.float32)
    essay_idx_v128 = np.array(val_chunks_128.essay_idx, dtype=np.int64); weights_v128 = np.array(val_chunks_128.weights, dtype=np.float32)

    # Post-hoc checkpoint selection with SW64 QWK (last 3-4 checkpoints)
    chk_dir = args.output_dir
    ckpts = []
    if os.path.isdir(chk_dir):
        for d in os.listdir(chk_dir):
            if d.startswith('checkpoint-'):
                try: step = int(d.split('-')[-1])
                except: step = -1
                ckpts.append((step, os.path.join(chk_dir, d)))
    ckpts.sort()
    candidates = [p for _, p in ckpts[-4:]] if ckpts else []
    best_q = -1.0; best_path = None; best_val64=None; best_val128=None; best_valht=None

    def eval_view(model_path, ds, aggregate, essay_idx, weights=None, n_items=None):
        m = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=1, problem_type='regression').to(device)
        t = Trainer(model=m, args=args, data_collator=collate_fn_l)
        with torch.no_grad():
            flat = t.predict(ds).predictions.reshape(-1)
        flat = np.clip(flat, min_score, max_score)
        if aggregate:
            return length_weighted_aggregate(flat, essay_idx, weights, n_items).astype(np.float32)
        else:
            return flat.astype(np.float32)

    for path in candidates if candidates else [None]:
        if path is None:
            continue
        val64 = eval_view(path, val_chunks_64, True, essay_idx_v64, weights_v64, len(va_idx))
        base_th = np.array([1.5,2.5,3.5,4.5,5.5])
        bins = [-np.inf] + base_th.tolist() + [np.inf]
        q = qwk_int(y[va_idx], np.digitize(val64, bins))
        if q > best_q:
            best_q = q; best_path = path; best_val64 = val64
            best_val128 = eval_view(path, val_chunks_128, True, essay_idx_v128, weights_v128, len(va_idx))
            # HT single-view
            mht = AutoModelForSequenceClassification.from_pretrained(path, num_labels=1, problem_type='regression').to(device)
            tht = Trainer(model=mht, args=args, data_collator=collate_fn_l)
            with torch.no_grad():
                flat_ht = tht.predict(val_ht).predictions.reshape(-1)
            best_valht = np.clip(flat_ht, min_score, max_score).astype(np.float32)

    assert best_path is not None and best_val64 is not None, f'[v3-large] No valid checkpoint for fold {f}'

    # Assign OOF
    oof_64_l[va_idx] = np.clip(best_val64, min_score, max_score)
    oof_128_l[va_idx] = np.clip(best_val128, min_score, max_score)
    oof_ht_l[va_idx] = np.clip(best_valht, min_score, max_score)

    # Test inference with best checkpoint for all views
    best_model = AutoModelForSequenceClassification.from_pretrained(best_path, num_labels=1, problem_type='regression').to(device)
    t_common = Trainer(model=best_model, args=args, data_collator=collate_fn_l)
    with torch.no_grad():
        flat64 = t_common.predict(test_chunks_64_l).predictions.reshape(-1)
        flat128 = t_common.predict(test_chunks_128_l).predictions.reshape(-1)
        flatht = t_common.predict(test_ht_l).predictions.reshape(-1)
    flat64 = np.clip(flat64, min_score, max_score); flat128 = np.clip(flat128, min_score, max_score); flatht = np.clip(flatht, min_score, max_score)
    test_pred_f64_l[:, f] = length_weighted_aggregate(flat64, essay_idx_t64_l, weights_t64_l, len(test_df)).astype(np.float32)
    test_pred_f128_l[:, f] = length_weighted_aggregate(flat128, essay_idx_t128_l, weights_t128_l, len(test_df)).astype(np.float32)
    test_pred_fht_l[:, f] = flatht.astype(np.float32)

    torch.cuda.empty_cache()
    print(f'[v3-large] Fold {f} done in {time.time()-fold_t0:.1f}s (best SW64 QWK={best_q:.5f})', flush=True)

# Save per-view OOF and test for large seed prefix 'sL'
pd.DataFrame({'essay_id': train_df[id_col], 'oof_deberta': oof_64_l, 'y': y}).to_csv('oof_deberta_base_sL_sw64.csv', index=False)
pd.DataFrame({'essay_id': train_df[id_col], 'oof_deberta': oof_128_l, 'y': y}).to_csv('oof_deberta_base_sL_sw128.csv', index=False)
pd.DataFrame({'essay_id': train_df[id_col], 'oof_deberta': oof_ht_l, 'y': y}).to_csv('oof_deberta_base_sL_ht.csv', index=False)
np.save('test_deberta_base_sL_sw64.npy', test_pred_f64_l.mean(axis=1))
np.save('test_deberta_base_sL_sw128.npy', test_pred_f128_l.mean(axis=1))
np.save('test_deberta_base_sL_ht.npy', test_pred_fht_l.mean(axis=1))

# Also write a default combined with a conservative HT cap (0.55,0.30,0.15) for convenience (final bagger will re-search masked TTA)
oof_tta_l = (0.55*oof_64_l + 0.30*oof_128_l + 0.15*oof_ht_l).astype(np.float32)
test_tta_l = (0.55*test_pred_f64_l.mean(axis=1) + 0.30*test_pred_f128_l.mean(axis=1) + 0.15*test_pred_fht_l.mean(axis=1)).astype(np.float32)
pd.DataFrame({'essay_id': train_df[id_col], 'oof_deberta': oof_tta_l, 'y': y}).to_csv('oof_deberta_base_sL.csv', index=False)
np.save('test_deberta_base_sL.npy', test_tta_l)
print('[v3-large] Saved per-view and combined TTA artifacts for sL (partial folds).', flush=True)

In [3]:
# Rebuild DeBERTa-v3-Large (sL) per-view and combined artifacts from checkpoints across available folds (no retraining)
import os, time, numpy as np, pandas as pd, torch
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments, AutoTokenizer
from torch.utils.data import Dataset
from sklearn.metrics import cohen_kappa_score

assert 'train_df' in globals() and 'test_df' in globals() and 'folds_df' in globals(), 'Run setup cells first.'

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
min_score, max_score = 1.0, 6.0
n_splits = int(folds_df['fold'].max()) + 1
y_int = train_df[target_col].astype(int).values

MODEL_NAME_L = 'microsoft/deberta-v3-large'
MAX_LEN_L = 512
HEAD_TOKENS_L = 256

def qwk_int(y_true, y_pred_int):
    return cohen_kappa_score(y_true, y_pred_int, weights='quadratic')

# Large tokenizer
if 'tok_l' not in globals():
    tok_l = AutoTokenizer.from_pretrained(MODEL_NAME_L)

# Ensure token ids are available (build with large tokenizer for consistency if missing)
if 'train_ids_all' not in globals() or 'test_ids_all' not in globals():
    tok_train = tok_l(train_df[text_col].tolist(), add_special_tokens=False, return_attention_mask=False, return_token_type_ids=False)
    tok_test  = tok_l(test_df[text_col].tolist(),  add_special_tokens=False, return_attention_mask=False, return_token_type_ids=False)
    train_ids_all = tok_train['input_ids']
    test_ids_all  = tok_test['input_ids']

def chunkify_l(ids, max_len=MAX_LEN_L, stride=64):
    usable = max_len - 2
    if len(ids) <= usable:
        chunks = [ids]
    else:
        chunks = []
        start = 0
        while start < len(ids):
            end = min(start + usable, len(ids))
            chunks.append(ids[start:end])
            if end == len(ids): break
            start += stride
    input_ids = []; attn = []; weights = []
    for ch in chunks:
        built = tok_l.build_inputs_with_special_tokens(ch)
        padded = tok_l.pad({'input_ids':[built]}, padding='max_length', max_length=max_len, return_tensors='pt')
        input_ids.append(padded['input_ids'][0]); attn.append(padded['attention_mask'][0]); weights.append(float(len(ch)))
    return input_ids, attn, weights

class ChunkDatasetL(Dataset):
    def __init__(self, ids_list, stride):
        self.inputs=[]; self.attns=[]; self.essay_idx=[]; self.weights=[]
        for i, ids in enumerate(ids_list):
            inp, att, w = chunkify_l(ids, MAX_LEN_L, stride=stride)
            self.inputs.extend(inp); self.attns.extend(att);
            self.essay_idx.extend([i]*len(inp)); self.weights.extend(w)
    def __len__(self): return len(self.inputs)
    def __getitem__(self, idx):
        return {'input_ids': self.inputs[idx], 'attention_mask': self.attns[idx]}

class HTInferDatasetL(Dataset):
    def __init__(self, ids_list, head_tokens=HEAD_TOKENS_L):
        self.inputs=[]; self.attns=[]; self.essay_idx=[]
        tail = MAX_LEN_L - 2 - head_tokens
        for i, ids in enumerate(ids_list):
            if len(ids) <= MAX_LEN_L - 2:
                core = ids
            else:
                core = ids[:head_tokens] + (ids[-tail:] if tail>0 else [])
            built = tok_l.build_inputs_with_special_tokens(core)
            padded = tok_l.pad({'input_ids':[built]}, padding='max_length', max_length=MAX_LEN_L, return_tensors='pt')
            self.inputs.append(padded['input_ids'][0]); self.attns.append(padded['attention_mask'][0]); self.essay_idx.append(i)
    def __len__(self): return len(self.inputs)
    def __getitem__(self, idx):
        return {'input_ids': self.inputs[idx], 'attention_mask': self.attns[idx]}

def collate_fn_l(batch):
    return {k: torch.stack([x[k] for x in batch]) for k in batch[0].keys()}

def length_weighted_aggregate(flat_preds, essay_idx, weights, n_items):
    agg = np.zeros(n_items, dtype=np.float32); wsum = np.zeros(n_items, dtype=np.float32)
    np.add.at(agg, essay_idx, flat_preds * weights); np.add.at(wsum, essay_idx, weights)
    return agg / np.clip(wsum, 1e-6, None)

# Build test datasets for three views
test_sw64 = ChunkDatasetL(test_ids_all, stride=64)
test_sw128 = ChunkDatasetL(test_ids_all, stride=128)
test_ht = HTInferDatasetL(test_ids_all, head_tokens=HEAD_TOKENS_L)
essay_idx_t64 = np.array(test_sw64.essay_idx, dtype=np.int64); weights_t64 = np.array(test_sw64.weights, dtype=np.float32)
essay_idx_t128 = np.array(test_sw128.essay_idx, dtype=np.int64); weights_t128 = np.array(test_sw128.weights, dtype=np.float32)

# Holders
oof_64 = np.zeros(len(train_df), dtype=np.float32)
oof_128 = np.zeros(len(train_df), dtype=np.float32)
oof_ht = np.zeros(len(train_df), dtype=np.float32)
test_pred_f64 = np.zeros((len(test_df), n_splits), dtype=np.float32)
test_pred_f128 = np.zeros((len(test_df), n_splits), dtype=np.float32)
test_pred_fht = np.zeros((len(test_df), n_splits), dtype=np.float32)

# Folds to evaluate: those with outputsL_fold{f} present
folds_to_eval = [f for f in range(n_splits) if os.path.isdir(f'outputsL_fold{f}')]
print('[sL-rebuild] Evaluating folds:', folds_to_eval, flush=True)

for f in folds_to_eval:
    va_idx = folds_df.index[folds_df['fold']==f].to_numpy()
    # Build validation datasets
    val_sw64 = ChunkDatasetL([train_ids_all[i] for i in va_idx], stride=64)
    val_sw128 = ChunkDatasetL([train_ids_all[i] for i in va_idx], stride=128)
    val_ht = HTInferDatasetL([train_ids_all[i] for i in va_idx], head_tokens=HEAD_TOKENS_L)
    essay_idx_v64 = np.array(val_sw64.essay_idx, dtype=np.int64); weights_v64 = np.array(val_sw64.weights, dtype=np.float32)
    essay_idx_v128 = np.array(val_sw128.essay_idx, dtype=np.int64); weights_v128 = np.array(val_sw128.weights, dtype=np.float32)

    chk_dir = f'outputsL_fold{f}'
    ckpts = []
    for d in os.listdir(chk_dir):
        if d.startswith('checkpoint-'):
            try: step = int(d.split('-')[-1])
            except: step = -1
            ckpts.append((step, os.path.join(chk_dir, d)))
    ckpts.sort()
    candidates = [p for _, p in ckpts[-4:]] if ckpts else []
    assert candidates, f'No checkpoints found for large fold {f}'

    args = TrainingArguments(output_dir=chk_dir, per_device_eval_batch_size=32, dataloader_num_workers=2, bf16_full_eval=True, report_to=[])

    def eval_view(model_path, ds, aggregate, essay_idx, weights=None, n_items=None):
        model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=1, problem_type='regression').to(device)
        trainer = Trainer(model=model, args=args, data_collator=collate_fn_l)
        with torch.no_grad():
            flat = trainer.predict(ds).predictions.reshape(-1)
        flat = np.clip(flat, min_score, max_score)
        if aggregate:
            return length_weighted_aggregate(flat, essay_idx, weights, n_items).astype(np.float32)
        else:
            return flat.astype(np.float32)

    best_q = -1.0; best_path = None; best_val64=None; best_val128=None; best_valht=None
    base_bins = [-np.inf,1.5,2.5,3.5,4.5,5.5,np.inf]
    for path in candidates:
        val64 = eval_view(path, val_sw64, True, essay_idx_v64, weights_v64, len(va_idx))
        q = qwk_int(y_int[va_idx], np.digitize(val64, base_bins))
        if q > best_q:
            best_q = q; best_path = path; best_val64 = val64
            best_val128 = eval_view(path, val_sw128, True, essay_idx_v128, weights_v128, len(va_idx))
            # HT single-view
            model_ht = AutoModelForSequenceClassification.from_pretrained(path, num_labels=1, problem_type='regression').to(device)
            trainer_ht = Trainer(model=model_ht, args=args, data_collator=collate_fn_l)
            with torch.no_grad():
                flat_ht = trainer_ht.predict(val_ht).predictions.reshape(-1)
            best_valht = np.clip(flat_ht, min_score, max_score).astype(np.float32)

    # Assign OOF for this fold
    oof_64[va_idx] = np.clip(best_val64, min_score, max_score)
    oof_128[va_idx] = np.clip(best_val128, min_score, max_score)
    oof_ht[va_idx] = np.clip(best_valht, min_score, max_score)

    # Test inference from best checkpoint
    best_model = AutoModelForSequenceClassification.from_pretrained(best_path, num_labels=1, problem_type='regression').to(device)
    trainer = Trainer(model=best_model, args=args, data_collator=collate_fn_l)
    with torch.no_grad():
        flat64 = trainer.predict(test_sw64).predictions.reshape(-1)
        flat128 = trainer.predict(test_sw128).predictions.reshape(-1)
        flatht = trainer.predict(test_ht).predictions.reshape(-1)
    flat64 = np.clip(flat64, min_score, max_score); flat128 = np.clip(flat128, min_score, max_score); flatht = np.clip(flatht, min_score, max_score)
    test_pred_f64[:, f] = length_weighted_aggregate(flat64, essay_idx_t64, weights_t64, len(test_df)).astype(np.float32)
    test_pred_f128[:, f] = length_weighted_aggregate(flat128, essay_idx_t128, weights_t128, len(test_df)).astype(np.float32)
    test_pred_fht[:, f] = flatht.astype(np.float32)
    torch.cuda.empty_cache()
    print(f'[sL-rebuild] Fold {f} best SW64 QWK={best_q:.5f}', flush=True)

# Save per-view and combined artifacts for sL (partial folds supported)
pd.DataFrame({'essay_id': train_df[id_col], 'oof_deberta': oof_64, 'y': y_int}).to_csv('oof_deberta_base_sL_sw64.csv', index=False)
pd.DataFrame({'essay_id': train_df[id_col], 'oof_deberta': oof_128, 'y': y_int}).to_csv('oof_deberta_base_sL_sw128.csv', index=False)
pd.DataFrame({'essay_id': train_df[id_col], 'oof_deberta': oof_ht, 'y': y_int}).to_csv('oof_deberta_base_sL_ht.csv', index=False)
np.save('test_deberta_base_sL_sw64.npy', test_pred_f64.mean(axis=1))
np.save('test_deberta_base_sL_sw128.npy', test_pred_f128.mean(axis=1))
np.save('test_deberta_base_sL_ht.npy', test_pred_fht.mean(axis=1))

# Conservative default TTA mix; bagging cell will re-opt with masks
oof_tta = (0.55*oof_64 + 0.30*oof_128 + 0.15*oof_ht).astype(np.float32)
test_tta = (0.55*test_pred_f64.mean(axis=1) + 0.30*test_pred_f128.mean(axis=1) + 0.15*test_pred_fht.mean(axis=1)).astype(np.float32)
pd.DataFrame({'essay_id': train_df[id_col], 'oof_deberta': oof_tta, 'y': y_int}).to_csv('oof_deberta_base_sL.csv', index=False)
np.save('test_deberta_base_sL.npy', test_tta)
print('[sL-rebuild] Saved per-view and combined artifacts for sL from existing checkpoints.', flush=True)



You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


[sL-rebuild] Evaluating folds: [0, 1, 4]


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[sL-rebuild] Fold 0 best SW64 QWK=0.78923


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[sL-rebuild] Fold 1 best SW64 QWK=0.76851


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[sL-rebuild] Fold 4 best SW64 QWK=0.76655


[sL-rebuild] Saved per-view and combined artifacts for sL from existing checkpoints.
