In [1]:
# Force pure eager mode (disable Torch compile/Inductor/Triton) BEFORE importing torch/transformers
%env TORCHDYNAMO_DISABLE=1
%env TORCH_COMPILE_DISABLE=1
%env TORCHINDUCTOR_DISABLE=1
%env TRITON_DISABLE=1
%env XFORMERS_FORCE_DISABLE_TRITON=1
%env TOKENIZERS_PARALLELISM=false

import os
os.environ['TORCHDYNAMO_DISABLE'] = '1'
os.environ['TORCH_COMPILE_DISABLE'] = '1'
os.environ['TORCHINDUCTOR_DISABLE'] = '1'
os.environ['TRITON_DISABLE'] = '1'
os.environ['XFORMERS_FORCE_DISABLE_TRITON'] = '1'
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

import torch
try:
    import torch._dynamo as dynamo
    dynamo.config.suppress_errors = True
    torch._dynamo.reset()
except Exception:
    pass

# Prefer plain math attention (avoid Triton-backed flash/sdpa kernels)
try:
    from torch.backends.cuda import sdp_kernel
    sdp_kernel(enable_flash=False, enable_mem_efficient=False, enable_math=True)
except Exception:
    pass

print('Eager mode enforced. Ready to import transformers/accelerate safely.')

env: TORCHDYNAMO_DISABLE=1
env: TORCH_COMPILE_DISABLE=1
env: TORCHINDUCTOR_DISABLE=1
env: TRITON_DISABLE=1
env: XFORMERS_FORCE_DISABLE_TRITON=1
env: TOKENIZERS_PARALLELISM=false


Eager mode enforced. Ready to import transformers/accelerate safely.


  self.gen = func(*args, **kwds)


# Plan: AES 2.0 Medal Strategy

Objectives:
- Establish fast, reliable CV and a working baseline ASAP.
- Leverage GPU for transformer fine-tuning; cache features and OOF.
- Optimize QWK via post-processing (threshold search) and robust CV.

Milestone 1: Environment & Data sanity
- Verify GPU (nvidia-smi).
- Load train/test, inspect schema, target distribution, text fields, lengths.
- Decide CV: stratified KFold by score and prompt_id (if present), group-aware if necessary.

Milestone 2: Baseline (fast)
- TF-IDF (char+word) + Ridge/LinearSVR/XGBoost (GPU) regression.
- Round-to-integers and optimized thresholds for QWK.
- Save OOF, test preds; establish CV ~0.80+ quickly.

Milestone 3: Transformer models
- Install torch cu121 stack + Transformers.
- Fine-tune DeBERTa-v3-base or RoBERTa-large (sequence regression).
- Use max_length ~1024 with Longformer/DeBERTa-v3-large if feasible; else chunking + mean/max pooling.
- CV with 5 folds, early stopping; log time per fold.
- Optimize prediction-to-label mapping (isotonic or threshold search).

Milestone 4: Ensembling
- Blend TF-IDF model with transformer OOF (weighted).
- Try multiple seeds/models; weight by OOF.

Milestone 5: Error Analysis & Refinements
- Bucket by prompt/length/score; address calibration.
- Feature augments: readability, basic counts, prompt_id embeddings.

Validation Discipline:
- Single, deterministic folds saved to disk and reused.
- All preprocessors fit inside folds.
- Multiple seeds; track ΔOOF per change.

Next: Run GPU check and basic EDA, then request expert review of plan and CV setup.

In [33]:
import os, sys, time, subprocess, textwrap
import pandas as pd
import numpy as np

def run(cmd):
    print(f"$ {' '.join(cmd)}", flush=True)
    try:
        out = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, check=False, text=True)
        print(out.stdout)
    except Exception as e:
        print(f"Command failed: {e}")

print('=== Environment: GPU check (nvidia-smi) ===', flush=True)
run(['bash','-lc','nvidia-smi || true'])

t0 = time.time()
print('=== Loading data ===', flush=True)
train_path = 'train.csv'
test_path = 'test.csv'
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
print(f"train shape: {train.shape}")
print(f"test shape: {test.shape}")
print('train columns:', list(train.columns))
print('test columns:', list(test.columns))

# Identify key columns (hardcode text_col fix)
target_col = 'score' if 'score' in train.columns else None
id_cols = [c for c in train.columns if 'id' in c.lower()]
text_col = 'full_text' if 'full_text' in train.columns else None
prompt_candidates = [c for c in train.columns if 'prompt' in c.lower()]
prompt_col = prompt_candidates[0] if prompt_candidates else None
print('Detected columns -> id:', id_cols, ' text:', text_col, ' prompt:', prompt_col, ' target:', target_col)

print('\n=== Head(train) ===')
print(train.head(3))
print('\n=== Head(test) ===')
print(test.head(3))

if target_col:
    print('\nTarget stats:')
    print(train[target_col].describe())
    vc = train[target_col].value_counts().sort_index()
    print('value_counts:', vc.to_dict())

if text_col:
    print('\nText length stats (chars) on train:')
    lens = train[text_col].astype(str).str.len()
    print(lens.describe())
    print('Word count stats on train:')
    wcnt = train[text_col].astype(str).str.split().map(len)
    print(wcnt.describe())
    print('Test text length stats (chars):')
    lens_te = test[text_col].astype(str).str.len()
    print(lens_te.describe())
    print('Test word count stats:')
    wcnt_te = test[text_col].astype(str).str.split().map(len)
    print(wcnt_te.describe())

elapsed = time.time() - t0
print(f'=== EDA setup done in {elapsed:.2f}s ===', flush=True)

=== Environment: GPU check (nvidia-smi) ===


$ bash -lc nvidia-smi || true


Failed to initialize NVML: Unknown Error

=== Loading data ===


train shape: (15576, 3)
test shape: (1731, 2)
train columns: ['essay_id', 'full_text', 'score']
test columns: ['essay_id', 'full_text']
Detected columns -> id: ['essay_id']  text: full_text  prompt: None  target: score

=== Head(train) ===
  essay_id                                          full_text  score
0  663d2cf  Dear State Senator,\n\nI am arguing in favor o...      3
1  3a20bfb  In " The Challenge of Exploring Venus" The aut...      2
2  6adae64  Teachers can have a hard time telling if their...      3

=== Head(test) ===
  essay_id                                          full_text
0  d550b2d  The face was not created by aliens because the...
1  0c10954  Hello my name is Luke Bomberger and I was seag...
2  ef04816  The technology to read the emotional expressio...

Target stats:
count    15576.000000
mean         2.950116
std          1.044384
min          1.000000
25%          2.000000
50%          3.000000
75%          4.000000
max          6.000000
Name: score, dtype: float

count    15576.000000
mean       368.705252
std        150.747833
min        150.000000
25%        253.000000
50%        345.000000
75%        453.000000
max       1656.000000
Name: full_text, dtype: float64
Test text length stats (chars):
count     1731.000000
mean      2055.101098
std        887.263737
min        800.000000
25%       1402.500000
50%       1912.000000
75%       2497.500000
max      10309.000000
Name: full_text, dtype: float64
Test word count stats:
count    1731.000000
mean      365.135760
std       147.183553
min       152.000000
25%       255.000000
50%       340.000000
75%       443.500000
max      1367.000000
Name: full_text, dtype: float64
=== EDA setup done in 0.68s ===


In [None]:
import time, math, json, os
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import cohen_kappa_score
from scipy import sparse

np.random.seed(42)

def qwk(y_true, y_pred_int):
    return cohen_kappa_score(y_true, y_pred_int, weights='quadratic')

class ThresholdOptimizer:
    def __init__(self, init_thresholds=None):
        self.thresholds = np.array(init_thresholds if init_thresholds is not None else [1.5,2.5,3.5,4.5,5.5], dtype=float)

    def _apply(self, preds):
        th = self.thresholds
        return np.digitize(preds, th) + 1  # maps to 1..6

    def score(self, y_true, preds):
        return qwk(y_true, self._apply(preds))

    def fit(self, y_true, preds, iters=200, step=0.02):
        best = self.thresholds.copy()
        best_score = self.score(y_true, preds)
        for it in range(iters):
            improved = False
            for i in range(len(best)):
                for delta in (-step, step):
                    cand = best.copy()
                    cand[i] += delta
                    cand = np.sort(cand)
                    # Enforce bounds
                    if not (0.5 < cand[0] < cand[1] < cand[2] < cand[3] < cand[4] < 6.5):
                        continue
                    s = qwk(y_true, np.digitize(preds, cand) + 1)
                    if s > best_score:
                        best_score, best = s, cand
                        improved = True
            if not improved:
                step *= 0.5
                if step < 1e-4:
                    break
        self.thresholds = best
        return best, best_score

print('=== Building CV folds and TF-IDF baseline ===', flush=True)
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
text_col = 'full_text'
target_col = 'score'

# Length bins to stabilize stratification
len_bins = pd.qcut(train[text_col].astype(str).str.len(), q=5, duplicates='drop', labels=False)
strat_labels = train[target_col].astype(int).astype(str) + '_' + len_bins.astype(int).astype(str)

n_folds = 5
skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
folds = np.full(len(train), -1, dtype=int)
for fold, (_, val_idx) in enumerate(skf.split(train, strat_labels)):
    folds[val_idx] = fold
pd.DataFrame({'essay_id': train['essay_id'], 'fold': folds}).to_csv('folds.csv', index=False)
print('Saved folds.csv')

# Placeholders
oof = np.zeros(len(train), dtype=float)
test_preds_folds = []

t_start = time.time()
for fold in range(n_folds):
    f0 = time.time()
    tr_idx = np.where(folds != fold)[0]
    va_idx = np.where(folds == fold)[0]
    X_tr_text = train.loc[tr_idx, text_col].astype(str)
    X_va_text = train.loc[va_idx, text_col].astype(str)
    y_tr = train.loc[tr_idx, target_col].values.astype(float)

    # Vectorizers fit INSIDE fold
    word_tfidf = TfidfVectorizer(ngram_range=(1,2), min_df=3, sublinear_tf=True, max_features=80000)
    char_tfidf = TfidfVectorizer(analyzer='char_wb', ngram_range=(3,5), min_df=3, sublinear_tf=True, max_features=120000)

    Xw_tr = word_tfidf.fit_transform(X_tr_text)
    Xc_tr = char_tfidf.fit_transform(X_tr_text)
    X_tr = sparse.hstack([Xw_tr, Xc_tr]).tocsr()

    Xw_va = word_tfidf.transform(X_va_text)
    Xc_va = char_tfidf.transform(X_va_text)
    X_va = sparse.hstack([Xw_va, Xc_va]).tocsr()

    # Model
    model = Ridge(alpha=4.0, random_state=42)
    model.fit(X_tr, y_tr)
    oof[va_idx] = model.predict(X_va)

    # Test transform and preds for this fold
    Xw_te = word_tfidf.transform(test[text_col].astype(str))
    Xc_te = char_tfidf.transform(test[text_col].astype(str))
    X_te = sparse.hstack([Xw_te, Xc_te]).tocsr()
    te_pred = model.predict(X_te)
    test_preds_folds.append(te_pred.astype(float))

    # Fold logging
    # Quick rounded QWK for sanity per fold
    va_true = train.loc[va_idx, target_col].values.astype(int)
    va_round = np.clip(np.rint(oof[va_idx]), 1, 6).astype(int)
    fold_qwk_round = qwk(va_true, va_round)
    print(f'Fold {fold}: n_tr={len(tr_idx)} n_va={len(va_idx)} round-QWK={fold_qwk_round:.4f} elapsed={time.time()-f0:.1f}s', flush=True)

elapsed = time.time() - t_start
print(f'All folds done in {elapsed/60:.1f} min', flush=True)

# Threshold optimization on global OOF
oof_clipped = np.clip(oof, 0.5, 6.5)
opt = ThresholdOptimizer()
init_th = [1.5,2.5,3.5,4.5,5.5]
best_th, best_oof_qwk = opt.fit(train[target_col].values.astype(int), oof_clipped, iters=200, step=0.05)
oof_labels = opt._apply(oof_clipped)
round_qwk = qwk(train[target_col].values.astype(int), np.clip(np.rint(oof),1,6).astype(int))
print(f'OOF round-QWK={round_qwk:.5f}  OOF thresh-QWK={best_oof_qwk:.5f}  thresholds={best_th}', flush=True)

# Blend test predictions across folds (mean), then apply thresholds
test_pred_mean = np.mean(np.vstack(test_preds_folds), axis=0)
test_pred_mean = np.clip(test_pred_mean, 0.5, 6.5)
test_labels = np.digitize(test_pred_mean, best_th) + 1
test_labels = np.clip(test_labels, 1, 6).astype(int)

# Save artifacts
np.save('oof_tfidf.npy', oof)
np.save('test_tfidf.npy', test_pred_mean)
with open('thresholds_tfidf.json','w') as f:
    json.dump({'thresholds': best_th.tolist(), 'oof_qwk': float(best_oof_qwk), 'round_oof_qwk': float(round_qwk)}, f)

sub = pd.DataFrame({'essay_id': test['essay_id'], 'score': test_labels})
sub.to_csv('submission.csv', index=False)
print('Saved submission.csv, oof_tfidf.npy, test_tfidf.npy, thresholds_tfidf.json')

In [None]:
import os, sys, subprocess, shutil, time
from pathlib import Path

def pip(*args):
    print(">", *args, flush=True)
    subprocess.run([sys.executable, "-m", "pip", *args], check=True)

print("=== Install PyTorch cu121 + NLP stack ===", flush=True)
# Uninstall any stray stacks (best-effort)
for pkg in ("torch","torchvision","torchaudio"):
    subprocess.run([sys.executable, "-m", "pip", "uninstall", "-y", pkg], check=False)

for d in (
    "/app/.pip-target/torch",
    "/app/.pip-target/torch-2.8.0.dist-info",
    "/app/.pip-target/torch-2.4.1.dist-info",
    "/app/.pip-target/torchvision",
    "/app/.pip-target/torchvision-0.23.0.dist-info",
    "/app/.pip-target/torchvision-0.19.1.dist-info",
    "/app/.pip-target/torchaudio",
    "/app/.pip-target/torchaudio-2.8.0.dist-info",
    "/app/.pip-target/torchaudio-2.4.1.dist-info",
    "/app/.pip-target/torchgen",
    "/app/.pip-target/functorch",
):
    if os.path.exists(d):
        print("Removing", d)
        shutil.rmtree(d, ignore_errors=True)

# 1) Install exact cu121 stack
pip("install",
    "--index-url", "https://download.pytorch.org/whl/cu121",
    "--extra-index-url", "https://pypi.org/simple",
    "torch==2.4.1", "torchvision==0.19.1", "torchaudio==2.4.1")

# 2) Freeze torch versions
Path("constraints.txt").write_text("torch==2.4.1\ntorchvision==0.19.1\ntorchaudio==2.4.1\n")

# 3) Install Transformers stack honoring constraints
pip("install", "-c", "constraints.txt",
    "transformers==4.44.2", "accelerate==0.34.2",
    "datasets==2.21.0", "evaluate==0.4.2",
    "sentencepiece", "scikit-learn", "torchmetrics",
    "--upgrade-strategy", "only-if-needed")

import torch
print("torch:", torch.__version__, "built CUDA:", getattr(torch.version, "cuda", None))
print("CUDA available:", torch.cuda.is_available())
assert str(getattr(torch.version, "cuda", "")).startswith("12.1"), f"Wrong CUDA build: {torch.version.cuda}"
assert torch.cuda.is_available(), "CUDA not available"
print("GPU:", torch.cuda.get_device_name(0))
print("=== Install & GPU sanity OK ===")

In [None]:
import time
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.model_selection import StratifiedGroupKFold

print('=== Building prompt clusters and grouped folds ===', flush=True)
t0 = time.time()
train = pd.read_csv('train.csv')
text_col = 'full_text'
target_col = 'score'

# TF-IDF -> SVD on TRAIN only
tfidf = TfidfVectorizer(ngram_range=(1,2), min_df=5, max_features=100000, sublinear_tf=True)
X = tfidf.fit_transform(train[text_col].astype(str))
print(f'TFIDF shape: {X.shape}', flush=True)

svd = TruncatedSVD(n_components=50, random_state=42)
X_svd = svd.fit_transform(X)
print('SVD done.', flush=True)

k = 12
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
clusters = kmeans.fit_predict(X_svd)
print('KMeans done.', flush=True)

train['cluster'] = clusters.astype(int)

# StratifiedGroupKFold: stratify by score, group by cluster
sgkf = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42)
folds_g = np.full(len(train), -1, dtype=int)
for fold, (_, va_idx) in enumerate(sgkf.split(X_svd, train[target_col].astype(int), groups=train['cluster'])):
    folds_g[va_idx] = fold

assert (folds_g >= 0).all(), 'Some rows not assigned a fold'
fold_df = pd.DataFrame({'essay_id': train['essay_id'], 'fold_grouped': folds_g, 'cluster': train['cluster']})
fold_df.to_csv('folds_grouped.csv', index=False)
print('Saved folds_grouped.csv with grouped folds and clusters')
print('Cluster distribution:', train['cluster'].value_counts().sort_index().to_dict())
print('Fold sizes:', pd.Series(folds_g).value_counts().sort_index().to_dict())
print(f'=== Done in {(time.time()-t0):.1f}s ===', flush=True)

In [None]:
import os, time, json, math, random, gc
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, Trainer, TrainingArguments
from sklearn.metrics import cohen_kappa_score

torch.backends.cuda.matmul.allow_tf32 = True
torch.set_float32_matmul_precision('high')
SEED = 42
def seed_everything(seed=SEED):
    random.seed(seed); np.random.seed(seed); torch.manual_seed(seed); torch.cuda.manual_seed_all(seed)
seed_everything()

def qwk(y_true, y_pred_int):
    return cohen_kappa_score(y_true, y_pred_int, weights='quadratic')

class ThresholdOptimizer:
    def __init__(self, init_thresholds=None):
        self.thresholds = np.array(init_thresholds if init_thresholds is not None else [1.5,2.5,3.5,4.5,5.5], dtype=float)
    def _apply(self, preds):
        return np.digitize(preds, self.thresholds) + 1
    def fit(self, y_true, preds, iters=200, step=0.05):
        best = self.thresholds.copy(); best_score = qwk(y_true, self._apply(preds))
        for _ in range(iters):
            improved = False
            for i in range(5):
                for d in (-step, step):
                    cand = np.sort(np.clip(best + (np.arange(5)==i)*d, 0.5, 6.5))
                    if not (0.5 < cand[0] < cand[1] < cand[2] < cand[3] < cand[4] < 6.5):
                        continue
                    s = qwk(y_true, np.digitize(preds, cand) + 1)
                    if s > best_score:
                        best_score, best, improved = s, cand, True
            if not improved:
                step *= 0.5
                if step < 1e-4: break
        self.thresholds = best
        return best, best_score

MODEL_NAME = 'microsoft/deberta-v3-base'
MAX_LEN = 1024
HEAD_FRAC = 0.88  # dynamic head emphasis

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

def encode_head_tail(text):
    ids = tokenizer(text, add_special_tokens=False)['input_ids']
    keep_total = MAX_LEN - 3  # [CLS], mid [SEP], last [SEP]
    if len(ids) <= MAX_LEN - 2:
        out = [tokenizer.cls_token_id] + ids + [tokenizer.sep_token_id]
    else:
        keep_head = int(HEAD_FRAC * keep_total)
        keep_tail = keep_total - keep_head
        head = ids[:keep_head]
        tail = ids[-keep_tail:] if keep_tail > 0 else []
        out = [tokenizer.cls_token_id] + head + [tokenizer.sep_token_id] + tail + [tokenizer.sep_token_id]
    attn = [1]*len(out)
    return {'input_ids': out, 'attention_mask': attn}

class EssayDataset(Dataset):
    def __init__(self, df, text_col='full_text', targets=None):
        self.texts = df[text_col].astype(str).tolist()
        self.targets = None if targets is None else targets.astype(np.float32)
    def __len__(self): return len(self.texts)
    def __getitem__(self, idx):
        enc = encode_head_tail(self.texts[idx])
        item = {k: torch.tensor(v, dtype=torch.long) for k, v in enc.items()}
        if self.targets is not None:
            item['labels'] = torch.tensor(self.targets[idx], dtype=torch.float32)  # shape [ ] scalar
        return item

class PadCollator:
    def __init__(self):
        self.pad = DataCollatorWithPadding(tokenizer=tokenizer, pad_to_multiple_of=8)
    def __call__(self, features):
        labels = None
        if 'labels' in features[0]:
            labels = torch.stack([f['labels'] for f in features]).view(-1)  # shape [B]
            for f in features: f.pop('labels')
        batch = self.pad(features)
        if labels is not None: batch['labels'] = labels
        return batch

class SmoothL1Trainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop('labels')
        outputs = model(**inputs)
        logits = outputs.logits.squeeze(-1)
        loss = torch.nn.functional.smooth_l1_loss(logits, labels, beta=1.0, reduction='mean')
        return (loss, outputs) if return_outputs else loss

def train_fold(fold, df, folds, out_dir='deberta_base_1024'):
    os.makedirs(out_dir, exist_ok=True)
    tr_idx = np.where(folds != fold)[0]
    va_idx = np.where(folds == fold)[0]
    dtrain = EssayDataset(df.iloc[tr_idx], targets=df.iloc[tr_idx]['score'].values.astype(np.float32))
    dvalid = EssayDataset(df.iloc[va_idx], targets=df.iloc[va_idx]['score'].values.astype(np.float32))
    model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=1, problem_type='regression')
    args = TrainingArguments(
        output_dir=f"{out_dir}/fold{fold}",
        learning_rate=2e-5,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=16,
        num_train_epochs=3,
        weight_decay=0.01,
        fp16=True,
        evaluation_strategy='steps',
        save_strategy='steps',
        eval_steps=800,
        save_steps=800,
        save_total_limit=1,
        logging_strategy='steps',
        logging_steps=200,
        load_best_model_at_end=True,
        metric_for_best_model='qwk_round',
        greater_is_better=True,
        report_to=[],
        dataloader_num_workers=6,
        dataloader_pin_memory=True,
        dataloader_persistent_workers=True,
        gradient_accumulation_steps=4,
        lr_scheduler_type='cosine',
        warmup_ratio=0.1,
        optim='adamw_torch_fused',
        eval_accumulation_steps=32,
        seed=SEED
    )
    def compute_metrics(eval_pred):
        preds = eval_pred.predictions.squeeze()
        labels = eval_pred.label_ids.squeeze()
        preds_clip = np.clip(preds, 0.5, 6.5)
        q = qwk(labels.astype(int), np.clip(np.rint(preds_clip),1,6).astype(int))
        return {'qwk_round': q}
    trainer = SmoothL1Trainer(
        model=model,
        args=args,
        train_dataset=dtrain,
        eval_dataset=dvalid,
        tokenizer=tokenizer,
        data_collator=PadCollator(),
        compute_metrics=compute_metrics
    )
    t0 = time.time()
    trainer.train()
    print(f"Fold {fold} train done in {(time.time()-t0)/60:.1f} min", flush=True)
    preds_val = trainer.predict(dvalid).predictions.squeeze()
    torch.cuda.empty_cache(); gc.collect()
    return va_idx, preds_val

print('=== DeBERTa-v3-base 1024 head+tail 5-fold training (grouped folds) ===', flush=True)
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
folds_g = pd.read_csv('folds_grouped.csv')
folds_map = dict(zip(folds_g['essay_id'], folds_g['fold_grouped']))
folds = train_df['essay_id'].map(folds_map).values.astype(int)

oof = np.zeros(len(train_df), dtype=float)
for f in sorted(np.unique(folds)):
    f_start = time.time()
    va_idx, preds_val = train_fold(f, train_df, folds)
    oof[va_idx] = preds_val
    y_true = train_df.iloc[va_idx]['score'].values.astype(int)
    fold_qwk_round = qwk(y_true, np.clip(np.rint(np.clip(preds_val,0.5,6.5)),1,6).astype(int))
    print(f"Fold {f} val round-QWK={fold_qwk_round:.4f} elapsed={(time.time()-f_start)/60:.1f} min", flush=True)

np.save('oof_deberta_base_1024.npy', oof)
opt = ThresholdOptimizer()
best_th, best_oof_qwk = opt.fit(train_df['score'].values.astype(int), np.clip(oof,0.5,6.5))
round_q = qwk(train_df['score'].values.astype(int), np.clip(np.rint(oof),1,6).astype(int))
print(f'OOF round-QWK={round_q:.5f}  OOF thresh-QWK={best_oof_qwk:.5f}  thresholds={best_th}', flush=True)
with open('thresholds_deberta_base_1024.json','w') as f:
    json.dump({'thresholds': best_th.tolist(), 'oof_qwk': float(best_oof_qwk), 'round_oof_qwk': float(round_q)}, f)

print('=== Note === Next: add test-time inference and seeds; if OOF <0.835, pivot to 512 sliding windows. ===', flush=True)

In [None]:
import os, time, json, math, random, gc
from collections import defaultdict
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from transformers import (
    AutoTokenizer, AutoModel,
    DataCollatorWithPadding, Trainer, TrainingArguments
)
from sklearn.metrics import cohen_kappa_score

print('=== DeBERTa-v3-base sliding windows + mean pooling (512/384) ===', flush=True)
torch.backends.cuda.matmul.allow_tf32 = True
torch.set_float32_matmul_precision('high')
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

SEED = 42
def seed_everything(seed=SEED):
    random.seed(seed); np.random.seed(seed); torch.manual_seed(seed); torch.cuda.manual_seed_all(seed)
seed_everything()

def qwk(y_true, y_pred_int):
    return cohen_kappa_score(y_true, y_pred_int, weights='quadratic')

class ThresholdOptimizer:
    def __init__(self, init_thresholds=None):
        self.thresholds = np.array(init_thresholds if init_thresholds is not None else [1.5,2.5,3.5,4.5,5.5], dtype=float)
    def _apply(self, preds):
        return np.digitize(preds, self.thresholds) + 1
    def fit(self, y_true, preds, iters=200, step=0.05):
        best = self.thresholds.copy(); best_score = qwk(y_true, self._apply(preds))
        for _ in range(iters):
            improved = False
            for i in range(5):
                for d in (-step, step):
                    cand = np.sort(np.clip(best + (np.arange(5)==i)*d, 0.5, 6.5))
                    if not (0.5 < cand[0] < cand[1] < cand[2] < cand[3] < cand[4] < 6.5):
                        continue
                    s = qwk(y_true, np.digitize(preds, cand) + 1)
                    if s > best_score:
                        best_score, best, improved = s, cand, True
            if not improved:
                step *= 0.5
                if step < 1e-4: break
        self.thresholds = best
        return best, best_score

MODEL_NAME = 'microsoft/deberta-v3-base'
MAX_LEN = 512
STRIDE = 128  # overlap tokens between chunks
BATCH_TRAIN = 8
BATCH_EVAL = 32

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

class WindowDataset(Dataset):
    def __init__(self, df, text_col='full_text', labels=None):
        self.essay_ids = []
        self.input_ids = []
        self.attn_masks = []
        self.labels = [] if labels is not None else None
        texts = df[text_col].astype(str).tolist()
        eids = df['essay_id'].tolist()
        lbls = None if labels is None else labels.astype(np.float32).tolist()
        enc = tokenizer(texts,
                        max_length=MAX_LEN,
                        truncation=True,
                        padding=False,
                        return_overflowing_tokens=True,
                        stride=STRIDE,
                        return_attention_mask=True)
        overflow_to_sample = enc.pop('overflow_to_sample_mapping')
        for idx, sample_idx in enumerate(overflow_to_sample):
            self.essay_ids.append(eids[sample_idx])
            self.input_ids.append(enc['input_ids'][idx])
            self.attn_masks.append(enc['attention_mask'][idx])
            if lbls is not None:
                self.labels.append(lbls[sample_idx])
        if self.labels is not None:
            self.labels = np.array(self.labels, dtype=np.float32)
        # Keep as lists; collator will pad
    def __len__(self):
        return len(self.input_ids)
    def __getitem__(self, i):
        item = {
            'input_ids': torch.tensor(self.input_ids[i], dtype=torch.long),
            'attention_mask': torch.tensor(self.attn_masks[i], dtype=torch.long),
        }
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[i], dtype=torch.float32)
        item['essay_id'] = self.essay_ids[i]
        return item

class PadCollator:
    def __init__(self):
        self.pad = DataCollatorWithPadding(tokenizer=tokenizer, pad_to_multiple_of=8)
    def __call__(self, features):
        # Ensure any extra keys like essay_id are removed before padding
        for f in features:
            if 'essay_id' in f:
                f.pop('essay_id')
        labels = None
        if 'labels' in features[0]:
            labels = torch.stack([f['labels'] for f in features]).view(-1)
            for f in features: f.pop('labels')
        batch = self.pad(features)
        if labels is not None:
            batch['labels'] = labels
        return batch

class MeanPoolRegressor(nn.Module):
    def __init__(self, model_name, hidden_size=768):
        super().__init__()
        self.backbone = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.1)
        self.head = nn.Linear(hidden_size, 1)
    def forward(self, input_ids=None, attention_mask=None, labels=None):
        out = self.backbone(input_ids=input_ids, attention_mask=attention_mask)
        last = out.last_hidden_state  # [B, T, H]
        mask = attention_mask.unsqueeze(-1).float()  # [B, T, 1]
        masked = last * mask
        denom = mask.sum(dim=1).clamp(min=1e-6)
        mean = masked.sum(dim=1) / denom  # [B, H]
        mean = self.dropout(mean)
        logits = self.head(mean).squeeze(-1)  # [B]
        if labels is not None:
            loss = torch.nn.functional.smooth_l1_loss(logits, labels, beta=1.0, reduction='mean')
            return {'loss': loss, 'logits': logits.unsqueeze(-1)}
        return {'logits': logits.unsqueeze(-1)}

# Global var to let compute_metrics know current eval essay_ids order
EVAL_ESSAY_IDS = None

def make_compute_metrics():
    def compute(eval_pred):
        preds = eval_pred.predictions.squeeze()  # per-window
        labels = eval_pred.label_ids.squeeze()   # per-window
        # Aggregate by essay
        ids = np.array(EVAL_ESSAY_IDS)
        by_id = defaultdict(list)
        by_id_true = {}
        for p, y, i in zip(preds, labels, ids):
            by_id[i].append(float(p))
            by_id_true[i] = int(y)
        agg_preds = []
        agg_true = []
        for i, vals in by_id.items():
            agg_preds.append(np.mean(vals))
            agg_true.append(by_id_true[i])
        agg_preds = np.clip(np.array(agg_preds), 0.5, 6.5)
        agg_labels = np.array(agg_true, dtype=int)
        q = qwk(agg_labels, np.clip(np.rint(agg_preds), 1, 6).astype(int))
        return {'qwk_round': q}
    return compute

def train_fold_windows(fold, df, folds, out_dir='deberta_v3_base_win512'):
        os.makedirs(out_dir, exist_ok=True)
        tr_idx = np.where(folds != fold)[0]
        va_idx = np.where(folds == fold)[0]
        dtrain = WindowDataset(df.iloc[tr_idx], labels=df.iloc[tr_idx]['score'].values.astype(np.float32))
        dvalid = WindowDataset(df.iloc[va_idx], labels=df.iloc[va_idx]['score'].values.astype(np.float32))
        model = MeanPoolRegressor(MODEL_NAME, hidden_size=768)
        args = TrainingArguments(
            output_dir=f"{out_dir}/fold{fold}",
            learning_rate=2e-5,
            per_device_train_batch_size=BATCH_TRAIN,
            per_device_eval_batch_size=BATCH_EVAL,
            num_train_epochs=3,
            weight_decay=0.01,
            bf16=True,
            evaluation_strategy='steps',
            save_strategy='steps',
            eval_steps=1000,
            save_steps=1000,
            save_total_limit=1,
            logging_strategy='steps',
            logging_steps=200,
            load_best_model_at_end=True,
            metric_for_best_model='qwk_round',
            greater_is_better=True,
            report_to=[],
            dataloader_num_workers=6,
            dataloader_pin_memory=True,
            dataloader_persistent_workers=True,
            gradient_accumulation_steps=4,
            lr_scheduler_type='cosine',
            warmup_ratio=0.1,
            optim='adamw_torch_fused',
            eval_accumulation_steps=32,
            seed=SEED,
            remove_unused_columns=False
        )
        global EVAL_ESSAY_IDS
        EVAL_ESSAY_IDS = dvalid.essay_ids
        trainer = Trainer(
            model=model,
            args=args,
            train_dataset=dtrain,
            eval_dataset=dvalid,
            tokenizer=tokenizer,
            data_collator=PadCollator(),
            compute_metrics=make_compute_metrics()
        )
        t0 = time.time()
        trainer.train()
        print(f"Fold {fold} train done in {(time.time()-t0)/60:.1f} min", flush=True)
        # Predict on valid windows and aggregate
        preds_val = trainer.predict(dvalid).predictions.squeeze()
        ids = np.array(dvalid.essay_ids)
        by_id = defaultdict(list)
        for p, i in zip(preds_val, ids):
            by_id[i].append(float(p))
        agg = {i: float(np.mean(v)) for i, v in by_id.items()}
        # Map back to essay order
        va_eids = df.iloc[va_idx]['essay_id'].values.tolist()
        agg_vec = np.array([agg[e] for e in va_eids], dtype=float)
        torch.cuda.empty_cache(); gc.collect()
        return va_idx, agg_vec

train_df = pd.read_csv('train.csv')
folds_g = pd.read_csv('folds_grouped.csv')
folds_map = dict(zip(folds_g['essay_id'], folds_g['fold_grouped']))
folds = train_df['essay_id'].map(folds_map).values.astype(int)

oof = np.zeros(len(train_df), dtype=float)
for f in sorted(np.unique(folds)):
    f_start = time.time()
    va_idx, agg_preds = train_fold_windows(f, train_df, folds)
    oof[va_idx] = agg_preds
    y_true = train_df.iloc[va_idx]['score'].values.astype(int)
    q = qwk(y_true, np.clip(np.rint(np.clip(agg_preds, 0.5, 6.5)), 1, 6).astype(int))
    print(f"Fold {f} val round-QWK={q:.4f} elapsed={(time.time()-f_start)/60:.1f} min", flush=True)

np.save('oof_deberta_v3_base_win512.npy', oof)
opt = ThresholdOptimizer()
best_th, best_oof_qwk = opt.fit(train_df['score'].values.astype(int), np.clip(oof, 0.5, 6.5))
round_q = qwk(train_df['score'].values.astype(int), np.clip(np.rint(oof), 1, 6).astype(int))
print(f'OOF round-QWK={round_q:.5f}  OOF thresh-QWK={best_oof_qwk:.5f}  thresholds={best_th}', flush=True)
with open('thresholds_deberta_v3_base_win512.json','w') as f:
    json.dump({'thresholds': best_th.tolist(), 'oof_qwk': float(best_oof_qwk), 'round_oof_qwk': float(round_q)}, f)
print('=== Next: add test-time window inference + second seed; then consider v3-large ===', flush=True)

In [None]:
import time
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.model_selection import StratifiedGroupKFold

print('=== Rebuilding grouped folds: TF-IDF -> SVD(100) -> KMeans(k=16) -> StratifiedGroupKFold ===', flush=True)
t0 = time.time()
train = pd.read_csv('train.csv')
text_col = 'full_text'
target_col = 'score'

# Fit TF-IDF on TRAIN only
tfidf = TfidfVectorizer(ngram_range=(1,2), min_df=5, max_features=120000, sublinear_tf=True)
X = tfidf.fit_transform(train[text_col].astype(str))
print(f'TFIDF shape: {X.shape}', flush=True)

# SVD to 100 components
svd = TruncatedSVD(n_components=100, random_state=42)
X_svd = svd.fit_transform(X)
print('SVD done.', flush=True)

# KMeans k=16
k = 16
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
clusters = kmeans.fit_predict(X_svd)
train['cluster_k16'] = clusters.astype(int)
print('KMeans done.', flush=True)

# StratifiedGroupKFold: stratify by score, group by new clusters
sgkf = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42)
folds_g = np.full(len(train), -1, dtype=int)
for fold, (_, va_idx) in enumerate(sgkf.split(X_svd, train[target_col].astype(int), groups=train['cluster_k16'])):
    folds_g[va_idx] = fold
assert (folds_g >= 0).all(), 'Some rows not assigned a fold'

fold_df = pd.DataFrame({
    'essay_id': train['essay_id'],
    'fold_grouped_k16': folds_g,
    'cluster_k16': train['cluster_k16']
})
fold_df.to_csv('folds_grouped_k16.csv', index=False)
print('Saved folds_grouped_k16.csv')
print('Cluster (k=16) distribution:', train['cluster_k16'].value_counts().sort_index().to_dict())
print('Fold sizes:', pd.Series(folds_g).value_counts().sort_index().to_dict())
print(f'=== Done in {(time.time()-t0):.1f}s ===', flush=True)

In [11]:
import os, time, json, math, random, gc
from collections import defaultdict
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from transformers import (
    AutoTokenizer, AutoModel,
    DataCollatorWithPadding, Trainer, TrainingArguments, PrinterCallback
)
from sklearn.metrics import cohen_kappa_score

print('=== Seed 2 prep: DeBERTa-v3-base windows + mean pooling + Multi-Sample Dropout (512/384) ===', flush=True)
torch.backends.cuda.matmul.allow_tf32 = True
torch.set_float32_matmul_precision('high')
torch.backends.cudnn.benchmark = True
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
# Disable any implicit torch.compile/inductor paths to avoid Triton build (Python.h) issues
os.environ['TORCH_COMPILE_DISABLE'] = '1'
os.environ['TORCHINDUCTOR_DISABLE'] = '1'
try:
    import torch._dynamo as dynamo
    dynamo.config.suppress_errors = True
except Exception:
    pass

SEED2 = 2025
def seed_everything(seed=SEED2):
    random.seed(seed); np.random.seed(seed); torch.manual_seed(seed); torch.cuda.manual_seed_all(seed)
seed_everything()

def qwk(y_true, y_pred_int):
    return cohen_kappa_score(y_true, y_pred_int, weights='quadratic')

class ThresholdOptimizer:
    def __init__(self, init_thresholds=None):
        self.thresholds = np.array(init_thresholds if init_thresholds is not None else [1.5,2.5,3.5,4.5,5.5], dtype=float)
    def _apply(self, preds):
        return np.digitize(preds, self.thresholds) + 1
    def fit(self, y_true, preds, iters=200, step=0.05):
        best = self.thresholds.copy(); best_score = qwk(y_true, self._apply(preds))
        for _ in range(iters):
            improved = False; 
            for i in range(5):
                for d in (-step, step):
                    cand = np.sort(np.clip(best + (np.arange(5)==i)*d, 0.5, 6.5))
                    if not (0.5 < cand[0] < cand[1] < cand[2] < cand[3] < cand[4] < 6.5):
                        continue
                    s = qwk(y_true, np.digitize(preds, cand) + 1)
                    if s > best_score:
                        best_score, best, improved = s, cand, True
            if not improved:
                step *= 0.5
                if step < 1e-4: break
        self.thresholds = best
        return best, best_score

MODEL_NAME = 'microsoft/deberta-v3-base'
MAX_LEN = 512
STRIDE = 128  # overlap tokens between chunks
BATCH_TRAIN = 4  # reduce to avoid OOM without grad ckpt
BATCH_EVAL = 32

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

class WindowDataset(Dataset):
    def __init__(self, df, text_col='full_text', labels=None):
        self.essay_ids = []
        self.input_ids = []
        self.attn_masks = []
        self.lengths = []  # token counts per window
        self.labels = [] if labels is not None else None
        texts = df[text_col].astype(str).tolist()
        eids = df['essay_id'].tolist()
        lbls = None if labels is None else labels.astype(np.float32).tolist()
        enc = tokenizer(texts,
                        max_length=MAX_LEN,
                        truncation=True,
                        padding=False,
                        return_overflowing_tokens=True,
                        stride=STRIDE,
                        return_attention_mask=True)
        overflow_to_sample = enc.pop('overflow_to_sample_mapping')
        for idx, sample_idx in enumerate(overflow_to_sample):
            self.essay_ids.append(eids[sample_idx])
            ids_i = enc['input_ids'][idx]
            attn_i = enc['attention_mask'][idx]
            self.input_ids.append(ids_i)
            self.attn_masks.append(attn_i)
            self.lengths.append(int(sum(attn_i)))
            if lbls is not None:
                self.labels.append(lbls[sample_idx])
        if self.labels is not None:
            self.labels = np.array(self.labels, dtype=np.float32)
    def __len__(self):
        return len(self.input_ids)
    def __getitem__(self, i):
        item = {
            'input_ids': torch.tensor(self.input_ids[i], dtype=torch.long),
            'attention_mask': torch.tensor(self.attn_masks[i], dtype=torch.long),
        }
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[i], dtype=torch.float32)
        item['essay_id'] = self.essay_ids[i]
        return item

class PadCollator:
    def __init__(self):
        self.pad = DataCollatorWithPadding(tokenizer=tokenizer, pad_to_multiple_of=8)
    def __call__(self, features):
        for f in features:
            f.pop('essay_id', None)
        labels = None
        if 'labels' in features[0]:
            labels = torch.stack([f['labels'] for f in features]).view(-1)
            for f in features: f.pop('labels')
        batch = self.pad(features)
        if labels is not None:
            batch['labels'] = labels
        return batch

class MSDMeanPoolRegressor(nn.Module):
    def __init__(self, model_name, hidden_size=768, msd=5, p=0.2):
        super().__init__()
        self.backbone = AutoModel.from_pretrained(model_name)
        self.dropouts = nn.ModuleList([nn.Dropout(p) for _ in range(msd)])
        self.head = nn.Linear(hidden_size, 1)
        self.msd = msd
    def forward(self, input_ids=None, attention_mask=None, labels=None):
        out = self.backbone(input_ids=input_ids, attention_mask=attention_mask)
        last = out.last_hidden_state  # [B, T, H]
        mask = attention_mask.unsqueeze(-1).float()
        mean = (last * mask).sum(dim=1) / mask.sum(dim=1).clamp(min=1e-6)
        logits_list = []
        for dp in self.dropouts:
            logits_list.append(self.head(dp(mean)).squeeze(-1))
        logits = torch.stack(logits_list, dim=0).mean(dim=0)  # [B]
        if labels is not None:
            loss = torch.nn.functional.smooth_l1_loss(logits, labels, beta=1.0, reduction='mean')
            return {'loss': loss, 'logits': logits.unsqueeze(-1)}
        return {'logits': logits.unsqueeze(-1)}

def compute_metrics_factory(eval_ids):
    # Note: eval doesn't expose per-window lengths; use simple mean for metrics.
    def compute(eval_pred):
        preds = eval_pred.predictions.squeeze()
        labels = eval_pred.label_ids.squeeze()
        ids = np.array(eval_ids)
        by_id = defaultdict(list)
        by_id_true = {}
        for p, y, i in zip(preds, labels, ids):
            by_id[i].append(float(p))
            by_id_true[i] = int(y)
        agg_preds = np.array([np.mean(v) for i, v in by_id.items()])
        agg_true = np.array([by_id_true[i] for i in by_id.keys()])
        agg_preds = np.clip(agg_preds, 0.5, 6.5)
        q = qwk(agg_true, np.clip(np.rint(agg_preds), 1, 6).astype(int))
        return {'qwk_round': q}
    return compute

def _log_windows_stats(name, ds):
    n_win = len(ds)
    uniq = len(set(ds.essay_ids))
    avg_w = n_win / max(uniq, 1)
    print(f'{name}: essays={uniq} windows={n_win} avg_windows_per_essay={avg_w:.2f}', flush=True)

def train_fold_seed2(fold, df, folds, out_dir='deberta_v3_base_win512_seed2025'):
    os.makedirs(out_dir, exist_ok=True)
    tr_idx = np.where(folds != fold)[0]
    va_idx = np.where(folds == fold)[0]
    print(f'[Fold {fold}] Building datasets...', flush=True)
    dtrain = WindowDataset(df.iloc[tr_idx], labels=df.iloc[tr_idx]['score'].values.astype(np.float32))
    dvalid = WindowDataset(df.iloc[va_idx], labels=df.iloc[va_idx]['score'].values.astype(np.float32))
    _log_windows_stats(f'[Fold {fold}] Train', dtrain)
    _log_windows_stats(f'[Fold {fold}] Valid', dvalid)
    model = MSDMeanPoolRegressor(MODEL_NAME, hidden_size=768, msd=5, p=0.2)
    # Disable gradient checkpointing to avoid potential stalls
    # Eager mode only (no torch.compile) to avoid Triton build issues
    args = TrainingArguments(
        output_dir=f"{out_dir}/fold{fold}",
        learning_rate=2e-5,
        per_device_train_batch_size=BATCH_TRAIN,
        per_device_eval_batch_size=BATCH_EVAL,
        num_train_epochs=3,
        weight_decay=0.01,
        fp16=True,
        bf16=False,
        evaluation_strategy='steps',
        save_strategy='steps',
        eval_steps=400,
        save_steps=400,
        save_total_limit=1,
        logging_strategy='steps',
        logging_steps=50,
        logging_first_step=True,
        load_best_model_at_end=True,
        metric_for_best_model='qwk_round',
        greater_is_better=True,
        report_to=[],
        disable_tqdm=False,
        dataloader_num_workers=0,
        dataloader_pin_memory=True,
        dataloader_persistent_workers=False,
        group_by_length=False,
        gradient_accumulation_steps=4,
        lr_scheduler_type='cosine',
        warmup_ratio=0.1,
        optim='adamw_torch_fused',
        eval_accumulation_steps=32,
        seed=SEED2,
        remove_unused_columns=False,
    )
    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=dtrain,
        eval_dataset=dvalid,
        tokenizer=tokenizer,
        data_collator=PadCollator(),
        compute_metrics=compute_metrics_factory(dvalid.essay_ids),
        callbacks=[PrinterCallback()]
    )
    t0 = time.time()
    print(f'[Fold {fold}] Start training...', flush=True)
    trainer.train()
    print(f"[Fold {fold}] Train done in {(time.time()-t0)/60:.1f} min", flush=True)
    # Predict on valid windows and aggregate (token-count weighted mean)
    preds_val = trainer.predict(dvalid).predictions.squeeze()
    ids = np.array(dvalid.essay_ids)
    lens = np.array(dvalid.lengths, dtype=float)
    by_sum = defaultdict(float)
    by_w = defaultdict(float)
    for p, i, w in zip(preds_val, ids, lens):
        by_sum[i] += float(p) * float(w)
        by_w[i] += float(w)
    agg = {i: (by_sum[i] / max(by_w[i], 1e-6)) for i in by_sum.keys()}
    va_eids = df.iloc[va_idx]['essay_id'].values.tolist()
    agg_vec = np.array([agg[e] for e in va_eids], dtype=float)
    torch.cuda.empty_cache(); gc.collect()
    return va_idx, agg_vec

# Driver for seed 2 (will execute after current run finishes):
print('Prepared seed-2 training function with k16 folds, MSD, eval_steps=500. To run: set folds from folds_grouped_k16.csv and loop folds.', flush=True)
if os.path.exists('folds_grouped_k16.csv'):
    print('Found folds_grouped_k16.csv. Example run snippet (not executing now):', flush=True)
    print("""
train_df = pd.read_csv('train.csv')
folds_g2 = pd.read_csv('folds_grouped_k16.csv')
folds_map2 = dict(zip(folds_g2['essay_id'], folds_g2['fold_grouped_k16']))
folds2 = train_df['essay_id'].map(folds_map2).values.astype(int)
oof2 = np.zeros(len(train_df), dtype=float)
for f in sorted(np.unique(folds2)):
    f_start = time.time()
    va_idx, agg_preds = train_fold_seed2(f, train_df, folds2)
    oof2[va_idx] = agg_preds
    y_true = train_df.iloc[va_idx]['score'].values.astype(int)
    q = qwk(y_true, np.clip(np.rint(np.clip(agg_preds, 0.5, 6.5)), 1, 6).astype(int))
    print(f'Fold {f} val round-QWK={q:.4f} elapsed={(time.time()-f_start)/60:.1f} min', flush=True)
np.save('oof_deberta_v3_base_win512_seed2025.npy', oof2)
opt = ThresholdOptimizer()
best_th, best_oof_qwk = opt.fit(train_df['score'].values.astype(int), np.clip(oof2, 0.5, 6.5))
round_q = qwk(train_df['score'].values.astype(int), np.clip(np.rint(oof2), 1, 6).astype(int))
print(f'OOF round-QWK={round_q:.5f}  OOF thresh-QWK={best_oof_qwk:.5f}  thresholds={best_th}', flush=True)
with open('thresholds_deberta_v3_base_win512_seed2025.json','w') as f:
    json.dump({'thresholds': best_th.tolist(), 'oof_qwk': float(best_oof_qwk), 'round_oof_qwk': float(round_q)}, f)
    """)
else:
    print('folds_grouped_k16.csv not found yet. Build it with cell 7 first.', flush=True)

=== Seed 2 prep: DeBERTa-v3-base windows + mean pooling + Multi-Sample Dropout (512/384) ===




Prepared seed-2 training function with k16 folds, MSD, eval_steps=500. To run: set folds from folds_grouped_k16.csv and loop folds.


Found folds_grouped_k16.csv. Example run snippet (not executing now):



train_df = pd.read_csv('train.csv')
folds_g2 = pd.read_csv('folds_grouped_k16.csv')
folds_map2 = dict(zip(folds_g2['essay_id'], folds_g2['fold_grouped_k16']))
folds2 = train_df['essay_id'].map(folds_map2).values.astype(int)
oof2 = np.zeros(len(train_df), dtype=float)
for f in sorted(np.unique(folds2)):
    f_start = time.time()
    va_idx, agg_preds = train_fold_seed2(f, train_df, folds2)
    oof2[va_idx] = agg_preds
    y_true = train_df.iloc[va_idx]['score'].values.astype(int)
    q = qwk(y_true, np.clip(np.rint(np.clip(agg_preds, 0.5, 6.5)), 1, 6).astype(int))
    print(f'Fold {f} val round-QWK={q:.4f} elapsed={(time.time()-f_start)/60:.1f} min', flush=True)
np.save('oof_deberta_v3_base_win512_seed2025.npy', oof2)
opt = ThresholdOptimizer()
best_th, best_oof_qwk = opt.fit(train_df['score'].values.astype(int), np.clip(oof2, 0.5, 6.5))
round_q = qwk(train_df['score'].values.astype(int), np.clip(np.rint(oof2), 1, 6).astype(int))
print(f'OOF round-QWK={round_q:.5f}  OOF thresh-QWK={bes

In [37]:
import os, glob, json, time, gc
from collections import defaultdict
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModel, DataCollatorWithPadding, Trainer, TrainingArguments

print('=== Test-time inference (windows mean) for deberta_v3_base_win512 ===', flush=True)

MODEL_NAME = 'microsoft/deberta-v3-base'
MAX_LEN = 512
STRIDE = 128
BATCH_EVAL = 32
OUT_DIR = 'deberta_v3_base_win512'

tokenizer_tt = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

class WindowDatasetTest(Dataset):
    def __init__(self, df, text_col='full_text'):
        self.essay_ids = []
        self.input_ids = []
        self.attn_masks = []
        texts = df[text_col].astype(str).tolist()
        eids = df['essay_id'].tolist()
        enc = tokenizer_tt(texts,
                           max_length=MAX_LEN,
                           truncation=True,
                           padding=False,
                           return_overflowing_tokens=True,
                           stride=STRIDE,
                           return_attention_mask=True)
        overflow_to_sample = enc.pop('overflow_to_sample_mapping')
        for idx, sample_idx in enumerate(overflow_to_sample):
            self.essay_ids.append(eids[sample_idx])
            self.input_ids.append(enc['input_ids'][idx])
            self.attn_masks.append(enc['attention_mask'][idx])
    def __len__(self):
        return len(self.input_ids)
    def __getitem__(self, i):
        return {
            'input_ids': torch.tensor(self.input_ids[i], dtype=torch.long),
            'attention_mask': torch.tensor(self.attn_masks[i], dtype=torch.long),
            'essay_id': self.essay_ids[i]
        }

class PadCollatorTT:
    def __init__(self):
        self.pad = DataCollatorWithPadding(tokenizer=tokenizer_tt, pad_to_multiple_of=8)
    def __call__(self, features):
        for f in features:
            f.pop('essay_id', None)
        batch = self.pad(features)
        return batch

class MeanPoolRegressor(nn.Module):
    def __init__(self, model_name, hidden_size=768):
        super().__init__()
        self.backbone = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.1)
        self.head = nn.Linear(hidden_size, 1)
    def forward(self, input_ids=None, attention_mask=None, labels=None):
        out = self.backbone(input_ids=input_ids, attention_mask=attention_mask)
        last = out.last_hidden_state
        mask = attention_mask.unsqueeze(-1).float()
        mean = (last * mask).sum(dim=1) / mask.sum(dim=1).clamp(min=1e-6)
        mean = self.dropout(mean)
        logits = self.head(mean).squeeze(-1)
        return {'logits': logits.unsqueeze(-1)}

def load_best_subdir(folder):
        cks = sorted(glob.glob(os.path.join(folder, 'checkpoint-*')), key=lambda p: int(p.split('-')[-1]))
        return cks[-1] if cks else folder

def _find_weight_file(path_dir):
    cand = [
        os.path.join(path_dir, 'pytorch_model.bin'),
        os.path.join(path_dir, 'model.safetensors'),
        os.path.join(path_dir, 'pytorch_model.bin.index.json')
    ]
    for p in cand:
        if os.path.exists(p):
            return p
    return None

def predict_fold(folder, dtest):
    best_dir = load_best_subdir(folder)
    # If checkpoint dir chosen but missing weights, fallback to fold root
    wt = _find_weight_file(best_dir)
    if wt is None and 'checkpoint-' in best_dir:
        root_dir = folder
        wt = _find_weight_file(root_dir)
        best_dir = root_dir if wt is not None else best_dir
    # Init model and load weights
    model = MeanPoolRegressor(MODEL_NAME)
    if wt is not None and wt.endswith('.safetensors'):
        from safetensors.torch import load_file
        sd = load_file(wt)
        model.load_state_dict(sd, strict=False)
    elif wt is not None:
        model.load_state_dict(torch.load(wt, map_location='cpu'), strict=False)
    else:
        # As a last resort, try HF format dir
        try:
            model = MeanPoolRegressor.from_pretrained(best_dir)  # may fail for custom head
        except Exception:
            pass
    args = TrainingArguments(output_dir=os.path.join(folder, 'tmp_infer'), per_device_eval_batch_size=BATCH_EVAL,
                             dataloader_num_workers=0, dataloader_pin_memory=True, report_to=[], fp16=True, bf16=False)
    trainer = Trainer(model=model, args=args, tokenizer=tokenizer_tt, data_collator=PadCollatorTT())
    preds = trainer.predict(dtest).predictions.squeeze()
    ids = np.array(dtest.essay_ids)
    by_id = defaultdict(list)
    for p, i in zip(preds, ids):
        by_id[i].append(float(p))
    # Aggregate per essay-id
    agg = {}
    for i, v in by_id.items():
        agg[i] = float(np.mean(v))
    return agg

def run_test_inference():
    test_df = pd.read_csv('test.csv')
    dtest = WindowDatasetTest(test_df)
    fold_dirs = sorted([p for p in glob.glob(os.path.join(OUT_DIR, 'fold*')) if os.path.isdir(p)])
    all_fold_preds = []
    t0 = time.time()
    for fd in fold_dirs:
        t1 = time.time()
        agg = predict_fold(fd, dtest)
        all_fold_preds.append(agg)
        print(f'Predicted {fd} in {(time.time()-t1)/60:.1f} min', flush=True)
        torch.cuda.empty_cache(); gc.collect()
    # Average across folds
    test_ids_order = test_df['essay_id'].tolist()
    preds_mat = []
    for agg in all_fold_preds:
        preds_mat.append([agg[e] for e in test_ids_order])
    preds_mean = np.mean(np.array(preds_mat), axis=0)
    preds_mean = np.clip(preds_mean, 0.5, 6.5)
    np.save('test_deberta_v3_base_win512.npy', preds_mean)
    th_path = 'thresholds_deberta_v3_base_win512.json'
    if os.path.exists(th_path):
        with open(th_path, 'r') as f:
            th = np.array(json.load(f)['thresholds'], dtype=float)
    else:
        th = np.array([1.5,2.5,3.5,4.5,5.5], dtype=float)
    labels = np.digitize(preds_mean, th) + 1
    labels = np.clip(labels, 1, 6).astype(int)
    sub = pd.DataFrame({'essay_id': test_df['essay_id'], 'score': labels})
    sub.to_csv('submission.csv', index=False)
    print('Saved submission.csv and test_deberta_v3_base_win512.npy')
    print(f'=== Test inference done in {(time.time()-t0)/60:.1f} min ===', flush=True)

print('Inference cell ready. Run after training artifacts exist in deberta_v3_base_win512/fold*/')

=== Test-time inference (windows mean) for deberta_v3_base_win512 ===




Inference cell ready. Run after training artifacts exist in deberta_v3_base_win512/fold*/


In [12]:
import time, json, numpy as np, pandas as pd
print('=== Running seed-2 training with k=16 grouped folds (MSD, eval_steps=500) ===', flush=True)
t0 = time.time()
train_df = pd.read_csv('train.csv')
folds_g2 = pd.read_csv('folds_grouped_k16.csv')
folds_map2 = dict(zip(folds_g2['essay_id'], folds_g2['fold_grouped_k16']))
folds2 = train_df['essay_id'].map(folds_map2).values.astype(int)
oof2 = np.zeros(len(train_df), dtype=float)
for f in sorted(np.unique(folds2)):
    f_start = time.time()
    va_idx, agg_preds = train_fold_seed2(f, train_df, folds2)
    oof2[va_idx] = agg_preds
    y_true = train_df.iloc[va_idx]['score'].values.astype(int)
    q = qwk(y_true, np.clip(np.rint(np.clip(agg_preds, 0.5, 6.5)), 1, 6).astype(int))
    print(f'Fold {f} val round-QWK={q:.4f} elapsed={(time.time()-f_start)/60:.1f} min', flush=True)
np.save('oof_deberta_v3_base_win512_seed2025.npy', oof2)
opt = ThresholdOptimizer()
best_th, best_oof_qwk = opt.fit(train_df['score'].values.astype(int), np.clip(oof2, 0.5, 6.5))
round_q = qwk(train_df['score'].values.astype(int), np.clip(np.rint(oof2), 1, 6).astype(int))
print(f'OOF round-QWK={round_q:.5f}  OOF thresh-QWK={best_oof_qwk:.5f}  thresholds={best_th}', flush=True)
with open('thresholds_deberta_v3_base_win512_seed2025.json','w') as f:
    json.dump({'thresholds': best_th.tolist(), 'oof_qwk': float(best_oof_qwk), 'round_oof_qwk': float(round_q)}, f)
print(f'=== Seed-2 done in {(time.time()-t0)/60:.1f} min ===', flush=True)

=== Running seed-2 training with k=16 grouped folds (MSD, eval_steps=500) ===


[Fold 0] Building datasets...


[Fold 0] Train: essays=13166 windows=17322 avg_windows_per_essay=1.32


[Fold 0] Valid: essays=2410 windows=2961 avg_windows_per_essay=1.23


[Fold 0] Start training...




Step,Training Loss,Validation Loss


{'loss': 2.6829, 'grad_norm': 29.283714294433594, 'learning_rate': 6.153846153846154e-08, 'epoch': 0.0009235742322789194}


{'loss': 2.1801, 'grad_norm': 11.049778938293457, 'learning_rate': 3.0769230769230774e-06, 'epoch': 0.04617871161394597}


{'loss': 0.4359, 'grad_norm': 6.179534912109375, 'learning_rate': 6.153846153846155e-06, 'epoch': 0.09235742322789194}


{'loss': 0.3749, 'grad_norm': 2.8593878746032715, 'learning_rate': 9.230769230769232e-06, 'epoch': 0.1385361348418379}


{'loss': 0.3704, 'grad_norm': 5.08953332901001, 'learning_rate': 1.230769230769231e-05, 'epoch': 0.18471484645578387}


{'loss': 0.3555, 'grad_norm': 10.21146011352539, 'learning_rate': 1.5384615384615387e-05, 'epoch': 0.23089355806972986}


{'loss': 0.4006, 'grad_norm': 22.631258010864258, 'learning_rate': 1.8461538461538465e-05, 'epoch': 0.2770722696836758}


{'loss': 0.3442, 'grad_norm': 9.150673866271973, 'learning_rate': 1.999638539797114e-05, 'epoch': 0.3232509812976218}


{'loss': 0.3205, 'grad_norm': 2.66862154006958, 'learning_rate': 1.9967484258268576e-05, 'epoch': 0.36942969291156774}


{'eval_loss': 0.23827728629112244, 'eval_qwk_round': 0.6857860931176535, 'eval_runtime': 392.0781, 'eval_samples_per_second': 7.552, 'eval_steps_per_second': 0.237, 'epoch': 0.36942969291156774}


{'loss': 0.2365, 'grad_norm': 7.990867614746094, 'learning_rate': 1.990976553665388e-05, 'epoch': 0.41560840452551373}


{'loss': 0.2587, 'grad_norm': 2.2882888317108154, 'learning_rate': 1.9823396107129044e-05, 'epoch': 0.4617871161394597}


{'loss': 0.2407, 'grad_norm': 8.910479545593262, 'learning_rate': 1.9708625677448357e-05, 'epoch': 0.5079658277534057}


{'loss': 0.2606, 'grad_norm': 4.122722148895264, 'learning_rate': 1.9565786067173572e-05, 'epoch': 0.5541445393673516}


{'loss': 0.2462, 'grad_norm': 8.829679489135742, 'learning_rate': 1.9395290248330815e-05, 'epoch': 0.6003232509812976}


{'loss': 0.2661, 'grad_norm': 10.581090927124023, 'learning_rate': 1.9197631151442747e-05, 'epoch': 0.6465019625952436}


{'loss': 0.2184, 'grad_norm': 10.760598182678223, 'learning_rate': 1.8973380240388088e-05, 'epoch': 0.6926806742091896}


{'loss': 0.1962, 'grad_norm': 6.411953926086426, 'learning_rate': 1.8723185860208653e-05, 'epoch': 0.7388593858231355}


{'eval_loss': 0.21167442202568054, 'eval_qwk_round': 0.7572089675233813, 'eval_runtime': 391.8354, 'eval_samples_per_second': 7.557, 'eval_steps_per_second': 0.237, 'epoch': 0.7388593858231355}


{'loss': 0.2251, 'grad_norm': 3.406346082687378, 'learning_rate': 1.8447771362640735e-05, 'epoch': 0.7850380974370815}


{'loss': 0.226, 'grad_norm': 3.741476535797119, 'learning_rate': 1.8147933014790245e-05, 'epoch': 0.8312168090510275}


{'loss': 0.2338, 'grad_norm': 10.01356029510498, 'learning_rate': 1.7824537696997862e-05, 'epoch': 0.8773955206649734}


{'loss': 0.2017, 'grad_norm': 6.351576805114746, 'learning_rate': 1.747852039655015e-05, 'epoch': 0.9235742322789194}


{'loss': 0.2321, 'grad_norm': 3.014822483062744, 'learning_rate': 1.7110881504482632e-05, 'epoch': 0.9697529438928654}


{'loss': 0.2261, 'grad_norm': 3.3823678493499756, 'learning_rate': 1.6722683923290228e-05, 'epoch': 1.0159316555068114}


{'loss': 0.1977, 'grad_norm': 6.635529518127441, 'learning_rate': 1.6315049993907145e-05, 'epoch': 1.0621103671207572}


{'loss': 0.1813, 'grad_norm': 4.952932357788086, 'learning_rate': 1.588915825084077e-05, 'epoch': 1.1082890787347033}


{'eval_loss': 0.1851348578929901, 'eval_qwk_round': 0.7460734389033399, 'eval_runtime': 392.1869, 'eval_samples_per_second': 7.55, 'eval_steps_per_second': 0.237, 'epoch': 1.1082890787347033}


{'loss': 0.1968, 'grad_norm': 4.973823070526123, 'learning_rate': 1.5446240014840997e-05, 'epoch': 1.1544677903486493}


{'loss': 0.2151, 'grad_norm': 6.858154773712158, 'learning_rate': 1.4987575832956173e-05, 'epoch': 1.2006465019625951}


{'loss': 0.1963, 'grad_norm': 9.147194862365723, 'learning_rate': 1.4514491776267939e-05, 'epoch': 1.2468252135765412}


{'loss': 0.1882, 'grad_norm': 6.684695243835449, 'learning_rate': 1.4028355606008888e-05, 'epoch': 1.2930039251904872}


{'loss': 0.1631, 'grad_norm': 5.729094505310059, 'learning_rate': 1.3530572819147346e-05, 'epoch': 1.3391826368044333}


{'loss': 0.1772, 'grad_norm': 3.558868646621704, 'learning_rate': 1.302258258487217e-05, 'epoch': 1.385361348418379}


{'loss': 0.1815, 'grad_norm': 5.575219631195068, 'learning_rate': 1.2505853583725754e-05, 'epoch': 1.4315400600323251}


{'loss': 0.1777, 'grad_norm': 7.146465301513672, 'learning_rate': 1.198187976141507e-05, 'epoch': 1.4777187716462712}


{'eval_loss': 0.2381138652563095, 'eval_qwk_round': 0.7581001178991479, 'eval_runtime': 392.8363, 'eval_samples_per_second': 7.537, 'eval_steps_per_second': 0.237, 'epoch': 1.4777187716462712}


{'loss': 0.171, 'grad_norm': 2.7038567066192627, 'learning_rate': 1.1452176009577062e-05, 'epoch': 1.523897483260217}


{'loss': 0.1704, 'grad_norm': 10.400626182556152, 'learning_rate': 1.0918273785986052e-05, 'epoch': 1.570076194874163}


{'loss': 0.1994, 'grad_norm': 11.964150428771973, 'learning_rate': 1.03817166868658e-05, 'epoch': 1.616254906488109}


{'loss': 0.1754, 'grad_norm': 5.145857810974121, 'learning_rate': 9.844055984107359e-06, 'epoch': 1.662433618102055}


{'loss': 0.1844, 'grad_norm': 4.400597095489502, 'learning_rate': 9.306846140295395e-06, 'epoch': 1.708612329716001}


{'loss': 0.1806, 'grad_norm': 3.624797821044922, 'learning_rate': 8.77164031450966e-06, 'epoch': 1.754791041329947}


{'loss': 0.1665, 'grad_norm': 2.577631711959839, 'learning_rate': 8.239985871895144e-06, 'epoch': 1.8009697529438928}


KeyboardInterrupt: 

In [None]:
import subprocess, sys, time
print('=== Installing python dev headers to fix Triton build (Python.h) ===', flush=True)
t0 = time.time()
cmd = "apt-get update -y && apt-get install -y python3-dev python3.11-dev build-essential"
print(cmd, flush=True)
ret = subprocess.run(['bash','-lc', cmd], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
print(ret.stdout)
print(f'=== Done in {(time.time()-t0)/60:.1f} min ===', flush=True)

In [18]:
import math, copy, os, time, gc
import numpy as np
from sklearn.metrics import cohen_kappa_score
from transformers import TrainerCallback, Trainer, TrainingArguments, PrinterCallback
import torch.nn as nn
import torch
from collections import defaultdict

print('=== Seed-3 prep: LLRD + EMA (READY) | fixes: token-weighted eval, dynamic head, group_by_length, unique class name ===', flush=True)

# Ensure base backbone is used regardless of previous globals
MODEL_NAME_SEED3 = 'microsoft/deberta-v3-base'

def build_llrd_param_groups(model, base_lr=2e-5, head_lr_mult=2.0, decay=0.9, weight_decay=0.01):
    backbone = model.backbone
    layers = []
    if hasattr(backbone, 'encoder') and hasattr(backbone.encoder, 'layer'):
        layers = list(backbone.encoder.layer)
    elif hasattr(backbone, 'deberta') and hasattr(backbone.deberta, 'encoder') and hasattr(backbone.deberta.encoder, 'layer'):
        layers = list(backbone.deberta.encoder.layer)
    elif hasattr(backbone, 'embeddings'):
        layers = []
    n = len(layers)
    param_groups = []
    no_decay = ('bias', 'LayerNorm.weight', 'layer_norm.weight', 'ln.weight')
    def add_group(params, lr, wd):
        if not params: return
        param_groups.append({'params': params, 'lr': lr, 'weight_decay': wd})
    # Embeddings (deepest)
    emb, emb_nd = [], []
    for n_, p in backbone.embeddings.named_parameters(recurse=True):
        (emb_nd if any(nd in n_ for nd in no_decay) else emb).append(p)
    add_group(emb, base_lr * (decay ** (n+1)), weight_decay)
    add_group(emb_nd, base_lr * (decay ** (n+1)), 0.0)
    # Encoder layers
    for i, layer in enumerate(layers):
        depth = i + 1
        lr_i = base_lr * (decay ** (n - depth + 1))
        pg, pg_nd = [], []
        for n_, p in layer.named_parameters(recurse=True):
            (pg_nd if any(nd in n_ for nd in no_decay) else pg).append(p)
        add_group(pg, lr_i, weight_decay)
        add_group(pg_nd, lr_i, 0.0)
    # Pooler
    if hasattr(backbone, 'pooler'):
        pl, pl_nd = [], []
        for n_, p in backbone.pooler.named_parameters(recurse=True):
            (pl_nd if any(nd in n_ for nd in no_decay) else pl).append(p)
        add_group(pl, base_lr, weight_decay)
        add_group(pl_nd, base_lr, 0.0)
    # Head (higher LR)
    head_lr = base_lr * head_lr_mult
    head_wd, head_nd = [], []
    for n_, p in model.head.named_parameters(recurse=True):
        (head_nd if any(nd in n_ for nd in no_decay) else head_wd).append(p)
    add_group(head_wd, head_lr, weight_decay)
    add_group(head_nd, head_lr, 0.0)
    return param_groups

class EMACallback(TrainerCallback):
    def __init__(self, ema_decay=0.995):
        self.decay = ema_decay
        self.shadow = {}
        self.backup = {}
    def on_train_begin(self, args, state, control, **kwargs):
        model = kwargs['model']
        self.shadow = {name: p.detach().clone() for name, p in model.named_parameters() if p.requires_grad}
    def on_step_end(self, args, state, control, **kwargs):
        model = kwargs['model']
        with torch.no_grad():
            for name, p in model.named_parameters():
                if p.requires_grad and name in self.shadow:
                    self.shadow[name].mul_(self.decay).add_(p.detach(), alpha=(1.0 - self.decay))
    def apply_shadow(self, model):
        self.backup = {}
        for name, p in model.named_parameters():
            if p.requires_grad and name in self.shadow:
                self.backup[name] = p.detach().clone()
                p.data.copy_(self.shadow[name].data)
    def restore(self, model):
        for name, p in model.named_parameters():
            if p.requires_grad and name in self.backup:
                p.data.copy_(self.backup[name].data)
        self.backup = {}
    def on_evaluate(self, args, state, control, **kwargs):
        self.apply_shadow(kwargs['model'])
    def on_evaluate_end(self, args, state, control, **kwargs):
        self.restore(kwargs['model'])

# Unique class name to avoid clashes with earlier definitions
from transformers import AutoModel
class MSDMeanPoolRegressorSeed3(nn.Module):
    def __init__(self, model_name, msd=5, p=0.2):
        super().__init__()
        self.backbone = AutoModel.from_pretrained(model_name)
        hidden = int(getattr(self.backbone.config, 'hidden_size', 768))
        self.dropouts = nn.ModuleList([nn.Dropout(p) for _ in range(msd)])
        self.head = nn.Linear(hidden, 1)
        self.msd = msd
    def forward(self, input_ids=None, attention_mask=None, labels=None):
        out = self.backbone(input_ids=input_ids, attention_mask=attention_mask)
        last = out.last_hidden_state  # [B, T, H]
        mask = attention_mask.unsqueeze(-1).float()
        mean = (last * mask).sum(dim=1) / mask.sum(dim=1).clamp(min=1e-6)
        logits_list = []
        for dp in self.dropouts:
            logits_list.append(self.head(dp(mean)).squeeze(-1))
        logits = torch.stack(logits_list, dim=0).mean(dim=0)  # [B]
        if labels is not None:
            loss = torch.nn.functional.smooth_l1_loss(logits, labels, beta=1.0, reduction='mean')
            return {'loss': loss, 'logits': logits.unsqueeze(-1)}
        return {'logits': logits.unsqueeze(-1)}

def compute_metrics_factory_token_weighted(eval_ids, eval_lengths):
    def compute(eval_pred):
        preds = eval_pred.predictions.squeeze()
        labels = eval_pred.label_ids.squeeze()
        ids = np.array(eval_ids)
        wts = np.array(eval_lengths, dtype=float)
        by_sum, by_w, by_true = defaultdict(float), defaultdict(float), {}
        for p, y, i, w in zip(preds, labels, ids, wts):
            by_sum[i] += float(p) * float(w)
            by_w[i] += float(w)
            by_true[i] = int(y)
        agg_preds, agg_true = [], []
        for i in by_sum.keys():
            agg_preds.append(by_sum[i] / max(by_w[i], 1e-6))
            agg_true.append(by_true[i])
        agg_preds = np.clip(np.array(agg_preds), 0.5, 6.5)
        agg_true = np.array(agg_true, dtype=int)
        q = cohen_kappa_score(agg_true, np.clip(np.rint(agg_preds), 1, 6).astype(int), weights='quadratic')
        return {'qwk_round': q}
    return compute

def train_fold_seed3(fold, df, folds, out_dir='deberta_v3_base_win512_seed3_llrd_ema',
                     base_lr=2e-5, head_lr_mult=2.0, decay=0.9, ema_decay=0.995):
    os.makedirs(out_dir, exist_ok=True)
    tr_idx = np.where(folds != fold)[0]
    va_idx = np.where(folds == fold)[0]
    print(f'[Seed3 Fold {fold}] Building datasets...', flush=True)
    dtrain = WindowDataset(df.iloc[tr_idx], labels=df.iloc[tr_idx]['score'].values.astype(np.float32))
    dvalid = WindowDataset(df.iloc[va_idx], labels=df.iloc[va_idx]['score'].values.astype(np.float32))
    _log_windows_stats(f'[Seed3 Fold {fold}] Train', dtrain)
    _log_windows_stats(f'[Seed3 Fold {fold}] Valid', dvalid)
    model = MSDMeanPoolRegressorSeed3(MODEL_NAME_SEED3, msd=5, p=0.2)
    args = TrainingArguments(
        output_dir=f"{out_dir}/fold{fold}",
        learning_rate=base_lr,
        per_device_train_batch_size=max(4, BATCH_TRAIN),
        per_device_eval_batch_size=BATCH_EVAL,
        num_train_epochs=3,
        weight_decay=0.01,
        fp16=True,
        bf16=False,
        evaluation_strategy='steps',
        save_strategy='steps',
        eval_steps=400,
        save_steps=400,
        save_total_limit=1,
        logging_strategy='steps',
        logging_steps=50,
        logging_first_step=True,
        load_best_model_at_end=True,
        metric_for_best_model='qwk_round',
        greater_is_better=True,
        report_to=[],
        disable_tqdm=False,
        dataloader_num_workers=0,
        dataloader_pin_memory=True,
        dataloader_persistent_workers=False,
        group_by_length=True,
        gradient_accumulation_steps=4,
        lr_scheduler_type='cosine',
        warmup_ratio=0.1,
        optim='adamw_torch_fused',
        eval_accumulation_steps=32,
        seed=SEED2+1,
        remove_unused_columns=False,
    )
    # Custom optimizer with LLRD
    pg = build_llrd_param_groups(model, base_lr=base_lr, head_lr_mult=head_lr_mult, decay=decay, weight_decay=0.01)
    class LLRDTrainer(Trainer):
        def create_optimizer(self):
            if self.optimizer is not None:
                return
            optim_kwargs = {'lr': base_lr, 'betas': (0.9, 0.999), 'eps': 1e-8, 'weight_decay': 0.01}
            try:
                from torch.optim import AdamW
                self.optimizer = AdamW(pg, **optim_kwargs)
            except Exception:
                self.optimizer = torch.optim.AdamW(pg, **optim_kwargs)
    ema_cb = EMACallback(ema_decay=ema_decay)
    trainer = LLRDTrainer(
        model=model,
        args=args,
        train_dataset=dtrain,
        eval_dataset=dvalid,
        tokenizer=tokenizer,
        data_collator=PadCollator(),
        compute_metrics=compute_metrics_factory_token_weighted(dvalid.essay_ids, dvalid.lengths),
        callbacks=[PrinterCallback(), ema_cb]
    )
    print(f'[Seed3 Fold {fold}] Start training...', flush=True)
    t0 = time.time()
    trainer.train()
    print(f"[Seed3 Fold {fold}] Train done in {(time.time()-t0)/60:.1f} min", flush=True)
    # EMA weights applied automatically during eval; do final predict with shadow applied
    ema_cb.apply_shadow(trainer.model)
    preds_val = trainer.predict(dvalid).predictions.squeeze()
    ema_cb.restore(trainer.model)
    ids = np.array(dvalid.essay_ids)
    lens = np.array(dvalid.lengths, dtype=float)
    by_sum = defaultdict(float); by_w = defaultdict(float)
    for p, i, w in zip(preds_val, ids, lens):
        by_sum[i] += float(p) * float(w); by_w[i] += float(w)
    agg = {i: (by_sum[i] / max(by_w[i], 1e-6)) for i in by_sum.keys()}
    va_eids = df.iloc[va_idx]['essay_id'].values.tolist()
    agg_vec = np.array([agg[e] for e in va_eids], dtype=float)
    torch.cuda.empty_cache(); gc.collect()
    return va_idx, agg_vec

print('Seed-3 training function ready (LLRD+EMA, token-weighted eval). Execute the driver cell next.', flush=True)

=== Seed-3 prep: LLRD + EMA (READY) | fixes: token-weighted eval, dynamic head, group_by_length, unique class name ===


Seed-3 training function ready (LLRD+EMA, token-weighted eval). Execute the driver cell next.


In [14]:
import os, time, json, math, random, gc, glob
from collections import defaultdict
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from transformers import (
    AutoTokenizer, AutoModel, Trainer, TrainingArguments, DataCollatorWithPadding, EarlyStoppingCallback, PrinterCallback
)
from sklearn.metrics import cohen_kappa_score

print('=== Pivot: DeBERTa-v3-LARGE sliding windows (512/128) + MSD CLS+Mean, 3-fold, token-weighted eval ===', flush=True)
torch.backends.cuda.matmul.allow_tf32 = True
torch.set_float32_matmul_precision('high')
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

SEED_L = 1337
def seed_everything(seed=SEED_L):
    random.seed(seed); np.random.seed(seed); torch.manual_seed(seed); torch.cuda.manual_seed_all(seed)
seed_everything()

def qwk(y_true, y_pred_int):
    return cohen_kappa_score(y_true, y_pred_int, weights='quadratic')

class ThresholdOptimizer:
    def __init__(self, init_thresholds=None):
        self.thresholds = np.array(init_thresholds if init_thresholds is not None else [1.5,2.5,3.5,4.5,5.5], dtype=float)
    def _apply(self, preds):
        return np.digitize(preds, self.thresholds) + 1
    def fit(self, y_true, preds, iters=200, step=0.05):
        best = self.thresholds.copy(); best_score = qwk(y_true, self._apply(preds))
        for _ in range(iters):
            improved = False
            for i in range(5):
                for d in (-step, step):
                    cand = np.sort(np.clip(best + (np.arange(5)==i)*d, 0.5, 6.5))
                    if not (0.5 < cand[0] < cand[1] < cand[2] < cand[3] < cand[4] < 6.5):
                        continue
                    s = qwk(y_true, np.digitize(preds, cand) + 1)
                    if s > best_score:
                        best_score, best, improved = s, cand, True
            if not improved:
                step *= 0.5
                if step < 1e-4: break
        self.thresholds = best
        return best, best_score

MODEL_NAME = 'microsoft/deberta-v3-large'
MAX_LEN = 512
STRIDE = 128
BATCH_TRAIN = 2
GRAD_ACCUM = 8
BATCH_EVAL = 8

tokenizer_large = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

class WindowDatasetL(Dataset):
    def __init__(self, df, text_col='full_text', labels=None):
        self.essay_ids = []
        self.input_ids = []
        self.attn_masks = []
        self.lengths = []  # valid token counts per window
        self.labels = [] if labels is not None else None
        texts = df[text_col].astype(str).tolist()
        eids = df['essay_id'].tolist()
        lbls = None if labels is None else labels.astype(np.float32).tolist()
        enc = tokenizer_large(texts,
                              max_length=MAX_LEN,
                              truncation=True,
                              padding=False,
                              return_overflowing_tokens=True,
                              stride=STRIDE,
                              return_attention_mask=True)
        overflow_to_sample = enc.pop('overflow_to_sample_mapping')
        for idx, sample_idx in enumerate(overflow_to_sample):
            self.essay_ids.append(eids[sample_idx])
            ids_i = enc['input_ids'][idx]
            attn_i = enc['attention_mask'][idx]
            self.input_ids.append(ids_i)
            self.attn_masks.append(attn_i)
            self.lengths.append(int(sum(attn_i)))
            if lbls is not None:
                self.labels.append(lbls[sample_idx])
        if self.labels is not None:
            self.labels = np.array(self.labels, dtype=np.float32)
    def __len__(self):
        return len(self.input_ids)
    def __getitem__(self, i):
        item = {
            'input_ids': torch.tensor(self.input_ids[i], dtype=torch.long),
            'attention_mask': torch.tensor(self.attn_masks[i], dtype=torch.long),
        }
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[i], dtype=torch.float32)
        item['essay_id'] = self.essay_ids[i]
        item['length'] = self.lengths[i]
        return item

class PadCollatorL:
    def __init__(self):
        self.pad = DataCollatorWithPadding(tokenizer=tokenizer_large, pad_to_multiple_of=8)
    def __call__(self, features):
        for f in features:
            f.pop('essay_id', None); f.pop('length', None)
        labels = None
        if 'labels' in features[0]:
            labels = torch.stack([f['labels'] for f in features]).view(-1)
            for f in features: f.pop('labels')
        batch = self.pad(features)
        if labels is not None:
            batch['labels'] = labels
        return batch

class MSDCLSMeanRegressor(nn.Module):
    def __init__(self, model_name, hidden_size=1024, msd=5, p=0.2):
        super().__init__()
        self.backbone = AutoModel.from_pretrained(model_name)
        self.dropouts = nn.ModuleList([nn.Dropout(p) for _ in range(msd)])
        self.head = nn.Linear(hidden_size*2, 1)  # CLS + mean concat
        self.msd = msd
    def forward(self, input_ids=None, attention_mask=None, labels=None):
        out = self.backbone(input_ids=input_ids, attention_mask=attention_mask)
        last = out.last_hidden_state  # [B, T, H]
        cls = last[:, 0, :]  # [B, H]
        mask = attention_mask.unsqueeze(-1).float()
        mean = (last * mask).sum(dim=1) / mask.sum(dim=1).clamp(min=1e-6)  # [B, H]
        feat = torch.cat([cls, mean], dim=-1)  # [B, 2H]
        logits_list = []
        for dp in self.dropouts:
            logits_list.append(self.head(dp(feat)).squeeze(-1))
        logits = torch.stack(logits_list, dim=0).mean(dim=0)  # [B]
        if labels is not None:
            loss = torch.nn.functional.smooth_l1_loss(logits, labels, beta=1.0, reduction='mean')
            return {'loss': loss, 'logits': logits.unsqueeze(-1)}
        return {'logits': logits.unsqueeze(-1)}

# Globals to align eval aggregation to inference: token-length weighted mean
EVAL_IDS = None
EVAL_WTS = None

def make_compute_metrics_token_weighted():
    def compute(eval_pred):
        preds = eval_pred.predictions.squeeze()
        labels = eval_pred.label_ids.squeeze()
        ids = np.array(EVAL_IDS)
        wts = np.array(EVAL_WTS, dtype=float)
        by_sum, by_w = defaultdict(float), defaultdict(float)
        by_true = {}
        for p, y, i, w in zip(preds, labels, ids, wts):
            by_sum[i] += float(p) * float(w)
            by_w[i] += float(w)
            by_true[i] = int(y)
        agg_preds, agg_true = [], []
        for i in by_sum.keys():
            agg_preds.append(by_sum[i] / max(by_w[i], 1e-6))
            agg_true.append(by_true[i])
        agg_preds = np.clip(np.array(agg_preds), 0.5, 6.5)
        agg_true = np.array(agg_true, dtype=int)
        q = qwk(agg_true, np.clip(np.rint(agg_preds), 1, 6).astype(int))
        return {'qwk_round': q}
    return compute

def train_fold_v3large(fold, df, folds, out_dir='deberta_v3_large_win512'):
    os.makedirs(out_dir, exist_ok=True)
    tr_idx = np.where(folds != fold)[0]
    va_idx = np.where(folds == fold)[0]
    print(f'[v3-large Fold {fold}] Build datasets...', flush=True)
    dtrain = WindowDatasetL(df.iloc[tr_idx], labels=df.iloc[tr_idx]['score'].values.astype(np.float32))
    dvalid = WindowDatasetL(df.iloc[va_idx], labels=df.iloc[va_idx]['score'].values.astype(np.float32))
    print(f'[v3-large Fold {fold}] Train: essays={len(set(dtrain.essay_ids))} windows={len(dtrain)} avg_w/E={len(dtrain)/max(1,len(set(dtrain.essay_ids))):.2f}', flush=True)
    print(f'[v3-large Fold {fold}] Valid: essays={len(set(dvalid.essay_ids))} windows={len(dvalid)} avg_w/E={len(dvalid)/max(1,len(set(dvalid.essay_ids))):.2f}', flush=True)
    model = MSDCLSMeanRegressor(MODEL_NAME, hidden_size=1024, msd=5, p=0.2)
    args = TrainingArguments(
        output_dir=f"{out_dir}/fold{fold}",
        learning_rate=1.5e-5,
        per_device_train_batch_size=BATCH_TRAIN,
        per_device_eval_batch_size=BATCH_EVAL,
        num_train_epochs=3,  # early stopping will cut to ~2-2.5
        weight_decay=0.05,
        fp16=True,
        bf16=False,
        evaluation_strategy='steps',
        save_strategy='steps',
        eval_steps=250,
        save_steps=250,
        save_total_limit=1,
        logging_strategy='steps',
        logging_steps=50,
        logging_first_step=True,
        load_best_model_at_end=True,
        metric_for_best_model='qwk_round',
        greater_is_better=True,
        report_to=[],
        disable_tqdm=False,
        dataloader_num_workers=0,
        dataloader_pin_memory=True,
        dataloader_persistent_workers=False,
        group_by_length=False,
        gradient_accumulation_steps=GRAD_ACCUM,
        lr_scheduler_type='cosine',
        warmup_ratio=0.08,
        optim='adamw_torch_fused',
        eval_accumulation_steps=16,
        seed=SEED_L,
        remove_unused_columns=False,
    )
    global EVAL_IDS, EVAL_WTS
    EVAL_IDS = dvalid.essay_ids
    EVAL_WTS = dvalid.lengths
    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=dtrain,
        eval_dataset=dvalid,
        tokenizer=tokenizer_large,
        data_collator=PadCollatorL(),
        compute_metrics=make_compute_metrics_token_weighted(),
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2, early_stopping_threshold=0.0), PrinterCallback()]
    )
    t0 = time.time()
    print(f'[v3-large Fold {fold}] Start training...', flush=True)
    trainer.train()
    print(f"[v3-large Fold {fold}] Train done in {(time.time()-t0)/60:.1f} min", flush=True)
    # Predict on valid windows and aggregate with token-length weights
    preds_val = trainer.predict(dvalid).predictions.squeeze()
    ids = np.array(dvalid.essay_ids)
    lens = np.array(dvalid.lengths, dtype=float)
    by_sum, by_w = defaultdict(float), defaultdict(float)
    for p, i, w in zip(preds_val, ids, lens):
        by_sum[i] += float(p) * float(w)
        by_w[i] += float(w)
    agg = {i: (by_sum[i] / max(by_w[i], 1e-6)) for i in by_sum.keys()}
    va_eids = df.iloc[va_idx]['essay_id'].values.tolist()
    agg_vec = np.array([agg[e] for e in va_eids], dtype=float)
    torch.cuda.empty_cache(); gc.collect()
    return va_idx, agg_vec

# Driver: run only 3 folds to fit time budget
t0 = time.time()
train_df = pd.read_csv('train.csv')
folds_g2 = pd.read_csv('folds_grouped_k16.csv') if os.path.exists('folds_grouped_k16.csv') else pd.read_csv('folds_grouped.csv').rename(columns={'fold_grouped':'fold_grouped_k16'})
folds_map2 = dict(zip(folds_g2['essay_id'], folds_g2[[c for c in folds_g2.columns if 'fold_grouped' in c][0]]))
folds_arr = train_df['essay_id'].map(folds_map2).values.astype(int)

unique_folds = sorted(np.unique(folds_arr))[:3]
print('Using folds:', unique_folds, flush=True)
oof_l = np.zeros(len(train_df), dtype=float)
for f in unique_folds:
    f_start = time.time()
    va_idx, agg_preds = train_fold_v3large(f, train_df, folds_arr, out_dir='deberta_v3_large_win512')
    oof_l[va_idx] = agg_preds
    y_true = train_df.iloc[va_idx]['score'].values.astype(int)
    q = qwk(y_true, np.clip(np.rint(np.clip(agg_preds, 0.5, 6.5)), 1, 6).astype(int))
    print(f"[v3-large Fold {f}] val round-QWK={q:.4f} elapsed={(time.time()-f_start)/60:.1f} min", flush=True)
    torch.cuda.empty_cache(); gc.collect()

np.save('oof_deberta_v3_large_win512.npy', oof_l)
opt = ThresholdOptimizer()
best_th, best_oof_qwk = opt.fit(train_df['score'].values.astype(int), np.clip(oof_l, 0.5, 6.5))
round_q = qwk(train_df['score'].values.astype(int), np.clip(np.rint(oof_l), 1, 6).astype(int))
print(f'OOF (partial 3-fold) round-QWK={round_q:.5f}  thresh-QWK={best_oof_qwk:.5f} thresholds={best_th}', flush=True)
with open('thresholds_deberta_v3_large_win512.json','w') as f:
    json.dump({'thresholds': best_th.tolist(), 'oof_qwk': float(best_oof_qwk), 'round_oof_qwk': float(round_q)}, f)
print(f'=== v3-large 3-fold run done in {(time.time()-t0)/60:.1f} min ===', flush=True)
print('Next: run inference and blend with TF-IDF; then calibrate global thresholds.', flush=True)

=== Pivot: DeBERTa-v3-LARGE sliding windows (512/128) + MSD CLS+Mean, 3-fold, token-weighted eval ===




Using folds: [0, 1, 2]


[v3-large Fold 0] Build datasets...


[v3-large Fold 0] Train: essays=13166 windows=17322 avg_w/E=1.32


[v3-large Fold 0] Valid: essays=2410 windows=2961 avg_w/E=1.23


[v3-large Fold 0] Start training...




Step,Training Loss,Validation Loss


{'loss': 2.4715, 'grad_norm': 36.33293914794922, 'learning_rate': 5.7692307692307695e-08, 'epoch': 0.0009236808682600162}


{'loss': 1.9256, 'grad_norm': 6.323685646057129, 'learning_rate': 2.884615384615385e-06, 'epoch': 0.04618404341300081}


{'loss': 0.4093, 'grad_norm': 11.742685317993164, 'learning_rate': 5.76923076923077e-06, 'epoch': 0.09236808682600162}


{'loss': 0.3503, 'grad_norm': 5.3757452964782715, 'learning_rate': 8.653846153846153e-06, 'epoch': 0.13855213023900242}


{'loss': 0.3012, 'grad_norm': 5.670287609100342, 'learning_rate': 1.153846153846154e-05, 'epoch': 0.18473617365200323}


{'loss': 0.2853, 'grad_norm': 12.33273696899414, 'learning_rate': 1.4423076923076924e-05, 'epoch': 0.23092021706500404}


{'eval_loss': 0.24509067833423615, 'eval_qwk_round': 0.6841542448418836, 'eval_runtime': 1161.5732, 'eval_samples_per_second': 2.549, 'eval_steps_per_second': 0.319, 'epoch': 0.23092021706500404}


{'loss': 0.2538, 'grad_norm': 5.509453296661377, 'learning_rate': 1.4993359400471464e-05, 'epoch': 0.27710426047800485}


KeyboardInterrupt: 

In [None]:
import os, glob, json, gc, time
from collections import defaultdict
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModel, Trainer, TrainingArguments, DataCollatorWithPadding
from sklearn.metrics import cohen_kappa_score

print('=== Inference + Blending: v3-large + TF-IDF ===', flush=True)

def qwk(y_true, y_pred_int):
    return cohen_kappa_score(y_true, y_pred_int, weights='quadratic')

class ThresholdOptimizer:
    def __init__(self, init_thresholds=None):
        self.thresholds = np.array(init_thresholds if init_thresholds is not None else [1.5,2.5,3.5,4.5,5.5], dtype=float)
    def _apply(self, preds):
        return np.digitize(preds, self.thresholds) + 1
    def fit(self, y_true, preds, iters=200, step=0.05):
        best = self.thresholds.copy(); best_score = qwk(y_true, self._apply(preds))
        for _ in range(iters):
            improved = False
            for i in range(5):
                for d in (-step, step):
                    cand = np.sort(np.clip(best + (np.arange(5)==i)*d, 0.5, 6.5))
                    if not (0.5 < cand[0] < cand[1] < cand[2] < cand[3] < cand[4] < 6.5):
                        continue
                    s = qwk(y_true, np.digitize(preds, cand) + 1)
                    if s > best_score:
                        best_score, best, improved = s, cand, True
            if not improved:
                step *= 0.5
                if step < 1e-4: break
        self.thresholds = best
        return best, best_score

# Test-time inference for v3-large CLS+Mean MSD model
MODEL_NAME_L = 'microsoft/deberta-v3-large'
MAX_LEN_L = 512
STRIDE_L = 128
BATCH_EVAL_L = 8
OUT_DIR_L = 'deberta_v3_large_win512'
tokenizer_inf_l = AutoTokenizer.from_pretrained(MODEL_NAME_L, use_fast=True)

class WindowDatasetTestL(Dataset):
    def __init__(self, df, text_col='full_text'):
        self.essay_ids = []
        self.input_ids = []
        self.attn_masks = []
        self.lengths = []
        texts = df[text_col].astype(str).tolist()
        eids = df['essay_id'].tolist()
        enc = tokenizer_inf_l(texts,
                              max_length=MAX_LEN_L,
                              truncation=True,
                              padding=False,
                              return_overflowing_tokens=True,
                              stride=STRIDE_L,
                              return_attention_mask=True)
        overflow_to_sample = enc.pop('overflow_to_sample_mapping')
        for idx, sample_idx in enumerate(overflow_to_sample):
            self.essay_ids.append(eids[sample_idx])
            ids_i = enc['input_ids'][idx]
            attn_i = enc['attention_mask'][idx]
            self.input_ids.append(ids_i)
            self.attn_masks.append(attn_i)
            self.lengths.append(int(sum(attn_i)))
    def __len__(self):
        return len(self.input_ids)
    def __getitem__(self, i):
        return {
            'input_ids': torch.tensor(self.input_ids[i], dtype=torch.long),
            'attention_mask': torch.tensor(self.attn_masks[i], dtype=torch.long),
            'essay_id': self.essay_ids[i],
            'length': self.lengths[i],
        }

class PadCollatorTTL:
    def __init__(self):
        self.pad = DataCollatorWithPadding(tokenizer=tokenizer_inf_l, pad_to_multiple_of=8)
    def __call__(self, features):
        for f in features:
            f.pop('essay_id', None); f.pop('length', None)
        return self.pad(features)

class MSDCLSMeanRegressor(nn.Module):
    def __init__(self, model_name, hidden_size=1024, msd=5, p=0.2):
        super().__init__()
        self.backbone = AutoModel.from_pretrained(model_name)
        self.dropouts = nn.ModuleList([nn.Dropout(p) for _ in range(msd)])
        self.head = nn.Linear(hidden_size*2, 1)
        self.msd = msd
    def forward(self, input_ids=None, attention_mask=None, labels=None):
        out = self.backbone(input_ids=input_ids, attention_mask=attention_mask)
        last = out.last_hidden_state
        cls = last[:, 0, :]
        mask = attention_mask.unsqueeze(-1).float()
        mean = (last * mask).sum(dim=1) / mask.sum(dim=1).clamp(min=1e-6)
        feat = torch.cat([cls, mean], dim=-1)
        logits = 0.0
        for dp in self.dropouts:
            logits = logits + self.head(dp(feat)).squeeze(-1)
        logits = logits / float(self.msd)
        return {'logits': logits.unsqueeze(-1)}

def load_best_subdir(folder):
    cks = sorted(glob.glob(os.path.join(folder, 'checkpoint-*')), key=lambda p: int(p.split('-')[-1]))
    return cks[-1] if cks else folder

def predict_fold_large(folder, dtest):
    best_dir = load_best_subdir(folder)
    model = MSDCLSMeanRegressor(MODEL_NAME_L, hidden_size=1024, msd=5, p=0.2)
    sd_path = os.path.join(best_dir, 'pytorch_model.bin')
    model.load_state_dict(torch.load(sd_path, map_location='cpu'))
    args = TrainingArguments(output_dir=os.path.join(folder, 'tmp_infer'), per_device_eval_batch_size=BATCH_EVAL_L,
                             dataloader_num_workers=2, dataloader_pin_memory=True, report_to=[], fp16=True)
    trainer = Trainer(model=model, args=args, tokenizer=tokenizer_inf_l, data_collator=PadCollatorTTL())
    preds = trainer.predict(dtest).predictions.squeeze()
    ids = np.array(dtest.essay_ids)
    lens = np.array(dtest.lengths, dtype=float)
    by_sum, by_w = defaultdict(float), defaultdict(float)
    for p, i, w in zip(preds, ids, lens):
        by_sum[i] += float(p) * float(w)
        by_w[i] += float(w)
    agg = {i: (by_sum[i] / max(by_w[i], 1e-6)) for i in by_sum.keys()}
    return agg

def run_test_inference_large():
    test_df = pd.read_csv('test.csv')
    dtest = WindowDatasetTestL(test_df)
    fold_dirs = sorted([p for p in glob.glob(os.path.join(OUT_DIR_L, 'fold*')) if os.path.isdir(p)])
    all_fold_preds = []
    t0 = time.time()
    for fd in fold_dirs:
        t1 = time.time()
        agg = predict_fold_large(fd, dtest)
        all_fold_preds.append(agg)
        print(f'[v3-large inference] {fd} done in {(time.time()-t1)/60:.1f} min', flush=True)
        torch.cuda.empty_cache(); gc.collect()
    test_ids_order = test_df['essay_id'].tolist()
    preds_mat = []
    for agg in all_fold_preds:
        preds_mat.append([agg[e] for e in test_ids_order])
    preds_mean = np.mean(np.array(preds_mat), axis=0)
    preds_mean = np.clip(preds_mean, 0.5, 6.5)
    np.save('test_deberta_v3_large_win512.npy', preds_mean)
    print('[v3-large inference] Saved test_deberta_v3_large_win512.npy', flush=True)
    return preds_mean

def optimize_blend_and_submit():
    train_df = pd.read_csv('train.csv')
    y_true = train_df['score'].values.astype(int)
    # Load OOFs
    oof_large = np.load('oof_deberta_v3_large_win512.npy') if os.path.exists('oof_deberta_v3_large_win512.npy') else None
    oof_tfidf = np.load('oof_tfidf.npy')
    assert oof_large is not None, 'oof_deberta_v3_large_win512.npy not found'
    oof_large = np.clip(oof_large, 0.5, 6.5)
    oof_tfidf = np.clip(oof_tfidf, 0.5, 6.5)
    # Grid search blend weight
    best = (-1.0, 0.0, [1.5,2.5,3.5,4.5,5.5])  # (qwk, w, th)
    for w in np.linspace(0.6, 0.95, 15):
        blend = w * oof_large + (1.0 - w) * oof_tfidf
        opt = ThresholdOptimizer()
        th, q = opt.fit(y_true, blend.copy(), iters=200, step=0.05)
        if q > best[0]:
            best = (q, float(w), th)
    best_q, best_w, best_th = best
    print(f'[Blend] Best OOF thresh-QWK={best_q:.5f} at w={best_w:.3f} thresholds={best_th}', flush=True)
    with open('blend_params_large_tfidf.json','w') as f:
        json.dump({'weight_large': best_w, 'thresholds': [float(x) for x in best_th], 'oof_qwk': float(best_q)}, f)
    # Test preds
    if os.path.exists('test_deberta_v3_large_win512.npy'):
        test_large = np.load('test_deberta_v3_large_win512.npy')
    else:
        test_large = run_test_inference_large()
    test_tfidf = np.load('test_tfidf.npy')
    test_blend = best_w * test_large + (1.0 - best_w) * test_tfidf
    test_blend = np.clip(test_blend, 0.5, 6.5)
    th = np.array(best_th, dtype=float)
    labels = np.digitize(test_blend, th) + 1
    labels = np.clip(labels, 1, 6).astype(int)
    sub = pd.DataFrame({'essay_id': pd.read_csv('test.csv')['essay_id'], 'score': labels})
    sub.to_csv('submission.csv', index=False)
    print('Saved submission.csv (blended v3-large + TF-IDF)', flush=True)

print('=== Inference+Blend cell ready. After training finishes, run optimize_blend_and_submit() ===', flush=True)

In [24]:
import time, json, numpy as np, pandas as pd
from sklearn.metrics import cohen_kappa_score
from collections import defaultdict

print('=== Seed-3 RUN: DeBERTa-v3-base (LLRD+EMA), 3 folds (k16), sliding 512/128 ===', flush=True)

def qwk(y_true, y_pred_int):
    return cohen_kappa_score(y_true, y_pred_int, weights='quadratic')

class ThresholdOptimizer:
    def __init__(self, init_thresholds=None):
        self.thresholds = np.array(init_thresholds if init_thresholds is not None else [1.5,2.5,3.5,4.5,5.5], dtype=float)
    def _apply(self, preds):
        return np.digitize(preds, self.thresholds) + 1
    def fit(self, y_true, preds, iters=200, step=0.05):
        best = self.thresholds.copy(); best_score = qwk(y_true, self._apply(preds))
        for _ in range(iters):
            improved = False
            for i in range(5):
                for d in (-step, step):
                    cand = np.sort(np.clip(best + (np.arange(5)==i)*d, 0.5, 6.5))
                    if not (0.5 < cand[0] < cand[1] < cand[2] < cand[3] < cand[4] < 6.5):
                        continue
                    s = qwk(y_true, np.digitize(preds, cand) + 1)
                    if s > best_score:
                        best_score, best, improved = s, cand, True
            if not improved:
                step *= 0.5
                if step < 1e-4: break
        self.thresholds = best
        return best, best_score

# Local, unambiguous version of seed-3 fold trainer to avoid class collisions
def train_fold_seed3_local(fold, df, folds, out_dir='deberta_v3_base_win512_seed3_llrd_ema',
                           base_lr=2e-5, head_lr_mult=2.5, decay=0.9, ema_decay=0.995):
    import os, time, gc, torch
    from transformers import Trainer, TrainingArguments, PrinterCallback, EarlyStoppingCallback
    # Build datasets from global WindowDataset
    os.makedirs(out_dir, exist_ok=True)
    tr_idx = np.where(folds != fold)[0]
    va_idx = np.where(folds == fold)[0]
    print(f'[Seed3 Fold {fold}] Building datasets...', flush=True)
    dtrain = WindowDataset(df.iloc[tr_idx], labels=df.iloc[tr_idx]['score'].values.astype(np.float32))
    dvalid = WindowDataset(df.iloc[va_idx], labels=df.iloc[va_idx]['score'].values.astype(np.float32))
    _log_windows_stats(f'[Seed3 Fold {fold}] Train', dtrain)
    _log_windows_stats(f'[Seed3 Fold {fold}] Valid', dvalid)
    # Build model using the unique Seed3 class defined in cell 13
    model = MSDMeanPoolRegressorSeed3('microsoft/deberta-v3-base', msd=5, p=0.2)
    # LLRD param groups from global function
    pg = build_llrd_param_groups(model, base_lr=base_lr, head_lr_mult=head_lr_mult, decay=decay, weight_decay=0.01)
    class LLRDTrainer(Trainer):
        def create_optimizer(self):
            if self.optimizer is not None:
                return
            optim_kwargs = {'lr': base_lr, 'betas': (0.9, 0.999), 'eps': 1e-8, 'weight_decay': 0.01}
            try:
                from torch.optim import AdamW
                self.optimizer = AdamW(pg, **optim_kwargs)
            except Exception:
                self.optimizer = torch.optim.AdamW(pg, **optim_kwargs)
    args = TrainingArguments(
        output_dir=f"{out_dir}/fold{fold}",
        learning_rate=base_lr,
        per_device_train_batch_size=max(4, BATCH_TRAIN),
        per_device_eval_batch_size=BATCH_EVAL,
        num_train_epochs=3,
        weight_decay=0.01,
        fp16=True,
        bf16=False,
        evaluation_strategy='steps',
        save_strategy='steps',
        eval_steps=400,
        save_steps=400,
        save_total_limit=1,
        logging_strategy='steps',
        logging_steps=50,
        logging_first_step=True,
        load_best_model_at_end=True,
        metric_for_best_model='qwk_round',
        greater_is_better=True,
        report_to=[],
        disable_tqdm=False,
        dataloader_num_workers=0,
        dataloader_pin_memory=True,
        dataloader_persistent_workers=False,
        group_by_length=True,
        gradient_accumulation_steps=4,
        lr_scheduler_type='cosine',
        warmup_ratio=0.1,
        optim='adamw_torch_fused',
        eval_accumulation_steps=32,
        seed=SEED2+1,
        remove_unused_columns=False,
        max_grad_norm=1.0,
    )
    ema_cb = EMACallback(ema_decay=ema_decay)
    trainer = LLRDTrainer(
        model=model,
        args=args,
        train_dataset=dtrain,
        eval_dataset=dvalid,
        tokenizer=tokenizer,
        data_collator=PadCollator(),
        compute_metrics=compute_metrics_factory_token_weighted(dvalid.essay_ids, dvalid.lengths),
        callbacks=[PrinterCallback(), ema_cb, EarlyStoppingCallback(early_stopping_patience=1, early_stopping_threshold=0.0)]
    )
    print(f'[Seed3 Fold {fold}] Start training...', flush=True)
    t0 = time.time()
    trainer.train()
    print(f"[Seed3 Fold {fold}] Train done in {(time.time()-t0)/60:.1f} min", flush=True)
    # EMA shadow for final predict
    ema_cb.apply_shadow(trainer.model)
    preds_val = trainer.predict(dvalid).predictions.squeeze()
    ema_cb.restore(trainer.model)
    ids = np.array(dvalid.essay_ids)
    lens = np.array(dvalid.lengths, dtype=float)
    by_sum = defaultdict(float); by_w = defaultdict(float)
    for p, i, w in zip(preds_val, ids, lens):
        by_sum[i] += float(p) * float(w); by_w[i] += float(w)
    agg = {i: (by_sum[i] / max(by_w[i], 1e-6)) for i in by_sum.keys()}
    va_idx_list = df.iloc[va_idx]['essay_id'].values.tolist()
    agg_vec = np.array([agg[e] for e in va_idx_list], dtype=float)
    torch.cuda.empty_cache(); gc.collect()
    return va_idx, agg_vec

# Use existing folds (k=16 grouped), run only fold 2 with higher LR after poor fold 1
t0 = time.time()
train_df = pd.read_csv('train.csv')
folds_g2 = pd.read_csv('folds_grouped_k16.csv')
folds_map2 = dict(zip(folds_g2['essay_id'], folds_g2['fold_grouped_k16']))
folds_arr = train_df['essay_id'].map(folds_map2).values.astype(int)
# Run only fold 2 now, with base_lr bumped to 2.8e-5
use_folds = [2]
print('Using folds:', use_folds, '(base_lr=2.8e-5 for this run)', flush=True)

oof_seed3 = np.zeros(len(train_df), dtype=float)
for f in use_folds:
    f_start = time.time()
    va_idx, agg_preds = train_fold_seed3_local(f, train_df, folds_arr, out_dir='deberta_v3_base_win512_seed3_llrd_ema',
                                               base_lr=2.8e-5, head_lr_mult=2.5, decay=0.9, ema_decay=0.995)
    oof_seed3[va_idx] = agg_preds
    y_true = train_df.iloc[va_idx]['score'].values.astype(int)
    q = qwk(y_true, np.clip(np.rint(np.clip(agg_preds, 0.5, 6.5)), 1, 6).astype(int))
    print(f"[Seed3 Fold {f}] val round-QWK={q:.4f} elapsed={(time.time()-f_start)/60:.1f} min", flush=True)

np.save('oof_deberta_v3_base_win512_seed3_llrd_ema.npy', oof_seed3)
opt = ThresholdOptimizer()
best_th, best_oof_qwk = opt.fit(train_df['score'].values.astype(int), np.clip(oof_seed3, 0.5, 6.5))
round_q = qwk(train_df['score'].values.astype(int), np.clip(np.rint(oof_seed3), 1, 6).astype(int))
print(f'OOF (partial) round-QWK={round_q:.5f}  thresh-QWK={best_oof_qwk:.5f} thresholds={best_th}', flush=True)
with open('thresholds_deberta_v3_base_win512_seed3_llrd_ema.json','w') as f:
    json.dump({'thresholds': best_th.tolist(), 'oof_qwk': float(best_oof_qwk), 'round_oof_qwk': float(round_q)}, f)
print(f'=== Seed-3 partial run done in {(time.time()-t0)/60:.1f} min ===', flush=True)

=== Seed-3 RUN: DeBERTa-v3-base (LLRD+EMA), 3 folds (k16), sliding 512/128 ===


Using folds: [2] (base_lr=2.8e-5 for this run)


[Seed3 Fold 2] Building datasets...


[Seed3 Fold 2] Train: essays=13306 windows=17042 avg_windows_per_essay=1.28


[Seed3 Fold 2] Valid: essays=2270 windows=3241 avg_windows_per_essay=1.43


[Seed3 Fold 2] Start training...




Step,Training Loss,Validation Loss


{'loss': 4.4972, 'grad_norm': 28.20829200744629, 'learning_rate': 2.2241325997878757e-08, 'epoch': 0.0009387467730579676}


{'loss': 2.8793, 'grad_norm': 28.798147201538086, 'learning_rate': 1.1120662998939378e-06, 'epoch': 0.04693733865289838}


{'loss': 0.6369, 'grad_norm': 23.43297576904297, 'learning_rate': 2.2241325997878755e-06, 'epoch': 0.09387467730579677}


{'loss': 0.3431, 'grad_norm': 6.04536247253418, 'learning_rate': 3.336198899681813e-06, 'epoch': 0.14081201595869514}


{'loss': 0.361, 'grad_norm': 3.446148633956909, 'learning_rate': 4.448265199575751e-06, 'epoch': 0.18774935461159353}


{'loss': 0.2832, 'grad_norm': 4.831424713134766, 'learning_rate': 5.560331499469689e-06, 'epoch': 0.2346866932644919}


{'loss': 0.2854, 'grad_norm': 13.580657005310059, 'learning_rate': 6.672397799363626e-06, 'epoch': 0.28162403191739027}


{'loss': 0.3858, 'grad_norm': 17.343822479248047, 'learning_rate': 7.115312361416194e-06, 'epoch': 0.32856137057028867}


{'loss': 0.3865, 'grad_norm': 10.519705772399902, 'learning_rate': 7.1036356135517416e-06, 'epoch': 0.37549870922318707}


{'eval_loss': 0.4877200126647949, 'eval_qwk_round': 0.38282240052751304, 'eval_runtime': 402.4138, 'eval_samples_per_second': 8.054, 'eval_steps_per_second': 1.009, 'epoch': 0.37549870922318707}


KeyboardInterrupt: 

In [22]:
STRIDE = 128
print('Sliding-window STRIDE reset to 128 for subsequent dataset builds.', flush=True)

Sliding-window STRIDE reset to 128 for subsequent dataset builds.


In [None]:
import os, glob, json, time, gc
from collections import defaultdict
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModel, Trainer, TrainingArguments, DataCollatorWithPadding
from sklearn.metrics import cohen_kappa_score

print('=== Seed-3 Inference (token-weighted) + Blend with TF-IDF ===', flush=True)

def qwk(y_true, y_pred_int):
    return cohen_kappa_score(y_true, y_pred_int, weights='quadratic')

class ThresholdOptimizer:
    def __init__(self, init_thresholds=None):
        self.thresholds = np.array(init_thresholds if init_thresholds is not None else [1.5,2.5,3.5,4.5,5.5], dtype=float)
    def _apply(self, preds):
        return np.digitize(preds, self.thresholds) + 1
    def fit(self, y_true, preds, iters=200, step=0.05):
        best = self.thresholds.copy(); best_score = qwk(y_true, self._apply(preds))
        for _ in range(iters):
            improved = False
            for i in range(5):
                for d in (-step, step):
                    cand = np.sort(np.clip(best + (np.arange(5)==i)*d, 0.5, 6.5))
                    if not (0.5 < cand[0] < cand[1] < cand[2] < cand[3] < cand[4] < 6.5):
                        continue
                    s = qwk(y_true, np.digitize(preds, cand) + 1)
                    if s > best_score:
                        best_score, best, improved = s, cand, True
            if not improved:
                step *= 0.5
                if step < 1e-4: break
        self.thresholds = best
        return best, best_score

# Consistent params with training
MODEL_NAME = 'microsoft/deberta-v3-base'
MAX_LEN = 512
STRIDE = 128
BATCH_EVAL = 32
OUT_DIR = 'deberta_v3_base_win512_seed3_llrd_ema'

tokenizer_seed3 = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

class WindowDatasetTestSeed3(Dataset):
    def __init__(self, df, text_col='full_text'):
        self.essay_ids = []
        self.input_ids = []
        self.attn_masks = []
        self.lengths = []  # valid token counts per window
        texts = df[text_col].astype(str).tolist()
        eids = df['essay_id'].tolist()
        enc = tokenizer_seed3(texts, max_length=MAX_LEN, truncation=True, padding=False,
                              return_overflowing_tokens=True, stride=STRIDE, return_attention_mask=True)
        overflow_to_sample = enc.pop('overflow_to_sample_mapping')
        for idx, sample_idx in enumerate(overflow_to_sample):
            self.essay_ids.append(eids[sample_idx])
            ids_i = enc['input_ids'][idx]
            attn_i = enc['attention_mask'][idx]
            self.input_ids.append(ids_i)
            self.attn_masks.append(attn_i)
            self.lengths.append(int(sum(attn_i)))
    def __len__(self):
        return len(self.input_ids)
    def __getitem__(self, i):
        return {
            'input_ids': torch.tensor(self.input_ids[i], dtype=torch.long),
            'attention_mask': torch.tensor(self.attn_masks[i], dtype=torch.long),
            'essay_id': self.essay_ids[i],
            'length': self.lengths[i],
        }

class PadCollatorTTSeed3:
    def __init__(self):
        self.pad = DataCollatorWithPadding(tokenizer=tokenizer_seed3, pad_to_multiple_of=8)
    def __call__(self, features):
        for f in features:
            f.pop('essay_id', None); f.pop('length', None)
        return self.pad(features)

class MSDMeanPoolRegressorSeed3(nn.Module):
    def __init__(self, model_name, msd=5, p=0.2):
        super().__init__()
        self.backbone = AutoModel.from_pretrained(model_name)
        hidden = int(getattr(self.backbone.config, 'hidden_size', 768))
        self.dropouts = nn.ModuleList([nn.Dropout(p) for _ in range(msd)])
        self.head = nn.Linear(hidden, 1)
        self.msd = msd
    def forward(self, input_ids=None, attention_mask=None, labels=None):
        out = self.backbone(input_ids=input_ids, attention_mask=attention_mask)
        last = out.last_hidden_state
        mask = attention_mask.unsqueeze(-1).float()
        mean = (last * mask).sum(dim=1) / mask.sum(dim=1).clamp(min=1e-6)
        logits = 0.0
        for dp in self.dropouts:
            logits = logits + self.head(dp(mean)).squeeze(-1)
        logits = logits / float(self.msd)
        return {'logits': logits.unsqueeze(-1)}

def load_best_subdir(folder):
    cks = sorted(glob.glob(os.path.join(folder, 'checkpoint-*')), key=lambda p: int(p.split('-')[-1]))
    return cks[-1] if cks else folder

def predict_fold_seed3(folder, dtest):
    best_dir = load_best_subdir(folder)
    # Load model weights saved by Trainer
    model = MSDMeanPoolRegressorSeed3(MODEL_NAME, msd=5, p=0.2)
    sd_path = os.path.join(best_dir, 'pytorch_model.bin')
    model.load_state_dict(torch.load(sd_path, map_location='cpu'))
    args = TrainingArguments(output_dir=os.path.join(folder, 'tmp_infer'), per_device_eval_batch_size=BATCH_EVAL,
                             dataloader_num_workers=0, dataloader_pin_memory=True, report_to=[], fp16=True)
    trainer = Trainer(model=model, args=args, tokenizer=tokenizer_seed3, data_collator=PadCollatorTTSeed3())
    preds = trainer.predict(dtest).predictions.squeeze()
    ids = np.array(dtest.essay_ids)
    lens = np.array(dtest.lengths, dtype=float)
    by_sum, by_w = defaultdict(float), defaultdict(float)
    for p, i, w in zip(preds, ids, lens):
        by_sum[i] += float(p) * float(w)
        by_w[i] += float(w)
    agg = {i: (by_sum[i] / max(by_w[i], 1e-6)) for i in by_sum.keys()}
    return agg

def run_test_inference_seed3():
    test_df = pd.read_csv('test.csv')
    dtest = WindowDatasetTestSeed3(test_df)
    fold_dirs = sorted([p for p in glob.glob(os.path.join(OUT_DIR, 'fold*')) if os.path.isdir(p)])
    assert fold_dirs, f'No fold dirs found in {OUT_DIR}'
    all_fold_preds = []
    t0 = time.time()
    for fd in fold_dirs:
        t1 = time.time()
        agg = predict_fold_seed3(fd, dtest)
        all_fold_preds.append(agg)
        print(f'[seed3 inference] {fd} done in {(time.time()-t1)/60:.1f} min', flush=True)
        torch.cuda.empty_cache(); gc.collect()
    test_ids_order = test_df['essay_id'].tolist()
    preds_mat = []
    for agg in all_fold_preds:
        preds_mat.append([agg[e] for e in test_ids_order])
    preds_mean = np.mean(np.array(preds_mat), axis=0)
    preds_mean = np.clip(preds_mean, 0.5, 6.5)
    np.save('test_deberta_v3_base_win512_seed3_llrd_ema.npy', preds_mean)
    print('[seed3 inference] Saved test_deberta_v3_base_win512_seed3_llrd_ema.npy', flush=True)
    return preds_mean

def optimize_blend_and_submit_seed3_tfidf():
    train_df = pd.read_csv('train.csv')
    y_true = train_df['score'].values.astype(int)
    # Load OOFs
    oof_seed3 = np.load('oof_deberta_v3_base_win512_seed3_llrd_ema.npy')
    oof_tfidf = np.load('oof_tfidf.npy')
    oof_seed3 = np.clip(oof_seed3, 0.5, 6.5)
    oof_tfidf = np.clip(oof_tfidf, 0.5, 6.5)
    # Handle partial OOF: optimize on entries with seed3 predictions present
    mask = oof_seed3 != 0.0
    if mask.sum() == 0:
        raise RuntimeError('No non-zero entries in oof_seed3; ensure at least one fold finished.')
    y_sub = y_true[mask]
    s3_sub = oof_seed3[mask]
    tf_sub = oof_tfidf[mask]
    print(f'[Blend seed3+tfidf] optimizing on {mask.sum()} / {len(mask)} train rows with seed3 OOF', flush=True)
    # Blend weight grid per expert advice
    best = (-1.0, 0.0, [1.5,2.5,3.5,4.5,5.5])
    for w in np.linspace(0.75, 0.92, 18):
        blend = w * s3_sub + (1.0 - w) * tf_sub
        opt = ThresholdOptimizer()
        th, q = opt.fit(y_sub, blend.copy(), iters=200, step=0.05)
        if q > best[0]:
            best = (q, float(w), th)
    best_q, best_w, best_th = best
    print(f'[Blend seed3+tfidf] Best OOF thresh-QWK={best_q:.5f} at w={best_w:.3f} thresholds={best_th}', flush=True)
    with open('blend_params_seed3_tfidf.json','w') as f:
        json.dump({'weight_seed3': best_w, 'thresholds': [float(x) for x in best_th], 'oof_qwk': float(best_q)}, f)
    # Test preds
    if os.path.exists('test_deberta_v3_base_win512_seed3_llrd_ema.npy'):
        test_seed3 = np.load('test_deberta_v3_base_win512_seed3_llrd_ema.npy')
    else:
        test_seed3 = run_test_inference_seed3()
    test_tfidf = np.load('test_tfidf.npy')
    test_blend = best_w * test_seed3 + (1.0 - best_w) * test_tfidf
    test_blend = np.clip(test_blend, 0.5, 6.5)
    th = np.array(best_th, dtype=float)
    labels = np.digitize(test_blend, th) + 1
    labels = np.clip(labels, 1, 6).astype(int)
    sub = pd.DataFrame({'essay_id': pd.read_csv('test.csv')['essay_id'], 'score': labels})
    sub.to_csv('submission.csv', index=False)
    print('Saved submission.csv (blended seed3 + TF-IDF)', flush=True)

print('Inference+Blend (seed3+tfidf) cell ready. After training finishes, run:')
print(' - run_test_inference_seed3()')
print(' - optimize_blend_and_submit_seed3_tfidf()')

In [30]:
import os, glob, json, time, gc
from collections import defaultdict
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
from sklearn.metrics import cohen_kappa_score

print('=== Blend existing OOFs (TF-IDF + DeBERTa base 1024 + optional v3-base win512 if test preds exist) ===', flush=True)

def qwk(y_true, y_pred_int):
    return cohen_kappa_score(y_true, y_pred_int, weights='quadratic')

class ThresholdOptimizer:
    def __init__(self, init_thresholds=None):
        self.thresholds = np.array(init_thresholds if init_thresholds is not None else [1.5,2.5,3.5,4.5,5.5], dtype=float)
    def _apply(self, preds):
        return np.digitize(preds, self.thresholds) + 1
    def fit(self, y_true, preds, iters=200, step=0.05):
        best = self.thresholds.copy(); best_score = qwk(y_true, self._apply(preds))
        for _ in range(iters):
            improved = False
            for i in range(5):
                for d in (-step, step):
                    cand = np.sort(np.clip(best + (np.arange(5)==i)*d, 0.5, 6.5))
                    if not (0.5 < cand[0] < cand[1] < cand[2] < cand[3] < cand[4] < 6.5):
                        continue
                    s = qwk(y_true, np.digitize(preds, cand) + 1)
                    if s > best_score:
                        best_score, best, improved = s, cand, True
            if not improved:
                step *= 0.5
                if step < 1e-4: break
        self.thresholds = best
        return best, best_score

# ---------- Inference helpers for DeBERTa-base 1024 head+tail ----------
MODEL_1024 = 'microsoft/deberta-v3-base'
MAX_LEN_1024 = 1024
HEAD_FRAC = 0.88
BATCH_EVAL_1024 = 16
OUT_DIR_1024 = 'deberta_base_1024'
tok_1024 = AutoTokenizer.from_pretrained(MODEL_1024, use_fast=True)

def encode_head_tail(text):
    ids = tok_1024(text, add_special_tokens=False)['input_ids']
    keep_total = MAX_LEN_1024 - 3
    if len(ids) <= MAX_LEN_1024 - 2:
        out = [tok_1024.cls_token_id] + ids + [tok_1024.sep_token_id]
    else:
        keep_head = int(HEAD_FRAC * keep_total)
        keep_tail = keep_total - keep_head
        head = ids[:keep_head]
        tail = ids[-keep_tail:] if keep_tail > 0 else []
        out = [tok_1024.cls_token_id] + head + [tok_1024.sep_token_id] + tail + [tok_1024.sep_token_id]
    attn = [1]*len(out)
    return {'input_ids': out, 'attention_mask': attn}

class HeadTailTestDS(Dataset):
    def __init__(self, df, text_col='full_text'):
        self.ids = df['essay_id'].tolist()
        self.encs = [encode_head_tail(t) for t in df[text_col].astype(str).tolist()]
    def __len__(self): return len(self.ids)
    def __getitem__(self, i):
        e = self.encs[i]
        return {'input_ids': torch.tensor(e['input_ids'], dtype=torch.long),
                'attention_mask': torch.tensor(e['attention_mask'], dtype=torch.long),
                'essay_id': self.ids[i]}

class PadCollator1024:
    def __init__(self):
        self.pad = DataCollatorWithPadding(tokenizer=tok_1024, pad_to_multiple_of=8)
    def __call__(self, features):
        for f in features: f.pop('essay_id', None)
        return self.pad(features)

def load_best_subdir(path_dir):
    cks = sorted(glob.glob(os.path.join(path_dir, 'checkpoint-*')), key=lambda p: int(p.split('-')[-1]))
    return cks[-1] if cks else path_dir

def infer_test_deberta_base_1024():
    test_df = pd.read_csv('test.csv')
    dtest = HeadTailTestDS(test_df)
    fold_dirs = sorted([p for p in glob.glob(os.path.join(OUT_DIR_1024, 'fold*')) if os.path.isdir(p)])
    assert fold_dirs, f'No fold dirs found in {OUT_DIR_1024}'
    preds_folds = []
    for fd in fold_dirs:
        best_dir = load_best_subdir(fd)
        try:
            model = AutoModelForSequenceClassification.from_pretrained(best_dir)
        except Exception:
            model = AutoModelForSequenceClassification.from_pretrained(MODEL_1024, num_labels=1, problem_type='regression')
            sd = torch.load(os.path.join(best_dir, 'pytorch_model.bin'), map_location='cpu')
            model.load_state_dict(sd)
        args = TrainingArguments(output_dir=os.path.join(fd, 'tmp_infer'), per_device_eval_batch_size=BATCH_EVAL_1024,
                                 dataloader_num_workers=0, dataloader_pin_memory=True, report_to=[], fp16=True)
        trainer = Trainer(model=model, args=args, tokenizer=tok_1024, data_collator=PadCollator1024())
        preds = trainer.predict(dtest).predictions.squeeze()
        preds_folds.append(preds.astype(float))
        torch.cuda.empty_cache(); gc.collect()
    preds_mean = np.mean(np.vstack(preds_folds), axis=0)
    preds_mean = np.clip(preds_mean, 0.5, 6.5)
    np.save('test_deberta_base_1024.npy', preds_mean)
    print('[1024 inference] Saved test_deberta_base_1024.npy', flush=True)
    return preds_mean

# ---------- Blending existing OOFs and producing submission ----------
def blend_existing_and_submit():
    train_df = pd.read_csv('train.csv')
    y_true = train_df['score'].values.astype(int)
    # Load available OOFs
    oof_tfidf = np.load('oof_tfidf.npy')
    oof_1024 = np.load('oof_deberta_base_1024.npy') if os.path.exists('oof_deberta_base_1024.npy') else None
    # Only include v3 win512 if BOTH OOF and TEST preds exist to keep sources consistent
    include_v3 = os.path.exists('oof_deberta_v3_base_win512.npy') and os.path.exists('test_deberta_v3_base_win512.npy')
    oof_win512 = np.load('oof_deberta_v3_base_win512.npy') if include_v3 else None
    mats = []
    names = []
    mats.append(np.clip(oof_tfidf, 0.5, 6.5)); names.append('tfidf')
    if oof_1024 is not None:
        mats.append(np.clip(oof_1024, 0.5, 6.5)); names.append('d1024')
    if include_v3 and oof_win512 is not None:
        mats.append(np.clip(oof_win512, 0.5, 6.5)); names.append('v3w512')
    mats = [m.astype(float) for m in mats]
    k = len(mats)
    assert k >= 2, 'Need at least two OOF sources to blend'
    print('[Blend] sources:', names, flush=True)
    # Grid search weights
    best = (-1.0, None, [1.5,2.5,3.5,4.5,5.5])
    if k == 2:
        A, B = mats[0], mats[1]
        for w in np.linspace(0.6, 0.95, 36):
            blend = w*A + (1.0-w)*B
            opt = ThresholdOptimizer()
            th, q = opt.fit(y_true, blend.copy(), iters=300, step=0.05)
            if q > best[0]: best = (q, (float(w), 1.0-float(w)), th)
    else:
        A, B, C = mats[:3]
        grid = np.linspace(0.1, 0.9, 41)
        for w1 in grid[::4]:
            for w2 in grid[::4]:
                w3 = 1.0 - w1 - w2
                if w3 <= 0 or w3 >= 0.9: continue
                blend = w1*A + w2*B + w3*C
                opt = ThresholdOptimizer()
                th, q = opt.fit(y_true, blend.copy(), iters=300, step=0.05)
                if q > best[0]: best = (q, (float(w1), float(w2), float(w3)), th)
    best_q, best_w, best_th = best
    print(f"[Blend] Best OOF thresh-QWK={best_q:.5f} weights={best_w} thresholds={best_th}", flush=True)
    with open('blend_existing_params.json','w') as f:
        json.dump({'weights': best_w, 'thresholds': [float(x) for x in best_th], 'oof_qwk': float(best_q), 'sources': names}, f)

    # Test preds for each source
    test_tfidf = np.load('test_tfidf.npy')
    test_preds = []
    test_preds.append(np.clip(test_tfidf, 0.5, 6.5))
    te_1024 = np.load('test_deberta_base_1024.npy') if os.path.exists('test_deberta_base_1024.npy') else infer_test_deberta_base_1024()
    if 'd1024' in names: test_preds.append(np.clip(te_1024, 0.5, 6.5))
    if 'v3w512' in names:
        te_w512 = np.load('test_deberta_v3_base_win512.npy')
        test_preds.append(np.clip(te_w512, 0.5, 6.5))

    # Combine with best weights
    test_preds = [p.astype(float) for p in test_preds]
    if len(best_w) == 2:
        w1, w2 = best_w
        test_blend = w1*test_preds[0] + w2*test_preds[1]
    else:
        w1, w2, w3 = best_w
        test_blend = w1*test_preds[0] + w2*test_preds[1] + w3*test_preds[2]
    test_blend = np.clip(test_blend, 0.5, 6.5)
    th = np.array(best_th, dtype=float)
    labels = np.digitize(test_blend, th) + 1
    labels = np.clip(labels, 1, 6).astype(int)
    sub = pd.DataFrame({'essay_id': pd.read_csv('test.csv')['essay_id'], 'score': labels})
    sub.to_csv('submission.csv', index=False)
    print('Saved submission.csv (blend existing models)', flush=True)

print('Blend cell ready. After training/inference, run: blend_existing_and_submit()', flush=True)

=== Blend existing OOFs (TF-IDF + DeBERTa base 1024 + optional v3-base win512 if test preds exist) ===




Blend cell ready. After training/inference, run: blend_existing_and_submit()


In [39]:
print('=== Running blend_existing_and_submit() ===', flush=True)
blend_existing_and_submit()

=== Running blend_existing_and_submit() ===


[Blend] sources: ['tfidf', 'd1024', 'v3w512']


[Blend] Best OOF thresh-QWK=0.80996 weights=(0.5, 0.33999999999999997, 0.16000000000000003) thresholds=[1.84960938 2.68085937 3.4953125  4.21953125 4.9375    ]


Saved submission.csv (blend existing models)


In [38]:
print('=== Generating test_deberta_v3_base_win512.npy via run_test_inference() ===', flush=True)
run_test_inference()
print('Done. You can re-run blend (cell 20) to refresh submission with proper v3 win512 test preds.', flush=True)

=== Generating test_deberta_v3_base_win512.npy via run_test_inference() ===




Predicted deberta_v3_base_win512/fold0 in 4.9 min


Predicted deberta_v3_base_win512/fold1 in 4.9 min


Predicted deberta_v3_base_win512/fold2 in 4.9 min


Predicted deberta_v3_base_win512/fold3 in 4.9 min


Predicted deberta_v3_base_win512/fold4 in 4.9 min


Saved submission.csv and test_deberta_v3_base_win512.npy
=== Test inference done in 24.6 min ===


Done. You can re-run blend (cell 20) to refresh submission with proper v3 win512 test preds.


In [40]:
import time, json, numpy as np, pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.metrics import cohen_kappa_score

print('=== Cluster-wise thresholds for blended preds (k=16 on train TF-IDF->SVD) ===', flush=True)

def qwk(y_true, y_pred_int):
    return cohen_kappa_score(y_true, y_pred_int, weights='quadratic')

class ThresholdOptimizer:
    def __init__(self, init_thresholds=None):
        self.thresholds = np.array(init_thresholds if init_thresholds is not None else [1.5,2.5,3.5,4.5,5.5], dtype=float)
    def _apply(self, preds):
        return np.digitize(preds, self.thresholds) + 1
    def fit(self, y_true, preds, iters=200, step=0.05):
        best = self.thresholds.copy(); best_score = qwk(y_true, self._apply(np.clip(preds,0.5,6.5)))
        for _ in range(iters):
            improved = False
            for i in range(5):
                for d in (-step, step):
                    cand = np.sort(np.clip(best + (np.arange(5)==i)*d, 0.5, 6.5))
                    if not (0.5 < cand[0] < cand[1] < cand[2] < cand[3] < cand[4] < 6.5):
                        continue
                    s = qwk(y_true, np.digitize(np.clip(preds,0.5,6.5), cand) + 1)
                    if s > best_score:
                        best_score, best, improved = s, cand, True
            if not improved:
                step *= 0.5
                if step < 1e-4: break
        self.thresholds = best
        return best, best_score

def build_clusters(train_texts, k=16):
    tfidf = TfidfVectorizer(ngram_range=(1,2), min_df=5, max_features=120000, sublinear_tf=True)
    X = tfidf.fit_transform(train_texts.astype(str))
    svd = TruncatedSVD(n_components=100, random_state=42)
    X_svd = svd.fit_transform(X)
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    cl_train = kmeans.fit_predict(X_svd).astype(int)
    return tfidf, svd, kmeans, cl_train

def assign_clusters(tfidf, svd, kmeans, texts):
    X = tfidf.transform(texts.astype(str))
    Xs = svd.transform(X)
    return kmeans.predict(Xs).astype(int)

def blended_preds_and_weights(y_true, mats):
    # mats: list of arrays [N] for OOF sources in fixed order [tfidf, d1024, v3w512?]
    mats = [np.clip(m.astype(float), 0.5, 6.5) for m in mats]
    k = len(mats)
    best = (-1.0, None, np.array([1.5,2.5,3.5,4.5,5.5], dtype=float))
    if k == 2:
        A, B = mats
        for w in np.linspace(0.6, 0.95, 36):
            blend = w*A + (1.0-w)*B
            opt = ThresholdOptimizer()
            th, q = opt.fit(y_true, blend.copy(), iters=300, step=0.05)
            if q > best[0]: best = (q, (float(w), 1.0-float(w)), th)
        w = best[1]; A, B = mats
        oof_blend = w[0]*A + w[1]*B
    else:
        A, B, C = mats[:3]
        grid = np.linspace(0.1, 0.9, 41)
        for w1 in grid[::4]:
            for w2 in grid[::4]:
                w3 = 1.0 - w1 - w2
                if w3 <= 0 or w3 >= 0.9: continue
                blend = w1*A + w2*B + w3*C
                opt = ThresholdOptimizer()
                th, q = opt.fit(y_true, blend.copy(), iters=300, step=0.05)
                if q > best[0]: best = (q, (float(w1), float(w2), float(w3)), th)
        w = best[1]
        oof_blend = w[0]*A + w[1]*B + w[2]*C
    return np.clip(oof_blend, 0.5, 6.5), best[1], np.array(best[2], dtype=float), float(best[0])

def run_cluster_threshold_blend():
    t0 = time.time()
    train_df = pd.read_csv('train.csv')
    test_df = pd.read_csv('test.csv')
    y_true = train_df['score'].values.astype(int)
    # Load OOF/test sources
    oof_tfidf = np.load('oof_tfidf.npy')
    te_tfidf = np.load('test_tfidf.npy')
    mats_oof = [oof_tfidf]
    mats_te = [te_tfidf]
    if os.path.exists('oof_deberta_base_1024.npy') and os.path.exists('test_deberta_base_1024.npy'):
        mats_oof.append(np.load('oof_deberta_base_1024.npy'))
        mats_te.append(np.load('test_deberta_base_1024.npy'))
    if os.path.exists('oof_deberta_v3_base_win512.npy') and os.path.exists('test_deberta_v3_base_win512.npy'):
        mats_oof.append(np.load('oof_deberta_v3_base_win512.npy'))
        mats_te.append(np.load('test_deberta_v3_base_win512.npy'))
    assert len(mats_oof) >= 2, 'Need at least two sources'
    # Optimize global weights on OOF
    oof_blend, weights, th_global, q_global = blended_preds_and_weights(y_true, mats_oof)
    print(f'[ClusterBlend] Global best OOF thresh-QWK={q_global:.5f}, weights={weights}, thr={th_global}', flush=True)
    # Build clusters on train only
    tfidf, svd, kmeans, cl_train = build_clusters(train_df['full_text'])
    cl_test = assign_clusters(tfidf, svd, kmeans, test_df['full_text'])
    # Fit per-cluster thresholds on blended OOF
    th_per = {}
    oof_lbls_cluster = np.zeros_like(y_true)
    for c in range(kmeans.n_clusters):
        idx = np.where(cl_train == c)[0]
        if len(idx) < 50:
            th_per[c] = th_global.copy()
            continue
        opt = ThresholdOptimizer(th_global.copy())
        th_c, q_c = opt.fit(y_true[idx], oof_blend[idx], iters=200, step=0.05)
        th_per[c] = th_c
    # Evaluate OOF with per-cluster thresholds (sanity)
    for c in range(kmeans.n_clusters):
        idx = np.where(cl_train == c)[0]
        if len(idx) == 0: continue
        th = th_per[c]
        oof_lbls_cluster[idx] = np.digitize(oof_blend[idx], th) + 1
    oof_lbls_cluster = np.clip(oof_lbls_cluster, 1, 6).astype(int)
    q_cluster = qwk(y_true, oof_lbls_cluster)
    print(f'[ClusterBlend] OOF QWK with per-cluster thresholds={q_cluster:.5f} (vs global {q_global:.5f})', flush=True)
    # Build blended test preds using same weights
    mats_te = [np.clip(m.astype(float), 0.5, 6.5) for m in mats_te]
    if len(weights) == 2:
        te_blend = weights[0]*mats_te[0] + weights[1]*mats_te[1]
    else:
        te_blend = weights[0]*mats_te[0] + weights[1]*mats_te[1] + weights[2]*mats_te[2]
    te_blend = np.clip(te_blend, 0.5, 6.5)
    # Apply per-cluster thresholds to test
    labels = np.zeros(len(test_df), dtype=int)
    for c in range(kmeans.n_clusters):
        idx = np.where(cl_test == c)[0]
        if len(idx) == 0: continue
        th = th_per.get(c, th_global)
        labels[idx] = np.digitize(te_blend[idx], th) + 1
    labels = np.clip(labels, 1, 6).astype(int)
    sub = pd.DataFrame({'essay_id': test_df['essay_id'], 'score': labels})
    sub.to_csv('submission.csv', index=False)
    with open('blend_existing_params.json','r') as f:
        prev = json.load(f) if os.path.exists('blend_existing_params.json') else {}
    meta = {'weights': weights, 'global_thresholds': th_global.tolist(), 'cluster_qwk': float(q_cluster), 'global_qwk': float(q_global)}
    with open('blend_clusterwise_params.json','w') as f:
        json.dump(meta, f)
    print('Saved submission.csv (cluster-wise thresholds) and blend_clusterwise_params.json', flush=True)
    print(f'=== Cluster-wise blend done in {(time.time()-t0)/60:.1f} min ===', flush=True)

run_cluster_threshold_blend()

=== Cluster-wise thresholds for blended preds (k=16 on train TF-IDF->SVD) ===


[ClusterBlend] Global best OOF thresh-QWK=0.80996, weights=(0.5, 0.33999999999999997, 0.16000000000000003), thr=[1.84960938 2.68085937 3.4953125  4.21953125 4.9375    ]


In [41]:
import time, json, numpy as np, pandas as pd
from sklearn.metrics import cohen_kappa_score

print('=== Length-bin threshold calibration for blended preds ===', flush=True)

def qwk(y_true, y_pred_int):
    return cohen_kappa_score(y_true, y_pred_int, weights='quadratic')

class ThresholdOptimizer:
    def __init__(self, init_thresholds=None):
        self.thresholds = np.array(init_thresholds if init_thresholds is not None else [1.5,2.5,3.5,4.5,5.5], dtype=float)
    def _apply(self, preds):
        return np.digitize(preds, self.thresholds) + 1
    def fit(self, y_true, preds, iters=200, step=0.05):
        preds = np.clip(preds, 0.5, 6.5)
        best = self.thresholds.copy(); best_score = qwk(y_true, self._apply(preds))
        for _ in range(iters):
            improved = False
            for i in range(5):
                for d in (-step, step):
                    cand = np.sort(np.clip(best + (np.arange(5)==i)*d, 0.5, 6.5))
                    if not (0.5 < cand[0] < cand[1] < cand[2] < cand[3] < cand[4] < 6.5):
                        continue
                    s = qwk(y_true, np.digitize(preds, cand) + 1)
                    if s > best_score:
                        best_score, best, improved = s, cand, True
            if not improved:
                step *= 0.5
                if step < 1e-4: break
        self.thresholds = best
        return best, best_score

def load_blend_weights_and_sources():
    with open('blend_existing_params.json','r') as f:
        meta = json.load(f)
    weights = meta['weights']
    sources = meta.get('sources', ['tfidf','d1024','v3w512'])
    if isinstance(weights, list): weights = tuple(weights)
    return weights, sources

def load_oof_and_test_for_sources(sources):
    oofs = []; tests = []
    for s in sources:
        if s == 'tfidf':
            oofs.append(np.load('oof_tfidf.npy'))
            tests.append(np.load('test_tfidf.npy'))
        elif s == 'd1024':
            oofs.append(np.load('oof_deberta_base_1024.npy'))
            tests.append(np.load('test_deberta_base_1024.npy'))
        elif s == 'v3w512':
            oofs.append(np.load('oof_deberta_v3_base_win512.npy'))
            tests.append(np.load('test_deberta_v3_base_win512.npy'))
        else:
            raise ValueError(f'Unknown source {s}')
    oofs = [np.clip(x.astype(float), 0.5, 6.5) for x in oofs]
    tests = [np.clip(x.astype(float), 0.5, 6.5) for x in tests]
    return oofs, tests

def blend_with_weights(arrs, weights):
    arrs = [a.astype(float) for a in arrs]
    if len(weights) == 2:
        w1, w2 = weights
        return np.clip(w1*arrs[0] + w2*arrs[1], 0.5, 6.5)
    elif len(weights) == 3:
        w1, w2, w3 = weights
        return np.clip(w1*arrs[0] + w2*arrs[1] + w3*arrs[2], 0.5, 6.5)
    else:
        raise ValueError('weights must be len 2 or 3')

def run_length_bin_thresholds(n_bins=6):
    t0 = time.time()
    train_df = pd.read_csv('train.csv')
    test_df = pd.read_csv('test.csv')
    y_true = train_df['score'].values.astype(int)
    # Use character length bins (robust, quick)
    len_tr = train_df['full_text'].astype(str).str.len().values.astype(float)
    len_te = test_df['full_text'].astype(str).str.len().values.astype(float)
    # Load blend config and preds
    weights, sources = load_blend_weights_and_sources()
    oofs, tests = load_oof_and_test_for_sources(sources)
    oof_blend = blend_with_weights(oofs, weights)
    te_blend = blend_with_weights(tests, weights)
    # Global thresholds as fallback
    opt_g = ThresholdOptimizer()
    th_global, q_global = opt_g.fit(y_true, oof_blend.copy(), iters=300, step=0.05)
    # Build bins on train only
    qs = np.linspace(0, 1, n_bins+1)
    edges = np.unique(np.quantile(len_tr, qs))
    if len(edges) <= 2:
        edges = np.unique(np.quantile(len_tr, [0, 0.33, 0.66, 1.0]))
    # Fit per-bin thresholds
    th_per = []  # list of (lo, hi, thresholds)
    oof_labels = np.zeros_like(y_true)
    for i in range(len(edges)-1):
        lo, hi = edges[i], edges[i+1] + (1e-6 if i == len(edges)-2 else 0.0)
        idx = np.where((len_tr >= lo) & (len_tr < hi))[0]
        if len(idx) < 100:
            th = th_global.copy()
        else:
            opt = ThresholdOptimizer(th_global.copy())
            th, _ = opt.fit(y_true[idx], oof_blend[idx], iters=200, step=0.05)
        th_per.append((lo, hi, th))
        # assign oof labels for sanity check
        if len(idx) > 0:
            oof_labels[idx] = np.digitize(oof_blend[idx], th) + 1
    oof_labels = np.clip(oof_labels, 1, 6).astype(int)
    q_lenbin = qwk(y_true, oof_labels)
    print(f'[LenBin] OOF QWK with per-bin thresholds={q_lenbin:.5f} (global={q_global:.5f}) bins={len(th_per)}', flush=True)
    # Apply to test
    labels = np.zeros(len(test_df), dtype=int)
    for lo, hi, th in th_per:
        idx = np.where((len_te >= lo) & (len_te < hi))[0]
        if len(idx) == 0: continue
        labels[idx] = np.digitize(te_blend[idx], th) + 1
    labels = np.clip(labels, 1, 6).astype(int)
    sub = pd.DataFrame({'essay_id': test_df['essay_id'], 'score': labels})
    sub.to_csv('submission.csv', index=False)
    meta = {
        'weights': list(weights) if isinstance(weights, tuple) else weights,
        'sources': sources,
        'global_thresholds': opt_g.thresholds.tolist(),
        'oof_qwk_global': float(q_global),
        'oof_qwk_lenbin': float(q_lenbin),
        'bins': [(float(lo), float(hi), [float(x) for x in th]) for (lo,hi,th) in th_per]
    }
    with open('blend_lenbin_params.json','w') as f:
        json.dump(meta, f)
    print('Saved submission.csv (length-bin thresholds) and blend_lenbin_params.json', flush=True)
    print(f'=== Length-bin calibration done in {(time.time()-t0)/60:.1f} min ===', flush=True)

run_length_bin_thresholds(n_bins=6)

In [42]:
import time, json, numpy as np, pandas as pd
from sklearn.metrics import cohen_kappa_score

print('=== Fast affine calibration on blended preds (fixed thresholds) ===', flush=True)

def qwk(y_true, y_pred_int):
    return cohen_kappa_score(y_true, y_pred_int, weights='quadratic')

def load_blend_meta():
    with open('blend_existing_params.json','r') as f:
        meta = json.load(f)
    weights = meta['weights']
    if isinstance(weights, list):
        weights = tuple(weights)
    th = np.array(meta['thresholds'], dtype=float)
    sources = meta.get('sources', ['tfidf','d1024','v3w512'])
    return weights, th, sources

def load_oof_and_test(sources):
    oofs, tests = [], []
    for s in sources:
        if s == 'tfidf':
            oofs.append(np.load('oof_tfidf.npy')); tests.append(np.load('test_tfidf.npy'))
        elif s == 'd1024':
            oofs.append(np.load('oof_deberta_base_1024.npy')); tests.append(np.load('test_deberta_base_1024.npy'))
        elif s == 'v3w512':
            oofs.append(np.load('oof_deberta_v3_base_win512.npy')); tests.append(np.load('test_deberta_v3_base_win512.npy'))
        else:
            raise ValueError(f'Unknown source {s}')
    oofs = [np.clip(x.astype(float), 0.5, 6.5) for x in oofs]
    tests = [np.clip(x.astype(float), 0.5, 6.5) for x in tests]
    return oofs, tests

def blend(arrs, weights):
    arrs = [a.astype(float) for a in arrs]
    if len(weights) == 2:
        w1, w2 = weights; out = w1*arrs[0] + w2*arrs[1]
    else:
        w1, w2, w3 = weights; out = w1*arrs[0] + w2*arrs[1] + w3*arrs[2]
    return np.clip(out, 0.5, 6.5)

def run_affine_calibration():
    t0 = time.time()
    train_df = pd.read_csv('train.csv')
    test_df = pd.read_csv('test.csv')
    y_true = train_df['score'].values.astype(int)
    weights, th, sources = load_blend_meta()
    oofs, tests = load_oof_and_test(sources)
    oof_blend = blend(oofs, weights)
    te_blend = blend(tests, weights)
    # Grid over scale (a) and bias (b), keep thresholds fixed
    best = (-1.0, 1.0, 0.0)  # (qwk, a, b)
    A = np.linspace(0.95, 1.05, 41)
    B = np.linspace(-0.10, 0.10, 81)
    for a in A[::2]:
        xb = np.clip(a*oof_blend, 0.5, 6.5)
        for b in B[::2]:
            z = np.clip(xb + b, 0.5, 6.5)
            pred = np.digitize(z, th) + 1
            q = qwk(y_true, pred.astype(int))
            if q > best[0]:
                best = (float(q), float(a), float(b))
    q_best, a_best, b_best = best
    print(f'[Affine] Best OOF QWK={q_best:.5f} with a={a_best:.5f}, b={b_best:.5f} (fixed thr)', flush=True)
    # Apply to test
    zt = np.clip(a_best*te_blend + b_best, 0.5, 6.5)
    labels = np.digitize(zt, th) + 1
    labels = np.clip(labels, 1, 6).astype(int)
    sub = pd.DataFrame({'essay_id': test_df['essay_id'], 'score': labels})
    sub.to_csv('submission.csv', index=False)
    with open('blend_affine_params.json','w') as f:
        json.dump({'a': a_best, 'b': b_best, 'oof_qwk_affine': q_best, 'thresholds': th.tolist(), 'weights': list(weights), 'sources': sources}, f)
    print('Saved submission.csv (affine-calibrated blend) and blend_affine_params.json', flush=True)
    print(f'=== Affine calibration done in {(time.time()-t0)/60:.1f} min ===', flush=True)

run_affine_calibration()

In [44]:
import time, json, numpy as np, pandas as pd
from sklearn.isotonic import IsotonicRegression
from sklearn.metrics import cohen_kappa_score

print('=== Isotonic calibration on blended preds + threshold re-opt (iters<=80) ===', flush=True)

def qwk(y_true, y_pred_int):
    return cohen_kappa_score(y_true, y_pred_int, weights='quadratic')

class ThresholdOptimizer:
    def __init__(self, init_thresholds=None):
        self.thresholds = np.array(init_thresholds if init_thresholds is not None else [1.5,2.5,3.5,4.5,5.5], dtype=float)
    def _apply(self, preds):
        return np.digitize(preds, self.thresholds) + 1
    def fit(self, y_true, preds, iters=80, step=0.05):
        preds = np.clip(preds, 0.5, 6.5)
        best = self.thresholds.copy(); best_score = qwk(y_true, self._apply(preds))
        for _ in range(iters):
            improved = False
            for i in range(5):
                for d in (-step, step):
                    cand = np.sort(np.clip(best + (np.arange(5)==i)*d, 0.5, 6.5))
                    if not (0.5 < cand[0] < cand[1] < cand[2] < cand[3] < cand[4] < 6.5):
                        continue
                    s = qwk(y_true, np.digitize(preds, cand) + 1)
                    if s > best_score:
                        best_score, best, improved = s, cand, True
            if not improved:
                step *= 0.5
                if step < 1e-4: break
        self.thresholds = best
        return best, best_score

def load_blend_meta():
    with open('blend_existing_params.json','r') as f:
        meta = json.load(f)
    weights = meta['weights']
    if isinstance(weights, list): weights = tuple(weights)
    th = np.array(meta['thresholds'], dtype=float)
    sources = meta.get('sources', ['tfidf','d1024','v3w512'])
    return weights, th, sources, float(meta.get('oof_qwk', 0.0))

def load_oof_and_test(sources):
    oofs, tests = [], []
    for s in sources:
        if s == 'tfidf':
            oofs.append(np.load('oof_tfidf.npy')); tests.append(np.load('test_tfidf.npy'))
        elif s == 'd1024':
            oofs.append(np.load('oof_deberta_base_1024.npy')); tests.append(np.load('test_deberta_base_1024.npy'))
        elif s == 'v3w512':
            oofs.append(np.load('oof_deberta_v3_base_win512.npy')); tests.append(np.load('test_deberta_v3_base_win512.npy'))
        else:
            raise ValueError(f'Unknown source {s}')
    oofs = [np.clip(x.astype(float), 0.5, 6.5) for x in oofs]
    tests = [np.clip(x.astype(float), 0.5, 6.5) for x in tests]
    return oofs, tests

def blend(arrs, weights):
    arrs = [a.astype(float) for a in arrs]
    if len(weights) == 2:
        w1, w2 = weights; out = w1*arrs[0] + w2*arrs[1]
    else:
        w1, w2, w3 = weights; out = w1*arrs[0] + w2*arrs[1] + w3*arrs[2]
    return np.clip(out, 0.5, 6.5)

def run_isotonic_calibration():
    t0 = time.time()
    train_df = pd.read_csv('train.csv')
    test_df = pd.read_csv('test.csv')
    y_true = train_df['score'].values.astype(int)
    weights, th_init, sources, q_base = load_blend_meta()
    oofs, tests = load_oof_and_test(sources)
    oof_blend = blend(oofs, weights)
    te_blend = blend(tests, weights)
    # Fit isotonic regression (monotonic) mapping blended preds -> target
    ir = IsotonicRegression(y_min=0.5, y_max=6.5, increasing=True, out_of_bounds='clip')
    ir.fit(oof_blend, y_true.astype(float))
    oof_iso = np.clip(ir.transform(oof_blend), 0.5, 6.5)
    te_iso = np.clip(ir.transform(te_blend), 0.5, 6.5)
    # Re-opt thresholds on isotonic-transformed OOF with iters<=80
    opt = ThresholdOptimizer(th_init.copy())
    th_best, q_iso = opt.fit(y_true, oof_iso.copy(), iters=80, step=0.05)
    print(f'[Iso] OOF QWK after isotonic + re-threshold={q_iso:.5f} (base blend={q_base:.5f}) thr={th_best}', flush=True)
    # Apply to test
    labels = np.digitize(te_iso, th_best) + 1
    labels = np.clip(labels, 1, 6).astype(int)
    sub = pd.DataFrame({'essay_id': test_df['essay_id'], 'score': labels})
    sub.to_csv('submission.csv', index=False)
    with open('blend_isotonic_params.json','w') as f:
        json.dump({'oof_qwk_isotonic': float(q_iso), 'oof_qwk_base': float(q_base), 'thresholds': th_best.tolist()}, f)
    print('Saved submission.csv (isotonic-calibrated blend) and blend_isotonic_params.json')
    print(f'=== Isotonic calibration done in {(time.time()-t0)/60:.2f} min ===', flush=True)

run_isotonic_calibration()