# Plan: chaii - Hindi and Tamil QA (MLE-Benchmark)

Objectives:
- Establish GPU availability and environment sanity.
- Inspect data schema and target (answer_start).
- Build a fast, reproducible baseline with proper CV mirroring test.
- Iterate to a medal via improved models (mBERT/XLM-R), span extraction heuristics, ensembling.

Validation:
- Stratify by language if available; otherwise group by article/context id to avoid leakage (same context in multiple rows).
- Use KFold with deterministic seed; save folds for reuse.

Modeling roadmap:
1) Baseline heuristic: character-based match of answer_text in context to verify target and compute a sanity Jaccard.
2) Transformer QA head (start/end token classification) using multilingual base (xlm-roberta-base → large), GPU-required.
3) OOF-based error analysis; adjust preprocessing, improve max_length/stride, post-processing.
4) Seed averaging and model ensembling.

Deliverables:
- Reliable CV (OOF Jaccard) and submission.csv.
- Logs with timing per fold; cached OOF/test predictions.

Next:
1) Check GPU
2) Load and profile data
3) Define CV splits and quick baseline

In [30]:
# Environment check: GPU availability
import subprocess, sys, time, os
print("Running nvidia-smi...", flush=True)
subprocess.run(['bash','-lc','nvidia-smi || true'], check=False)
print("Python:", sys.version)
print("CUDA_VISIBLE_DEVICES:", os.environ.get('CUDA_VISIBLE_DEVICES'))
print("Done GPU sanity.")

Running nvidia-smi...


Thu Sep 25 03:16:33 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 570.158.01             Driver Version: 580.65.06      CUDA Version: 13.0     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla V100-SXM2-16GB           Off |   00000000:00:04.0 Off |                    0 |
| N/A   61C    P0            276W /  300W |    8700MiB /  16384MiB |     96%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
# EDA: load data, inspect schema, verify labels and submission format
import pandas as pd, unicodedata, re, hashlib, time, os
t0=time.time()
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')
print('Train shape:', train.shape); print('Test shape:', test.shape)
print('Train columns:', list(train.columns))
print('Test columns:', list(test.columns))
print('\nHead train:')
print(train.head(3))
print('\nHead test:')
print(test.head(3))

# Check sample submission format
sub=pd.read_csv('sample_submission.csv')
print('\nSample submission columns:', list(sub.columns))
print(sub.head())

# Normalization helpers for label verification
ZW_CHARS = ''.join([chr(c) for c in [0x200B,0x200C,0x200D,0xFEFF]])
ZW_RE = re.compile(f"[{re.escape(ZW_CHARS)}]")
WS_RE = re.compile(r"\s+")
def normalize_text(s:str)->str:
    if not isinstance(s,str): return ''
    s = unicodedata.normalize('NFKC', s)
    s = ZW_RE.sub('', s)
    s = WS_RE.sub(' ', s).strip()
    return s

# Verify answer alignment if columns exist
align_checks = {'total':0,'ok':0,'mismatch':0,'nan_els':0}
sample_rows = min(2000, len(train))
cols = set(train.columns.str.lower())
has_answer_start = 'answer_start' in cols
has_answer_text = 'answer_text' in cols
print(f'Has answer_start: {has_answer_start}, has answer_text: {has_answer_text}')
if has_answer_start and has_answer_text:
    # map actual column names (case-insensitive)
    def col(name):
        for c in train.columns:
            if c.lower()==name: return c
        return name
    c_context = col('context')
    c_answer_text = col('answer_text')
    c_answer_start = col('answer_start')
    mism_examples = []
    for i,(ctx,ans,st) in enumerate(zip(train[c_context].astype(str), train[c_answer_text].astype(str), train[c_answer_start])):
        if i>=sample_rows: break
        align_checks['total']+=1
        if pd.isna(st):
            align_checks['nan_els']+=1
            continue
        st = int(st)
        slice_txt = ctx[st:st+len(ans)] if 0<=st<len(ctx) else ''
        if slice_txt==ans:
            align_checks['ok']+=1
            continue
        # try normalized comparison
        if normalize_text(slice_txt)==normalize_text(ans):
            align_checks['ok']+=1
        else:
            align_checks['mismatch']+=1
            if len(mism_examples)<5:
                mism_examples.append({'i':i,'slice':slice_txt,'ans':ans})
    print('Align checks on first', sample_rows, 'rows:', align_checks)
    if mism_examples:
        print('Examples of mismatches (up to 5):')
        for ex in mism_examples:
            print(ex)

# Propose grouping id by normalized context hash
ctx_col = None
for c in train.columns:
    if c.lower()=='context': ctx_col=c; break
if ctx_col is not None:
    norm_ctx = train[ctx_col].astype(str).map(lambda x: normalize_text(x))
    grp = norm_ctx.map(lambda x: hashlib.md5(x.encode('utf-8')).hexdigest())
    uniq = grp.nunique()
    print('Proposed group ids (normalized context hash) unique:', uniq, 'over', len(train))
print('EDA done in %.2fs' % (time.time()-t0))

# Notes printed; next: build CV splits and metric

In [None]:
# Build CV splits and metric; save folds
import numpy as np, pandas as pd, hashlib, unicodedata, re, time
from sklearn.model_selection import StratifiedGroupKFold, GroupKFold

def remove_zw(s: str) -> str:
    if not isinstance(s, str): return ''
    return re.sub(r"[\u200B\u200C\u200D\uFEFF]", "", s)

def norm_for_metric(s: str) -> str:
    if not isinstance(s, str): return ''
    s = unicodedata.normalize('NFKC', s)
    s = remove_zw(s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

def word_jaccard(a: str, b: str) -> float:
    a = norm_for_metric(a); b = norm_for_metric(b)
    sa = set(a.split())
    sb = set(b.split())
    if not sa and not sb: return 1.0
    if not sa or not sb: return 0.0
    inter = len(sa & sb)
    union = len(sa | sb)
    return inter / union if union else 0.0

# Prepare grouping by normalized context
ctx_col = 'context'; lang_col = 'language'
norm_ctx = train[ctx_col].astype(str).map(lambda x: norm_for_metric(x))
groups = norm_ctx.map(lambda x: hashlib.md5(x.encode('utf-8')).hexdigest())
y_len = train['answer_text'].astype(str).map(lambda x: len(x))  # proxy to avoid constant y

n_splits = 5
if lang_col in train.columns:
    cv = StratifiedGroupKFold(n_splits=n_splits, shuffle=True, random_state=42)
    split_iter = cv.split(train, train[lang_col], groups)
else:
    cv = GroupKFold(n_splits=n_splits)
    split_iter = cv.split(train, y_len, groups)

folds = np.full(len(train), -1, dtype=int)
for fold, (trn_idx, val_idx) in enumerate(split_iter):
    folds[val_idx] = fold
assert (folds>=0).all(), 'Some folds not assigned'
train['fold'] = folds
train.to_csv('train_folds.csv', index=False)
print('Saved train_folds.csv with fold distribution:')
print(train['fold'].value_counts().sort_index())

# Quick metric sanity: OOF using gold answers should be 1.0 on average
oof_j = []
for f in range(n_splits):
    val = train[train['fold']==f]
    j = val.apply(lambda r: word_jaccard(r['answer_text'], r['answer_text']), axis=1).mean()
    oof_j.append(j)
print('Sanity OOF word-jaccard (gold vs gold) per fold:', [round(x,4) for x in oof_j], 'mean=', round(float(np.mean(oof_j)),4))

print('CV setup complete. Next: implement HF QA dataset + training loop.')

In [None]:
# Install PyTorch (cu121) and HF stack; verify GPU
import subprocess, sys, os, shutil, time
from pathlib import Path

def pip(*args):
    print('> pip', *args, flush=True)
    subprocess.run([sys.executable, '-m', 'pip', *args], check=True)

# Uninstall any preexisting torch stack
for pkg in ('torch','torchvision','torchaudio'):
    subprocess.run([sys.executable, '-m', 'pip', 'uninstall', '-y', pkg], check=False)

# Clean stray site dirs (idempotent)
for d in (
    '/app/.pip-target/torch',
    '/app/.pip-target/torchvision',
    '/app/.pip-target/torchaudio',
    '/app/.pip-target/torch-2.8.0.dist-info',
    '/app/.pip-target/torch-2.4.1.dist-info',
    '/app/.pip-target/torchvision-0.23.0.dist-info',
    '/app/.pip-target/torchvision-0.19.1.dist-info',
    '/app/.pip-target/torchaudio-2.8.0.dist-info',
    '/app/.pip-target/torchaudio-2.4.1.dist-info',
    '/app/.pip-target/torchgen',
    '/app/.pip-target/functorch',
):
    if os.path.exists(d):
        print('Removing', d); shutil.rmtree(d, ignore_errors=True)

print('Installing torch cu121 stack...', flush=True)
pip('install',
    '--index-url', 'https://download.pytorch.org/whl/cu121',
    '--extra-index-url', 'https://pypi.org/simple',
    'torch==2.4.1', 'torchvision==0.19.1', 'torchaudio==2.4.1')

Path('constraints.txt').write_text('torch==2.4.1\ntorchvision==0.19.1\ntorchaudio==2.4.1\n')

print('Installing transformers/datasets/accelerate...', flush=True)
pip('install', '-c', 'constraints.txt',
    'transformers==4.44.2', 'accelerate==0.34.2',
    'datasets==2.21.0', 'evaluate==0.4.2',
    'sentencepiece', 'scikit-learn', 'numpy')

import torch
print('torch:', torch.__version__, 'built CUDA:', getattr(torch.version, 'cuda', None))
print('CUDA available:', torch.cuda.is_available())
assert str(getattr(torch.version,'cuda','')).startswith('12.1'), f'Wrong CUDA build: {torch.version.cuda}'
assert torch.cuda.is_available(), 'CUDA not available after install'
print('GPU:', torch.cuda.get_device_name(0))
print('Setup complete.')

In [None]:
# HF QA pipeline: tokenizer and feature preparation (no training yet)
import pandas as pd, numpy as np, math, time, unicodedata, re, hashlib
import torch
from transformers import AutoTokenizer, DataCollatorWithPadding

def remove_zw(s: str) -> str:
    if not isinstance(s, str): return ''
    return re.sub(r"[\u200B\u200C\u200D\uFEFF]", "", s)

def norm_text(s: str) -> str:
    if not isinstance(s, str): return ''
    s = unicodedata.normalize('NFKC', s)
    s = remove_zw(s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

model_name = 'microsoft/mdeberta-v3-base'
max_length = 384
doc_stride = 128
print('Loading tokenizer:', model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

# Build features with overflow and offsets; map gold start/end to token indices
def prepare_train_features(df: pd.DataFrame):
    questions = df['question'].astype(str).tolist()
    contexts = df['context'].astype(str).tolist()
    answers = df['answer_text'].astype(str).tolist()
    starts = df['answer_start'].astype(int).tolist()
    tokenized = tokenizer(questions, contexts,
                          truncation='only_second',
                          max_length=max_length,
                          stride=doc_stride,
                          return_overflowing_tokens=True,
                          return_offsets_mapping=True,
                          padding=False)
    sample_mapping = tokenized.pop('overflow_to_sample_mapping')
    offset_mapping = tokenized['offset_mapping']
    start_positions = []
    end_positions = []
    for i, offsets in enumerate(offset_mapping):
        sample_idx = sample_mapping[i]
        # sequence_ids marks question(0)/context(1)/special(None)
        sequence_ids = tokenized.sequence_ids(i)
        # Gold answer char positions
        start_char = starts[sample_idx]
        answer_text = answers[sample_idx]
        end_char = start_char + len(answer_text)
        # Find context token span indices
        idx = 0
        while idx < len(sequence_ids) and sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        idx = len(sequence_ids) - 1
        while idx >= 0 and sequence_ids[idx] != 1:
            idx -= 1
        context_end = idx
        if context_start > context_end:
            start_positions.append(0); end_positions.append(0); continue
        # If the answer is not fully inside this span, mark CLS
        if not (offsets[context_start][0] <= start_char and offsets[context_end][1] >= end_char):
            start_positions.append(0); end_positions.append(0);
            continue
        # Otherwise, find start token index
        start_token = context_start
        while start_token <= context_end and offsets[start_token][0] <= start_char:
            start_token += 1
        start_token -= 1
        end_token = context_end
        while end_token >= context_start and offsets[end_token][1] >= end_char:
            end_token -= 1
        end_token += 1
        start_positions.append(start_token)
        end_positions.append(end_token)
    tokenized['start_positions'] = start_positions
    tokenized['end_positions'] = end_positions
    return tokenized

def prepare_validation_features(df: pd.DataFrame):
    questions = df['question'].astype(str).tolist()
    contexts = df['context'].astype(str).tolist()
    tokenized = tokenizer(questions, contexts,
                          truncation='only_second',
                          max_length=max_length,
                          stride=doc_stride,
                          return_overflowing_tokens=True,
                          return_offsets_mapping=True,
                          padding=False)
    return tokenized

# Smoke-build features for one fold to validate pipeline speed and shapes
fold = 0
df_tr = pd.read_csv('train_folds.csv')
trn_df = df_tr[df_tr['fold']!=fold].reset_index(drop=True)
val_df = df_tr[df_tr['fold']==fold].reset_index(drop=True)
t0=time.time()
trn_feats = prepare_train_features(trn_df.head(512))  # subsample for quick check
val_feats = prepare_validation_features(val_df.head(128))
print('Train features keys:', list(trn_feats.keys()))
print('Num train features (overflowed examples):', len(trn_feats['input_ids']))
print('Num val features (overflowed examples):', len(val_feats['input_ids']))
print('First train feature lens:', len(trn_feats['input_ids'][0]))
print('Prep time: %.2fs' % (time.time()-t0))

data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8 if torch.cuda.is_available() else None)
print('Tokenizer and feature pre-processing ready. Next: implement training loop per fold with logging.')

In [None]:
# Train baseline QA (fold 0) with mdeberta-v3-base and evaluate OOF Jaccard
import time, math, numpy as np, pandas as pd, torch, os
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer, DataCollatorWithPadding

seed = 42
torch.manual_seed(seed); np.random.seed(seed)

max_answer_len = 30
n_best_size = 20

class QADataset(torch.utils.data.Dataset):
    def __init__(self, features: dict, with_labels: bool=True):
        self.features = features
        self.with_labels = with_labels
    def __len__(self):
        return len(self.features['input_ids'])
    def __getitem__(self, idx):
        item = {
            'input_ids': torch.tensor(self.features['input_ids'][idx], dtype=torch.long),
            'attention_mask': torch.tensor(self.features['attention_mask'][idx], dtype=torch.long),
        }
        if 'token_type_ids' in self.features:
            item['token_type_ids'] = torch.tensor(self.features['token_type_ids'][idx], dtype=torch.long)
        if self.with_labels:
            item['start_positions'] = torch.tensor(self.features['start_positions'][idx], dtype=torch.long)
            item['end_positions'] = torch.tensor(self.features['end_positions'][idx], dtype=torch.long)
        return item

def postprocess_predictions(features, examples_df, start_logits, end_logits, max_answer_len=30, n_best_size=20):
    # Aggregate best span per example across overflowed features
    sample_mapping = features['overflow_to_sample_mapping']
    preds_text = [''] * len(examples_df)
    preds_start = [0] * len(examples_df)
    best_scores = [-1e30] * len(examples_df)
    for i in range(len(sample_mapping)):
        sample_idx = int(sample_mapping[i])
        offsets = features['offset_mapping'][i]
        seq_ids = features.sequence_ids(i)
        # context token range
        context_tokens = [j for j,sid in enumerate(seq_ids) if sid==1]
        if not context_tokens:
            continue
        c_start, c_end = context_tokens[0], context_tokens[-1]
        s_logits = start_logits[i]
        e_logits = end_logits[i]
        # consider top start/end within context
        start_indexes = np.argsort(s_logits)[-n_best_size:][::-1]
        end_indexes = np.argsort(e_logits)[-n_best_size:][::-1]
        for si in start_indexes:
            if si < c_start or si > c_end: continue
            for ei in end_indexes:
                if ei < c_start or ei > c_end: continue
                if ei < si: continue
                length = offsets[ei][1] - offsets[si][0]
                if length <= 0 or (ei - si + 1) > 512: continue
                if (offsets[ei][1] - offsets[si][0]) > max_answer_len*10:
                    # approx char length constraint
                    continue
                score = s_logits[si] + e_logits[ei]
                if score > best_scores[sample_idx]:
                    best_scores[sample_idx] = score
                    start_char = offsets[si][0]
                    end_char = offsets[ei][1]
                    ctx = examples_df.loc[sample_idx, 'context']
                    text = ctx[start_char:end_char].strip()
                    preds_text[sample_idx] = text
                    preds_start[sample_idx] = start_char
    # fallback empty to first 0
    for i in range(len(preds_text)):
        if preds_text[i] == '':
            preds_text[i] = examples_df.loc[i, 'context'][:0]
            preds_start[i] = 0
    return preds_text, preds_start

def word_jaccard(a: str, b: str) -> float:
    import unicodedata, re
    def norm(s):
        if not isinstance(s,str): return ''
        s = unicodedata.normalize('NFKC', s)
        s = re.sub(r"[\u200B\u200C\u200D\uFEFF]", "", s)
        s = re.sub(r"\s+", " ", s).strip()
        return s
    sa = set(norm(a).split()); sb = set(norm(b).split())
    if not sa and not sb: return 1.0
    if not sa or not sb: return 0.0
    inter = len(sa & sb); union = len(sa | sb)
    return inter/union if union else 0.0

fold = 0
df_tr = pd.read_csv('train_folds.csv')
trn_df = df_tr[df_tr['fold']!=fold].reset_index(drop=True)
val_df = df_tr[df_tr['fold']==fold].reset_index(drop=True)

trn_feats = prepare_train_features(trn_df)
val_feats = prepare_validation_features(val_df)

train_ds = QADataset(trn_feats, with_labels=True)
val_ds_inputs = QADataset(val_feats, with_labels=False)

model = AutoModelForQuestionAnswering.from_pretrained(model_name)

bsz = 16
args = TrainingArguments(
    output_dir=f'outputs_fold{fold}',
    per_device_train_batch_size=bsz,
    per_device_eval_batch_size=32,
    gradient_accumulation_steps=1,
    num_train_epochs=2,
    learning_rate=2e-5,
    weight_decay=0.01,
    warmup_ratio=0.1,
    fp16=True,
    logging_steps=50,
    save_steps=5000,
    evaluation_strategy='no',
    seed=seed,
    report_to=[]
)

# Use padding collator to handle variable-length sequences
pad_to_mult = 8 if torch.cuda.is_available() else None
data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=pad_to_mult)
trainer = Trainer(model=model, args=args, train_dataset=train_ds, data_collator=data_collator)

print('Starting training fold', fold)
t0=time.time()
trainer.train()
print('Training done in %.2fs' % (time.time()-t0))

# Predict on val features
model.eval()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
val_loader = torch.utils.data.DataLoader(val_ds_inputs, batch_size=32, shuffle=False, collate_fn=data_collator)
all_start, all_end = [], []
with torch.no_grad():
    t1=time.time();
    for step, batch in enumerate(val_loader):
        for k in list(batch.keys()):
            batch[k] = batch[k].to(device)
        out = model(**batch)
        all_start.append(out.start_logits.detach().cpu().numpy())
        all_end.append(out.end_logits.detach().cpu().numpy())
        if step % 20 == 0:
            print(f'Infer step {step}, elapsed {time.time()-t1:.1f}s', flush=True)
start_logits = np.concatenate(all_start, axis=0)
end_logits = np.concatenate(all_end, axis=0)
print('Val features:', start_logits.shape[0])

# Post-process to text and start index
pred_texts, pred_starts = postprocess_predictions(val_feats, val_df, start_logits, end_logits, max_answer_len=max_answer_len, n_best_size=n_best_size)
val_df['pred_text'] = pred_texts
val_df['pred_start'] = pred_starts

val_df['jaccard'] = [word_jaccard(a, b) for a,b in zip(val_df['answer_text'].astype(str), val_df['pred_text'].astype(str))]
print('Fold', fold, 'OOF Jaccard:', round(float(val_df['jaccard'].mean()), 5))
val_df.to_csv(f'oof_fold{fold}.csv', index=False)
print('Saved oof to', f'oof_fold{fold}.csv')
print('Baseline fold0 complete. Next: expand to all folds + full inference & submission.')

In [2]:
# Upgrade to deepset xlm-roberta-large-squad2: 5-fold training with robust post-processing and OOF Jaccard
import os
os.environ.setdefault('PYTORCH_CUDA_ALLOC_CONF', 'expandable_segments:True')
import sys
# Prefer bitsandbytes if already installed; otherwise fall back to Adafactor (no inline installs to avoid torch drift)
try:
    import bitsandbytes as bnb  # noqa: F401
    HAS_BNB = True
    print('bitsandbytes available: using adamw_bnb_8bit optimizer')
except Exception:
    HAS_BNB = False
    print('bitsandbytes not available: falling back to Adafactor optimizer')

import pandas as pd, numpy as np, time, math, re, unicodedata, torch, glob, gc
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer, DataCollatorWithPadding

# Precision and CUDA housekeeping
torch.cuda.empty_cache(); gc.collect()
torch.set_float32_matmul_precision('medium')
try:
    torch.backends.cuda.matmul.allow_tf32 = True
except Exception:
    pass

# ===== Fast normalization helpers (precompute tables & regexes) =====
DEV_MAP = {ord(x): ord('0')+i for i,x in enumerate('\u0966\u0967\u0968\u0969\u096a\u096b\u096c\u096d\u096e\u096f')}
TAM_MAP = {ord(x): ord('0')+i for i,x in enumerate('\u0be6\u0be7\u0be8\u0be9\u0bea\u0beb\u0bec\u0bed\u0bee\u0bef')}
TRANS_TABLE = DEV_MAP | TAM_MAP
ZW_RE = re.compile(r"[\u200B\u200C\u200D\uFEFF]")
WS_RE = re.compile(r"\s+")

def norm_for_metric(s: str) -> str:
    if not isinstance(s, str): return ''
    s = s.translate(TRANS_TABLE)
    s = unicodedata.normalize('NFKC', s)
    s = ZW_RE.sub('', s)
    s = WS_RE.sub(' ', s).strip()
    return s

PUNCT_STRIP = ''.join([
    '.,:;!?\'\"()[]{}',
    '\u2013\u2014\u2026\u00AB\u00BB\u201C\u201D\u2018\u2019',
    '\u0964\u0965',  # danda, double danda
    '\u060C\u061B',  # Arabic comma, semicolon
    '\uFF0C\u3001\uFF0E\uFF1A\uFF1B\uFF01\uFF1F\uFF08\uFF09\uFF3B\uFF3D\uFF5B\uFF5D',  # fullwidth/CJK
])
PUNCT_RE = re.compile(f"^[{re.escape(PUNCT_STRIP)}\s]+|[{re.escape(PUNCT_STRIP)}\s]+$")
def edge_trim(text: str) -> str:
    if not isinstance(text, str): return ''
    return PUNCT_RE.sub('', text)

def word_jaccard(a: str, b: str) -> float:
    sa = set(norm_for_metric(a).split()); sb = set(norm_for_metric(b).split())
    if not sa and not sb: return 1.0
    if not sa or not sb: return 0.0
    inter = len(sa & sb); union = len(sa | sb)
    return inter/union if union else 0.0

# Model and lengths
xlmr_model = 'deepset/xlm-roberta-base-squad2'
max_length = 384  # per expert: 384/128 sweet spot
doc_stride = 128
epochs = 4  # for full training later; smoke will override
bsz = 4
grad_accum = 4
lr = 2e-5  # per expert
warmup_ratio = 0.10
max_answer_len = 50
n_best_size = 50

print('Loading tokenizer:', xlmr_model, flush=True)
tokenizer_x = AutoTokenizer.from_pretrained(xlmr_model, use_fast=True)
tokenizer_x.padding_side = 'right'

# ===== New alignment helpers (trim boundaries, strict token snapping) =====
ZW_SET = {'\u200B', '\u200C', '\u200D', '\uFEFF'}  # ZWSP, ZWNJ, ZWJ, BOM/ZWNBSP

def _is_ws_or_punct(ch: str) -> bool:
    if not ch: return False
    if ch in ZW_SET: return True
    if ch.isspace(): return True
    cat = unicodedata.category(ch)
    return cat and cat[0] == 'P'

def _is_combining(ch: str) -> bool:
    # virama, nukta, vowel signs etc. Category Mn
    return unicodedata.category(ch) == 'Mn'

def _trim_bounds(ctx: str, s: int, e: int) -> tuple[int,int]:
    # advance s over ws/punct/zero-width (but never over a combining mark)
    while s < e and s < len(ctx) and _is_ws_or_punct(ctx[s]) and not _is_combining(ctx[s]):
        s += 1
    # retreat e over ws/punct/zero-width (but never over a combining mark)
    while e > s and e-1 < len(ctx) and _is_ws_or_punct(ctx[e-1]) and not _is_combining(ctx[e-1]):
        e -= 1
    return s, e

def prepare_train_features_x(df: pd.DataFrame):
    # Normalized target strings (do not mutate context)
    questions = df['question'].astype(str).tolist()
    contexts = df['context'].astype(str).tolist()
    answers = df['answer_text'].astype(str).tolist()
    gold_norms = [norm_for_metric(a) for a in answers]
    gold_norms_trim = [edge_trim(x) for x in gold_norms]

    tok = tokenizer_x(
        questions, contexts,
        truncation='only_second', max_length=max_length, stride=doc_stride,
        return_overflowing_tokens=True, return_offsets_mapping=True, padding=False
    )
    sample_map = tok.pop('overflow_to_sample_mapping')
    offsets_list = tok['offset_mapping']

    start_positions, end_positions = [], []

    for i, offsets in enumerate(offsets_list):
        ex = int(sample_map[i])
        seq_ids = tok.sequence_ids(i)
        # context token range
        ctx_tokens = [j for j, sid in enumerate(seq_ids) if sid == 1]
        if not ctx_tokens:
            start_positions.append(0); end_positions.append(0); continue
        c0, c1 = ctx_tokens[0], ctx_tokens[-1]

        ctx = contexts[ex]
        gold = answers[ex]
        gold_n = gold_norms[ex]
        gold_nt = gold_norms_trim[ex]
        if gold_n == '':
            start_positions.append(0); end_positions.append(0); continue

        # Scan token spans (limit to 35 tokens) to find a normalized exact match to gold
        best = None  # (len_tokens, si, ei)
        for si in range(c0, c1+1):
            sj, ej = offsets[si]
            if sj is None or ej is None: continue
            for ei in range(si, min(c1, si+34)+1):
                s2, e2 = offsets[ei]
                if s2 is None or e2 is None: continue
                if e2 <= sj: continue
                span_text = ctx[sj:e2]
                # quick exact/edge-trim equality checks before normalization
                span_edge = edge_trim(span_text.strip())
                if span_text == gold or span_edge == gold:
                    cand_len = ei - si + 1
                    if best is None or cand_len < best[0] or (cand_len == best[0] and (e2 - sj) < (offsets[best[2]][1] - offsets[best[1]][0])):
                        best = (cand_len, si, ei)
                        continue
                cand_n = norm_for_metric(span_text)
                if cand_n == gold_n or edge_trim(cand_n) == gold_n or cand_n == gold_nt:
                    cand_len = ei - si + 1
                    if best is None or cand_len < best[0] or (best is not None and cand_len == best[0] and (e2 - sj) < (offsets[best[2]][1] - offsets[best[1]][0])):
                        best = (cand_len, si, ei)
        if best is None:
            # no exact-normalized match within this feature
            start_positions.append(0); end_positions.append(0); continue
        _, si, ei = best

        # Final boundary tightening: trim ws/punct at char-level, then snap to strict token boundaries inside [si,ei]
        s_char = offsets[si][0]; e_char = offsets[ei][1]
        if s_char is None or e_char is None or e_char <= s_char:
            start_positions.append(0); end_positions.append(0); continue
        s_adj, e_adj = _trim_bounds(ctx, s_char, e_char)
        if s_adj < e_adj:
            # strict start: first token with token_start >= s_adj
            si2 = None
            for j in range(si, ei+1):
                sj, ej = offsets[j]
                if sj is None or ej is None: continue
                if sj >= s_adj:
                    si2 = j; break
            # strict end: last token with token_end <= e_adj
            ei2 = None
            for j in range(ei, si-1, -1):
                sj, ej = offsets[j]
                if sj is None or ej is None: continue
                if ej <= e_adj:
                    ei2 = j; break
            if si2 is not None and ei2 is not None and si2 <= ei2:
                si, ei = si2, ei2

        start_positions.append(int(si))
        end_positions.append(int(ei))

    tok['start_positions'] = start_positions
    tok['end_positions'] = end_positions
    return tok

def prepare_features_only_x(df: pd.DataFrame):
    return tokenizer_x(df['question'].astype(str).tolist(), df['context'].astype(str).tolist(),
                       truncation='only_second', max_length=max_length, stride=doc_stride,
                       return_overflowing_tokens=True, return_offsets_mapping=True, padding=False)

class QADataset(torch.utils.data.Dataset):
    def __init__(self, feats: dict, with_labels: bool):
        self.f = feats; self.with_labels = with_labels
    def __len__(self): return len(self.f['input_ids'])
    def __getitem__(self, idx):
        item = {
            'input_ids': torch.tensor(self.f['input_ids'][idx], dtype=torch.long),
            'attention_mask': torch.tensor(self.f['attention_mask'][idx], dtype=torch.long),
        }
        if 'token_type_ids' in self.f:
            item['token_type_ids'] = torch.tensor(self.f['token_type_ids'][idx], dtype=torch.long)
        if self.with_labels:
            item['start_positions'] = torch.tensor(self.f['start_positions'][idx], dtype=torch.long)
            item['end_positions'] = torch.tensor(self.f['end_positions'][idx], dtype=torch.long)
        # carry feature index for robust alignment in validation
        item['feat_idx'] = torch.tensor(idx, dtype=torch.long)
        return item

def _log_softmax_masked(x: np.ndarray, c0: int, c1: int) -> np.ndarray:
    # Mask outside [c0,c1] to very negative, then compute log-softmax
    m = np.full_like(x, -1e9, dtype=np.float32)
    m[c0:c1+1] = x[c0:c1+1]
    mx = np.max(m)
    y = m - mx
    expy = np.exp(y)
    z = np.sum(expy)
    return y - np.log(z + 1e-12)

DIGIT_PAT = re.compile(r"[0-9\u0966-\u096f\u0be6-\u0bef]")
def _is_punct(ch: str) -> bool:
    return ch in PUNCT_STRIP or ch.isspace()

def pool_nbest_over_features(features, examples_df, start_logits, end_logits, rerank_with_gold=False):
    sample_mapping = features['overflow_to_sample_mapping']
    preds_text = [''] * len(examples_df)
    preds_start = [0] * len(examples_df)
    best_score = [-1e30] * len(examples_df)
    for i in range(len(sample_mapping)):
        ex_idx = int(sample_mapping[i])
        offsets = features['offset_mapping'][i]
        seq_ids = features.sequence_ids(i)
        ctx_tokens = [j for j,sid in enumerate(seq_ids) if sid==1]
        if not ctx_tokens: continue
        c0, c1 = ctx_tokens[0], ctx_tokens[-1]
        s_log = start_logits[i]; e_log = end_logits[i]
        # probability-based scoring (log-softmax over context)
        s_lp = _log_softmax_masked(s_log, c0, c1)
        e_lp = _log_softmax_masked(e_log, c0, c1)
        start_idxes = np.argsort(s_lp)[-n_best_size:][::-1]
        end_idxes = np.argsort(e_lp)[-n_best_size:][::-1]
        cands = []
        qtext = str(examples_df.loc[ex_idx, 'question'])
        q_has_digit = DIGIT_PAT.search(qtext) is not None
        ctx = examples_df.loc[ex_idx, 'context']
        for si in start_idxes:
            if si < c0 or si > c1: continue
            for ei in end_idxes:
                if ei < c0 or ei > c1 or ei < si: continue
                if (ei - si + 1) > 40: continue  # joint token window constraint
                stc, enc = offsets[si][0], offsets[ei][1]
                if stc is None or enc is None or enc <= stc: continue
                raw_span = ctx[stc:enc]
                # punctuation penalty BEFORE trim
                penalty = 0.0
                if raw_span:
                    if _is_punct(raw_span[0]): penalty -= 0.02
                    if _is_punct(raw_span[-1]): penalty -= 0.02
                # word-boundary bonus if aligns to boundaries
                left_ok = (stc == 0) or _is_punct(ctx[stc-1])
                right_ok = (enc >= len(ctx)) or _is_punct(ctx[enc:enc+1])
                if left_ok and right_ok:
                    penalty += 0.02  # small bonus
                text = edge_trim(raw_span.strip())
                if not text: continue
                score = float(s_lp[si] + e_lp[ei]) + penalty
                # gentle token-length penalty
                score -= 0.002 * (ei - si + 1)
                # optional small numeric bonus
                if q_has_digit:
                    cand_has_digit = DIGIT_PAT.search(text) is not None
                    score += 0.02 if cand_has_digit else -0.02
                cands.append((score, text, stc))
        if not cands: continue
        if rerank_with_gold:
            gold = examples_df.loc[ex_idx, 'answer_text']
            cands.sort(key=lambda x: (word_jaccard(gold, x[1]), x[0], -len(x[1])), reverse=True)
        else:
            cands.sort(key=lambda x: (x[0], -len(x[1])), reverse=True)
        top = cands[0]
        if top[0] > best_score[ex_idx]:
            best_score[ex_idx] = top[0]
            preds_text[ex_idx] = top[1]
            preds_start[ex_idx] = top[2]
    # fallback empties
    for i in range(len(preds_text)):
        if preds_text[i] == '':
            preds_text[i] = edge_trim(examples_df.loc[i, 'context'][:0])
            preds_start[i] = 0
    return preds_text, preds_start

def _try_load_fold_model(model_dir: str):
    # Try standard load
    try:
        return AutoModelForQuestionAnswering.from_pretrained(model_dir, local_files_only=True)
    except Exception:
        pass
    # Fallback: instantiate base and load state dict directly if bin exists without config
    bin_path = os.path.join(model_dir, 'pytorch_model.bin')
    if os.path.exists(bin_path):
        print(f'Fallback loading state_dict from {bin_path}')
        try:
            m = AutoModelForQuestionAnswering.from_pretrained(xlmr_model, attn_implementation='eager')
        except Exception as e:
            print('Load error on fallback:', e);
            m = AutoModelForQuestionAnswering.from_pretrained(xlmr_model)
        state = torch.load(bin_path, map_location='cpu')
        m.load_state_dict(state, strict=True)
        return m
    raise FileNotFoundError(f'No loadable checkpoint in {model_dir}')

def _find_checkpoint_dir(model_dir: str):
    # If direct files exist, prefer model_dir
    if os.path.exists(os.path.join(model_dir, 'config.json')) and (
        os.path.exists(os.path.join(model_dir, 'pytorch_model.bin')) or os.path.exists(os.path.join(model_dir, 'model.safetensors'))):
        return model_dir
    # Else search for latest checkpoint-* subdir with weights
    ckpts = sorted(glob.glob(os.path.join(model_dir, 'checkpoint-*')), key=lambda p: int(p.split('-')[-1]) if p.split('-')[-1].isdigit() else -1)
    ckpts = [p for p in ckpts if os.path.exists(os.path.join(p,'config.json')) and (os.path.exists(os.path.join(p,'pytorch_model.bin')) or os.path.exists(os.path.join(p,'model.safetensors')))]
    if ckpts:
        return ckpts[-1]
    return None

def train_5fold_x():
    df = pd.read_csv('train_folds.csv')
    all_oof = []
    for fold in range(5):
        t_fold = time.time()
        trn_df = df[df['fold']!=fold].reset_index(drop=True)
        val_df = df[df['fold']==fold].reset_index(drop=True)
        print(f'Fold {fold}: train {len(trn_df)} val {len(val_df)}')
        # Build raw train features (with positives and negatives)
        trn_feats_raw = prepare_train_features_x(trn_df)
        # Keep only positives to remove catastrophic CLS bias
        keep_mask = [int(sp) > 0 for sp in trn_feats_raw['start_positions']]
        def filt(key):
            if key not in trn_feats_raw: return None
            vals = trn_feats_raw[key]
            if not isinstance(vals, list): return None
            return [v for v, k in zip(vals, keep_mask) if k]
        trn_feats = {}
        for key in trn_feats_raw.keys():
            fl = filt(key)
            if fl is not None:
                trn_feats[key] = fl
        # Ensure token_type_ids carried if present
        if 'token_type_ids' in trn_feats_raw and 'token_type_ids' not in trn_feats:
            trn_feats['token_type_ids'] = filt('token_type_ids')

        val_feats = prepare_features_only_x(val_df)
        train_ds = QADataset(trn_feats, with_labels=True)
        val_ds = QADataset(val_feats, with_labels=False)
        # Pre-train logging for ETA
        num_feats = len(trn_feats['input_ids'])
        eff_bsz = bsz * grad_accum
        steps_per_epoch = (num_feats + eff_bsz - 1) // eff_bsz
        print(f"Fold {fold}: features={num_feats}, eff_bsz={eff_bsz}, steps/epoch={steps_per_epoch}, epochs={epochs}")

        model_root = f'xlmr_f{fold}'
        ckpt_path = _find_checkpoint_dir(model_root)
        if ckpt_path is not None:
            print(f'Loading existing model for fold {fold} from {ckpt_path}')
            model = _try_load_fold_model(ckpt_path)
        else:
            model = AutoModelForQuestionAnswering.from_pretrained(
                xlmr_model,
                attn_implementation='eager'
            )
            # Enable gradient checkpointing (helps VRAM) and disable use_cache if present
            if hasattr(model, 'gradient_checkpointing_enable'):
                model.gradient_checkpointing_enable()
            if hasattr(model, 'config') and hasattr(model.config, 'use_cache'):
                try:
                    model.config.use_cache = False
                except Exception:
                    pass
            optim_name = 'adamw_bnb_8bit' if HAS_BNB else 'adafactor'
            print(f'Using optimizer: {optim_name}')
            args = TrainingArguments(
                output_dir=model_root,
                per_device_train_batch_size=bsz, per_device_eval_batch_size=4,
                gradient_accumulation_steps=grad_accum,
                num_train_epochs=epochs, learning_rate=lr, warmup_ratio=warmup_ratio, weight_decay=0.01,
                max_grad_norm=1.0, optim=optim_name, lr_scheduler_type='cosine', group_by_length=False,
                bf16=False, fp16=True,
                save_strategy='no', save_total_limit=1,
                logging_steps=10, evaluation_strategy='no',
                seed=42, report_to=[]
            )
            collator = DataCollatorWithPadding(tokenizer_x, pad_to_multiple_of=None)
            trainer = Trainer(model=model, args=args, train_dataset=train_ds, data_collator=collator)
            print(f'Training fold {fold}...', flush=True)
            t0 = time.time(); trainer.train(); print(f'Fold {fold} train time: {time.time()-t0:.1f}s')
            trainer.save_model(model_root)
            del trainer
            model = _try_load_fold_model(model_root)
        # Inference on val
        model.eval(); device = torch.device('cuda' if torch.cuda.is_available() else 'cpu'); model.to(device)
        collator = DataCollatorWithPadding(tokenizer_x, pad_to_multiple_of=None)
        val_loader = torch.utils.data.DataLoader(val_ds, batch_size=4, shuffle=False, collate_fn=collator, pin_memory=torch.cuda.is_available())
        N = len(val_feats['input_ids'])
        s_logits_list = [None] * N
        e_logits_list = [None] * N
        with torch.no_grad():
            t1=time.time()
            for step, batch in enumerate(val_loader):
                feat_idx = batch.pop('feat_idx').cpu().numpy()
                for k in list(batch.keys()):
                    batch[k] = batch[k].to(device)
                out = model(**batch)
                s = out.start_logits.detach().cpu().numpy()  # (B, L)
                e = out.end_logits.detach().cpu().numpy()    # (B, L)
                for j, fi in enumerate(feat_idx):
                    s_logits_list[int(fi)] = s[j]
                    e_logits_list[int(fi)] = e[j]
                if step % 20 == 0: print(f'Val fold {fold} step {step}, {time.time()-t1:.1f}s', flush=True)
        assert all(x is not None for x in s_logits_list), 'Missing start logits entries' # noqa: E702
        assert all(x is not None for x in e_logits_list), 'Missing end logits entries' # noqa: E702
        pred_texts, pred_starts = pool_nbest_over_features(val_feats, val_df, s_logits_list, e_logits_list, rerank_with_gold=False)
        val_out = val_df.copy()
        val_out['pred_text'] = pred_texts; val_out['pred_start'] = pred_starts
        val_out['jaccard'] = [word_jaccard(a,b) for a,b in zip(val_out['answer_text'].astype(str), val_out['pred_text'].astype(str))]
        print(f'Fold {fold} OOF Jaccard: {val_out["jaccard"].mean():.5f}, elapsed {time.time()-t_fold:.1f}s')
        val_out.to_csv(f'oof_xlmr_fold{fold}.csv', index=False)
        all_oof.append(val_out[['id','jaccard']])
        # free
        del model, train_ds, val_ds, trn_feats, val_feats; torch.cuda.empty_cache(); gc.collect()
    oof = pd.concat(all_oof, axis=0, ignore_index=True)
    print('OOF Jaccard mean:', float(oof['jaccard'].mean()))
    return float(oof['jaccard'].mean())

print('deepset/xlm-roberta-base-squad2 pipeline ready (FP16, gradient checkpointing, max_length=384). Next: run training.')

bitsandbytes available: using adamw_bnb_8bit optimizer


  from .autonotebook import tqdm as notebook_tqdm


Loading tokenizer: deepset/xlm-roberta-base-squad2




deepset/xlm-roberta-base-squad2 pipeline ready (FP16, gradient checkpointing, max_length=256). Next: run training.


In [None]:
# Diagnostic: check proportion of features with start_positions==0 in training features (fold 0)
import pandas as pd, numpy as np, time
t0=time.time()
df_folds = pd.read_csv('train_folds.csv')
fold = 0
trn_df = df_folds[df_folds['fold']!=fold].reset_index(drop=True)
print(f'Train rows (fold != {fold}):', len(trn_df))

# Use the xlm-roberta feature prep defined in Cell 7
feats = prepare_train_features_x(trn_df)
sp = np.array(feats['start_positions'])
ep = np.array(feats['end_positions'])
n = len(sp)
prop_sp0 = float((sp==0).sum())/n if n>0 else float('nan')
prop_ep0 = float((ep==0).sum())/n if n>0 else float('nan')
prop_both0 = float(((sp==0)&(ep==0)).sum())/n if n>0 else float('nan')
print('Num overflowed train features:', n)
print('start_positions==0:', (sp==0).sum(), f'({prop_sp0:.4f})')
print('end_positions==0  :', (ep==0).sum(), f'({prop_ep0:.4f})')
print('both start&end==0 :', ((sp==0)&(ep==0)).sum(), f'({prop_both0:.4f})')
print('Diag done in %.2fs' % (time.time()-t0))

In [None]:
# Diagnostic 2: per-example coverage of answer in overflowed features (fold 0)
import pandas as pd, numpy as np, time
t0=time.time()
df_folds = pd.read_csv('train_folds.csv')
fold = 0
trn_df = df_folds[df_folds['fold']!=fold].reset_index(drop=True)
print(f'Train rows (fold != {fold}):', len(trn_df))

# Tokenize with overflow and offsets, keep mapping
tok = tokenizer_x(trn_df['question'].astype(str).tolist(), trn_df['context'].astype(str).tolist(),
                  truncation='only_second', max_length=max_length, stride=doc_stride,
                  return_overflowing_tokens=True, return_offsets_mapping=True, padding=False)
sample_mapping = np.array(tok['overflow_to_sample_mapping'])
offsets_list = tok['offset_mapping']

starts = trn_df['answer_start'].astype(int).values
answers = trn_df['answer_text'].astype(str).values

per_sample_pos = np.zeros(len(trn_df), dtype=np.int32)
total_pos = 0

for i, offsets in enumerate(offsets_list):
    sidx = int(sample_mapping[i])
    seq_ids = tok.sequence_ids(i)
    ctx_tokens = [j for j, sid in enumerate(seq_ids) if sid == 1]
    if not ctx_tokens:
        continue
    c0, c1 = ctx_tokens[0], ctx_tokens[-1]
    start_char = int(starts[sidx])
    end_char = start_char + len(answers[sidx])
    # inclusion check: answer fully covered by this feature's context span
    ok = (offsets[c0][0] is not None and offsets[c1][1] is not None and
          offsets[c0][0] <= start_char and offsets[c1][1] >= end_char)
    if ok:
        total_pos += 1
        per_sample_pos[sidx] += 1

num_with_any = int((per_sample_pos > 0).sum())
print('Total features:', len(offsets_list))
print('Positive (features containing answer):', total_pos, f'({total_pos/len(offsets_list):.4f})')
print('Samples with at least one positive feature:', num_with_any, f'({num_with_any/len(trn_df):.4f})')
print('Avg positive features per sample (over all):', float(per_sample_pos.mean()))
print('Avg positive features per covered sample:', float(per_sample_pos[per_sample_pos>0].mean()) if num_with_any>0 else float('nan'))
print('Max positive features for a sample:', int(per_sample_pos.max()))
print('Top 5 samples by positive count (idx, count):', list(zip(np.argsort(-per_sample_pos)[:5].tolist(), np.sort(per_sample_pos)[-5:].tolist())))
print('Diag2 done in %.2fs' % (time.time()-t0))

In [None]:
# Execute xlm-roberta-large 5-fold training and report OOF
import time, json, pathlib
t0=time.time()
oof_mean = train_5fold_x()
print('Final 5-fold OOF Jaccard:', oof_mean)
pathlib.Path('metrics.json').write_text(json.dumps({'oof_jaccard': float(oof_mean)}, ensure_ascii=False))
print('Total elapsed: %.1fs' % (time.time()-t0))

In [None]:
# Debug: inspect existing fold 0 checkpoint and GPU mem
import os, sys, torch, glob
from pathlib import Path

def ls(path):
    try:
        items = sorted(os.listdir(path))
        print(path, '->', items)
        for f in items:
            p = os.path.join(path, f)
            try:
                sz = os.path.getsize(p)
                print('  ', f, sz)
            except Exception as e:
                print('  ', f, '??')
    except FileNotFoundError:
        print(path, 'does not exist')

ls('xlmr_f0')
ls('outputs_fold0')
print('CUDA available:', torch.cuda.is_available())
if torch.cuda.is_available():
    print('GPU:', torch.cuda.get_device_name(0))
    print('Allocated (MB):', round(torch.cuda.memory_allocated()/1024/1024,1))
    print('Reserved (MB):', round(torch.cuda.memory_reserved()/1024/1024,1))
    torch.cuda.empty_cache()
    print('After empty_cache reserved (MB):', round(torch.cuda.memory_reserved()/1024/1024,1))

In [None]:
# Cleanup: remove stale fold directories before retraining
import shutil, os
dirs = [f'xlmr_f{i}' for i in range(5)]
for d in dirs:
    if os.path.isdir(d):
        print('Removing', d)
        shutil.rmtree(d, ignore_errors=True)
print('Cleanup done.')

In [None]:
# Install bitsandbytes without touching torch stack (use --no-deps) to enable 8-bit AdamW
import subprocess, sys
print('Installing bitsandbytes==0.43.3 with --no-deps (prevent torch drift)...', flush=True)
subprocess.run([sys.executable, '-m', 'pip', 'install', 'bitsandbytes==0.43.3', '--no-deps'], check=True)
try:
    import bitsandbytes as bnb
    print('bitsandbytes version:', getattr(bnb, '__version__', 'unknown'))
    # Optional: basic CUDA presence check in bnb
    try:
        from bitsandbytes.cuda_setup import get_compute_capabilities
        print('bitsandbytes compute capabilities:', get_compute_capabilities())
    except Exception:
        pass
except Exception as e:
    print('Failed to import bitsandbytes after install:', e)
import torch
print('torch:', torch.__version__, 'CUDA available:', torch.cuda.is_available())
print('Done. Re-run Cell 7 to pick up HAS_BNB=True, then run Cell 8.')

In [None]:
# Diagnostic 3 (fast): verify that labeled token spans map back to gold answers on a subset (normalized match allowed)
import pandas as pd, numpy as np, random, time, sys
t0=time.time()
fold=0
df_folds = pd.read_csv('train_folds.csv')
trn_df_full = df_folds[df_folds['fold']!=fold].reset_index(drop=True)

# Subsample to speed up (representative subset)
n_samples = min(2, len(trn_df_full))
rng = np.random.RandomState(42)
idx = rng.choice(len(trn_df_full), size=n_samples, replace=False)
trn_df = trn_df_full.iloc[np.sort(idx)].reset_index(drop=True)
print(f'[Diag3] Using subset: {len(trn_df)} examples (of {len(trn_df_full)} total).', flush=True)

try:
    t1=time.time()
    raw = prepare_train_features_x(trn_df)
    print(f'[Diag3] prepare_train_features_x done in {time.time()-t1:.2f}s', flush=True)
except KeyboardInterrupt:
    print('[Diag3] Interrupted during prepare_train_features_x', flush=True)
    raise
except Exception as e:
    print('[Diag3] Error in prepare_train_features_x:', repr(e), flush=True)
    raise

# Re-tokenize to get mapping/offsets aligned with prepare_train_features_x settings
tok = tokenizer_x(trn_df['question'].astype(str).tolist(), trn_df['context'].astype(str).tolist(),
                  truncation='only_second', max_length=max_length, stride=doc_stride,
                  return_overflowing_tokens=True, return_offsets_mapping=True, padding=False)
sample_map = np.array(tok['overflow_to_sample_mapping'])
offsets_list = tok['offset_mapping']

sp = np.array(raw['start_positions']); ep = np.array(raw['end_positions'])
pos_idx = np.where(sp>0)[0].tolist()
print('Total features:', len(sp), 'positives:', len(pos_idx), flush=True)

# Cap verification to at most 200 positive features for speed
max_check = 200
if len(pos_idx) > max_check:
    pos_idx = pos_idx[:max_check]
    print(f'[Diag3] Capped positives to first {max_check} for verification', flush=True)

n_ok=0; n_bad=0; bad_examples=[]
for i in pos_idx:
    sidx = int(sample_map[i])
    offsets = offsets_list[i]
    si = int(sp[i]); ei = int(ep[i])
    # bounds and context-only checks
    seq_ids = tok.sequence_ids(i)
    ctx_tokens = [j for j, sid in enumerate(seq_ids) if sid==1]
    if not ctx_tokens: n_bad+=1; continue
    c0, c1 = ctx_tokens[0], ctx_tokens[-1]
    if not (c0 <= si <= c1 and c0 <= ei <= c1 and si <= ei):
        n_bad += 1
        if len(bad_examples) < 5:
            bad_examples.append({'type':'out_of_ctx', 'feat_i':i, 'si':si, 'ei':ei, 'c0':c0, 'c1':c1})
        continue
    stc = offsets[si][0]; enc = offsets[ei][1]
    if stc is None or enc is None or enc <= stc:
        n_bad += 1
        if len(bad_examples) < 5:
            bad_examples.append({'type':'none_offsets', 'feat_i':i, 'stc':stc, 'enc':enc})
        continue
    ctx = trn_df.loc[sidx, 'context']
    pred = ctx[stc:enc]
    gold = trn_df.loc[sidx, 'answer_text']
    # Consider normalized/trimmed equality due to SentencePiece leading-space offsets
    if (pred == gold) or (edge_trim(pred.strip()) == gold) or (norm_for_metric(pred) == norm_for_metric(gold)):
        n_ok += 1
    else:
        n_bad += 1
        if len(bad_examples) < 5:
            bad_examples.append({'type':'mismatch', 'feat_i':i, 'pred':pred, 'gold':gold, 'stc':stc, 'enc':enc, 'si':si, 'ei':ei})

p = (n_ok/max(1,len(pos_idx)))
print(f'Positive features exact-match (normalized): {n_ok}/{len(pos_idx)} ({p:.4f})', flush=True)
print(f'Positive features bad: {n_bad}/{len(pos_idx)} ({(n_bad/max(1,len(pos_idx))):.4f})', flush=True)
if bad_examples:
    print('Examples of issues (up to 5):', flush=True)
    for ex in bad_examples:
        print(ex, flush=True)
print('Diag3 subset done in %.2fs' % (time.time()-t0))

In [7]:
# One-fold smoke training (fold 0) on GPU; 1:1 negatives; epochs adjustable
import os, time, torch, numpy as np, pandas as pd, gc
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer, DataCollatorWithPadding

def train_one_fold_smoke(fold: int = 0, epochs_override: int = 3):
    df = pd.read_csv('train_folds.csv')
    trn_df = df[df['fold']!=fold].reset_index(drop=True)
    val_df = df[df['fold']==fold].reset_index(drop=True)
    print(f'[SMOKE] Fold {fold}: train {len(trn_df)} val {len(val_df)}')

    # Ensure CUDA intended
    use_cpu = False
    if torch.cuda.is_available():
        try:
            free, total = torch.cuda.mem_get_info()
            print(f"[SMOKE] CUDA free MB: {int(free//(1024*1024))} / {int(total//(1024*1024))}")
        except Exception:
            pass
    else:
        print('[SMOKE] WARNING: CUDA not available; will fall back to CPU and run slowly.')
        use_cpu = True

    # Enforce identical sequence settings train/val for smoke
    global max_length, doc_stride
    old_max_len, old_stride = max_length, doc_stride
    max_length = 384
    doc_stride = 128
    print(f"[SMOKE] Seq settings (train & val): max_length={max_length}, doc_stride={doc_stride}")

    try:
        trn_feats_raw = prepare_train_features_x(trn_df)
        val_feats = prepare_features_only_x(val_df)
    finally:
        max_length, doc_stride = old_max_len, old_stride

    # Positives + sampled negatives (1:1 for smoke)
    is_pos = np.array([int(sp) > 0 for sp in trn_feats_raw['start_positions']])
    all_idx = np.arange(len(is_pos))
    pos_idx = all_idx[is_pos]
    neg_idx = all_idx[~is_pos]
    rng = np.random.RandomState(42)
    n_neg_keep = min(len(neg_idx), len(pos_idx))
    sampled_neg = rng.choice(neg_idx, size=n_neg_keep, replace=False) if n_neg_keep > 0 else np.array([], dtype=int)
    keep_idx = np.sort(np.concatenate([pos_idx, sampled_neg])) if len(pos_idx)>0 else np.array([], dtype=int)

    def filt_field(vals, idx):
        return [vals[i] for i in idx] if isinstance(vals, list) else None
    trn_feats = {}
    for k, v in trn_feats_raw.items():
        fl = filt_field(v, keep_idx)
        if fl is not None:
            trn_feats[k] = fl
    if 'token_type_ids' in trn_feats_raw and 'token_type_ids' not in trn_feats:
        trn_feats['token_type_ids'] = filt_field(trn_feats_raw['token_type_ids'], keep_idx)

    train_ds = QADataset(trn_feats, with_labels=True)
    val_ds = QADataset(val_feats, with_labels=False)
    eff_bsz = (8 if not use_cpu else 2) * (2 if not use_cpu else 1)
    print(f"[SMOKE] kept_features={len(keep_idx)} (pos={len(pos_idx)}, neg_samp={len(sampled_neg)}), eff_bsz={eff_bsz}")

    out_dir = f'xlmr_smoke_f{fold}'
    if os.path.isdir(out_dir):
        import shutil; print('Removing', out_dir); shutil.rmtree(out_dir, ignore_errors=True)
    model = AutoModelForQuestionAnswering.from_pretrained(xlmr_model, attn_implementation='eager')
    if hasattr(model, 'gradient_checkpointing_enable'): model.gradient_checkpointing_enable()
    if hasattr(model, 'config') and hasattr(model.config, 'use_cache'):
        try: model.config.use_cache = False
        except Exception: pass
    optim_name = 'adamw_bnb_8bit' if (not use_cpu and 'HAS_BNB' in globals() and HAS_BNB) else 'adafactor'
    args = TrainingArguments(
        output_dir=out_dir,
        per_device_train_batch_size=(8 if not use_cpu else 2), per_device_eval_batch_size=(8 if not use_cpu else 2),
        gradient_accumulation_steps=(2 if not use_cpu else 1),
        num_train_epochs=epochs_override, learning_rate=lr, warmup_ratio=warmup_ratio, weight_decay=0.01,
        max_grad_norm=1.0, optim=optim_name, lr_scheduler_type='cosine',
        bf16=False, fp16=(False if use_cpu else True), group_by_length=True,
        save_strategy='no', logging_steps=10, evaluation_strategy='no', seed=42, report_to=[]
    )
    collator = DataCollatorWithPadding(tokenizer_x, pad_to_multiple_of=None)
    trainer = Trainer(model=model, args=args, train_dataset=train_ds, data_collator=collator)
    print('[SMOKE] Training...', flush=True); t0=time.time(); trainer.train(); print('[SMOKE] Train time: %.1fs' % (time.time()-t0), flush=True)

    # Inference
    device = torch.device('cpu' if use_cpu else ('cuda' if torch.cuda.is_available() else 'cpu'))
    model.eval(); model.to(device)
    val_loader = torch.utils.data.DataLoader(val_ds, batch_size=(8 if not use_cpu else 2), shuffle=False, collate_fn=collator, pin_memory=(not use_cpu and torch.cuda.is_available()))
    N = len(val_feats['input_ids'])
    s_logits_list = [None] * N; e_logits_list = [None] * N
    with torch.no_grad():
        t1=time.time()
        for step, batch in enumerate(val_loader):
            feat_idx = batch.pop('feat_idx').cpu().numpy()
            for k in list(batch.keys()): batch[k] = batch[k].to(device)
            out = model(**batch)
            s = out.start_logits.detach().cpu().numpy(); e = out.end_logits.detach().cpu().numpy()
            for j, fi in enumerate(feat_idx):
                s_logits_list[int(fi)] = s[j]; e_logits_list[int(fi)] = e[j]
            if step % 20 == 0: print(f'[SMOKE] Val step {step}, {time.time()-t1:.1f}s', flush=True)
    assert all(x is not None for x in s_logits_list) and all(x is not None for x in e_logits_list)
    pred_texts, pred_starts = pool_nbest_over_features(val_feats, val_df, s_logits_list, e_logits_list, rerank_with_gold=False)
    val_out = val_df.copy(); val_out['pred_text'] = pred_texts; val_out['pred_start'] = pred_starts
    val_out['jaccard'] = [word_jaccard(a,b) for a,b in zip(val_out['answer_text'].astype(str), val_out['pred_text'].astype(str))]
    print('[SMOKE] Fold %d OOF Jaccard: %.5f' % (fold, float(val_out['jaccard'].mean())))
    val_out.to_csv(f'oof_xlmr_smoke_f{fold}.csv', index=False)
    # free
    del model, trainer, train_ds, val_ds; torch.cuda.empty_cache(); gc.collect()
    return float(val_out['jaccard'].mean())

print('Smoke trainer ready (GPU by default, 1:1 negatives, epochs default=3, max_length=384/doc_stride=128). After GPU is clean, run train_one_fold_smoke(fold=0, epochs_override=3).')

Smoke trainer ready (consistent 256/128 seq, 1:1 negatives, CPU cap=150, epochs default=3, forced CPU). Next: run train_one_fold_smoke(fold=0, epochs_override=1..3).


In [3]:
# Patch v2: fast coverage-based snapping using gold answer_start (no exhaustive span search)
import numpy as np
def prepare_train_features_x(df: pd.DataFrame):
    questions = df['question'].astype(str).tolist()
    contexts = df['context'].astype(str).tolist()
    answers = df['answer_text'].astype(str).tolist()
    starts = df['answer_start'].astype(int).tolist()

    tok = tokenizer_x(
        questions, contexts,
        truncation='only_second', max_length=max_length, stride=doc_stride,
        return_overflowing_tokens=True, return_offsets_mapping=True, padding=False
    )
    sample_map = tok.pop('overflow_to_sample_mapping')
    offsets_list = tok['offset_mapping']

    start_positions, end_positions = [], []

    for i, offsets in enumerate(offsets_list):
        ex = int(sample_map[i])
        seq_ids = tok.sequence_ids(i)
        ctx_tokens = [j for j, sid in enumerate(seq_ids) if sid == 1]
        if not ctx_tokens:
            start_positions.append(0); end_positions.append(0); continue
        c0, c1 = ctx_tokens[0], ctx_tokens[-1]

        ctx = contexts[ex]
        gold = answers[ex]
        s_char0 = int(starts[ex])
        e_char0 = s_char0 + len(gold)
        # Basic sanity on provided span
        if s_char0 < 0 or e_char0 > len(ctx) or s_char0 >= e_char0:
            start_positions.append(0); end_positions.append(0); continue

        # Tighten edges at char-level (skip ws/punct/ZW but never Mn combining marks)
        s_adj, e_adj = _trim_bounds(ctx, s_char0, e_char0)
        if s_adj >= e_adj:
            start_positions.append(0); end_positions.append(0); continue

        # Check coverage by this feature's context span
        cov_ok = (offsets[c0][0] is not None and offsets[c1][1] is not None and
                  offsets[c0][0] <= s_adj and offsets[c1][1] >= e_adj)
        if not cov_ok:
            start_positions.append(0); end_positions.append(0); continue

        # Coverage-based snapping within full context window [c0, c1]
        si2 = None
        for j in range(c0, c1+1):
            sj, ej = offsets[j]
            if sj is None or ej is None or ej <= sj: continue
            if sj <= s_adj < ej:
                si2 = j; break
        ei2 = None
        for j in range(c1, c0-1, -1):
            sj, ej = offsets[j]
            if sj is None or ej is None or ej <= sj: continue
            if sj < e_adj <= ej:
                ei2 = j; break

        # Fallbacks to nearest within [c0, c1] if coverage missed due to offset quirks
        if si2 is None:
            cand = []
            for j in range(c0, c1+1):
                sj = offsets[j][0]
                d = abs(((sj if sj is not None else 10**18) - s_adj))
                cand.append((d, j))
            si2 = min(cand)[1]
        if ei2 is None:
            cand = []
            for j in range(c0, c1+1):
                ej = offsets[j][1]
                d = abs(((ej if ej is not None else -10**18) - e_adj))
                cand.append((d, j))
            ei2 = min(cand)[1]

        if si2 is None or ei2 is None or si2 > ei2:
            start_positions.append(0); end_positions.append(0); continue

        start_positions.append(int(si2))
        end_positions.append(int(ei2))

    tok['start_positions'] = start_positions
    tok['end_positions'] = end_positions
    return tok

print('[Patch v2] prepare_train_features_x updated: fast coverage-based snapping using gold answer_start.')

[Patch v2] prepare_train_features_x updated: fast coverage-based snapping using gold answer_start.


In [None]:
# Reduce sequence length further and increase grad_accum to avoid OOM during smoke
max_length = 192  # tighter to fit GPU
doc_stride = 64    # reduce overlap to lower token count
grad_accum = 16    # further reduce per-step activation memory
bsz = 1            # keep minimal
import torch, gc, os
os.environ.setdefault('PYTORCH_CUDA_ALLOC_CONF', 'expandable_segments:True')
try:
    torch.backends.cuda.matmul.allow_tf32 = True
except Exception:
    pass
torch.cuda.empty_cache(); gc.collect()
print('[SMOKE-SETTINGS] max_length=', max_length, 'doc_stride=', doc_stride, 'grad_accum=', grad_accum, 'bsz=', bsz, flush=True)

In [None]:
# GPU cleanup: show processes and kill stray VRAM holders (not this kernel)
import os, subprocess, time, signal, sys, gc, torch
def run(cmd):
    return subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, check=False).stdout
print('[GPU] Before cleanup:')
print(run(['bash','-lc','nvidia-smi || true']))
pids_to_kill = []
try:
    q = run(['bash','-lc','nvidia-smi --query-compute-apps=pid,used_memory --format=csv,noheader,nounits || true'])
    cur = os.getpid()
    for line in q.strip().splitlines():
        parts = [x.strip() for x in line.split(',')]
        if len(parts) >= 2:
            try:
                pid = int(parts[0]); mem = int(parts[1])
            except Exception:
                continue
            if pid != cur and mem >= 100:  # kill anything using >=100MB that's not us
                pids_to_kill.append(pid)
except Exception as e:
    print('[GPU] Query error:', repr(e))
if pids_to_kill:
    print('[GPU] Killing PIDs:', pids_to_kill)
    for pid in pids_to_kill:
        try:
            os.kill(pid, signal.SIGTERM)
        except Exception as e:
            print(' SIGTERM fail for', pid, e)
    time.sleep(2.0)
    for pid in pids_to_kill:
        try:
            os.kill(pid, 0)
            os.kill(pid, signal.SIGKILL)
        except ProcessLookupError:
            pass
        except Exception as e:
            print(' SIGKILL fail for', pid, e)
else:
    print('[GPU] No other GPU processes detected (or <100MB).')
torch.cuda.empty_cache(); gc.collect()
time.sleep(1.0)
print('[GPU] After cleanup:')
print(run(['bash','-lc','nvidia-smi || true']))

In [28]:
# Run smoke test on GPU: 3 epochs on fold 0 (after session restart and setup cells run)
import time, torch
if not torch.cuda.is_available():
    print('[SMOKE] CUDA not available. Restart the session (Power > Restart Session), then run Cells 4 → 7 → 16 → 20, and re-run this cell.')
else:
    t0=time.time()
    score = train_one_fold_smoke(fold=0, epochs_override=3)
    print('[SMOKE] Done in %.1fs, OOF Jaccard=%.5f' % (time.time()-t0, score))

[SMOKE] Fold 0: train 813 val 189
[SMOKE] CUDA free MB: 12781
[SMOKE] Forcing CPU mode for smoke validation (GPU wedged).
[SMOKE] Seq settings (train & val): max_length=256, doc_stride=128


[SMOKE] CPU mode downsample: pos_cap=150, neg_cap=150, kept=300
[SMOKE] features_kept=300 (total_pos=1246, total_neg_samp=1246), eff_bsz=1
Removing xlmr_smoke_f0


[SMOKE] Training...


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss


[SMOKE] Train time: 998.4s


[SMOKE] Val step 0, 0.3s


[SMOKE] Val step 20, 6.3s


[SMOKE] Val step 40, 12.3s


[SMOKE] Val step 60, 17.5s


[SMOKE] Val step 80, 23.0s


[SMOKE] Val step 100, 28.3s


[SMOKE] Val step 120, 33.5s


[SMOKE] Val step 140, 38.6s


[SMOKE] Val step 160, 43.8s


[SMOKE] Val step 180, 49.2s


[SMOKE] Val step 200, 54.4s


[SMOKE] Val step 220, 60.1s


[SMOKE] Val step 240, 66.0s


[SMOKE] Val step 260, 71.9s


[SMOKE] Val step 280, 77.0s


[SMOKE] Val step 300, 82.5s


[SMOKE] Val step 320, 87.9s


[SMOKE] Val step 340, 93.1s


[SMOKE] Val step 360, 98.5s


[SMOKE] Val step 380, 103.8s


[SMOKE] Val step 400, 109.2s


[SMOKE] Val step 420, 114.6s


[SMOKE] Val step 440, 119.7s


[SMOKE] Val step 460, 125.0s


[SMOKE] Val step 480, 130.2s


[SMOKE] Val step 500, 135.4s


[SMOKE] Val step 520, 140.6s


[SMOKE] Val step 540, 146.0s


[SMOKE] Val step 560, 151.1s


[SMOKE] Val step 580, 156.3s


[SMOKE] Val step 600, 161.4s


[SMOKE] Val step 620, 166.6s


[SMOKE] Val step 640, 171.8s


[SMOKE] Val step 660, 177.2s


[SMOKE] Val step 680, 182.4s


[SMOKE] Val step 700, 187.5s


[SMOKE] Val step 720, 192.7s


[SMOKE] Val step 740, 197.8s


[SMOKE] Val step 760, 203.0s


[SMOKE] Val step 780, 208.2s


[SMOKE] Val step 800, 213.5s


[SMOKE] Val step 820, 219.1s


[SMOKE] Val step 840, 224.2s


[SMOKE] Val step 860, 229.8s


[SMOKE] Val step 880, 235.4s


[SMOKE] Val step 900, 241.0s


[SMOKE] Val step 920, 246.6s


[SMOKE] Val step 940, 252.2s


[SMOKE] Val step 960, 257.7s


[SMOKE] Val step 980, 263.3s


[SMOKE] Val step 1000, 268.9s


[SMOKE] Val step 1020, 274.4s


[SMOKE] Val step 1040, 280.1s


[SMOKE] Val step 1060, 285.9s


[SMOKE] Val step 1080, 291.3s


[SMOKE] Val step 1100, 297.2s


[SMOKE] Val step 1120, 303.6s


[SMOKE] Val step 1140, 309.8s


[SMOKE] Val step 1160, 316.0s


[SMOKE] Val step 1180, 322.1s


[SMOKE] Val step 1200, 328.3s


[SMOKE] Val step 1220, 334.4s


[SMOKE] Val step 1240, 340.5s


[SMOKE] Val step 1260, 346.6s


[SMOKE] Val step 1280, 352.7s


[SMOKE] Fold 0 OOF Jaccard: 0.50960


[SMOKE] Done in 1428.8s, OOF Jaccard=0.50960


In [4]:
# Patch: simplify post-processing and increase n_best_size to 100, max span 50
n_best_size = 100  # override previous setting

def pool_nbest_over_features(features, examples_df, start_logits, end_logits, rerank_with_gold=False):
    sample_mapping = features['overflow_to_sample_mapping']
    preds_text = [''] * len(examples_df)
    preds_start = [0] * len(examples_df)
    best_score = [-1e30] * len(examples_df)
    for i in range(len(sample_mapping)):
        ex_idx = int(sample_mapping[i])
        offsets = features['offset_mapping'][i]
        seq_ids = features.sequence_ids(i)
        ctx_tokens = [j for j,sid in enumerate(seq_ids) if sid==1]
        if not ctx_tokens:
            continue
        c0, c1 = ctx_tokens[0], ctx_tokens[-1]
        s = np.array(start_logits[i], dtype=np.float32).copy()
        e = np.array(end_logits[i], dtype=np.float32).copy()
        # mask non-context to very negative
        s[:c0] = -1e9; e[:c0] = -1e9
        if c1+1 < len(s):
            s[c1+1:] = -1e9; e[c1+1:] = -1e9
        start_idxes = np.argsort(s)[-n_best_size:][::-1]
        end_idxes   = np.argsort(e)[-n_best_size:][::-1]
        cands = []
        ctx = examples_df.loc[ex_idx, 'context']
        qtext = str(examples_df.loc[ex_idx, 'question'])
        q_has_digit = 'DIGIT_PAT' in globals() and (DIGIT_PAT.search(qtext) is not None)
        for si in start_idxes:
            if si < c0 or si > c1:
                continue
            for ei in end_idxes:
                if ei < c0 or ei > c1 or ei < si:
                    continue
                # joint token span length cap
                if (ei - si + 1) > 50:
                    continue
                stc, enc = offsets[si][0], offsets[ei][1]
                if stc is None or enc is None or enc <= stc:
                    continue
                text = edge_trim(ctx[stc:enc].strip())
                if not text:
                    continue
                score = float(s[si] + e[ei])
                # optional small numeric bonus
                if q_has_digit:
                    cand_has_digit = DIGIT_PAT.search(text) is not None
                    score += (0.02 if cand_has_digit else -0.02)
                cands.append((score, text, stc))
        if not cands:
            continue
        if rerank_with_gold:
            gold = examples_df.loc[ex_idx, 'answer_text']
            cands.sort(key=lambda x: (word_jaccard(gold, x[1]), x[0], -len(x[1])), reverse=True)
        else:
            cands.sort(key=lambda x: (x[0], -len(x[1])), reverse=True)
        score, text, stc = cands[0]
        if score > best_score[ex_idx]:
            best_score[ex_idx] = score
            preds_text[ex_idx] = text
            preds_start[ex_idx] = stc
    for i in range(len(preds_text)):
        if preds_text[i] == '':
            preds_text[i] = edge_trim(examples_df.loc[i, 'context'][:0])
            preds_start[i] = 0
    return preds_text, preds_start

print('[Patch] Post-processing: raw logits with context masking, n_best_size=100, max span=50, +numeric bonus if question has digits.')

[Patch] Post-processing simplified: raw logits with context masking, n_best_size=100, max span=50


In [None]:
# Patch: redefine train_5fold_x to include sampled negatives (~2x per positive)
def train_5fold_x():
    import pandas as pd, numpy as np, time, torch, gc, os, glob
    from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer, DataCollatorWithPadding
    df = pd.read_csv('train_folds.csv')
    all_oof = []
    for fold in range(5):
        t_fold = time.time()
        trn_df = df[df['fold']!=fold].reset_index(drop=True)
        val_df = df[df['fold']==fold].reset_index(drop=True)
        print(f'Fold {fold}: train {len(trn_df)} val {len(val_df)}')
        # Build raw train features
        trn_feats_raw = prepare_train_features_x(trn_df)
        # Positives + sampled negatives (2x)
        is_pos = np.array([int(sp) > 0 for sp in trn_feats_raw['start_positions']])
        all_idx = np.arange(len(is_pos))
        pos_idx = all_idx[is_pos]
        neg_idx = all_idx[~is_pos]
        rng = np.random.RandomState(42 + fold)
        n_neg_keep = min(len(neg_idx), 2*len(pos_idx))
        sampled_neg = rng.choice(neg_idx, size=n_neg_keep, replace=False) if n_neg_keep > 0 else np.array([], dtype=int)
        keep_idx = np.sort(np.concatenate([pos_idx, sampled_neg])) if len(pos_idx)>0 else np.array([], dtype=int)
        def filt_field(vals, idx):
            return [vals[i] for i in idx] if isinstance(vals, list) else None
        trn_feats = {}
        for k, v in trn_feats_raw.items():
            fl = filt_field(v, keep_idx)
            if fl is not None:
                trn_feats[k] = fl
        if 'token_type_ids' in trn_feats_raw and 'token_type_ids' not in trn_feats:
            trn_feats['token_type_ids'] = filt_field(trn_feats_raw['token_type_ids'], keep_idx)
        print(f"Fold {fold}: features_kept={len(keep_idx)} (pos={len(pos_idx)}, neg_samp={len(sampled_neg)})")

        val_feats = prepare_features_only_x(val_df)
        train_ds = QADataset(trn_feats, with_labels=True)
        val_ds = QADataset(val_feats, with_labels=False)
        num_feats = len(trn_feats['input_ids']) if 'input_ids' in trn_feats else 0
        eff_bsz = bsz * grad_accum
        steps_per_epoch = (num_feats + eff_bsz - 1) // eff_bsz if eff_bsz>0 else 0
        print(f"Fold {fold}: features={num_feats}, eff_bsz={eff_bsz}, steps/epoch={steps_per_epoch}, epochs={epochs}")

        model_root = f'xlmr_f{fold}'
        ckpt_path = _find_checkpoint_dir(model_root)
        if ckpt_path is not None:
            print(f'Loading existing model for fold {fold} from {ckpt_path}')
            model = _try_load_fold_model(ckpt_path)
        else:
            model = AutoModelForQuestionAnswering.from_pretrained(xlmr_model, attn_implementation='eager')
            if hasattr(model, 'gradient_checkpointing_enable'):
                model.gradient_checkpointing_enable()
            if hasattr(model, 'config') and hasattr(model.config, 'use_cache'):
                try: model.config.use_cache = False
                except Exception: pass
            optim_name = 'adamw_bnb_8bit' if HAS_BNB else 'adafactor'
            print(f'Using optimizer: {optim_name}')
            args = TrainingArguments(
                output_dir=model_root,
                per_device_train_batch_size=bsz, per_device_eval_batch_size=4,
                gradient_accumulation_steps=grad_accum,
                num_train_epochs=epochs, learning_rate=lr, warmup_ratio=warmup_ratio, weight_decay=0.01,
                max_grad_norm=1.0, optim=optim_name, lr_scheduler_type='cosine', group_by_length=True,
                bf16=False, fp16=True, save_strategy='no', save_total_limit=1,
                logging_steps=10, evaluation_strategy='no', seed=42, report_to=[]
            )
            collator = DataCollatorWithPadding(tokenizer_x, pad_to_multiple_of=None)
            trainer = Trainer(model=model, args=args, train_dataset=train_ds, data_collator=collator)
            print(f'Training fold {fold}...', flush=True)
            t0 = time.time(); trainer.train(); print(f'Fold {fold} train time: {time.time()-t0:.1f}s')
            trainer.save_model(model_root)
            del trainer
            model = _try_load_fold_model(model_root)

        # Inference on val
        model.eval(); device = torch.device('cuda' if torch.cuda.is_available() else 'cpu'); model.to(device)
        collator = DataCollatorWithPadding(tokenizer_x, pad_to_multiple_of=None)
        val_loader = torch.utils.data.DataLoader(val_ds, batch_size=4, shuffle=False, collate_fn=collator, pin_memory=torch.cuda.is_available())
        N = len(val_feats['input_ids'])
        s_logits_list = [None] * N; e_logits_list = [None] * N
        with torch.no_grad():
            t1=time.time()
            for step, batch in enumerate(val_loader):
                feat_idx = batch.pop('feat_idx').cpu().numpy()
                for k in list(batch.keys()): batch[k] = batch[k].to(device)
                out = model(**batch)
                s = out.start_logits.detach().cpu().numpy(); e = out.end_logits.detach().cpu().numpy()
                for j, fi in enumerate(feat_idx):
                    s_logits_list[int(fi)] = s[j]; e_logits_list[int(fi)] = e[j]
                if step % 20 == 0: print(f'Val fold {fold} step {step}, {time.time()-t1:.1f}s', flush=True)
        assert all(x is not None for x in s_logits_list) and all(x is not None for x in e_logits_list)
        pred_texts, pred_starts = pool_nbest_over_features(val_feats, val_df, s_logits_list, e_logits_list, rerank_with_gold=False)
        val_out = val_df.copy(); val_out['pred_text'] = pred_texts; val_out['pred_start'] = pred_starts
        val_out['jaccard'] = [word_jaccard(a,b) for a,b in zip(val_out['answer_text'].astype(str), val_out['pred_text'].astype(str))]
        print(f'Fold {fold} OOF Jaccard: {val_out["jaccard"].mean():.5f}, elapsed {time.time()-t_fold:.1f}s')
        val_out.to_csv(f'oof_xlmr_fold{fold}.csv', index=False)
        all_oof.append(val_out[['id','jaccard']])
        # free
        del model, train_ds, val_ds, trn_feats, val_feats; torch.cuda.empty_cache(); gc.collect()
    oof = pd.concat(all_oof, axis=0, ignore_index=True)
    print('OOF Jaccard mean:', float(oof['jaccard'].mean()))
    return float(oof['jaccard'].mean())

print('[Patch] train_5fold_x updated to include sampled negatives (2x) and FP16 enabled on V100 as recommended.')

## RESTART CHECKLIST (do this via UI now)
- Power > Restart Session (to clear wedged GPU).
- Run Cell 1: confirm nvidia-smi shows ~0 MiB used.
- Run in order: Cell 4 (install torch/HF) → Cell 7 (init xlmr-base globals) → Cell 16 (prepare_train_features_x patch) → Cell 20 (post-process patch).
- Optional: need 8-bit optimizer? Run Cell 13 (bitsandbytes --no-deps), then re-run Cell 7 to set HAS_BNB=True.
- Smoke test: Cell 15 is GPU-ready; then run Cell 19 (epochs=3). Target OOF fold0 ≥ 0.70.
- Full training: ensure Cell 21 has bf16=False, fp16=True (already set). Then call train_5fold_x() in a new cell.

Globals for medal run (already set or in Cell 7):
- Model: deepset/xlm-roberta-base-squad2
- max_length=384, doc_stride=128
- per_device_train_batch_size=4, grad_accum=4 (eff=16). If OOM: bsz=2, grad_accum=8 or max_length=320.
- epochs=3–4, lr=2e-5, warmup_ratio=0.10, weight_decay=0.01
- fp16=True, bf16=False, gradient_checkpointing=True, group_by_length=True, use_cache=False
- Negatives ≈ 2x positives (Cell 21 handles this).

Post-processing: simplified start+end with context masking, n_best=100, max span=50 (no extra penalties).

In [15]:
# Inference: average fold logits on test and write submission.csv
import os, glob, time, numpy as np, pandas as pd, torch, gc
from transformers import AutoModelForQuestionAnswering, DataCollatorWithPadding

def predict_test_and_submit(model_glob='xlmr_f*', force_cpu=True):
    # Hard-disable CUDA if forcing CPU to avoid wedged GPU OOM during model load
    if force_cpu:
        os.environ['CUDA_VISIBLE_DEVICES'] = ''
    t0=time.time()
    test_df = pd.read_csv('test.csv')
    sub_tmpl = pd.read_csv('sample_submission.csv')
    id_col = sub_tmpl.columns[0]
    pred_col = sub_tmpl.columns[1]
    print(f'[TEST] id_col={id_col}, pred_col={pred_col}, test_rows={len(test_df)}')

    # Ensure tokenizer settings mirror training
    feats = prepare_features_only_x(test_df)
    test_ds = QADataset(feats, with_labels=False)
    collator = DataCollatorWithPadding(tokenizer_x, pad_to_multiple_of=None)

    # Discover fold model dirs
    fold_dirs = sorted([d for d in glob.glob(model_glob) if os.path.isdir(d)])
    if not fold_dirs:
        raise FileNotFoundError('No fold model directories found (pattern: %s)' % model_glob)
    print('[TEST] Using folds:', fold_dirs)

    N = len(feats['input_ids'])
    s_sum = None; e_sum = None
    used = 0
    device = torch.device('cpu') if force_cpu else torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    for k, d in enumerate(fold_dirs):
        print(f'[TEST] Loading {d} ...', flush=True)
        model = None
        # try standard then fallback loader; if both fail, skip this fold dir
        try:
            model = AutoModelForQuestionAnswering.from_pretrained(d, local_files_only=True, device_map='cpu', torch_dtype=torch.float32)
        except Exception:
            try:
                model = _try_load_fold_model(d)
            except Exception as e:
                print(f'[TEST] Skip {d}: {repr(e)}');
                continue
        used += 1
        model.eval(); model.to(device)
        loader = torch.utils.data.DataLoader(test_ds, batch_size=8, shuffle=False, collate_fn=collator, pin_memory=False)
        s_logits = [None]*N; e_logits = [None]*N
        with torch.no_grad():
            t1=time.time()
            for step, batch in enumerate(loader):
                feat_idx = batch.pop('feat_idx').cpu().numpy()
                for kk in list(batch.keys()): batch[kk] = batch[kk].to(device)
                out = model(**batch)
                s = out.start_logits.detach().cpu().numpy(); e = out.end_logits.detach().cpu().numpy()
                for j, fi in enumerate(feat_idx):
                    s_logits[int(fi)] = s[j]; e_logits[int(fi)] = e[j]
                if step % 50 == 0:
                    print(f'  [TEST] fold {k} step {step}, {time.time()-t1:.1f}s', flush=True)
        s_arr = np.stack(s_logits, axis=0); e_arr = np.stack(e_logits, axis=0)
        if s_sum is None:
            s_sum = s_arr; e_sum = e_arr
        else:
            s_sum += s_arr; e_sum += e_arr
        del model, loader, s_logits, e_logits, s_arr, e_arr; torch.cuda.empty_cache(); gc.collect()

    if used == 0:
        raise RuntimeError('No usable fold checkpoints found to run inference.')

    s_avg = s_sum / used; e_avg = e_sum / used
    pred_texts, pred_starts = pool_nbest_over_features(feats, test_df, s_avg, e_avg, rerank_with_gold=False)

    sub = pd.DataFrame({id_col: test_df[id_col].values, pred_col: pred_texts})
    sub.to_csv('submission.csv', index=False)
    print('[TEST] Wrote submission.csv with', len(sub), 'rows in %.1fs (folds used=%d, device=%s)' % (time.time()-t0, used, str(device)))
    return sub.head()

print('Test inference helper ready: call predict_test_and_submit() after training completes.')

Test inference helper ready: call predict_test_and_submit() after training completes.


In [None]:
# One-click full pipeline: 5-fold train -> test inference -> submission.csv
import time, json, pathlib
def run_full_train_and_submit():
    t0=time.time()
    print('[RUN] Starting 5-fold training...', flush=True)
    oof_mean = train_5fold_x()
    print('[RUN] 5-fold OOF Jaccard:', float(oof_mean))
    pathlib.Path('metrics.json').write_text(json.dumps({'oof_jaccard': float(oof_mean)}, ensure_ascii=False))
    print('[RUN] Inference on test and submission build...', flush=True)
    head = predict_test_and_submit('xlmr_f*')
    print(head)
    print('[RUN] Done in %.1fs' % (time.time()-t0))
    return float(oof_mean)

print('Ready: after smoke success, call run_full_train_and_submit() to train 5 folds and create submission.csv.')

In [24]:
# CPU fallback: generate submission using existing fold models
try:
    head = predict_test_and_submit('xlmr_f*')
    print(head)
except Exception as e:
    print('Submission inference failed:', repr(e))

[TEST] id_col=id, pred_col=PredictionString, test_rows=112


[TEST] Using folds: ['xlmr_f0', 'xlmr_f1', 'xlmr_f2']
[TEST] Loading xlmr_f0 ...


  [TEST] fold 0 step 0, 0.1s


  [TEST] fold 0 step 50, 6.3s


  [TEST] fold 0 step 100, 12.4s


  [TEST] fold 0 step 150, 18.5s


  [TEST] fold 0 step 200, 24.6s


  [TEST] fold 0 step 250, 30.8s


  [TEST] fold 0 step 300, 36.9s


  [TEST] fold 0 step 350, 43.1s


[TEST] Loading xlmr_f1 ...


  [TEST] fold 1 step 0, 0.1s


  [TEST] fold 1 step 50, 6.3s


  [TEST] fold 1 step 100, 12.4s


  [TEST] fold 1 step 150, 18.6s


  [TEST] fold 1 step 200, 24.8s


  [TEST] fold 1 step 250, 30.9s


  [TEST] fold 1 step 300, 37.1s


  [TEST] fold 1 step 350, 43.3s


[TEST] Loading xlmr_f2 ...


[TEST] Skip xlmr_f2: FileNotFoundError('No loadable checkpoint in xlmr_f2')


[TEST] Wrote submission.csv with 112 rows in 148.9s (folds used=2)
          id               PredictionString
0  be799d365              मुंबई, महाराष्ट्र
1  26f356026  उदासिनाचार्य सुमेरुदास महाराज
2  57a56c43f                     brain stem
3  da062fdbb                       बिंबिसार
4  72fc0d5b5                 20 अप्रैल 1889


## ACTION REQUIRED: Restart Session Now to Unblock GPU

The GPU is wedged (nvidia-smi shows ~10.9GB used with no processes). To proceed to a medal-capable run, please do:

1) Power > Restart Session (clears phantom GPU processes).
2) Run cells in order:
- Cell 1: confirm GPU VRAM ~0 MiB used.
- Cell 4: install cu121 PyTorch and HF stack.
- Cell 7: initialize XLM-R base pipeline (HAS_BNB auto-detected).
- Cell 16: span-labeling patch (coverage-based snapping).
- Cell 20: simplified post-processing (n_best=100, max span=50).
  - Optional: Cell 13 to install bitsandbytes, then re-run Cell 7 so HAS_BNB=True.

3) Smoke test on GPU:
- Run Cell 19 (epochs_override=3). Expect OOF fold0 ≥ 0.70.

4) Full training (target medal):
- Run train_5fold_x() in a new cell or Cell 24's run_full_train_and_submit().
- Settings: max_length=384, doc_stride=128, fp16=True, gradient_checkpointing, group_by_length=True, AdamW 8-bit if BNB available, LR=2e-5, warmup_ratio=0.10, weight_decay=0.01, epochs=3.
- Negatives: ~2x per positive (already implemented).

5) Inference and submit:
- After training, call predict_test_and_submit('xlmr_f*') to write submission.csv, then submit.

If training time exceeds budget (>60 min per fold), stop after 4 folds and submit, or pivot to 3 folds with 2–3 seeds and average logits.

In [21]:
# CPU inference: include smoke model too (may or may not help); writes submission.csv
try:
    head_alt = predict_test_and_submit('xlmr_*')
    print(head_alt)
except Exception as e:
    print('Alt submission inference failed:', repr(e))

[TEST] id_col=id, pred_col=PredictionString, test_rows=112


[TEST] Using folds: ['xlmr_f0', 'xlmr_f1', 'xlmr_f2', 'xlmr_smoke_f0']
[TEST] Loading xlmr_f0 ...


  [TEST] fold 0 step 0, 0.3s


  [TEST] fold 0 step 50, 9.9s


  [TEST] fold 0 step 100, 20.5s


  [TEST] fold 0 step 150, 29.1s


  [TEST] fold 0 step 200, 35.2s


  [TEST] fold 0 step 250, 41.4s


  [TEST] fold 0 step 300, 47.6s


  [TEST] fold 0 step 350, 53.8s


[TEST] Loading xlmr_f1 ...


  [TEST] fold 1 step 0, 0.1s


  [TEST] fold 1 step 50, 6.3s


  [TEST] fold 1 step 100, 12.5s


  [TEST] fold 1 step 150, 18.7s


  [TEST] fold 1 step 200, 24.8s


  [TEST] fold 1 step 250, 31.0s


  [TEST] fold 1 step 300, 37.2s


  [TEST] fold 1 step 350, 43.4s


[TEST] Loading xlmr_f2 ...


[TEST] Skip xlmr_f2: FileNotFoundError('No loadable checkpoint in xlmr_f2')
[TEST] Loading xlmr_smoke_f0 ...


[TEST] Skip xlmr_smoke_f0: FileNotFoundError('No loadable checkpoint in xlmr_smoke_f0')


[TEST] Wrote submission.csv with 112 rows in 164.7s (folds used=2)
          id               PredictionString
0  be799d365              मुंबई, महाराष्ट्र
1  26f356026  उदासिनाचार्य सुमेरुदास महाराज
2  57a56c43f                     brain stem
3  da062fdbb                       बिंबिसार
4  72fc0d5b5                 20 अप्रैल 1889


In [31]:
# CPU inference with longer test context (TTA): bump max_length/doc_stride only for test features
try:
    old_max_len, old_stride = max_length, doc_stride
except NameError:
    old_max_len, old_stride = 384, 128
try:
    max_length, doc_stride = 512, 192  # slightly larger stride for more overlap/coverage
    print(f'[TTA] Test-time lengths: max_length={max_length}, doc_stride={doc_stride}')
    head_tta = predict_test_and_submit('xlmr_f*')
    print(head_tta)
finally:
    max_length, doc_stride = old_max_len, old_stride
    print(f'[TTA] Restored lengths: max_length={max_length}, doc_stride={doc_stride}')

[TTA] Test-time lengths: max_length=512, doc_stride=192
[TEST] id_col=id, pred_col=PredictionString, test_rows=112


[TEST] Using folds: ['xlmr_f0', 'xlmr_f1', 'xlmr_f2']
[TEST] Loading xlmr_f0 ...


  [TEST] fold 0 step 0, 0.8s


  [TEST] fold 0 step 50, 37.6s


  [TEST] fold 0 step 100, 74.3s


[TEST] Loading xlmr_f1 ...


  [TEST] fold 1 step 0, 0.7s


  [TEST] fold 1 step 50, 37.7s


  [TEST] fold 1 step 100, 74.7s


[TEST] Loading xlmr_f2 ...


[TEST] Skip xlmr_f2: FileNotFoundError('No loadable checkpoint in xlmr_f2')


[TEST] Wrote submission.csv with 112 rows in 225.7s (folds used=2)
          id               PredictionString
0  be799d365              मुंबई, महाराष्ट्र
1  26f356026  उदासिनाचार्य सुमेरुदास महाराज
2  57a56c43f                          புறணி
3  da062fdbb                       बिंबिसार
4  72fc0d5b5                 २० अप्रैल १८८९
[TTA] Restored lengths: max_length=256, doc_stride=128


In [32]:
# CPU ensemble over two tokenization lengths: 256/128 vs 512/160, restricted to xlmr_f0/xlmr_f1 for speed.
import os, glob, time, numpy as np, pandas as pd, torch, gc
from transformers import AutoModelForQuestionAnswering, DataCollatorWithPadding

def _pool_with_scores(features, examples_df, start_logits, end_logits):
    sample_mapping = features['overflow_to_sample_mapping']
    preds_text = [''] * len(examples_df)
    preds_start = [0] * len(examples_df)
    best_score = [-1e30] * len(examples_df)
    for i in range(len(sample_mapping)):
        ex_idx = int(sample_mapping[i])
        offsets = features['offset_mapping'][i]
        seq_ids = features.sequence_ids(i)
        ctx_tokens = [j for j,sid in enumerate(seq_ids) if sid==1]
        if not ctx_tokens:
            continue
        c0, c1 = ctx_tokens[0], ctx_tokens[-1]
        s = np.array(start_logits[i], dtype=np.float32)
        e = np.array(end_logits[i], dtype=np.float32)
        s[:c0] = -1e9; e[:c0] = -1e9
        if c1+1 < len(s):
            s[c1+1:] = -1e9; e[c1+1:] = -1e9
        start_idxes = np.argsort(s)[-n_best_size:][::-1]
        end_idxes   = np.argsort(e)[-n_best_size:][::-1]
        ctx = examples_df.loc[ex_idx, 'context']
        qtext = str(examples_df.loc[ex_idx, 'question'])
        q_has_digit = ('DIGIT_PAT' in globals()) and (DIGIT_PAT.search(qtext) is not None)
        for si in start_idxes:
            if si < c0 or si > c1: continue
            for ei in end_idxes:
                if ei < c0 or ei > c1 or ei < si: continue
                if (ei - si + 1) > 50: continue
                stc, enc = offsets[si][0], offsets[ei][1]
                if stc is None or enc is None or enc <= stc: continue
                text = edge_trim(ctx[stc:enc].strip())
                if not text: continue
                score = float(s[si] + e[ei])
                if q_has_digit:
                    cand_has_digit = DIGIT_PAT.search(text) is not None
                    score += (0.02 if cand_has_digit else -0.02)
                if score > best_score[ex_idx]:
                    best_score[ex_idx] = score
                    preds_text[ex_idx] = text
                    preds_start[ex_idx] = stc
    for i in range(len(preds_text)):
        if preds_text[i] == '':
            preds_text[i] = edge_trim(examples_df.loc[i, 'context'][:0])
            preds_start[i] = 0
            if best_score[i] < -1e20: best_score[i] = -1e20
    return preds_text, preds_start, np.array(best_score, dtype=np.float32)

def _infer_once_token_length(model_dirs, max_len, stride):
    global max_length, doc_stride
    old_max, old_stride = max_length, doc_stride
    max_length, doc_stride = max_len, stride
    try:
        os.environ['CUDA_VISIBLE_DEVICES'] = ''
        device = torch.device('cpu')
        test_df = pd.read_csv('test.csv')
        feats = prepare_features_only_x(test_df)
        test_ds = QADataset(feats, with_labels=False)
        collator = DataCollatorWithPadding(tokenizer_x, pad_to_multiple_of=None)
        N = len(feats['input_ids'])
        s_sum = None; e_sum = None; used = 0
        for d in model_dirs:
            try:
                model = AutoModelForQuestionAnswering.from_pretrained(d, local_files_only=True, device_map='cpu', torch_dtype=torch.float32)
            except Exception:
                continue
            used += 1
            model.eval(); model.to(device)
            loader = torch.utils.data.DataLoader(test_ds, batch_size=8, shuffle=False, collate_fn=collator, pin_memory=False)
            s_logits = [None]*N; e_logits = [None]*N
            t1 = time.time()
            with torch.no_grad():
                for step, batch in enumerate(loader):
                    feat_idx = batch.pop('feat_idx').cpu().numpy()
                    for k in list(batch.keys()): batch[k] = batch[k].to(device)
                    out = model(**batch)
                    s = out.start_logits.detach().cpu().numpy(); e = out.end_logits.detach().cpu().numpy()
                    for j, fi in enumerate(feat_idx):
                        s_logits[int(fi)] = s[j]; e_logits[int(fi)] = e[j]
                    if step % 50 == 0:
                        print(f'    [PASS len={max_len}] folddir {d}, step {step}, {time.time()-t1:.1f}s', flush=True)
            s_arr = np.stack(s_logits, axis=0); e_arr = np.stack(e_logits, axis=0)
            if s_sum is None: s_sum, e_sum = s_arr, e_arr
            else: s_sum += s_arr; e_sum += e_arr
            del model, loader, s_logits, e_logits, s_arr, e_arr; torch.cuda.empty_cache(); gc.collect()
        if used == 0: raise RuntimeError('No usable fold checkpoints found.')
        s_avg = s_sum / used; e_avg = e_sum / used
        texts, starts, scores = _pool_with_scores(feats, test_df, s_avg, e_avg)
        return test_df, texts, starts, scores
    finally:
        max_length, doc_stride = old_max, old_stride

def cpu_dual_length_ensemble():
    sub_tmpl = pd.read_csv('sample_submission.csv')
    id_col = sub_tmpl.columns[0]; pred_col = sub_tmpl.columns[1]
    # Restrict to known good folds for speed
    model_dirs = [d for d in ['xlmr_f0','xlmr_f1'] if os.path.isdir(d)]
    print('[DUAL] Using folds:', model_dirs)
    # run short
    print('[DUAL] Pass A: max_length=256, doc_stride=128')
    A_df, A_texts, A_starts, A_scores = _infer_once_token_length(model_dirs, 256, 128)
    # run long
    print('[DUAL] Pass B: max_length=512, doc_stride=160')
    B_df, B_texts, B_starts, B_scores = _infer_once_token_length(model_dirs, 512, 160)
    assert list(A_df[id_col].values) == list(B_df[id_col].values)
    # choose per-example by higher score
    A_scores = np.asarray(A_scores); B_scores = np.asarray(B_scores)
    take_B = B_scores > A_scores
    final_texts = [B_texts[i] if take_B[i] else A_texts[i] for i in range(len(A_texts))]
    sub = pd.DataFrame({id_col: A_df[id_col].values, pred_col: final_texts})
    sub.to_csv('submission.csv', index=False)
    print('[DUAL] Wrote submission.csv with', len(sub), 'rows. Chose B for', int(take_B.sum()), 'rows.')
    return sub.head()

head_dual = cpu_dual_length_ensemble()
print(head_dual)

[DUAL] Using folds: ['xlmr_f0', 'xlmr_f1']
[DUAL] Pass A: max_length=256, doc_stride=128


    [PASS len=256] folddir xlmr_f0, step 0, 2.1s


    [PASS len=256] folddir xlmr_f0, step 50, 106.8s


    [PASS len=256] folddir xlmr_f0, step 100, 211.2s


    [PASS len=256] folddir xlmr_f0, step 150, 315.7s


    [PASS len=256] folddir xlmr_f0, step 200, 419.8s


    [PASS len=256] folddir xlmr_f0, step 250, 520.8s


    [PASS len=256] folddir xlmr_f0, step 300, 616.9s


    [PASS len=256] folddir xlmr_f0, step 350, 712.8s


    [PASS len=256] folddir xlmr_f1, step 0, 2.0s


    [PASS len=256] folddir xlmr_f1, step 50, 97.7s


    [PASS len=256] folddir xlmr_f1, step 100, 191.9s


    [PASS len=256] folddir xlmr_f1, step 150, 292.6s


    [PASS len=256] folddir xlmr_f1, step 200, 400.5s


    [PASS len=256] folddir xlmr_f1, step 250, 509.5s


    [PASS len=256] folddir xlmr_f1, step 300, 606.0s


    [PASS len=256] folddir xlmr_f1, step 350, 703.5s


[DUAL] Pass B: max_length=512, doc_stride=160


    [PASS len=512] folddir xlmr_f0, step 0, 6.5s


    [PASS len=512] folddir xlmr_f0, step 50, 335.2s


    [PASS len=512] folddir xlmr_f0, step 100, 663.4s


    [PASS len=512] folddir xlmr_f1, step 0, 4.8s


    [PASS len=512] folddir xlmr_f1, step 50, 236.9s


    [PASS len=512] folddir xlmr_f1, step 100, 444.1s


[DUAL] Wrote submission.csv with 112 rows. Chose B for 44 rows.
          id               PredictionString
0  be799d365              मुंबई, महाराष्ट्र
1  26f356026  उदासिनाचार्य सुमेरुदास महाराज
2  57a56c43f                     brain stem
3  da062fdbb                       बिंबिसार
4  72fc0d5b5                 20 अप्रैल 1889


In [33]:
# CPU single-pass (512/160) with strong re-ranking as per expert advice
import os, time, glob, gc, numpy as np, pandas as pd, torch
from transformers import AutoModelForQuestionAnswering, DataCollatorWithPadding

def _pool_with_scores_strong(features, examples_df, start_logits, end_logits,
                             top_k=15, max_span_tokens=30):
    sample_mapping = features['overflow_to_sample_mapping']
    N_ex = len(examples_df)
    preds_text = [''] * N_ex
    preds_start = [0] * N_ex
    best_score = np.full(N_ex, -1e30, dtype=np.float32)
    best_len = np.full(N_ex, 10**9, dtype=np.int32)
    best_text = [''] * N_ex
    for i in range(len(sample_mapping)):
        ex_idx = int(sample_mapping[i])
        offsets = features['offset_mapping'][i]
        seq_ids = features.sequence_ids(i)
        ctx_tokens = [j for j, sid in enumerate(seq_ids) if sid == 1]
        if not ctx_tokens:
            continue
        c0, c1 = ctx_tokens[0], ctx_tokens[-1]
        s_log = np.asarray(start_logits[i], dtype=np.float32)
        e_log = np.asarray(end_logits[i], dtype=np.float32)
        # masked log-softmax probabilities within context
        s_lp = _log_softmax_masked(s_log, c0, c1)
        e_lp = _log_softmax_masked(e_log, c0, c1)
        start_idxes = np.argsort(s_lp)[-top_k:][::-1]
        end_idxes = np.argsort(e_lp)[-top_k:][::-1]
        qtext = str(examples_df.loc[ex_idx, 'question'])
        ctx = examples_df.loc[ex_idx, 'context']
        q_has_digit = DIGIT_PAT.search(qtext) is not None
        ctx_norm = norm_for_metric(ctx)
        for si in start_idxes:
            if si < c0 or si > c1: continue
            for ei in end_idxes:
                if ei < c0 or ei > c1 or ei < si: continue
                tok_len = (ei - si + 1)
                if tok_len > max_span_tokens: continue
                stc, enc = offsets[si][0], offsets[ei][1]
                if stc is None or enc is None or enc <= stc: continue
                raw_span = ctx[stc:enc]
                text = edge_trim(raw_span.strip())
                if not text: continue
                # boundary/penalties
                left_ok = (stc == 0) or _is_punct(ctx[stc-1])
                right_ok = (enc >= len(ctx)) or _is_punct(ctx[enc:enc+1])
                lead_p = (raw_span and _is_punct(raw_span[0]))
                trail_p = (raw_span and _is_punct(raw_span[-1]))
                span_has_digit = DIGIT_PAT.search(text) is not None
                # repetition bonus (normalized, digits mapped) - count >= 2
                text_norm = norm_for_metric(text)
                ctx_freq = (text_norm and ctx_norm.count(text_norm) >= 2)
                score = float(s_lp[si] + e_lp[ei])
                score += (-0.003 * tok_len)
                if q_has_digit:
                    score += (0.03 if span_has_digit else -0.03)
                if left_ok: score += 0.02
                if right_ok: score += 0.02
                if lead_p: score -= 0.02
                if trail_p: score -= 0.02
                if text.endswith('।') or text.endswith('॥'): score -= 0.03
                if ctx_freq: score += 0.02
                # update with tie-breaks
                if score > best_score[ex_idx] + 1e-12:
                    best_score[ex_idx] = score; best_len[ex_idx] = tok_len
                    preds_text[ex_idx] = text; preds_start[ex_idx] = stc; best_text[ex_idx] = text
                else:
                    # if within 0.02 and substring relation, prefer shorter
                    if abs(score - best_score[ex_idx]) <= 0.02:
                        prev = best_text[ex_idx]
                        if prev:
                            if (text in prev or prev in text) and tok_len < best_len[ex_idx]:
                                best_score[ex_idx] = score; best_len[ex_idx] = tok_len
                                preds_text[ex_idx] = text; preds_start[ex_idx] = stc; best_text[ex_idx] = text
    # fallbacks
    for i in range(N_ex):
        if preds_text[i] == '':
            preds_text[i] = edge_trim(examples_df.loc[i, 'context'][:0])
            preds_start[i] = 0
    return preds_text, preds_start, best_score

def cpu_single_pass_rank_and_submit():
    # single pass len=512, stride=160; average logits over available two folds
    sub_tmpl = pd.read_csv('sample_submission.csv')
    id_col = sub_tmpl.columns[0]; pred_col = sub_tmpl.columns[1]
    model_dirs = [d for d in ['xlmr_f0','xlmr_f1'] if os.path.isdir(d)]
    if not model_dirs:
        raise FileNotFoundError('No usable fold dirs among xlmr_f0/xlmr_f1')
    print('[SINGLE] Using folds:', model_dirs, flush=True)
    # override seq len temporarly
    global max_length, doc_stride
    old_len, old_stride = max_length, doc_stride
    max_length, doc_stride = 512, 160
    try:
        os.environ['CUDA_VISIBLE_DEVICES'] = ''
        device = torch.device('cpu')
        test_df = pd.read_csv('test.csv')
        feats = prepare_features_only_x(test_df)
        test_ds = QADataset(feats, with_labels=False)
        collator = DataCollatorWithPadding(tokenizer_x, pad_to_multiple_of=None)
        N = len(feats['input_ids'])
        s_sum = None; e_sum = None; used = 0
        t0 = time.time()
        for d in model_dirs:
            try:
                model = AutoModelForQuestionAnswering.from_pretrained(d, local_files_only=True, device_map='cpu', torch_dtype=torch.float32)
            except Exception as e:
                print('[SINGLE] Skip', d, repr(e)); continue
            used += 1
            model.eval(); model.to(device)
            loader = torch.utils.data.DataLoader(test_ds, batch_size=8, shuffle=False, collate_fn=collator, pin_memory=False)
            s_logits = [None]*N; e_logits = [None]*N
            t1=time.time()
            with torch.no_grad():
                for step, batch in enumerate(loader):
                    feat_idx = batch.pop('feat_idx').cpu().numpy()
                    for k in list(batch.keys()): batch[k] = batch[k].to(device)
                    out = model(**batch)
                    s = out.start_logits.detach().cpu().numpy(); e = out.end_logits.detach().cpu().numpy()
                    for j, fi in enumerate(feat_idx):
                        s_logits[int(fi)] = s[j]; e_logits[int(fi)] = e[j]
                    if step % 50 == 0:
                        print(f'  [SINGLE] {d} step {step}, {time.time()-t1:.1f}s', flush=True)
            s_arr = np.stack(s_logits, axis=0); e_arr = np.stack(e_logits, axis=0)
            if s_sum is None: s_sum, e_sum = s_arr, e_arr
            else: s_sum += s_arr; e_sum += e_arr
            del model, loader, s_logits, e_logits, s_arr, e_arr; torch.cuda.empty_cache(); gc.collect()
        if used == 0:
            raise RuntimeError('No models loaded for inference.')
        s_avg = s_sum / used; e_avg = e_sum / used
        print('[SINGLE] Pooling with strong reranker...', flush=True)
        texts, starts, scores = _pool_with_scores_strong(feats, test_df, s_avg, e_avg, top_k=15, max_span_tokens=30)
        sub = pd.DataFrame({id_col: test_df[id_col].values, pred_col: texts})
        sub.to_csv('submission.csv', index=False)
        print('[SINGLE] Wrote submission.csv with', len(sub), 'rows in %.1fs' % (time.time()-t0))
        return sub.head()
    finally:
        max_length, doc_stride = old_len, old_stride

head_single = cpu_single_pass_rank_and_submit()
print(head_single)

[SINGLE] Using folds: ['xlmr_f0', 'xlmr_f1']


  [SINGLE] xlmr_f0 step 0, 4.5s


  [SINGLE] xlmr_f0 step 50, 205.2s


  [SINGLE] xlmr_f0 step 100, 395.9s


  [SINGLE] xlmr_f1 step 0, 4.4s


  [SINGLE] xlmr_f1 step 50, 189.2s


  [SINGLE] xlmr_f1 step 100, 377.0s


[SINGLE] Pooling with strong reranker...


[SINGLE] Wrote submission.csv with 112 rows in 1022.3s
          id               PredictionString
0  be799d365              मुंबई, महाराष्ट्र
1  26f356026  उदासिनाचार्य सुमेरुदास महाराज
2  57a56c43f          1260 கன சென்டிமீட்டர்
3  da062fdbb                       बिंबिसार
4  72fc0d5b5                           1889
