In [1]:
# Minimal production decoder setup: inspect logits artifact and compute priors
import os, json, math, re, sys, gc, time, unicodedata as ud
from pathlib import Path
import numpy as np
import pandas as pd

print('CWD:', os.getcwd())

# Load data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
print('train/test shapes:', train.shape, test.shape, flush=True)

# Compute per-language char-length log-normal params from train
def clean_text(s: str) -> str:
    if not isinstance(s, str):
        return ''
    return s.strip()

train['answer_text'] = train['answer_text'].astype(str).map(clean_text)
train['char_len'] = train['answer_text'].str.len().clip(lower=1)
def fit_log_normal_params(df):
    x = np.log(df['char_len'].values.astype(float))
    return float(x.mean()), float(x.std() if x.std() > 1e-6 else 1e-6)

priors = {}
for lang, g in train.groupby('language'):
    mu, sigma = fit_log_normal_params(g)
    priors[lang] = {'mu': mu, 'sigma': sigma, 'n': int(len(g))}
print('Priors (log-space) by language:', priors, flush=True)

# Load strongest logits artifact
npz_path = 'xlmr_large_512_3seeds_avg.npz'
assert os.path.exists(npz_path), f'Missing {npz_path}'
npz = np.load(npz_path, allow_pickle=True)
print('NPZ keys:', list(npz.keys()))

# Try to infer shapes
shapes = {k: (npz[k].shape if hasattr(npz[k], 'shape') else type(npz[k]).__name__) for k in npz.keys()}
print('Shapes:', shapes, flush=True)

# Peek a few entries for mapping keys commonly used: example_id(s), offset_mapping, start_logits, end_logits
def safe_len(x):
    try:
        return len(x)
    except Exception:
        return None

for k in ['example_id', 'example_ids', 'example_id_list']:
    if k in npz:
        ex_ids = npz[k]
        print(k, 'len=', safe_len(ex_ids))
        break

for k in ['offset_mapping', 'offset_mappings', 'test_offset_mapping']:
    if k in npz:
        off = npz[k]
        print(k, 'dtype:', getattr(off, 'dtype', None), 'shape0:', off.shape[0] if hasattr(off,'shape') else None)
        break

for k in ['start_logits', 'test_start_logits', 'start_logits_avg']:
    if k in npz:
        sl = npz[k]
        print(k, 'shape:', getattr(sl, 'shape', None))
        break

for k in ['end_logits', 'test_end_logits', 'end_logits_avg']:
    if k in npz:
        el = npz[k]
        print(k, 'shape:', getattr(el, 'shape', None))
        break

print('Ready to implement decoding in next cell.', flush=True)

CWD: /app/agent_run_states/chaii-hindi-and-tamil-question-answering-20250924-155020


train/test shapes: (1002, 6) (112, 4)


Priors (log-space) by language: {'hindi': {'mu': 2.27352915467594, 'sigma': 0.7057322733289972, 'n': 662}, 'tamil': {'mu': 2.240137765846579, 'sigma': 0.8165516610717193, 'n': 340}}


NPZ keys: ['start', 'end']
Shapes: {'start': (1401, 512), 'end': (1401, 512)}


Ready to implement decoding in next cell.


In [5]:
# Single-stream decoder with char-length prior (lambda sweep) using xlmr_large_512_3seeds_avg.npz
import os, json, math, re, sys, gc, time, unicodedata as ud
from pathlib import Path
import numpy as np
import pandas as pd

npz_path = 'xlmr_large_512_3seeds_avg.npz'
npz = np.load(npz_path, allow_pickle=True)
start_logits = npz['start']  # (Nfeat, L)
end_logits = npz['end']      # (Nfeat, L)
Nfeat, L = start_logits.shape
print('Loaded logits:', start_logits.shape, end_logits.shape, flush=True)

# Load mapping from one seed's saved artifacts (same featureization/order)
map_dir = 'xlmr_large_512_test_logits'
eid_path = Path(map_dir) / 'test_example_id.json'
off_path = Path(map_dir) / 'test_offset_mapping.npy'
assert eid_path.exists() and off_path.exists(), 'Missing mapping files'
example_id_list = json.loads(Path(eid_path).read_text())  # list length Nfeat
offset_mapping = np.load(off_path, allow_pickle=True)     # often object array (Nfeat, L) of tuples
print('Mapping loaded:', len(example_id_list), getattr(offset_mapping, 'shape', None), 'dtype:', getattr(offset_mapping, 'dtype', None), flush=True)
assert len(example_id_list) == Nfeat

# Load test meta
test = pd.read_csv('test.csv')
id2lang = dict(zip(test['id'].tolist(), test['language'].tolist()))
id2context = dict(zip(test['id'].tolist(), test['context'].astype(str).tolist()))

# Priors computed in cell 0 (recompute here if needed)
train = pd.read_csv('train.csv')
train['answer_text'] = train['answer_text'].astype(str).str.strip()
train['char_len'] = train['answer_text'].str.len().clip(lower=1)
def fit_log_normal_params(df):
    x = np.log(df['char_len'].values.astype(float))
    mu = float(x.mean()); sd = float(x.std());
    if sd <= 1e-6: sd = 1e-6
    return mu, sd
priors = {}
for lang, g in train.groupby('language'):
    priors[lang] = dict(zip(['mu','sigma'], fit_log_normal_params(g)))
print('Priors:', priors, flush=True)

# Utility: unicode hygiene (minimal and safe)
ZW_CHARS = {
    '\u200B', '\u200C', '\u200D', '\u2060', '\ufeff',  # ZWSP, ZWNJ, ZWJ, WJ, BOM
}
NBSP_SET = {'\u00A0', '\u2002', '\u2003', '\u2004', '\u2005', '\u2006', '\u2007', '\u2008', '\u2009', '\u200A'}
HI_VIRAMA = '\u094D'
TA_PULLI = '\u0BCD'

def clean_span_text(s: str, lang: str) -> str:
    if not s:
        return ''
    # remove zero-width
    s = ''.join(ch for ch in s if ch not in ZW_CHARS)
    # collapse NBSP/thin spaces to space
    s = ''.join(' ' if ch in NBSP_SET else ch for ch in s)
    # collapse spaces
    s = re.sub(r'\s+', ' ', s).strip()
    # numeric glue inside digit runs
    s = re.sub(r'(?<=\d)[\s,._-](?=\d)', '', s)
    # drop one trailing combining mark or virama/pulli
    if s:
        last = s[-1]
        if ud.category(last) == 'Mn' or last in (HI_VIRAMA, TA_PULLI):
            s = s[:-1]
    # Hindi specific: collapse multiple dandas and strip final danda
    if lang == 'hindi':
        s = s.replace('\u0964\u0964', '\u0964')
        if s.endswith('\u0964'):
            s = s[:-1].rstrip()
    return s

# Scoring helpers
def log_normal_logpdf_len(char_len: int, mu: float, sigma: float) -> float:
    Lc = max(1, int(char_len))
    x = math.log(Lc)
    val = -0.5 * ((x - mu)/sigma)**2 - math.log(max(Lc,1e-6)) - math.log(sigma) - 0.5*math.log(2*math.pi)
    return val

def to_pair(t):
    try:
        if t is None:
            return (0, 0)
        if isinstance(t, (list, tuple, np.ndarray)):
            if len(t) >= 2:
                a = int(t[0]) if t[0] is not None else 0
                b = int(t[1]) if t[1] is not None else 0
                return (a, b)
            else:
                return (0, 0)
        # unexpected type
        return (0, 0)
    except Exception:
        return (0, 0)

def to_offs_matrix(offs_raw):
    # Convert a per-feature offset entry into an int32 (M,2) array; supports ragged/object formats
    if isinstance(offs_raw, np.ndarray) and offs_raw.ndim == 2 and offs_raw.shape[1] == 2 and offs_raw.dtype != object:
        return offs_raw.astype(np.int32, copy=False)
    try:
        pairs = [to_pair(t) for t in list(offs_raw)]
        arr = np.asarray(pairs, dtype=np.int32)
        if arr.ndim == 2 and arr.shape[1] == 2:
            return arr
    except Exception:
        pass
    return None

def decode_single(lambda_len: float, beta_freq: float = 0.0, temps=None, nbest_hi=180, nbest_ta=220, Lmax_hi=50, Lmax_ta=60, clip_prior=(-0.7, 0.0)):
    t0 = time.time()
    preds = {}  # id -> (score, text)
    counts = {'total_feats': Nfeat, 'skipped_feats': 0}
    # optional temperatures (unused by default)
    if temps is None: temps = {'hindi': (1.0, 1.0), 'tamil': (1.0, 1.0)}
    for fi in range(Nfeat):
        qid = example_id_list[fi]
        lang = id2lang[qid]
        context = id2context[qid]
        mu = priors.get(lang, {}).get('mu', 2.3 if lang=='hindi' else 2.1)
        sigma = priors.get(lang, {}).get('sigma', 0.8 if lang=='hindi' else 0.7)
        Lmax = Lmax_hi if lang == 'hindi' else Lmax_ta
        nbest = nbest_hi if lang == 'hindi' else nbest_ta
        Ts, Te = temps.get(lang, (1.0,1.0))
        s_logits_full = start_logits[fi] / Ts
        e_logits_full = end_logits[fi] / Te
        # Offsets for this feature
        offs = to_offs_matrix(offset_mapping[fi])
        if offs is None:
            counts['skipped_feats'] += 1
            continue
        M = offs.shape[0]
        # Align logits to available offsets length
        s_logits = s_logits_full[:M]
        e_logits = e_logits_full[:M]
        # Build valid mask: end > start (positive span within context); disallow specials where end==start
        valid = (offs[:,1] > offs[:,0])
        if not valid.any():
            counts['skipped_feats'] += 1
            continue
        # Top-N start indices among valid
        valid_idx = np.where(valid)[0]
        s_candidates = valid_idx[np.argsort(s_logits[valid_idx])[::-1][:nbest]]
        best_score_f = -1e18
        best_span_f = None  # (si, ei, score)
        for si in s_candidates:
            s_off = offs[si]
            if s_off[1] <= s_off[0]:
                continue
            # Bound ends: >= si and within Lmax chars
            end_range = np.arange(si, M, dtype=np.int32)
            e_offs = offs[end_range]
            clen = e_offs[:,1] - s_off[0]
            mask = (e_offs[:,1] > e_offs[:,0]) & (clen > 0) & (clen <= Lmax)
            if not mask.any():
                continue
            cand_e_idx = end_range[mask]
            raw_scores = s_logits[si] + e_logits[cand_e_idx]
            # length prior
            if lambda_len > 0.0:
                clen_valid = (offs[cand_e_idx][:,1] - s_off[0]).astype(int)
                lp = np.array([log_normal_logpdf_len(int(c), mu, sigma) for c in clen_valid], dtype=np.float64)
                lp = np.clip(lp, clip_prior[0], clip_prior[1])
                raw_scores = raw_scores + lambda_len * lp
            # optional frequency prior (compute only on top few)
            if beta_freq > 0.0:
                order = np.argsort(raw_scores)[::-1][:8]
                for idx in order:
                    ei = int(cand_e_idx[idx])
                    a = int(offs[si][0]); b = int(offs[ei][1])
                    if not (0 <= a < len(context)) or not (0 < b <= len(context)) or a >= b:
                        continue
                    span_text = context[a:b]
                    freq = context.count(span_text)
                    raw_scores[idx] += beta_freq * math.log(1 + freq)
            # take best end
            best_idx = int(np.argmax(raw_scores))
            ei = int(cand_e_idx[best_idx])
            score = float(np.max(raw_scores))
            if score > best_score_f:
                best_score_f = score
                best_span_f = (si, ei, score)
        if best_span_f is None:
            continue
        si, ei, sc = best_span_f
        a = int(offs[si][0]); b = int(offs[ei][1])
        # Sanity to ensure offsets within context
        if not (0 <= a < len(context)) or not (0 < b <= len(context)) or a >= b:
            # Fallback: pick maximal logit end within cap ignoring offsets (should rarely happen)
            ei = int(np.argmax(e_logits))
            si = int(np.argmax(s_logits[:ei+1]))
            a = int(offs[si][0]); b = int(offs[ei][1])
            a = max(0, min(a, len(context)-1))
            b = max(a+1, min(b, len(context)))
        text = context[a:b]
        text = clean_span_text(text, lang)
        if not text:
            # force non-empty by expanding one char to the right within cap
            b = min(len(context), a + 1)
            text = clean_span_text(context[a:b], lang) or context[a:b]
        # Keep best across features for this example id
        prev = preds.get(qid)
        if (prev is None) or (sc > prev[0]):
            preds[qid] = (sc, text)
        if (fi+1) % 200 == 0:
            print(f'Processed feat {fi+1}/{Nfeat} (elapsed {time.time()-t0:.1f}s)')
    # Build submission DataFrame
    out = []
    for qid in test['id'].tolist():
        if qid in preds:
            out.append((qid, preds[qid][1]))
        else:
            # extremely rare: fallback blank -> use first Lmax chars of context
            lang = id2lang[qid]; ctx = id2context[qid]; cap = Lmax_hi if lang=='hindi' else Lmax_ta
            fallback = clean_span_text(ctx[:max(1, min(cap, len(ctx)))], lang)
            out.append((qid, fallback if fallback else ctx[:1]))
    sub = pd.DataFrame(out, columns=['id','PredictionString'])
    # Diagnostics
    empties = (sub['PredictionString'].astype(str).str.len() == 0).sum()
    mean_len = sub['PredictionString'].astype(str).str.len().mean()
    print('Decode done. Empties:', empties, 'Mean char len:', round(mean_len,2))
    return sub

# Run variants
variants = [
    ('A_lenprior015', dict(lambda_len=0.15, beta_freq=0.0, nbest_hi=180, nbest_ta=220, Lmax_hi=50, Lmax_ta=60)),
    ('B_lenprior030', dict(lambda_len=0.30, beta_freq=0.0, nbest_hi=180, nbest_ta=220, Lmax_hi=50, Lmax_ta=60)),
    ('C_lenprior000', dict(lambda_len=0.00, beta_freq=0.0, nbest_hi=180, nbest_ta=220, Lmax_hi=50, Lmax_ta=60)),
]

subs = {}
for tag, kwargs in variants:
    print(f'Running variant {tag} with kwargs:', kwargs, flush=True)
    sub = decode_single(**kwargs)
    out_name = f'submission_512only_lenprior_{tag}.csv'
    sub.to_csv(out_name, index=False)
    subs[tag] = out_name
    print('Wrote', out_name, flush=True)

# Set primary submission.csv to Variant A
primary = subs['A_lenprior015']
pd.read_csv(primary).to_csv('submission.csv', index=False)
print('submission.csv updated ->', primary, flush=True)

Loaded logits: (1401, 512) (1401, 512)


Mapping loaded: 1401 (1401, 512) dtype: object


Priors: {'hindi': {'mu': 2.27352915467594, 'sigma': 0.7057322733289972}, 'tamil': {'mu': 2.240137765846579, 'sigma': 0.8165516610717193}}


Running variant A_lenprior015 with kwargs: {'lambda_len': 0.15, 'beta_freq': 0.0, 'nbest_hi': 180, 'nbest_ta': 220, 'Lmax_hi': 50, 'Lmax_ta': 60}


Processed feat 200/1401 (elapsed 2.7s)


Processed feat 400/1401 (elapsed 5.3s)


Processed feat 600/1401 (elapsed 7.9s)


Processed feat 800/1401 (elapsed 10.5s)


Processed feat 1000/1401 (elapsed 13.1s)


Processed feat 1200/1401 (elapsed 15.8s)


Processed feat 1400/1401 (elapsed 18.6s)
Decode done. Empties: 0 Mean char len: 10.67
Wrote submission_512only_lenprior_A_lenprior015.csv


Running variant B_lenprior030 with kwargs: {'lambda_len': 0.3, 'beta_freq': 0.0, 'nbest_hi': 180, 'nbest_ta': 220, 'Lmax_hi': 50, 'Lmax_ta': 60}


Processed feat 200/1401 (elapsed 2.8s)


Processed feat 400/1401 (elapsed 5.6s)


Processed feat 600/1401 (elapsed 8.4s)


Processed feat 800/1401 (elapsed 11.2s)


Processed feat 1000/1401 (elapsed 13.9s)


Processed feat 1200/1401 (elapsed 16.5s)


Processed feat 1400/1401 (elapsed 19.4s)
Decode done. Empties: 0 Mean char len: 10.67
Wrote submission_512only_lenprior_B_lenprior030.csv


Running variant C_lenprior000 with kwargs: {'lambda_len': 0.0, 'beta_freq': 0.0, 'nbest_hi': 180, 'nbest_ta': 220, 'Lmax_hi': 50, 'Lmax_ta': 60}


Processed feat 200/1401 (elapsed 1.3s)


Processed feat 400/1401 (elapsed 2.5s)


Processed feat 600/1401 (elapsed 3.9s)


Processed feat 800/1401 (elapsed 5.2s)


Processed feat 1000/1401 (elapsed 6.5s)


Processed feat 1200/1401 (elapsed 7.8s)


Processed feat 1400/1401 (elapsed 9.1s)
Decode done. Empties: 0 Mean char len: 10.67
Wrote submission_512only_lenprior_C_lenprior000.csv


submission.csv updated -> submission_512only_lenprior_A_lenprior015.csv


In [6]:
# Char-level fusion decoder (single-stream 512 3-seed), lambda=0.15 primary
import numpy as np, pandas as pd, time, math, json, re, unicodedata as ud
from pathlib import Path

npz_path = 'xlmr_large_512_3seeds_avg.npz'
npz = np.load(npz_path, allow_pickle=True)
start_logits = npz['start']
end_logits = npz['end']
Nfeat, Ltok = start_logits.shape

map_dir = 'xlmr_large_512_test_logits'
eid_path = Path(map_dir) / 'test_example_id.json'
off_path = Path(map_dir) / 'test_offset_mapping.npy'
example_id_list = json.loads(Path(eid_path).read_text())
offset_mapping = np.load(off_path, allow_pickle=True)

test = pd.read_csv('test.csv')
id2lang = dict(zip(test['id'].tolist(), test['language'].tolist()))
id2context = dict(zip(test['id'].tolist(), test['context'].astype(str).tolist()))

train = pd.read_csv('train.csv')
train['answer_text'] = train['answer_text'].astype(str).str.strip()
train['char_len'] = train['answer_text'].str.len().clip(lower=1)
def fit_log_normal_params(df):
    x = np.log(df['char_len'].values.astype(float))
    mu = float(x.mean()); sd = float(x.std());
    if sd <= 1e-6: sd = 1e-6
    return mu, sd
priors = {lang: dict(zip(['mu','sigma'], fit_log_normal_params(g))) for lang, g in train.groupby('language')}

ZW_CHARS = {'\u200B','\u200C','\u200D','\u2060','\ufeff'}
NBSP_SET = {'\u00A0','\u2002','\u2003','\u2004','\u2005','\u2006','\u2007','\u2008','\u2009','\u200A'}
HI_VIRAMA = '\u094D'
TA_PULLI = '\u0BCD'
def clean_span_text(s: str, lang: str) -> str:
    if not s: return ''
    s = ''.join(ch for ch in s if ch not in ZW_CHARS)
    s = ''.join(' ' if ch in NBSP_SET else ch for ch in s)
    s = re.sub(r'\s+', ' ', s).strip()
    s = re.sub(r'(?<=\d)[\s,._-](?=\d)', '', s)
    if s:
        last = s[-1]
        if ud.category(last) == 'Mn' or last in (HI_VIRAMA, TA_PULLI):
            s = s[:-1]
    if lang == 'hindi':
        s = s.replace('\u0964\u0964','\u0964')
        if s.endswith('\u0964'): s = s[:-1].rstrip()
    return s

def log_normal_logpdf_len(char_len: int, mu: float, sigma: float) -> float:
    Lc = max(1, int(char_len))
    x = math.log(Lc)
    return -0.5*((x-mu)/sigma)**2 - math.log(max(Lc,1e-6)) - math.log(sigma) - 0.5*math.log(2*math.pi)

def to_pair(t):
    try:
        if t is None: return (0,0)
        if isinstance(t,(list,tuple,np.ndarray)) and len(t)>=2:
            a = int(t[0]) if t[0] is not None else 0
            b = int(t[1]) if t[1] is not None else 0
            return (a,b)
    except Exception:
        pass
    return (0,0)

# Build index lists per example
qid_to_feat_idx = {}
for i, qid in enumerate(example_id_list):
    qid_to_feat_idx.setdefault(qid, []).append(i)

def maxpool1d(x):
    if len(x) == 0: return x
    y = x.copy()
    if len(x) == 1: return y
    # 1D max-pool with kernel=3, stride=1, padding=1
    y[0] = max(x[0], x[1])
    for i in range(1, len(x)-1):
        y[i] = max(x[i-1], x[i], x[i+1])
    y[-1] = max(x[-2], x[-1])
    return y

def decode_charfusion(lambda_len=0.15, clip_prior=(-0.7,0.0), nbest_hi=200, nbest_ta=240, Lmax_hi=50, Lmax_ta=60, do_pool=True):
    t0 = time.time()
    out_rows = []
    for qid in test['id'].tolist():
        lang = id2lang[qid]
        ctx = id2context[qid]
        mu = priors.get(lang, {}).get('mu', 2.3 if lang=='hindi' else 2.1)
        sigma = priors.get(lang, {}).get('sigma', 0.8 if lang=='hindi' else 0.7)
        Lmax = Lmax_hi if lang=='hindi' else Lmax_ta
        K = nbest_hi if lang=='hindi' else nbest_ta
        S = np.zeros(len(ctx), dtype=np.float32)
        E = np.zeros(len(ctx), dtype=np.float32)
        idxs = qid_to_feat_idx.get(qid, [])
        for fi in idxs:
            offs_raw = offset_mapping[fi]
            # offs_raw is (Ltok,) of pairs
            # Align token logits length to offs length
            M = len(offs_raw) if hasattr(offs_raw,'__len__') else Ltok
            s_log = start_logits[fi][:M]
            e_log = end_logits[fi][:M]
            # accumulate
            for ti in range(M):
                a, b = to_pair(offs_raw[ti])
                if b > a and 0 <= a < len(ctx) and 1 <= b <= len(ctx):
                    S[a] += s_log[ti]
                    E[b-1] += e_log[ti]
        if do_pool:
            S = maxpool1d(S)
            E = maxpool1d(E)
        # Top-K starts
        if len(S) == 0:
            out_rows.append((qid, ctx[:1]))
            continue
        starts = np.argsort(S)[::-1][:K]
        best_score = -1e18
        best_span = (0, max(0, min(Lmax, len(ctx))-1))
        for si in starts:
            end_lo = si
            end_hi = min(len(ctx)-1, si + Lmax - 1)
            if end_hi < end_lo: continue
            # choose best end by E
            seg = E[end_lo:end_hi+1]
            ej_rel = int(np.argmax(seg))
            ej = end_lo + ej_rel
            raw = float(S[si] + E[ej])
            if lambda_len > 0.0:
                clen = ej - si + 1
                lp = log_normal_logpdf_len(clen, mu, sigma)
                lp = max(clip_prior[0], min(clip_prior[1], lp))
                raw += lambda_len * lp
            if raw > best_score:
                best_score = raw
                best_span = (si, ej)
        a, b = best_span
        text = ctx[a:b+1]
        text = clean_span_text(text, lang)
        if not text:
            b2 = min(len(ctx), a+1)
            text = clean_span_text(ctx[a:b2], lang) or ctx[a:b2]
        out_rows.append((qid, text))
    sub = pd.DataFrame(out_rows, columns=['id','PredictionString'])
    empties = (sub['PredictionString'].astype(str).str.len()==0).sum()
    mean_len = sub['PredictionString'].astype(str).str.len().mean()
    print(f'Char-fusion decode done in {time.time()-t0:.1f}s. Empties={empties}, mean_len={mean_len:.2f}')
    return sub

# Run char-fusion primary and write submission
sub_char = decode_charfusion(lambda_len=0.15, nbest_hi=200, nbest_ta=240, Lmax_hi=50, Lmax_ta=60, do_pool=True)
out_name = 'submission_charfusion_512_lambda015.csv'
sub_char.to_csv(out_name, index=False)
pd.read_csv(out_name).to_csv('submission.csv', index=False)
print('submission.csv updated ->', out_name)

Char-fusion decode done in 2.2s. Empties=0, mean_len=10.67
submission.csv updated -> submission_charfusion_512_lambda015.csv


In [10]:
# Multi-stream char-level fusion: 512(3seeds)+384(+MuRIL for Hindi) with per-language weights, lambda=0.15
import json, math, time, re, unicodedata as ud
import numpy as np, pandas as pd
from pathlib import Path

# Common resources
test = pd.read_csv('test.csv')
id2lang = dict(zip(test['id'].tolist(), test['language'].tolist()))
id2context = dict(zip(test['id'].tolist(), test['context'].astype(str).tolist()))
train = pd.read_csv('train.csv')
train['answer_text'] = train['answer_text'].astype(str).str.strip()
train['char_len'] = train['answer_text'].str.len().clip(lower=1)
def fit_log_normal_params(df):
    x = np.log(df['char_len'].values.astype(float))
    mu = float(x.mean()); sd = float(x.std());
    if sd <= 1e-6: sd = 1e-6
    return mu, sd
priors = {lang: dict(zip(['mu','sigma'], fit_log_normal_params(g))) for lang, g in train.groupby('language')}

ZW_CHARS = {'\u200B','\u200C','\u200D','\u2060','\ufeff'}
NBSP_SET = {'\u00A0','\u2002','\u2003','\u2004','\u2005','\u2006','\u2007','\u2008','\u2009','\u200A'}
HI_VIRAMA = '\u094D'
TA_PULLI = '\u0BCD'
def clean_span_text(s: str, lang: str) -> str:
    if not s: return ''
    s = ''.join(ch for ch in s if ch not in ZW_CHARS)
    s = ''.join(' ' if ch in NBSP_SET else ch for ch in s)
    s = re.sub(r'\s+', ' ', s).strip()
    s = re.sub(r'(?<=\d)[\s,._-](?=\d)', '', s)
    if s:
        last = s[-1]
        if ud.category(last) == 'Mn' or last in (HI_VIRAMA, TA_PULLI): s = s[:-1]
    if lang == 'hindi':
        s = s.replace('\u0964\u0964','\u0964')
        if s.endswith('\u0964'): s = s[:-1].rstrip()
    return s

def log_normal_logpdf_len(char_len: int, mu: float, sigma: float) -> float:
    Lc = max(1, int(char_len))
    x = math.log(Lc)
    return -0.5*((x-mu)/sigma)**2 - math.log(max(Lc,1e-6)) - math.log(sigma) - 0.5*math.log(2*math.pi)

def to_pair(t):
    try:
        if t is None: return (0,0)
        if isinstance(t,(list,tuple,np.ndarray)) and len(t)>=2:
            a = int(t[0]) if t[0] is not None else 0
            b = int(t[1]) if t[1] is not None else 0
            return (a,b)
    except Exception:
        pass
    return (0,0)

def load_npz_logits(npz_path):
    arr = np.load(npz_path, allow_pickle=True)
    keys = list(arr.keys())
    if 'start' in keys and 'end' in keys:
        return arr['start'], arr['end']
    # fallback key names
    for sk in ['start_logits','test_start_logits','start_logits_avg']:
        for ek in ['end_logits','test_end_logits','end_logits_avg']:
            if sk in keys and ek in keys:
                return arr[sk], arr[ek]
    raise ValueError(f'Unknown keys in {npz_path}: {keys}')

def maxpool1d(x):
    if len(x) == 0: return x
    y = x.copy()
    if len(x) == 1: return y
    y[0] = max(x[0], x[1])
    for i in range(1, len(x)-1): y[i] = max(x[i-1], x[i], x[i+1])
    y[-1] = max(x[-2], x[-1])
    return y

# Streams config: paths and map dirs
streams = [
    dict(name='xlmr512', npz='xlmr_large_512_3seeds_avg.npz', map_dir='xlmr_large_512_test_logits'),
    dict(name='xlmr384', npz='xlmr_large_test_avg.npz', map_dir='xlmr_large_test_logits'),
    dict(name='muril',   npz='muril_large_test_avg.npz',   map_dir='muril_large_test_logits'),
]

# Per-language weights
weights_hi = {'xlmr512': 0.80, 'xlmr384': 0.15, 'muril': 0.05}
weights_ta = {'xlmr512': 0.95, 'xlmr384': 0.05, 'muril': 0.00}

# Load all streams
loaded = []
for s in streams:
    try:
        s_start, s_end = load_npz_logits(s['npz'])
        eid_path = Path(s['map_dir']) / 'test_example_id.json'
        offs_path = Path(s['map_dir']) / 'test_offset_mapping.npy'
        eid = json.loads(eid_path.read_text())
        offs = np.load(offs_path, allow_pickle=True)
        loaded.append((s['name'], s_start, s_end, eid, offs))
        print('Loaded stream', s['name'], s_start.shape, 'features and offsets shape', getattr(offs,'shape',None))
    except Exception as e:
        print('Skip stream due to load error:', s['name'], e)

def decode_charfusion_multistream(lambda_len=0.15, clip_prior=(-0.7,0.0), nbest_hi=200, nbest_ta=240, Lmax_hi=50, Lmax_ta=60, do_pool=True):
    t0 = time.time()
    # Build index per stream: qid -> feat idx list
    idx_maps = []
    for name, s_start, s_end, eid, offs in loaded:
        m = {}
        for i, qid in enumerate(eid):
            m.setdefault(qid, []).append(i)
        idx_maps.append(m)
    out_rows = []
    for qid in test['id'].tolist():
        lang = id2lang[qid]
        ctx = id2context[qid]
        mu = priors.get(lang, {}).get('mu', 2.3 if lang=='hindi' else 2.1)
        sigma = priors.get(lang, {}).get('sigma', 0.8 if lang=='hindi' else 0.7)
        Lmax = Lmax_hi if lang=='hindi' else Lmax_ta
        K = nbest_hi if lang=='hindi' else nbest_ta
        S = np.zeros(len(ctx), dtype=np.float32)
        E = np.zeros(len(ctx), dtype=np.float32)
        for (name, s_start, s_end, eid, offs), m in zip(loaded, idx_maps):
            w = (weights_hi if lang=='hindi' else weights_ta).get(name, 0.0)
            if w <= 0.0: continue
            feat_idx = m.get(qid, [])
            for fi in feat_idx:
                offs_raw = offs[fi]
                M = len(offs_raw) if hasattr(offs_raw,'__len__') else s_start.shape[1]
                s_log = s_start[fi][:M] * w
                e_log = s_end[fi][:M] * w
                for ti in range(M):
                    a, b = to_pair(offs_raw[ti])
                    if b > a and 0 <= a < len(ctx) and 1 <= b <= len(ctx):
                        S[a] += s_log[ti]
                        E[b-1] += e_log[ti]
        if do_pool:
            S = maxpool1d(S); E = maxpool1d(E)
        # Decode
        starts = np.argsort(S)[::-1][:K]
        best_score = -1e18
        best_span = (0, max(0, min(Lmax, len(ctx))-1))
        for si in starts:
            end_lo = si; end_hi = min(len(ctx)-1, si + Lmax - 1)
            if end_hi < end_lo: continue
            seg = E[end_lo:end_hi+1]
            ej = end_lo + int(np.argmax(seg))
            raw = float(S[si] + E[ej])
            if lambda_len > 0.0:
                clen = ej - si + 1
                lp = log_normal_logpdf_len(clen, mu, sigma)
                lp = max(clip_prior[0], min(clip_prior[1], lp))
                raw += lambda_len * lp
            if raw > best_score:
                best_score = raw; best_span = (si, ej)
        a, b = best_span
        text = clean_span_text(ctx[a:b+1], lang)
        if not text:
            b2 = min(len(ctx), a+1)
            text = clean_span_text(ctx[a:b2], lang) or ctx[a:b2]
        out_rows.append((qid, text))
    sub = pd.DataFrame(out_rows, columns=['id','PredictionString'])
    empties = (sub['PredictionString'].astype(str).str.len()==0).sum()
    mean_len = sub['PredictionString'].astype(str).str.len().mean()
    print(f'Multi-stream char-fusion done in {time.time()-t0:.1f}s. Empties={empties}, mean_len={mean_len:.2f}')
    return sub

# Build submission
sub_ms = decode_charfusion_multistream(lambda_len=0.15, nbest_hi=200, nbest_ta=240, Lmax_hi=50, Lmax_ta=60, do_pool=True)
out_ms = 'submission_charfusion_multistream_512_384_muril_hi80_15_5_ta95_5.csv'
sub_ms.to_csv(out_ms, index=False)
pd.read_csv(out_ms).to_csv('submission.csv', index=False)
print('submission.csv updated ->', out_ms)

Loaded stream xlmr512 (1401, 512) features and offsets shape (1401, 512)


Loaded stream xlmr384 (1921, 384) features and offsets shape (1921, 384)


Loaded stream muril (1513, 384) features and offsets shape (1513, 384)


Multi-stream char-fusion done in 3.8s. Empties=0, mean_len=10.03
submission.csv updated -> submission_charfusion_multistream_512_384_muril_hi80_15_5_ta95_5.csv


In [11]:
# Expert-recommended final variants: Primary (multistream+snap) and Safety (512-only+freq)
import json, math, time, re, unicodedata as ud
import numpy as np, pandas as pd
from pathlib import Path

# Common data
test = pd.read_csv('test.csv')
id2lang = dict(zip(test['id'].tolist(), test['language'].tolist()))
id2context = dict(zip(test['id'].tolist(), test['context'].astype(str).tolist()))
train = pd.read_csv('train.csv')
train['answer_text'] = train['answer_text'].astype(str).str.strip()
train['char_len'] = train['answer_text'].str.len().clip(lower=1)
def fit_log_normal_params(df):
    x = np.log(df['char_len'].values.astype(float))
    mu = float(x.mean()); sd = float(x.std());
    if sd <= 1e-6: sd = 1e-6
    return mu, sd
priors = {lang: dict(zip(['mu','sigma'], fit_log_normal_params(g))) for lang, g in train.groupby('language')}

ZW_CHARS = {'\u200B','\u200C','\u200D','\u2060','\ufeff'}
NBSP_SET = {'\u00A0','\u2002','\u2003','\u2004','\u2005','\u2006','\u2007','\u2008','\u2009','\u200A'}
HI_VIRAMA = '\u094D'
TA_PULLI = '\u0BCD'
DANDA = '\u0964'
def clean_span_text(s: str, lang: str) -> str:
    if not s: return ''
    s = ''.join(ch for ch in s if ch not in ZW_CHARS)
    s = ''.join(' ' if ch in NBSP_SET else ch for ch in s)
    s = re.sub(r'\s+', ' ', s).strip()
    s = re.sub(r'(?<=\d)[\s,._-](?=\d)', '', s)
    if s:
        last = s[-1]
        if ud.category(last) == 'Mn' or last in (HI_VIRAMA, TA_PULLI): s = s[:-1]
    if lang == 'hindi':
        s = s.replace(DANDA+DANDA, DANDA)
        if s.endswith(DANDA): s = s[:-1].rstrip()
    return s

def log_normal_logpdf_len(char_len: int, mu: float, sigma: float) -> float:
    Lc = max(1, int(char_len))
    x = math.log(Lc)
    return -0.5*((x-mu)/sigma)**2 - math.log(max(Lc,1e-6)) - math.log(sigma) - 0.5*math.log(2*math.pi)

def to_pair(t):
    try:
        if t is None: return (0,0)
        if isinstance(t,(list,tuple,np.ndarray)) and len(t)>=2:
            a = int(t[0]) if t[0] is not None else 0
            b = int(t[1]) if t[1] is not None else 0
            return (a,b)
    except Exception:
        pass
    return (0,0)

def load_npz_logits(npz_path):
    arr = np.load(npz_path, allow_pickle=True)
    keys = list(arr.keys())
    if 'start' in keys and 'end' in keys:
        return arr['start'], arr['end']
    for sk in ['start_logits','test_start_logits','start_logits_avg']:
        for ek in ['end_logits','test_end_logits','end_logits_avg']:
            if sk in keys and ek in keys:
                return arr[sk], arr[ek]
    raise ValueError(f'Unknown keys in {npz_path}: {keys}')

def maxpool1d(x):
    if len(x) == 0: return x
    y = x.copy()
    if len(x) == 1: return y
    y[0] = max(x[0], x[1])
    for i in range(1, len(x)-1): y[i] = max(x[i-1], x[i], x[i+1])
    y[-1] = max(x[-2], x[-1])
    return y

def is_boundary_char(ch: str) -> bool:
    return ch.isspace() or ch in {'.', ',', '!', '?', DANDA}

def snap_span(ctx: str, S: np.ndarray, E: np.ndarray, a: int, b: int, delta: float = 0.04):
    base = float(S[a] + E[b])
    # find previous boundary for left
    a2 = a
    for i in range(a-1, -1, -1):
        if is_boundary_char(ctx[i]):
            a2 = i+1
            break
    # find next boundary for right (b inclusive index for E)
    b2 = b
    for j in range(b, len(ctx)):
        if is_boundary_char(ctx[j]):
            b2 = max(a2, j-1)
            break
    # evaluate snapped vs original
    cand = float(S[a2] + E[b2])
    if cand >= base - delta:
        return a2, b2
    return a, b

# Load streams
streams = [
    dict(name='xlmr512', npz='xlmr_large_512_3seeds_avg.npz', map_dir='xlmr_large_512_test_logits'),
    dict(name='xlmr384', npz='xlmr_large_test_avg.npz', map_dir='xlmr_large_test_logits'),
    dict(name='muril',   npz='muril_large_test_avg.npz',   map_dir='muril_large_test_logits'),
]
loaded = []
for s in streams:
    try:
        s_start, s_end = load_npz_logits(s['npz'])
        eid_path = Path(s['map_dir']) / 'test_example_id.json'
        offs_path = Path(s['map_dir']) / 'test_offset_mapping.npy'
        eid = json.loads(eid_path.read_text())
        offs = np.load(offs_path, allow_pickle=True)
        loaded.append((s['name'], s_start, s_end, eid, offs))
        print('Loaded', s['name'], s_start.shape, 'offsets', getattr(offs,'shape',None))
    except Exception as e:
        print('Skip stream', s['name'], '->', e)

def decode_primary_multistream():
    # Weights per language
    weights_hi = {'xlmr512': 0.85, 'xlmr384': 0.10, 'muril': 0.05}
    weights_ta = {'xlmr512': 0.97, 'xlmr384': 0.03, 'muril': 0.00}
    lambda_len = 0.15
    K_hi, K_ta = 210, 250
    Lmax_hi, Lmax_ta = 52, 62
    t0 = time.time()
    # Build per-stream index maps
    idx_maps = []
    for name, s_start, s_end, eid, offs in loaded:
        m = {}
        for i, qid in enumerate(eid): m.setdefault(qid, []).append(i)
        idx_maps.append(m)
    out_rows = []
    for qid in test['id'].tolist():
        lang = id2lang[qid]; ctx = id2context[qid]
        mu = priors.get(lang, {}).get('mu', 2.3 if lang=='hindi' else 2.1)
        sigma = priors.get(lang, {}).get('sigma', 0.8 if lang=='hindi' else 0.7)
        Lmax = Lmax_hi if lang=='hindi' else Lmax_ta
        K = K_hi if lang=='hindi' else K_ta
        S = np.zeros(len(ctx), dtype=np.float32)
        E = np.zeros(len(ctx), dtype=np.float32)
        for (name, s_start, s_end, eid, offs), m in zip(loaded, idx_maps):
            w = (weights_hi if lang=='hindi' else weights_ta).get(name, 0.0)
            if w <= 0.0: continue
            feat_idx = m.get(qid, [])
            for fi in feat_idx:
                offs_raw = offs[fi]
                M = len(offs_raw) if hasattr(offs_raw,'__len__') else s_start.shape[1]
                s_log = s_start[fi][:M] * w
                e_log = s_end[fi][:M] * w
                for ti in range(M):
                    a, b = to_pair(offs_raw[ti])
                    if b > a and 0 <= a < len(ctx) and 1 <= b <= len(ctx):
                        S[a] += s_log[ti]
                        E[b-1] += e_log[ti]
        S = maxpool1d(S); E = maxpool1d(E)
        starts = np.argsort(S)[::-1][:K]
        best_score = -1e18
        best_span = (0, max(0, min(Lmax, len(ctx))-1))
        for si in starts:
            end_lo = si; end_hi = min(len(ctx)-1, si + Lmax - 1)
            if end_hi < end_lo: continue
            seg = E[end_lo:end_hi+1]
            ej = end_lo + int(np.argmax(seg))
            raw = float(S[si] + E[ej])
            if lambda_len > 0.0:
                clen = ej - si + 1
                lp = log_normal_logpdf_len(clen, mu, sigma)
                lp = max(-0.7, min(0.0, lp))
                raw += lambda_len * lp
            if raw > best_score:
                best_score = raw; best_span = (si, ej)
        a, b = best_span
        a, b = snap_span(ctx, S, E, a, b, delta=0.04)
        text = clean_span_text(ctx[a:b+1], lang)
        if not text:
            b2 = min(len(ctx), a+1)
            text = clean_span_text(ctx[a:b2], lang) or ctx[a:b2]
        out_rows.append((qid, text))
    sub = pd.DataFrame(out_rows, columns=['id','PredictionString'])
    empties = (sub['PredictionString'].astype(str).str.len()==0).sum()
    mean_len = sub['PredictionString'].astype(str).str.len().mean()
    print(f'Primary multistream+snap done in {time.time()-t0:.1f}s. Empties={empties}, mean_len={mean_len:.2f}')
    return sub

def decode_safety_512only():
    # Use only xlmr512
    lambda_len = 0.10; beta = 0.10
    K_hi, K_ta = 210, 250
    Lmax_hi, Lmax_ta = 48, 58
    t0 = time.time()
    # find 512 stream
    s512 = None
    for tpl in loaded:
        if tpl[0] == 'xlmr512': s512 = tpl
    assert s512 is not None, 'xlmr512 stream missing'
    name, s_start, s_end, eid, offs_all = s512
    # index map
    m = {}
    for i, qid in enumerate(eid): m.setdefault(qid, []).append(i)
    out_rows = []
    for qid in test['id'].tolist():
        lang = id2lang[qid]; ctx = id2context[qid]
        mu = priors.get(lang, {}).get('mu', 2.3 if lang=='hindi' else 2.1)
        sigma = priors.get(lang, {}).get('sigma', 0.8 if lang=='hindi' else 0.7)
        Lmax = Lmax_hi if lang=='hindi' else Lmax_ta
        K = K_hi if lang=='hindi' else K_ta
        S = np.zeros(len(ctx), dtype=np.float32)
        E = np.zeros(len(ctx), dtype=np.float32)
        for fi in m.get(qid, []):
            offs_raw = offs_all[fi]
            M = len(offs_raw) if hasattr(offs_raw,'__len__') else s_start.shape[1]
            s_log = s_start[fi][:M]
            e_log = s_end[fi][:M]
            for ti in range(M):
                a, b = to_pair(offs_raw[ti])
                if b > a and 0 <= a < len(ctx) and 1 <= b <= len(ctx):
                    S[a] += s_log[ti]
                    E[b-1] += e_log[ti]
        S = maxpool1d(S); E = maxpool1d(E)
        starts = np.argsort(S)[::-1][:K]
        best_score = -1e18
        best_span = (0, max(0, min(Lmax, len(ctx))-1))
        for si in starts:
            end_lo = si; end_hi = min(len(ctx)-1, si + Lmax - 1)
            if end_hi < end_lo: continue
            seg = E[end_lo:end_hi+1]
            # shortlist top ends (8-10)
            top = np.argsort(seg)[::-1][:10]
            for rel in top:
                ej = end_lo + int(rel)
                raw = float(S[si] + E[ej])
                clen = ej - si + 1
                if lambda_len > 0.0:
                    lp = log_normal_logpdf_len(clen, mu, sigma)
                    lp = max(-0.7, min(0.0, lp))
                    raw += lambda_len * lp
                # frequency prior on shortlist
                span_text = ctx[si:ej+1]
                if span_text:
                    raw += beta * math.log(1 + ctx.count(span_text))
                if raw > best_score:
                    best_score = raw; best_span = (si, ej)
        a, b = best_span
        text = clean_span_text(ctx[a:b+1], lang)
        if not text:
            b2 = min(len(ctx), a+1)
            text = clean_span_text(ctx[a:b2], lang) or ctx[a:b2]
        out_rows.append((qid, text))
    sub = pd.DataFrame(out_rows, columns=['id','PredictionString'])
    empties = (sub['PredictionString'].astype(str).str.len()==0).sum()
    mean_len = sub['PredictionString'].astype(str).str.len().mean()
    print(f'Safety 512-only+freq done in {time.time()-t0:.1f}s. Empties={empties}, mean_len={mean_len:.2f}')
    return sub

# Build both submissions
sub_primary = decode_primary_multistream()
primary_path = 'submission_primary_multistream_snap_hi85_10_5_ta97_3_0_lambda015.csv'
sub_primary.to_csv(primary_path, index=False)
sub_safety = decode_safety_512only()
safety_path = 'submission_safety_512only_lambda010_freq010.csv'
sub_safety.to_csv(safety_path, index=False)
# Set primary as submission.csv
pd.read_csv(primary_path).to_csv('submission.csv', index=False)
print('submission.csv updated ->', primary_path)

Loaded xlmr512 (1401, 512) offsets (1401, 512)


Loaded xlmr384 (1921, 384) offsets (1921, 384)


Loaded muril (1513, 384) offsets (1513, 384)


Primary multistream+snap done in 3.8s. Empties=0, mean_len=10.55


Safety 512-only+freq done in 4.6s. Empties=0, mean_len=13.18
submission.csv updated -> submission_primary_multistream_snap_hi85_10_5_ta97_3_0_lambda015.csv


In [12]:
# Switch submission.csv to Safety variant
import pandas as pd
safety_path = 'submission_safety_512only_lambda010_freq010.csv'
pd.read_csv(safety_path).to_csv('submission.csv', index=False)
print('submission.csv updated ->', safety_path)

submission.csv updated -> submission_safety_512only_lambda010_freq010.csv


In [13]:
# Optional third variant: Multi-stream, drop MuRIL entirely + snap (lambda=0.10)
import json, math, time, re, unicodedata as ud
import numpy as np, pandas as pd
from pathlib import Path

test = pd.read_csv('test.csv')
id2lang = dict(zip(test['id'].tolist(), test['language'].tolist()))
id2context = dict(zip(test['id'].tolist(), test['context'].astype(str).tolist()))
train = pd.read_csv('train.csv')
train['answer_text'] = train['answer_text'].astype(str).str.strip()
train['char_len'] = train['answer_text'].str.len().clip(lower=1)
def fit_log_normal_params(df):
    x = np.log(df['char_len'].values.astype(float))
    mu = float(x.mean()); sd = float(x.std());
    if sd <= 1e-6: sd = 1e-6
    return mu, sd
priors = {lang: dict(zip(['mu','sigma'], fit_log_normal_params(g))) for lang, g in train.groupby('language')}

ZW_CHARS = {'\u200B','\u200C','\u200D','\u2060','\ufeff'}
NBSP_SET = {'\u00A0','\u2002','\u2003','\u2004','\u2005','\u2006','\u2007','\u2008','\u2009','\u200A'}
HI_VIRAMA = '\u094D'
TA_PULLI = '\u0BCD'
DANDA = '\u0964'
def clean_span_text(s: str, lang: str) -> str:
    if not s: return ''
    s = ''.join(ch for ch in s if ch not in ZW_CHARS)
    s = ''.join(' ' if ch in NBSP_SET else ch for ch in s)
    s = re.sub(r'\s+', ' ', s).strip()
    s = re.sub(r'(?<=\d)[\s,._-](?=\d)', '', s)
    if s:
        last = s[-1]
        if ud.category(last) == 'Mn' or last in (HI_VIRAMA, TA_PULLI): s = s[:-1]
    if lang == 'hindi':
        s = s.replace(DANDA+DANDA, DANDA)
        if s.endswith(DANDA): s = s[:-1].rstrip()
    return s

def log_normal_logpdf_len(char_len: int, mu: float, sigma: float) -> float:
    Lc = max(1, int(char_len))
    x = math.log(Lc)
    return -0.5*((x-mu)/sigma)**2 - math.log(max(Lc,1e-6)) - math.log(sigma) - 0.5*math.log(2*math.pi)

def to_pair(t):
    try:
        if t is None: return (0,0)
        if isinstance(t,(list,tuple,np.ndarray)) and len(t)>=2:
            a = int(t[0]) if t[0] is not None else 0
            b = int(t[1]) if t[1] is not None else 0
            return (a,b)
    except Exception:
        pass
    return (0,0)

def load_npz_logits(npz_path):
    arr = np.load(npz_path, allow_pickle=True)
    keys = list(arr.keys())
    if 'start' in keys and 'end' in keys:
        return arr['start'], arr['end']
    for sk in ['start_logits','test_start_logits','start_logits_avg']:
        for ek in ['end_logits','test_end_logits','end_logits_avg']:
            if sk in keys and ek in keys:
                return arr[sk], arr[ek]
    raise ValueError(f'Unknown keys in {npz_path}: {keys}')

def maxpool1d(x):
    if len(x) == 0: return x
    y = x.copy()
    if len(x) == 1: return y
    y[0] = max(x[0], x[1])
    for i in range(1, len(x)-1): y[i] = max(x[i-1], x[i], x[i+1])
    y[-1] = max(x[-2], x[-1])
    return y

def is_boundary_char(ch: str) -> bool:
    return ch.isspace() or ch in {'.', ',', '!', '?', DANDA}

def snap_span(ctx: str, S: np.ndarray, E: np.ndarray, a: int, b: int, delta: float = 0.04):
    base = float(S[a] + E[b])
    a2 = a
    for i in range(a-1, -1, -1):
        if is_boundary_char(ctx[i]):
            a2 = i+1
            break
    b2 = b
    for j in range(b, len(ctx)):
        if is_boundary_char(ctx[j]):
            b2 = max(a2, j-1)
            break
    cand = float(S[a2] + E[b2])
    if cand >= base - delta:
        return a2, b2
    return a, b

# Load only required streams (drop MuRIL)
streams = [
    dict(name='xlmr512', npz='xlmr_large_512_3seeds_avg.npz', map_dir='xlmr_large_512_test_logits'),
    dict(name='xlmr384', npz='xlmr_large_test_avg.npz', map_dir='xlmr_large_test_logits'),
]
loaded = []
for s in streams:
    try:
        s_start, s_end = load_npz_logits(s['npz'])
        eid_path = Path(s['map_dir']) / 'test_example_id.json'
        offs_path = Path(s['map_dir']) / 'test_offset_mapping.npy'
        eid = json.loads(eid_path.read_text())
        offs = np.load(offs_path, allow_pickle=True)
        loaded.append((s['name'], s_start, s_end, eid, offs))
        print('Loaded', s['name'], s_start.shape, 'offsets', getattr(offs,'shape',None))
    except Exception as e:
        print('Skip stream', s['name'], '->', e)

def decode_third_variant():
    # Weights (drop MuRIL entirely)
    weights_hi = {'xlmr512': 0.90, 'xlmr384': 0.10}
    weights_ta = {'xlmr512': 0.99, 'xlmr384': 0.01}
    lambda_len = 0.10
    K_hi, K_ta = 200, 240
    Lmax_hi, Lmax_ta = 50, 58
    t0 = time.time()
    idx_maps = []
    for name, s_start, s_end, eid, offs in loaded:
        m = {}
        for i, qid in enumerate(eid): m.setdefault(qid, []).append(i)
        idx_maps.append(m)
    out_rows = []
    for qid in test['id'].tolist():
        lang = id2lang[qid]; ctx = id2context[qid]
        mu = priors.get(lang, {}).get('mu', 2.3 if lang=='hindi' else 2.1)
        sigma = priors.get(lang, {}).get('sigma', 0.8 if lang=='hindi' else 0.7)
        Lmax = Lmax_hi if lang=='hindi' else Lmax_ta
        K = K_hi if lang=='hindi' else K_ta
        S = np.zeros(len(ctx), dtype=np.float32)
        E = np.zeros(len(ctx), dtype=np.float32)
        for (name, s_start, s_end, eid, offs), m in zip(loaded, idx_maps):
            w = (weights_hi if lang=='hindi' else weights_ta).get(name, 0.0)
            if w <= 0.0: continue
            feat_idx = m.get(qid, [])
            for fi in feat_idx:
                offs_raw = offs[fi]
                M = len(offs_raw) if hasattr(offs_raw,'__len__') else s_start.shape[1]
                s_log = s_start[fi][:M] * w
                e_log = s_end[fi][:M] * w
                for ti in range(M):
                    a, b = to_pair(offs_raw[ti])
                    if b > a and 0 <= a < len(ctx) and 1 <= b <= len(ctx):
                        S[a] += s_log[ti]
                        E[b-1] += e_log[ti]
        S = maxpool1d(S); E = maxpool1d(E)
        starts = np.argsort(S)[::-1][:K]
        best_score = -1e18
        best_span = (0, max(0, min(Lmax, len(ctx))-1))
        for si in starts:
            end_lo = si; end_hi = min(len(ctx)-1, si + Lmax - 1)
            if end_hi < end_lo: continue
            seg = E[end_lo:end_hi+1]
            ej = end_lo + int(np.argmax(seg))
            raw = float(S[si] + E[ej])
            if lambda_len > 0.0:
                clen = ej - si + 1
                lp = log_normal_logpdf_len(clen, mu, sigma)
                lp = max(-0.7, min(0.0, lp))
                raw += lambda_len * lp
            if raw > best_score:
                best_score = raw; best_span = (si, ej)
        a, b = best_span
        a, b = snap_span(ctx, S, E, a, b, delta=0.04)
        text = clean_span_text(ctx[a:b+1], lang)
        if not text:
            b2 = min(len(ctx), a+1)
            text = clean_span_text(ctx[a:b2], lang) or ctx[a:b2]
        out_rows.append((qid, text))
    sub = pd.DataFrame(out_rows, columns=['id','PredictionString'])
    empties = (sub['PredictionString'].astype(str).str.len()==0).sum()
    mean_len = sub['PredictionString'].astype(str).str.len().mean()
    print(f'Third variant (drop MuRIL) done in {time.time()-t0:.1f}s. Empties={empties}, mean_len={mean_len:.2f}')
    return sub

sub_third = decode_third_variant()
third_path = 'submission_third_multistream_drop_muril_snap_lambda010.csv'
sub_third.to_csv(third_path, index=False)
pd.read_csv(third_path).to_csv('submission.csv', index=False)
print('submission.csv updated ->', third_path)

Loaded xlmr512 (1401, 512) offsets (1401, 512)


Loaded xlmr384 (1921, 384) offsets (1921, 384)


In [14]:
# Force submission.csv to the third-variant file and print quick diagnostics
import pandas as pd, os, time
third_path = 'submission_third_multistream_drop_muril_snap_lambda010.csv'
assert os.path.exists(third_path), f"Missing {third_path}"
sub = pd.read_csv(third_path)
empties = (sub['PredictionString'].astype(str).str.len()==0).sum()
mean_len = sub['PredictionString'].astype(str).str.len().mean()
sub.to_csv('submission.csv', index=False)
print('submission.csv updated ->', third_path)
print('Diagnostics: empties=', int(empties), 'mean_len=', round(float(mean_len), 2))
print('mtime(submission.csv)=', time.ctime(os.path.getmtime('submission.csv')))

submission.csv updated -> submission_third_multistream_drop_muril_snap_lambda010.csv
Diagnostics: empties= 0 mean_len= 10.79
mtime(submission.csv)= Thu Sep 25 10:55:33 2025


In [15]:
# Medal-push multistream (xlmr512 + tiny xlmr384), no MuRIL, tight snap, tiny freq prior
import json, math, time, re, unicodedata as ud
import numpy as np, pandas as pd
from pathlib import Path

test = pd.read_csv('test.csv')
id2lang = dict(zip(test['id'].tolist(), test['language'].tolist()))
id2context = dict(zip(test['id'].tolist(), test['context'].astype(str).tolist()))
train = pd.read_csv('train.csv')
train['answer_text'] = train['answer_text'].astype(str).str.strip()
train['char_len'] = train['answer_text'].str.len().clip(lower=1)
def fit_log_normal_params(df):
    x = np.log(df['char_len'].values.astype(float))
    mu = float(x.mean()); sd = float(x.std());
    if sd <= 1e-6: sd = 1e-6
    return mu, sd
priors = {lang: dict(zip(['mu','sigma'], fit_log_normal_params(g))) for lang, g in train.groupby('language')}

ZW_CHARS = {'\u200B','\u200C','\u200D','\u2060','\ufeff'}
NBSP_SET = {'\u00A0','\u2002','\u2003','\u2004','\u2005','\u2006','\u2007','\u2008','\u2009','\u200A'}
HI_VIRAMA = '\u094D'; TA_PULLI = '\u0BCD'; DANDA = '\u0964'
def clean_span_text(s: str, lang: str) -> str:
    if not s: return ''
    s = ''.join(ch for ch in s if ch not in ZW_CHARS)
    s = ''.join(' ' if ch in NBSP_SET else ch for ch in s)
    s = re.sub(r'\s+', ' ', s).strip()
    s = re.sub(r'(?<=\d)[\s,._-](?=\d)', '', s)
    if s and (ud.category(s[-1]) == 'Mn' or s[-1] in (HI_VIRAMA, TA_PULLI)):
        s = s[:-1]
    if lang == 'hindi':
        s = s.replace(DANDA+DANDA, DANDA)
        if s.endswith(DANDA): s = s[:-1].rstrip()
    return s

def log_normal_logpdf_len(char_len: int, mu: float, sigma: float) -> float:
    Lc = max(1, int(char_len))
    x = math.log(Lc)
    return -0.5*((x-mu)/sigma)**2 - math.log(max(Lc,1e-6)) - math.log(sigma) - 0.5*math.log(2*math.pi)

def to_pair(t):
    try:
        if t is None: return (0,0)
        if isinstance(t,(list,tuple,np.ndarray)) and len(t)>=2:
            a = int(t[0]) if t[0] is not None else 0
            b = int(t[1]) if t[1] is not None else 0
            return (a,b)
    except Exception:
        pass
    return (0,0)

def load_npz_logits(npz_path):
    arr = np.load(npz_path, allow_pickle=True)
    keys = list(arr.keys())
    if 'start' in keys and 'end' in keys:
        return arr['start'], arr['end']
    for sk in ['start_logits','test_start_logits','start_logits_avg']:
        for ek in ['end_logits','test_end_logits','end_logits_avg']:
            if sk in keys and ek in keys: return arr[sk], arr[ek]
    raise ValueError(f'Unknown keys in {npz_path}: {keys}')

def maxpool1d(x):
    if len(x) == 0: return x
    y = x.copy()
    if len(x) == 1: return y
    y[0] = max(x[0], x[1])
    for i in range(1, len(x)-1): y[i] = max(x[i-1], x[i], x[i+1])
    y[-1] = max(x[-2], x[-1])
    return y

def is_boundary_char(ch: str) -> bool:
    return ch.isspace() or ch in {'.', ',', '!', '?', DANDA, '"', "'", '(', ')', '[', ']', '{', '}'}

def snap_span(ctx: str, S: np.ndarray, E: np.ndarray, a: int, b: int, delta: float = 0.03):
    base = float(S[a] + E[b])
    a2 = a
    for i in range(a-1, -1, -1):
        if is_boundary_char(ctx[i]):
            a2 = i+1; break
    b2 = b
    for j in range(b, len(ctx)):
        if is_boundary_char(ctx[j]):
            b2 = max(a2, j-1); break
    cand = float(S[a2] + E[b2])
    if cand >= base - delta:
        return a2, b2
    if a > 0 and b < len(ctx)-1 and ctx[a-1] in {'(', '"', "'"} and ctx[b+1] in {')', '"', "'"}:
        cand2 = float(S[a-1] + E[b+1])
        if cand2 >= base - delta and (b+1 - (a-1) <= (b - a) + 2):
            return a-1, b+1
    return a, b

# Load only xlmr512 and xlmr384
streams = [
    dict(name='xlmr512', npz='xlmr_large_512_3seeds_avg.npz', map_dir='xlmr_large_512_test_logits'),
    dict(name='xlmr384', npz='xlmr_large_test_avg.npz',       map_dir='xlmr_large_test_logits'),
]
loaded = []
for s in streams:
    try:
        s_start, s_end = load_npz_logits(s['npz'])
        eid_path = Path(s['map_dir']) / 'test_example_id.json'
        offs_path = Path(s['map_dir']) / 'test_offset_mapping.npy'
        eid = json.loads(eid_path.read_text())
        offs = np.load(offs_path, allow_pickle=True)
        loaded.append((s['name'], s_start, s_end, eid, offs))
        print('Loaded', s['name'], s_start.shape, 'offsets', getattr(offs,'shape',None))
    except Exception as e:
        print('Skip stream', s['name'], '->', e)

def decode_medal_push():
    # Weights
    weights_hi = {'xlmr512': 0.92,  'xlmr384': 0.08}
    weights_ta = {'xlmr512': 0.995, 'xlmr384': 0.005}
    # Hyperparams
    lambda_len = 0.12
    K_hi, K_ta = 240, 280
    Lmax_hi, Lmax_ta = 52, 62
    clip_prior = (-0.8, 0.0)
    beta = 0.06    # tiny frequency prior
    shortlist = 6  # ends per start

    # Build per-stream index maps
    idx_maps = []
    for name, s_start, s_end, eid, offs in loaded:
        m = {}
        for i, qid in enumerate(eid): m.setdefault(qid, []).append(i)
        idx_maps.append(m)

    out_rows = []
    t0 = time.time()
    for qid in test['id'].tolist():
        lang = id2lang[qid]; ctx = id2context[qid]
        mu = priors[lang]['mu']; sigma = priors[lang]['sigma']
        Lmax = Lmax_hi if lang=='hindi' else Lmax_ta
        K = K_hi if lang=='hindi' else K_ta
        weights = weights_hi if lang=='hindi' else weights_ta

        S = np.zeros(len(ctx), dtype=np.float32)
        E = np.zeros(len(ctx), dtype=np.float32)

        # Accumulate streams
        for (name, s_start, s_end, eid, offs), m in zip(loaded, idx_maps):
            w = weights.get(name, 0.0)
            if w <= 0.0: continue
            for fi in m.get(qid, []):
                offs_raw = offs[fi]
                M = len(offs_raw) if hasattr(offs_raw,'__len__') else s_start.shape[1]
                s_log = s_start[fi][:M] * w
                e_log = s_end[fi][:M] * w
                for ti in range(M):
                    a, b = to_pair(offs_raw[ti])
                    if b > a and 0 <= a < len(ctx) and 1 <= b <= len(ctx):
                        S[a] += s_log[ti]
                        E[b-1] += e_log[ti]

        # Smooth
        S = maxpool1d(S); E = maxpool1d(E)

        # Decode
        starts = np.argsort(S)[::-1][:K]
        best_score = -1e18
        best_span = (0, max(0, min(Lmax, len(ctx))-1))
        for si in starts:
            end_lo = si; end_hi = min(len(ctx)-1, si + Lmax - 1)
            if end_hi < end_lo: continue
            seg = E[end_lo:end_hi+1]
            top_rel = np.argsort(seg)[::-1][:shortlist]
            for rel in top_rel:
                ej = end_lo + int(rel)
                raw = float(S[si] + E[ej])
                clen = ej - si + 1
                lp = log_normal_logpdf_len(clen, mu, sigma)
                lp = max(clip_prior[0], min(clip_prior[1], lp))
                raw += lambda_len * lp
                if beta > 0.0:
                    span_text = ctx[si:ej+1]
                    if span_text and ctx.count(span_text) > 1:
                        raw += beta * math.log(1 + ctx.count(span_text))
                if raw > best_score:
                    best_score = raw; best_span = (si, ej)

        a, b = best_span
        a, b = snap_span(ctx, S, E, a, b, delta=0.03)
        text = clean_span_text(ctx[a:b+1], lang)
        if not text:
            b2 = min(len(ctx), a+1)
            text = clean_span_text(ctx[a:b2], lang) or ctx[a:b2]
        out_rows.append((qid, text))

    sub = pd.DataFrame(out_rows, columns=['id','PredictionString'])
    empties = (sub['PredictionString'].astype(str).str.len()==0).sum()
    mean_len = sub['PredictionString'].astype(str).str.len().mean()
    print(f'Medal-push multistream done in {time.time()-t0:.1f}s. Empties={empties}, mean_len={mean_len:.2f}')
    return sub

sub = decode_medal_push()
out_path = 'submission_medalpush_2stream_lambda012_K240_280_Lmax52_62_beta006_delta003.csv'
sub.to_csv(out_path, index=False)
pd.read_csv(out_path).to_csv('submission.csv', index=False)
print('submission.csv updated ->', out_path)

Loaded xlmr512 (1401, 512) offsets (1401, 512)


Loaded xlmr384 (1921, 384) offsets (1921, 384)


Medal-push multistream done in 4.5s. Empties=0, mean_len=13.77
submission.csv updated -> submission_medalpush_2stream_lambda012_K240_280_Lmax52_62_beta006_delta003.csv


In [16]:
# 384-only sanity decode (alignment check): char-fusion single-stream with its own mapping
import numpy as np, pandas as pd, time, math, json, re, unicodedata as ud
from pathlib import Path

def load_npz_logits(npz_path):
    arr = np.load(npz_path, allow_pickle=True)
    keys = list(arr.keys())
    if 'start' in keys and 'end' in keys:
        return arr['start'], arr['end']
    for sk in ['start_logits','test_start_logits','start_logits_avg']:
        for ek in ['end_logits','test_end_logits','end_logits_avg']:
            if sk in keys and ek in keys: return arr[sk], arr[ek]
    raise ValueError(f'Unknown keys in {npz_path}: {keys}')

npz_path = 'xlmr_large_test_avg.npz'  # 384
start_logits, end_logits = load_npz_logits(npz_path)
Nfeat, Ltok = start_logits.shape
print('Loaded 384 logits:', start_logits.shape, end_logits.shape, flush=True)

map_dir = 'xlmr_large_test_logits'  # 384 mapping (must match)
eid_path = Path(map_dir) / 'test_example_id.json'
off_path = Path(map_dir) / 'test_offset_mapping.npy'
example_id_list = json.loads(Path(eid_path).read_text())
offset_mapping = np.load(off_path, allow_pickle=True)
print('Loaded 384 mapping:', len(example_id_list), getattr(offset_mapping,'shape',None), flush=True)
assert len(example_id_list) == Nfeat

test = pd.read_csv('test.csv')
id2lang = dict(zip(test['id'].tolist(), test['language'].tolist()))
id2context = dict(zip(test['id'].tolist(), test['context'].astype(str).tolist()))

train = pd.read_csv('train.csv')
train['answer_text'] = train['answer_text'].astype(str).str.strip()
train['char_len'] = train['answer_text'].str.len().clip(lower=1)
def fit_log_normal_params(df):
    x = np.log(df['char_len'].values.astype(float))
    mu = float(x.mean()); sd = float(x.std());
    if sd <= 1e-6: sd = 1e-6
    return mu, sd
priors = {lang: dict(zip(['mu','sigma'], fit_log_normal_params(g))) for lang, g in train.groupby('language')}

ZW_CHARS = {'\u200B','\u200C','\u200D','\u2060','\ufeff'}
NBSP_SET = {'\u00A0','\u2002','\u2003','\u2004','\u2005','\u2006','\u2007','\u2008','\u2009','\u200A'}
HI_VIRAMA = '\u094D'; TA_PULLI='\u0BCD'; DANDA='\u0964'
def clean_span_text(s: str, lang: str) -> str:
    if not s: return ''
    s = ''.join(ch for ch in s if ch not in ZW_CHARS)
    s = ''.join(' ' if ch in NBSP_SET else ch for ch in s)
    s = re.sub(r'\s+', ' ', s).strip()
    s = re.sub(r'(?<=\d)[\s,._-](?=\d)', '', s)
    if s:
        last = s[-1]
        if ud.category(last) == 'Mn' or last in (HI_VIRAMA, TA_PULLI): s = s[:-1]
    if lang == 'hindi':
        s = s.replace(DANDA+DANDA, DANDA)
        if s.endswith(DANDA): s = s[:-1].rstrip()
    return s

def log_normal_logpdf_len(char_len: int, mu: float, sigma: float) -> float:
    Lc = max(1, int(char_len))
    x = math.log(Lc)
    return -0.5*((x-mu)/sigma)**2 - math.log(max(Lc,1e-6)) - math.log(sigma) - 0.5*math.log(2*math.pi)

def to_pair(t):
    try:
        if t is None: return (0,0)
        if isinstance(t,(list,tuple,np.ndarray)) and len(t)>=2:
            a = int(t[0]) if t[0] is not None else 0
            b = int(t[1]) if t[1] is not None else 0
            return (a,b)
    except Exception:
        pass
    return (0,0)

qid_to_feat_idx = {}
for i, qid in enumerate(example_id_list):
    qid_to_feat_idx.setdefault(qid, []).append(i)

def maxpool1d(x):
    if len(x) == 0: return x
    y = x.copy()
    if len(x) == 1: return y
    y[0] = max(x[0], x[1])
    for i in range(1, len(x)-1): y[i] = max(x[i-1], x[i], x[i+1])
    y[-1] = max(x[-2], x[-1])
    return y

def decode_charfusion_384(lambda_len=0.15, clip_prior=(-0.7,0.0), nbest_hi=220, nbest_ta=260, Lmax_hi=50, Lmax_ta=60, do_pool=True):
    t0 = time.time()
    out_rows = []
    for qid in test['id'].tolist():
        lang = id2lang[qid]; ctx = id2context[qid]
        mu = priors[lang]['mu']; sigma = priors[lang]['sigma']
        Lmax = Lmax_hi if lang=='hindi' else Lmax_ta
        K = nbest_hi if lang=='hindi' else nbest_ta
        S = np.zeros(len(ctx), dtype=np.float32)
        E = np.zeros(len(ctx), dtype=np.float32)
        for fi in qid_to_feat_idx.get(qid, []):
            offs_raw = offset_mapping[fi]
            M = len(offs_raw) if hasattr(offs_raw,'__len__') else Ltok
            s_log = start_logits[fi][:M]
            e_log = end_logits[fi][:M]
            for ti in range(M):
                a,b = to_pair(offs_raw[ti])
                if b > a and 0 <= a < len(ctx) and 1 <= b <= len(ctx):
                    S[a] += s_log[ti]
                    E[b-1] += e_log[ti]
        if do_pool:
            S = maxpool1d(S); E = maxpool1d(E)
        if len(S) == 0:
            out_rows.append((qid, ctx[:1] if len(ctx)>0 else ''))
            continue
        starts = np.argsort(S)[::-1][:K]
        best_score = -1e18; best_span = (0, max(0, min(Lmax, len(ctx))-1))
        for si in starts:
            end_lo = si; end_hi = min(len(ctx)-1, si + Lmax - 1)
            if end_hi < end_lo: continue
            seg = E[end_lo:end_hi+1]
            ej = end_lo + int(np.argmax(seg))
            raw = float(S[si] + E[ej])
            clen = ej - si + 1
            lp = log_normal_logpdf_len(clen, mu, sigma)
            lp = max(clip_prior[0], min(clip_prior[1], lp))
            raw += lambda_len * lp
            if raw > best_score:
                best_score = raw; best_span = (si, ej)
        a,b = best_span
        text = clean_span_text(ctx[a:b+1], lang)
        if not text:
            b2 = min(len(ctx), a+1); text = clean_span_text(ctx[a:b2], lang) or ctx[a:b2]
        out_rows.append((qid, text))
    sub = pd.DataFrame(out_rows, columns=['id','PredictionString'])
    empties = (sub['PredictionString'].astype(str).str.len()==0).sum()
    mean_len = sub['PredictionString'].astype(str).str.len().mean()
    print(f'384-only char-fusion done in {time.time()-t0:.1f}s. Empties={empties}, mean_len={mean_len:.2f}')
    return sub

sub384 = decode_charfusion_384(lambda_len=0.15, nbest_hi=220, nbest_ta=260, Lmax_hi=50, Lmax_ta=60, do_pool=True)
out384 = 'submission_384only_charfusion_lambda015.csv'
sub384.to_csv(out384, index=False)
pd.read_csv(out384).to_csv('submission.csv', index=False)
print('submission.csv updated ->', out384)

Loaded 384 logits: (1921, 384) (1921, 384)


Loaded 384 mapping: 1921 (1921, 384)


384-only char-fusion done in 2.3s. Empties=0, mean_len=9.81
submission.csv updated -> submission_384only_charfusion_lambda015.csv


In [17]:
# Medal-push (alignment-safe): use 512 single-seed test_avg + 384; no MuRIL
import json, math, time, re, unicodedata as ud
import numpy as np, pandas as pd
from pathlib import Path

test = pd.read_csv('test.csv')
id2lang = dict(zip(test['id'].tolist(), test['language'].tolist()))
id2context = dict(zip(test['id'].tolist(), test['context'].astype(str).tolist()))
train = pd.read_csv('train.csv')
train['answer_text'] = train['answer_text'].astype(str).str.strip()
train['char_len'] = train['answer_text'].str.len().clip(lower=1)
def fit_log_normal_params(df):
    x = np.log(df['char_len'].values.astype(float))
    mu = float(x.mean()); sd = float(x.std());
    if sd <= 1e-6: sd = 1e-6
    return mu, sd
priors = {lang: dict(zip(['mu','sigma'], fit_log_normal_params(g))) for lang, g in train.groupby('language')}

ZW_CHARS = {'\u200B','\u200C','\u200D','\u2060','\ufeff'}
NBSP_SET = {'\u00A0','\u2002','\u2003','\u2004','\u2005','\u2006','\u2007','\u2008','\u2009','\u200A'}
HI_VIRAMA = '\u094D'; TA_PULLI = '\u0BCD'; DANDA = '\u0964'
def clean_span_text(s: str, lang: str) -> str:
    if not s: return ''
    s = ''.join(ch for ch in s if ch not in ZW_CHARS)
    s = ''.join(' ' if ch in NBSP_SET else ch for ch in s)
    s = re.sub(r'\s+', ' ', s).strip()
    s = re.sub(r'(?<=\d)[\s,._-](?=\d)', '', s)
    if s and (ud.category(s[-1]) == 'Mn' or s[-1] in (HI_VIRAMA, TA_PULLI)):
        s = s[:-1]
    if lang == 'hindi':
        s = s.replace(DANDA+DANDA, DANDA)
        if s.endswith(DANDA): s = s[:-1].rstrip()
    return s

def log_normal_logpdf_len(char_len: int, mu: float, sigma: float) -> float:
    Lc = max(1, int(char_len))
    x = math.log(Lc)
    return -0.5*((x-mu)/sigma)**2 - math.log(max(Lc,1e-6)) - math.log(sigma) - 0.5*math.log(2*math.pi)

def to_pair(t):
    try:
        if t is None: return (0,0)
        if isinstance(t,(list,tuple,np.ndarray)) and len(t)>=2:
            a = int(t[0]) if t[0] is not None else 0
            b = int(t[1]) if t[1] is not None else 0
            return (a,b)
    except Exception:
        pass
    return (0,0)

def load_npz_logits(npz_path):
    arr = np.load(npz_path, allow_pickle=True)
    keys = list(arr.keys())
    if 'start' in keys and 'end' in keys:
        return arr['start'], arr['end']
    for sk in ['start_logits','test_start_logits','start_logits_avg']:
        for ek in ['end_logits','test_end_logits','end_logits_avg']:
            if sk in keys and ek in keys: return arr[sk], arr[ek]
    raise ValueError(f'Unknown keys in {npz_path}: {keys}')

def maxpool1d(x):
    if len(x) == 0: return x
    y = x.copy()
    if len(x) == 1: return y
    y[0] = max(x[0], x[1])
    for i in range(1, len(x)-1): y[i] = max(x[i-1], x[i], x[i+1])
    y[-1] = max(x[-2], x[-1])
    return y

def is_boundary_char(ch: str) -> bool:
    return ch.isspace() or ch in {'.', ',', '!', '?', DANDA, '"', "'", '(', ')', '[', ']', '{', '}'}

def snap_span(ctx: str, S: np.ndarray, E: np.ndarray, a: int, b: int, delta: float = 0.03):
    base = float(S[a] + E[b])
    a2 = a
    for i in range(a-1, -1, -1):
        if is_boundary_char(ctx[i]):
            a2 = i+1; break
    b2 = b
    for j in range(b, len(ctx)):
        if is_boundary_char(ctx[j]):
            b2 = max(a2, j-1); break
    cand = float(S[a2] + E[b2])
    if cand >= base - delta:
        return a2, b2
    if a > 0 and b < len(ctx)-1 and ctx[a-1] in {'(', '"', "'"} and ctx[b+1] in {')', '"', "'"}:
        cand2 = float(S[a-1] + E[b+1])
        if cand2 >= base - delta and (b+1 - (a-1) <= (b - a) + 2):
            return a-1, b+1
    return a, b

# Streams: 512 single-seed test_avg (with its own mapping) + 384
streams = [
    dict(name='xlmr512s', npz='xlmr_large_512_test_avg.npz', map_dir='xlmr_large_512_test_logits'),
    dict(name='xlmr384',  npz='xlmr_large_test_avg.npz',     map_dir='xlmr_large_test_logits'),
]
loaded = []
for s in streams:
    try:
        s_start, s_end = load_npz_logits(s['npz'])
        eid_path = Path(s['map_dir']) / 'test_example_id.json'
        offs_path = Path(s['map_dir']) / 'test_offset_mapping.npy'
        eid = json.loads(eid_path.read_text())
        offs = np.load(offs_path, allow_pickle=True)
        loaded.append((s['name'], s_start, s_end, eid, offs))
        print('Loaded', s['name'], s_start.shape, 'offsets', getattr(offs,'shape',None))
    except Exception as e:
        print('Skip stream', s['name'], '->', e)

def decode_medal_push_alignsafe():
    # Per-language weights (no MuRIL)
    weights_hi = {'xlmr512s': 0.92,  'xlmr384': 0.08}
    weights_ta = {'xlmr512s': 0.995, 'xlmr384': 0.005}
    # Hyperparams
    lambda_len = 0.12
    K_hi, K_ta = 240, 280
    Lmax_hi, Lmax_ta = 52, 62
    clip_prior = (-0.8, 0.0)
    beta = 0.06
    shortlist = 6

    # index maps per stream
    idx_maps = []
    for name, s_start, s_end, eid, offs in loaded:
        m = {}
        for i, qid in enumerate(eid): m.setdefault(qid, []).append(i)
        idx_maps.append(m)

    out_rows = []
    t0 = time.time()
    for qid in test['id'].tolist():
        lang = id2lang[qid]; ctx = id2context[qid]
        mu = priors[lang]['mu']; sigma = priors[lang]['sigma']
        Lmax = Lmax_hi if lang=='hindi' else Lmax_ta
        K = K_hi if lang=='hindi' else K_ta
        weights = weights_hi if lang=='hindi' else weights_ta

        S = np.zeros(len(ctx), dtype=np.float32)
        E = np.zeros(len(ctx), dtype=np.float32)

        for (name, s_start, s_end, eid, offs), m in zip(loaded, idx_maps):
            w = weights.get(name, 0.0)
            if w <= 0.0: continue
            for fi in m.get(qid, []):
                offs_raw = offs[fi]
                M = len(offs_raw) if hasattr(offs_raw,'__len__') else s_start.shape[1]
                s_log = s_start[fi][:M] * w
                e_log = s_end[fi][:M] * w
                for ti in range(M):
                    a, b = to_pair(offs_raw[ti])
                    if b > a and 0 <= a < len(ctx) and 1 <= b <= len(ctx):
                        S[a] += s_log[ti]
                        E[b-1] += e_log[ti]

        S = maxpool1d(S); E = maxpool1d(E)

        starts = np.argsort(S)[::-1][:K]
        best_score = -1e18
        best_span = (0, max(0, min(Lmax, len(ctx))-1))
        for si in starts:
            end_lo = si; end_hi = min(len(ctx)-1, si + Lmax - 1)
            if end_hi < end_lo: continue
            seg = E[end_lo:end_hi+1]
            top_rel = np.argsort(seg)[::-1][:shortlist]
            for rel in top_rel:
                ej = end_lo + int(rel)
                raw = float(S[si] + E[ej])
                clen = ej - si + 1
                lp = log_normal_logpdf_len(clen, mu, sigma)
                lp = max(clip_prior[0], min(clip_prior[1], lp))
                raw += lambda_len * lp
                if beta > 0.0:
                    span_text = ctx[si:ej+1]
                    if span_text and ctx.count(span_text) > 1:
                        raw += beta * math.log(1 + ctx.count(span_text))
                if raw > best_score:
                    best_score = raw; best_span = (si, ej)

        a, b = best_span
        a, b = snap_span(ctx, S, E, a, b, delta=0.03)
        text = clean_span_text(ctx[a:b+1], lang)
        if not text:
            b2 = min(len(ctx), a+1)
            text = clean_span_text(ctx[a:b2], lang) or ctx[a:b2]
        out_rows.append((qid, text))

    sub = pd.DataFrame(out_rows, columns=['id','PredictionString'])
    empties = (sub['PredictionString'].astype(str).str.len()==0).sum()
    mean_len = sub['PredictionString'].astype(str).str.len().mean()
    print(f'Align-safe 2-stream decode done in {time.time()-t0:.1f}s. Empties={empties}, mean_len={mean_len:.2f}')
    return sub

sub_as = decode_medal_push_alignsafe()
out_path_as = 'submission_alignsafe_512single_384_lambda012_K240_280_Lmax52_62_beta006_delta003.csv'
sub_as.to_csv(out_path_as, index=False)
pd.read_csv(out_path_as).to_csv('submission.csv', index=False)
print('submission.csv updated ->', out_path_as)

Loaded xlmr512s (1401, 512) offsets (1401, 512)


Loaded xlmr384 (1921, 384) offsets (1921, 384)


Align-safe 2-stream decode done in 4.6s. Empties=0, mean_len=13.80
submission.csv updated -> submission_alignsafe_512single_384_lambda012_K240_280_Lmax52_62_beta006_delta003.csv


In [18]:
# Hedge variant: alignment-safe 2-stream (512 single-seed + 384), beta=0.0, snap delta=0.02
import json, math, time, re, unicodedata as ud
import numpy as np, pandas as pd
from pathlib import Path

test = pd.read_csv('test.csv')
id2lang = dict(zip(test['id'].tolist(), test['language'].tolist()))
id2context = dict(zip(test['id'].tolist(), test['context'].astype(str).tolist()))
train = pd.read_csv('train.csv')
train['answer_text'] = train['answer_text'].astype(str).str.strip()
train['char_len'] = train['answer_text'].str.len().clip(lower=1)
def fit_log_normal_params(df):
    x = np.log(df['char_len'].values.astype(float))
    mu = float(x.mean()); sd = float(x.std());
    if sd <= 1e-6: sd = 1e-6
    return mu, sd
priors = {lang: dict(zip(['mu','sigma'], fit_log_normal_params(g))) for lang, g in train.groupby('language')}

ZW_CHARS = {'\u200B','\u200C','\u200D','\u2060','\ufeff'}
NBSP_SET = {'\u00A0','\u2002','\u2003','\u2004','\u2005','\u2006','\u2007','\u2008','\u2009','\u200A'}
HI_VIRAMA = '\u094D'; TA_PULLI = '\u0BCD'; DANDA = '\u0964'
def clean_span_text(s: str, lang: str) -> str:
    if not s: return ''
    s = ''.join(ch for ch in s if ch not in ZW_CHARS)
    s = ''.join(' ' if ch in NBSP_SET else ch for ch in s)
    s = re.sub(r'\s+', ' ', s).strip()
    s = re.sub(r'(?<=\d)[\s,._-](?=\d)', '', s)
    if s and (ud.category(s[-1]) == 'Mn' or s[-1] in (HI_VIRAMA, TA_PULLI)):
        s = s[:-1]
    if lang == 'hindi':
        s = s.replace(DANDA+DANDA, DANDA)
        if s.endswith(DANDA): s = s[:-1].rstrip()
    return s

def log_normal_logpdf_len(char_len: int, mu: float, sigma: float) -> float:
    Lc = max(1, int(char_len))
    x = math.log(Lc)
    return -0.5*((x-mu)/sigma)**2 - math.log(max(Lc,1e-6)) - math.log(sigma) - 0.5*math.log(2*math.pi)

def to_pair(t):
    try:
        if t is None: return (0,0)
        if isinstance(t,(list,tuple,np.ndarray)) and len(t)>=2:
            a = int(t[0]) if t[0] is not None else 0
            b = int(t[1]) if t[1] is not None else 0
            return (a,b)
    except Exception:
        pass
    return (0,0)

def load_npz_logits(npz_path):
    arr = np.load(npz_path, allow_pickle=True)
    keys = list(arr.keys())
    if 'start' in keys and 'end' in keys:
        return arr['start'], arr['end']
    for sk in ['start_logits','test_start_logits','start_logits_avg']:
        for ek in ['end_logits','test_end_logits','end_logits_avg']:
            if sk in keys and ek in keys: return arr[sk], arr[ek]
    raise ValueError(f'Unknown keys in {npz_path}: {keys}')

def maxpool1d(x):
    if len(x) == 0: return x
    y = x.copy()
    if len(x) == 1: return y
    y[0] = max(x[0], x[1])
    for i in range(1, len(x)-1): y[i] = max(x[i-1], x[i], x[i+1])
    y[-1] = max(x[-2], x[-1])
    return y

def is_boundary_char(ch: str) -> bool:
    return ch.isspace() or ch in {'.', ',', '!', '?', DANDA, '"', "'", '(', ')', '[', ']', '{', '}'}

def snap_span(ctx: str, S: np.ndarray, E: np.ndarray, a: int, b: int, delta: float = 0.02):
    base = float(S[a] + E[b])
    a2 = a
    for i in range(a-1, -1, -1):
        if is_boundary_char(ctx[i]):
            a2 = i+1; break
    b2 = b
    for j in range(b, len(ctx)):
        if is_boundary_char(ctx[j]):
            b2 = max(a2, j-1); break
    cand = float(S[a2] + E[b2])
    if cand >= base - delta:
        return a2, b2
    if a > 0 and b < len(ctx)-1 and ctx[a-1] in {'(', '"', "'"} and ctx[b+1] in {')', '"', "'"}:
        cand2 = float(S[a-1] + E[b+1])
        if cand2 >= base - delta and (b+1 - (a-1) <= (b - a) + 2):
            return a-1, b+1
    return a, b

# Streams: 512 single-seed test_avg (with its own mapping) + 384
streams = [
    dict(name='xlmr512s', npz='xlmr_large_512_test_avg.npz', map_dir='xlmr_large_512_test_logits'),
    dict(name='xlmr384',  npz='xlmr_large_test_avg.npz',     map_dir='xlmr_large_test_logits'),
]
loaded = []
for s in streams:
    try:
        s_start, s_end = load_npz_logits(s['npz'])
        eid_path = Path(s['map_dir']) / 'test_example_id.json'
        offs_path = Path(s['map_dir']) / 'test_offset_mapping.npy'
        eid = json.loads(eid_path.read_text())
        offs = np.load(offs_path, allow_pickle=True)
        loaded.append((s['name'], s_start, s_end, eid, offs))
        print('Loaded', s['name'], s_start.shape, 'offsets', getattr(offs,'shape',None))
    except Exception as e:
        print('Skip stream', s['name'], '->', e)

def decode_alignsafe_beta0():
    weights_hi = {'xlmr512s': 0.92,  'xlmr384': 0.08}
    weights_ta = {'xlmr512s': 0.995, 'xlmr384': 0.005}
    lambda_len = 0.12
    K_hi, K_ta = 240, 280
    Lmax_hi, Lmax_ta = 52, 62
    clip_prior = (-0.8, 0.0)
    beta = 0.0
    shortlist = 6

    idx_maps = []
    for name, s_start, s_end, eid, offs in loaded:
        m = {}
        for i, qid in enumerate(eid): m.setdefault(qid, []).append(i)
        idx_maps.append(m)

    out_rows = []
    t0 = time.time()
    for qid in test['id'].tolist():
        lang = id2lang[qid]; ctx = id2context[qid]
        mu = priors[lang]['mu']; sigma = priors[lang]['sigma']
        Lmax = Lmax_hi if lang=='hindi' else Lmax_ta
        K = K_hi if lang=='hindi' else K_ta
        weights = weights_hi if lang=='hindi' else weights_ta

        S = np.zeros(len(ctx), dtype=np.float32)
        E = np.zeros(len(ctx), dtype=np.float32)

        for (name, s_start, s_end, eid, offs), m in zip(loaded, idx_maps):
            w = weights.get(name, 0.0)
            if w <= 0.0: continue
            for fi in m.get(qid, []):
                offs_raw = offs[fi]
                M = len(offs_raw) if hasattr(offs_raw,'__len__') else s_start.shape[1]
                s_log = s_start[fi][:M] * w
                e_log = s_end[fi][:M] * w
                for ti in range(M):
                    a, b = to_pair(offs_raw[ti])
                    if b > a and 0 <= a < len(ctx) and 1 <= b <= len(ctx):
                        S[a] += s_log[ti]
                        E[b-1] += e_log[ti]

        S = maxpool1d(S); E = maxpool1d(E)

        starts = np.argsort(S)[::-1][:K]
        best_score = -1e18
        best_span = (0, max(0, min(Lmax, len(ctx))-1))
        for si in starts:
            end_lo = si; end_hi = min(len(ctx)-1, si + Lmax - 1)
            if end_hi < end_lo: continue
            seg = E[end_lo:end_hi+1]
            top_rel = np.argsort(seg)[::-1][:shortlist]
            for rel in top_rel:
                ej = end_lo + int(rel)
                raw = float(S[si] + E[ej])
                clen = ej - si + 1
                lp = log_normal_logpdf_len(clen, mu, sigma)
                lp = max(clip_prior[0], min(clip_prior[1], lp))
                raw += lambda_len * lp
                if beta > 0.0:
                    span_text = ctx[si:ej+1]
                    if span_text and ctx.count(span_text) > 1:
                        raw += beta * math.log(1 + ctx.count(span_text))
                if raw > best_score:
                    best_score = raw; best_span = (si, ej)

        a, b = best_span
        a, b = snap_span(ctx, S, E, a, b, delta=0.02)
        text = clean_span_text(ctx[a:b+1], lang)
        if not text:
            b2 = min(len(ctx), a+1)
            text = clean_span_text(ctx[a:b2], lang) or ctx[a:b2]
        out_rows.append((qid, text))

    sub = pd.DataFrame(out_rows, columns=['id','PredictionString'])
    empties = (sub['PredictionString'].astype(str).str.len()==0).sum()
    mean_len = sub['PredictionString'].astype(str).str.len().mean()
    print(f'Align-safe beta0 snap002 done in {time.time()-t0:.1f}s. Empties={empties}, mean_len={mean_len:.2f}')
    return sub

sub_beta0 = decode_alignsafe_beta0()
out_path = 'submission_alignsafe_beta0_snap002_lambda012_K240_280_Lmax52_62.csv'
sub_beta0.to_csv(out_path, index=False)
pd.read_csv(out_path).to_csv('submission.csv', index=False)
print('submission.csv updated ->', out_path)

Loaded xlmr512s (1401, 512) offsets (1401, 512)


Loaded xlmr384 (1921, 384) offsets (1921, 384)


Align-safe beta0 snap002 done in 3.5s. Empties=0, mean_len=14.01
submission.csv updated -> submission_alignsafe_beta0_snap002_lambda012_K240_280_Lmax52_62.csv


In [19]:
# Token-level per-stream decode and per-id selection (512 single-seed vs 384), no MuRIL
import json, math, time, re, unicodedata as ud
import numpy as np, pandas as pd
from pathlib import Path

def load_npz_logits(npz_path):
    arr = np.load(npz_path, allow_pickle=True)
    keys = list(arr.keys())
    if 'start' in keys and 'end' in keys:
        return arr['start'], arr['end']
    for sk in ['start_logits','test_start_logits','start_logits_avg']:
        for ek in ['end_logits','test_end_logits','end_logits_avg']:
            if sk in keys and ek in keys: return arr[sk], arr[ek]
    raise ValueError(f'Unknown keys in {npz_path}: {keys}')

test = pd.read_csv('test.csv')
id2lang = dict(zip(test['id'].tolist(), test['language'].tolist()))
id2context = dict(zip(test['id'].tolist(), test['context'].astype(str).tolist()))
train = pd.read_csv('train.csv')
train['answer_text'] = train['answer_text'].astype(str).str.strip()
train['char_len'] = train['answer_text'].str.len().clip(lower=1)
def fit_log_normal_params(df):
    x = np.log(df['char_len'].values.astype(float))
    mu = float(x.mean()); sd = float(x.std());
    if sd <= 1e-6: sd = 1e-6
    return mu, sd
priors = {lang: dict(zip(['mu','sigma'], fit_log_normal_params(g))) for lang, g in train.groupby('language')}

ZW_CHARS = {'\u200B','\u200C','\u200D','\u2060','\ufeff'}
NBSP_SET = {'\u00A0','\u2002','\u2003','\u2004','\u2005','\u2006','\u2007','\u2008','\u2009','\u200A'}
HI_VIRAMA = '\u094D'; TA_PULLI='\u0BCD'; DANDA='\u0964'
def clean_span_text(s: str, lang: str) -> str:
    if not s: return ''
    s = ''.join(ch for ch in s if ch not in ZW_CHARS)
    s = ''.join(' ' if ch in NBSP_SET else ch for ch in s)
    s = re.sub(r'\s+', ' ', s).strip()
    s = re.sub(r'(?<=\d)[\s,._-](?=\d)', '', s)
    if s:
        last = s[-1]
        if ud.category(last) == 'Mn' or last in (HI_VIRAMA, TA_PULLI): s = s[:-1]
    if lang == 'hindi':
        s = s.replace(DANDA+DANDA, DANDA)
        if s.endswith(DANDA): s = s[:-1].rstrip()
    return s

def log_normal_logpdf_len(char_len: int, mu: float, sigma: float) -> float:
    Lc = max(1, int(char_len))
    x = math.log(Lc)
    return -0.5*((x-mu)/sigma)**2 - math.log(max(Lc,1e-6)) - math.log(sigma) - 0.5*math.log(2*math.pi)

def to_pair(t):
    try:
        if t is None: return (0,0)
        if isinstance(t,(list,tuple,np.ndarray)) and len(t)>=2:
            a = int(t[0]) if t[0] is not None else 0
            b = int(t[1]) if t[1] is not None else 0
            return (a,b)
    except Exception:
        pass
    return (0,0)

def build_qid_index(example_id_list):
    m = {}
    for i, qid in enumerate(example_id_list):
        m.setdefault(qid, []).append(i)
    return m

def decode_stream_token_level(start_logits, end_logits, example_id_list, offset_mapping, lambda_len=0.12, clip_prior=(-0.8,0.0),
                               K_hi=240, K_ta=280, Lmax_hi=52, Lmax_ta=62, shortlist=6):
    qid_index = build_qid_index(example_id_list)
    preds = {}  # qid -> (score, text)
    for qid in test['id'].tolist():
        lang = id2lang[qid]; ctx = id2context[qid]
        mu = priors[lang]['mu']; sigma = priors[lang]['sigma']
        Lmax = Lmax_hi if lang=='hindi' else Lmax_ta
        K = K_hi if lang=='hindi' else K_ta
        best_score = -1e18; best_text = None
        for fi in qid_index.get(qid, []):
            offs = offset_mapping[fi]
            M = len(offs) if hasattr(offs,'__len__') else start_logits.shape[1]
            s_log = start_logits[fi][:M]
            e_log = end_logits[fi][:M]
            # candidate starts: top-K among tokens that begin a valid span
            valid = []
            for ti in range(M):
                a,b = to_pair(offs[ti])
                if b > a and 0 <= a < len(ctx) and 1 <= b <= len(ctx):
                    valid.append(ti)
            if not valid: continue
            valid = np.array(valid, dtype=np.int32)
            s_top = valid[np.argsort(s_log[valid])[::-1][:K]]
            for si in s_top:
                a0, _ = to_pair(offs[si])
                end_lo = si; end_hi = M-1
                # restrict ends to within Lmax chars and valid offsets
                cand = []
                for ei in range(end_lo, M):
                    a2, b2 = to_pair(offs[ei])
                    if not (b2 > a2 and 0 <= a2 < len(ctx) and 1 <= b2 <= len(ctx)):
                        continue
                    clen = b2 - a0
                    if clen <= 0 or clen > Lmax:
                        continue
                    cand.append(ei)
                if not cand: continue
                cand = np.array(cand, dtype=np.int32)
                seg = e_log[cand]
                top_idx = cand[np.argsort(seg)[::-1][:shortlist]]
                for ei in top_idx:
                    a, b = to_pair(offs[si])[0], to_pair(offs[ei])[1]
                    if not (0 <= a < len(ctx) and 1 <= b <= len(ctx) and b > a):
                        continue
                    raw = float(s_log[si] + e_log[ei])
                    clen = b - a
                    lp = log_normal_logpdf_len(clen, mu, sigma)
                    lp = max(clip_prior[0], min(clip_prior[1], lp))
                    raw += lambda_len * lp
                    if raw > best_score:
                        best_score = raw
                        best_text = clean_span_text(ctx[a:b], lang) or ctx[a:b]
        if best_text is None:
            best_text = clean_span_text(ctx[:1], lang) or ctx[:1]
        preds[qid] = best_text
    return preds

# Load 512 single-seed (alignment-safe) stream
s512, e512 = load_npz_logits('xlmr_large_512_test_avg.npz')
eid_512 = json.loads(Path('xlmr_large_512_test_logits/test_example_id.json').read_text())
off_512 = np.load('xlmr_large_512_test_logits/test_offset_mapping.npy', allow_pickle=True)
print('512 single-seed:', s512.shape, 'mapping', len(eid_512))
# Load 384 stream
s384, e384 = load_npz_logits('xlmr_large_test_avg.npz')
eid_384 = json.loads(Path('xlmr_large_test_logits/test_example_id.json').read_text())
off_384 = np.load('xlmr_large_test_logits/test_offset_mapping.npy', allow_pickle=True)
print('384:', s384.shape, 'mapping', len(eid_384))

t0 = time.time()
preds_512 = decode_stream_token_level(s512, e512, eid_512, off_512, lambda_len=0.12, clip_prior=(-0.8,0.0),
                                      K_hi=240, K_ta=280, Lmax_hi=52, Lmax_ta=62, shortlist=6)
preds_384 = decode_stream_token_level(s384, e384, eid_384, off_384, lambda_len=0.12, clip_prior=(-0.8,0.0),
                                      K_hi=240, K_ta=280, Lmax_hi=52, Lmax_ta=62, shortlist=6)
print(f'Token-level per-stream decodes done in {time.time()-t0:.1f}s')

# Simple per-id selection: prefer 512 unless 384 gives shorter clean text within 20% length
rows = []
for qid in test['id'].tolist():
    t512 = preds_512.get(qid, '')
    t384 = preds_384.get(qid, '')
    if not t512:
        best = t384
    elif not t384:
        best = t512
    else:
        # Heuristic: pick the one closer to per-lang median length in log space
        lang = id2lang[qid]
        mu = priors[lang]['mu']
        def neg_ll(s):
            Lc = max(1, len(s)); return abs(math.log(Lc) - mu)
        d512, d384 = neg_ll(t512), neg_ll(t384)
        best = t512 if d512 <= d384 else t384
    rows.append((qid, best))
sub_tok = pd.DataFrame(rows, columns=['id','PredictionString'])
empties = (sub_tok['PredictionString'].astype(str).str.len()==0).sum()
mean_len = sub_tok['PredictionString'].astype(str).str.len().mean()
print('Token-level selection diagnostics: empties=', int(empties), 'mean_len=', round(float(mean_len),2))
out_tok = 'submission_tokenselect_512single_or_384_lambda012.csv'
sub_tok.to_csv(out_tok, index=False)
pd.read_csv(out_tok).to_csv('submission.csv', index=False)
print('submission.csv updated ->', out_tok)

512 single-seed: (1401, 512) mapping 1401


384: (1921, 384) mapping 1921


Token-level per-stream decodes done in 117.6s
Token-level selection diagnostics: empties= 0 mean_len= 10.9
submission.csv updated -> submission_tokenselect_512single_or_384_lambda012.csv


In [20]:
# Align-safe 2-stream char-fusion with tight length control (per expert). 512 single-seed + tiny 384; no MuRIL.
import json, math, time, re, unicodedata as ud
import numpy as np, pandas as pd
from pathlib import Path

test = pd.read_csv('test.csv')
id2lang = dict(zip(test['id'].tolist(), test['language'].tolist()))
id2context = dict(zip(test['id'].tolist(), test['context'].astype(str).tolist()))
train = pd.read_csv('train.csv')
train['answer_text'] = train['answer_text'].astype(str).str.strip()
train['char_len'] = train['answer_text'].str.len().clip(lower=1)
def fit_log_normal_params(df):
    x = np.log(df['char_len'].values.astype(float))
    mu = float(x.mean()); sd = float(x.std());
    if sd <= 1e-6: sd = 1e-6
    return mu, sd
priors = {lang: dict(zip(['mu','sigma'], fit_log_normal_params(g))) for lang, g in train.groupby('language')}

ZW_CHARS = {'\u200B','\u200C','\u200D','\u2060','\ufeff'}
NBSP_SET = {'\u00A0','\u2002','\u2003','\u2004','\u2005','\u2006','\u2007','\u2008','\u2009','\u200A'}
HI_VIRAMA = '\u094D'; TA_PULLI='\u0BCD'; DANDA='\u0964'
def clean_span_text(s: str, lang: str) -> str:
    if not s: return ''
    s = ''.join(ch for ch in s if ch not in ZW_CHARS)
    s = ''.join(' ' if ch in NBSP_SET else ch for ch in s)
    s = re.sub(r'\s+', ' ', s).strip()
    s = re.sub(r'(?<=\d)[\s,._-](?=\d)', '', s)
    if s:
        last = s[-1]
        if ud.category(last) == 'Mn' or last in (HI_VIRAMA, TA_PULLI): s = s[:-1]
    if lang == 'hindi':
        s = s.replace(DANDA+DANDA, DANDA)
        if s.endswith(DANDA): s = s[:-1].rstrip()
    return s

def log_normal_logpdf_len(char_len: int, mu: float, sigma: float) -> float:
    Lc = max(1, int(char_len))
    x = math.log(Lc)
    return -0.5*((x-mu)/sigma)**2 - math.log(max(Lc,1e-6)) - math.log(sigma) - 0.5*math.log(2*math.pi)

def to_pair(t):
    try:
        if t is None: return (0,0)
        if isinstance(t,(list,tuple,np.ndarray)) and len(t)>=2:
            a = int(t[0]) if t[0] is not None else 0
            b = int(t[1]) if t[1] is not None else 0
            return (a,b)
    except Exception:
        pass
    return (0,0)

def load_npz_logits(npz_path):
    arr = np.load(npz_path, allow_pickle=True)
    keys = list(arr.keys())
    if 'start' in keys and 'end' in keys:
        return arr['start'], arr['end']
    for sk in ['start_logits','test_start_logits','start_logits_avg']:
        for ek in ['end_logits','test_end_logits','end_logits_avg']:
            if sk in keys and ek in keys: return arr[sk], arr[ek]
    raise ValueError(f'Unknown keys in {npz_path}: {keys}')

def maxpool1d(x):
    if len(x) == 0: return x
    y = x.copy()
    if len(x) == 1: return y
    y[0] = max(x[0], x[1])
    for i in range(1, len(x)-1): y[i] = max(x[i-1], x[i], x[i+1])
    y[-1] = max(x[-2], x[-1])
    return y

def is_boundary_char(ch: str) -> bool:
    return ch.isspace() or ch in {'.', ',', '!', '?', DANDA, '"', "'", '(', ')', '[', ']', '{', '}', '“', '”', '‘', '’', '«', '»'}

def snap_span(ctx: str, S: np.ndarray, E: np.ndarray, a: int, b: int, delta: float = 0.02):
    base = float(S[a] + E[b])
    a2 = a
    for i in range(a-1, -1, -1):
        if is_boundary_char(ctx[i]):
            a2 = i+1; break
    b2 = b
    for j in range(b, len(ctx)):
        if is_boundary_char(ctx[j]):
            b2 = max(a2, j-1); break
    cand = float(S[a2] + E[b2])
    if cand >= base - delta:
        return a2, b2
    if a > 0 and b < len(ctx)-1 and ctx[a-1] in {'(', '"', "'"} and ctx[b+1] in {')', '"', "'"}:
        cand2 = float(S[a-1] + E[b+1])
        if cand2 >= base - delta and (b+1 - (a-1) <= (b - a) + 2):
            return a-1, b+1
    return a, b

# Streams: 512 single-seed test_avg (with its mapping) + 384
streams = [
    dict(name='xlmr512s', npz='xlmr_large_512_test_avg.npz', map_dir='xlmr_large_512_test_logits'),
    dict(name='xlmr384',  npz='xlmr_large_test_avg.npz',     map_dir='xlmr_large_test_logits'),
]
loaded = []
for s in streams:
    try:
        s_start, s_end = load_npz_logits(s['npz'])
        eid_path = Path(s['map_dir']) / 'test_example_id.json'
        offs_path = Path(s['map_dir']) / 'test_offset_mapping.npy'
        eid = json.loads(eid_path.read_text())
        offs = np.load(offs_path, allow_pickle=True)
        loaded.append((s['name'], s_start, s_end, eid, offs))
        print('Loaded', s['name'], s_start.shape, 'offsets', getattr(offs,'shape',None))
    except Exception as e:
        print('Skip stream', s['name'], '->', e)

def decode_alignsafe_tight():
    # Weights per language
    weights_hi = {'xlmr512s': 0.90, 'xlmr384': 0.10}
    weights_ta = {'xlmr512s': 1.00, 'xlmr384': 0.00}
    # Hyperparams
    lambda_len = 0.12
    clip_prior = (-0.8, 0.0)
    beta = 0.0
    shortlist = 6
    K_hi, K_ta = 200, 240
    Lmax_hi, Lmax_ta = 48, 58

    # index maps
    idx_maps = []
    for name, s_start, s_end, eid, offs in loaded:
        m = {}
        for i, qid in enumerate(eid): m.setdefault(qid, []).append(i)
        idx_maps.append(m)

    out_rows = []
    t0 = time.time()
    for qid in test['id'].tolist():
        lang = id2lang[qid]; ctx = id2context[qid]
        mu = priors[lang]['mu']; sigma = priors[lang]['sigma']
        Lmax = Lmax_hi if lang=='hindi' else Lmax_ta
        K = K_hi if lang=='hindi' else K_ta
        weights = weights_hi if lang=='hindi' else weights_ta

        S = np.zeros(len(ctx), dtype=np.float32)
        E = np.zeros(len(ctx), dtype=np.float32)

        for (name, s_start, s_end, eid, offs), m in zip(loaded, idx_maps):
            w = weights.get(name, 0.0)
            if w <= 0.0: continue
            for fi in m.get(qid, []):
                offs_raw = offs[fi]
                M = len(offs_raw) if hasattr(offs_raw,'__len__') else s_start.shape[1]
                s_log = s_start[fi][:M] * w
                e_log = s_end[fi][:M] * w
                for ti in range(M):
                    a, b = to_pair(offs_raw[ti])
                    if b > a and 0 <= a < len(ctx) and 1 <= b <= len(ctx):
                        S[a] += s_log[ti]
                        E[b-1] += e_log[ti]

        S = maxpool1d(S); E = maxpool1d(E)

        starts = np.argsort(S)[::-1][:K]
        best_score = -1e18
        best_span = (0, max(0, min(Lmax, len(ctx))-1))
        for si in starts:
            end_lo = si; end_hi = min(len(ctx)-1, si + Lmax - 1)
            if end_hi < end_lo: continue
            seg = E[end_lo:end_hi+1]
            top_rel = np.argsort(seg)[::-1][:shortlist]
            for rel in top_rel:
                ej = end_lo + int(rel)
                raw = float(S[si] + E[ej])
                clen = ej - si + 1
                lp = log_normal_logpdf_len(clen, mu, sigma)
                lp = max(clip_prior[0], min(clip_prior[1], lp))
                raw += lambda_len * lp
                if raw > best_score:
                    best_score = raw; best_span = (si, ej)

        a, b = best_span
        a, b = snap_span(ctx, S, E, a, b, delta=0.02)
        text = clean_span_text(ctx[a:b+1], lang)
        if not text:
            b2 = min(len(ctx), a+1)
            text = clean_span_text(ctx[a:b2], lang) or ctx[a:b2]
        out_rows.append((qid, text))

    sub = pd.DataFrame(out_rows, columns=['id','PredictionString'])
    empties = (sub['PredictionString'].astype(str).str.len()==0).sum()
    mean_len = sub['PredictionString'].astype(str).str.len().mean()
    print(f'Align-safe tight decode done in {time.time()-t0:.1f}s. Empties={empties}, mean_len={mean_len:.2f}')
    return sub

sub_tight = decode_alignsafe_tight()
out_tight = 'submission_alignsafe_tight_512single_384_wHI90_10_wTA100_0_lambda012_K200_240_Lmax48_58_delta002.csv'
sub_tight.to_csv(out_tight, index=False)
pd.read_csv(out_tight).to_csv('submission.csv', index=False)
print('submission.csv updated ->', out_tight)

Loaded xlmr512s (1401, 512) offsets (1401, 512)


Loaded xlmr384 (1921, 384) offsets (1921, 384)


Align-safe tight decode done in 3.2s. Empties=0, mean_len=13.80
submission.csv updated -> submission_alignsafe_tight_512single_384_wHI90_10_wTA100_0_lambda012_K200_240_Lmax48_58_delta002.csv


In [21]:
# Align-safe 512-only char-fusion (tight). Tamil 512-only; Hindi also 512-only. No 384, no MuRIL.
import json, math, time, re, unicodedata as ud
import numpy as np, pandas as pd
from pathlib import Path

test = pd.read_csv('test.csv')
id2lang = dict(zip(test['id'].tolist(), test['language'].tolist()))
id2context = dict(zip(test['id'].tolist(), test['context'].astype(str).tolist()))
train = pd.read_csv('train.csv')
train['answer_text'] = train['answer_text'].astype(str).str.strip()
train['char_len'] = train['answer_text'].str.len().clip(lower=1)
def fit_log_normal_params(df):
    x = np.log(df['char_len'].values.astype(float))
    mu = float(x.mean()); sd = float(x.std());
    if sd <= 1e-6: sd = 1e-6
    return mu, sd
priors = {lang: dict(zip(['mu','sigma'], fit_log_normal_params(g))) for lang, g in train.groupby('language')}

ZW_CHARS = {'\u200B','\u200C','\u200D','\u2060','\ufeff'}
NBSP_SET = {'\u00A0','\u2002','\u2003','\u2004','\u2005','\u2006','\u2007','\u2008','\u2009','\u200A'}
HI_VIRAMA='\u094D'; TA_PULLI='\u0BCD'; DANDA='\u0964'
def clean_span_text(s: str, lang: str) -> str:
    if not s: return ''
    s = ''.join(ch for ch in s if ch not in ZW_CHARS)
    s = ''.join(' ' if ch in NBSP_SET else ch for ch in s)
    s = re.sub(r'\s+', ' ', s).strip()
    s = re.sub(r'(?<=\d)[\s,._-](?=\d)', '', s)
    if s:
        last = s[-1]
        if ud.category(last) == 'Mn' or last in (HI_VIRAMA, TA_PULLI): s = s[:-1]
    if lang == 'hindi':
        s = s.replace(DANDA+DANDA, DANDA)
        if s.endswith(DANDA): s = s[:-1].rstrip()
    return s

def log_normal_logpdf_len(char_len: int, mu: float, sigma: float) -> float:
    Lc = max(1, int(char_len))
    x = math.log(Lc)
    return -0.5*((x-mu)/sigma)**2 - math.log(max(Lc,1e-6)) - math.log(sigma) - 0.5*math.log(2*math.pi)

def to_pair(t):
    try:
        if t is None: return (0,0)
        if isinstance(t,(list,tuple,np.ndarray)) and len(t)>=2:
            a = int(t[0]) if t[0] is not None else 0
            b = int(t[1]) if t[1] is not None else 0
            return (a,b)
    except Exception:
        pass
    return (0,0)

def load_npz_logits(npz_path):
    arr = np.load(npz_path, allow_pickle=True)
    keys = list(arr.keys())
    if 'start' in keys and 'end' in keys:
        return arr['start'], arr['end']
    for sk in ['start_logits','test_start_logits','start_logits_avg']:
        for ek in ['end_logits','test_end_logits','end_logits_avg']:
            if sk in keys and ek in keys: return arr[sk], arr[ek]
    raise ValueError(f'Unknown keys in {npz_path}: {keys}')

def maxpool1d(x):
    if len(x) == 0: return x
    y = x.copy()
    if len(x) == 1: return y
    y[0] = max(x[0], x[1])
    for i in range(1, len(x)-1): y[i] = max(x[i-1], x[i], x[i+1])
    y[-1] = max(x[-2], x[-1])
    return y

def is_boundary_char(ch: str) -> bool:
    return ch.isspace() or ch in {'.', ',', '!', '?', DANDA, '"', "'", '(', ')', '[', ']', '{', '}', '\u201c', '\u201d', '\u2018', '\u2019', '\u00ab', '\u00bb'}

def snap_span(ctx: str, S: np.ndarray, E: np.ndarray, a: int, b: int, delta: float = 0.02):
    base = float(S[a] + E[b])
    a2 = a
    for i in range(a-1, -1, -1):
        if is_boundary_char(ctx[i]):
            a2 = i+1; break
    b2 = b
    for j in range(b, len(ctx)):
        if is_boundary_char(ctx[j]):
            b2 = max(a2, j-1); break
    cand = float(S[a2] + E[b2])
    if cand >= base - delta:
        return a2, b2
    if a > 0 and b < len(ctx)-1 and ctx[a-1] in {'(', '"', "'"} and ctx[b+1] in {')', '"', "'"}:
        cand2 = float(S[a-1] + E[b+1])
        if cand2 >= base - delta and (b+1 - (a-1) <= (b - a) + 2):
            return a-1, b+1
    return a, b

# Load 512 single-seed stream only
s_start, s_end = load_npz_logits('xlmr_large_512_test_avg.npz')
eid = json.loads(Path('xlmr_large_512_test_logits/test_example_id.json').read_text())
offs_all = np.load('xlmr_large_512_test_logits/test_offset_mapping.npy', allow_pickle=True)
print('Loaded 512 single-seed:', s_start.shape, 'offsets', getattr(offs_all,'shape',None))

def decode_512only_tight():
    lambda_len = 0.10
    clip_prior = (-0.8, 0.0)
    K_hi, K_ta = 200, 240
    Lmax_hi, Lmax_ta = 48, 58
    shortlist = 6
    # index map
    m = {}
    for i, qid in enumerate(eid): m.setdefault(qid, []).append(i)
    out_rows = []
    t0 = time.time()
    for qid in test['id'].tolist():
        lang = id2lang[qid]; ctx = id2context[qid]
        mu = priors[lang]['mu']; sigma = priors[lang]['sigma']
        Lmax = Lmax_hi if lang=='hindi' else Lmax_ta
        K = K_hi if lang=='hindi' else K_ta
        S = np.zeros(len(ctx), dtype=np.float32)
        E = np.zeros(len(ctx), dtype=np.float32)
        for fi in m.get(qid, []):
            offs_raw = offs_all[fi]
            M = len(offs_raw) if hasattr(offs_raw,'__len__') else s_start.shape[1]
            s_log = s_start[fi][:M]
            e_log = s_end[fi][:M]
            for ti in range(M):
                a, b = to_pair(offs_raw[ti])
                if b > a and 0 <= a < len(ctx) and 1 <= b <= len(ctx):
                    S[a] += s_log[ti]
                    E[b-1] += e_log[ti]
        S = maxpool1d(S); E = maxpool1d(E)
        starts = np.argsort(S)[::-1][:K]
        best_score = -1e18; best_span = (0, max(0, min(Lmax, len(ctx))-1))
        for si in starts:
            end_lo = si; end_hi = min(len(ctx)-1, si + Lmax - 1)
            if end_hi < end_lo: continue
            seg = E[end_lo:end_hi+1]
            top_rel = np.argsort(seg)[::-1][:shortlist]
            for rel in top_rel:
                ej = end_lo + int(rel)
                raw = float(S[si] + E[ej])
                clen = ej - si + 1
                lp = log_normal_logpdf_len(clen, mu, sigma)
                lp = max(clip_prior[0], min(clip_prior[1], lp))
                raw += lambda_len * lp
                if raw > best_score:
                    best_score = raw; best_span = (si, ej)
        a, b = best_span
        a, b = snap_span(ctx, S, E, a, b, delta=0.02)
        text = clean_span_text(ctx[a:b+1], lang)
        if not text:
            b2 = min(len(ctx), a+1)
            text = clean_span_text(ctx[a:b2], lang) or ctx[a:b2]
        out_rows.append((qid, text))
    sub = pd.DataFrame(out_rows, columns=['id','PredictionString'])
    empties = (sub['PredictionString'].astype(str).str.len()==0).sum()
    mean_len = sub['PredictionString'].astype(str).str.len().mean()
    print(f'512-only tight decode done in {time.time()-t0:.1f}s. Empties={empties}, mean_len={mean_len:.2f}')
    return sub

sub_512tight = decode_512only_tight()
out_512tight = 'submission_512only_alignsafe_tight_lambda010_K200_240_Lmax48_58_delta002.csv'
sub_512tight.to_csv(out_512tight, index=False)
pd.read_csv(out_512tight).to_csv('submission.csv', index=False)
print('submission.csv updated ->', out_512tight)

Loaded 512 single-seed: (1401, 512) offsets (1401, 512)


512-only tight decode done in 2.5s. Empties=0, mean_len=14.40
submission.csv updated -> submission_512only_alignsafe_tight_lambda010_K200_240_Lmax48_58_delta002.csv


In [22]:
# 3-seed 512 + 384 (no MuRIL), tight settings per expert (beta=0, K/Lmax tight, delta=0.02, expanded boundaries)
import json, math, time, re, unicodedata as ud
import numpy as np, pandas as pd
from pathlib import Path

test = pd.read_csv('test.csv')
id2lang = dict(zip(test['id'].tolist(), test['language'].tolist()))
id2context = dict(zip(test['id'].tolist(), test['context'].astype(str).tolist()))
train = pd.read_csv('train.csv')
train['answer_text'] = train['answer_text'].astype(str).str.strip()
train['char_len'] = train['answer_text'].str.len().clip(lower=1)
def fit_log_normal_params(df):
    x = np.log(df['char_len'].values.astype(float))
    mu = float(x.mean()); sd = float(x.std());
    if sd <= 1e-6: sd = 1e-6
    return mu, sd
priors = {lang: dict(zip(['mu','sigma'], fit_log_normal_params(g))) for lang, g in train.groupby('language')}

ZW_CHARS = {'\u200B','\u200C','\u200D','\u2060','\ufeff'}
NBSP_SET = {'\u00A0','\u2002','\u2003','\u2004','\u2005','\u2006','\u2007','\u2008','\u2009','\u200A'}
HI_VIRAMA='\u094D'; TA_PULLI='\u0BCD'; DANDA='\u0964'
def clean_span_text(s: str, lang: str) -> str:
    if not s: return ''
    s = ''.join(ch for ch in s if ch not in ZW_CHARS)
    s = ''.join(' ' if ch in NBSP_SET else ch for ch in s)
    s = re.sub(r'\s+', ' ', s).strip()
    s = re.sub(r'(?<=\d)[\s,._-](?=\d)', '', s)
    if s:
        last = s[-1]
        if ud.category(last) == 'Mn' or last in (HI_VIRAMA, TA_PULLI): s = s[:-1]
    if lang == 'hindi':
        s = s.replace(DANDA+DANDA, DANDA)
        if s.endswith(DANDA): s = s[:-1].rstrip()
    return s

def log_normal_logpdf_len(char_len: int, mu: float, sigma: float) -> float:
    Lc = max(1, int(char_len))
    x = math.log(Lc)
    return -0.5*((x-mu)/sigma)**2 - math.log(max(Lc,1e-6)) - math.log(sigma) - 0.5*math.log(2*math.pi)

def to_pair(t):
    try:
        if t is None: return (0,0)
        if isinstance(t,(list,tuple,np.ndarray)) and len(t)>=2:
            a = int(t[0]) if t[0] is not None else 0
            b = int(t[1]) if t[1] is not None else 0
            return (a,b)
    except Exception:
        pass
    return (0,0)

def load_npz_logits(npz_path):
    arr = np.load(npz_path, allow_pickle=True)
    keys = list(arr.keys())
    if 'start' in keys and 'end' in keys:
        return arr['start'], arr['end']
    for sk in ['start_logits','test_start_logits','start_logits_avg']:
        for ek in ['end_logits','test_end_logits','end_logits_avg']:
            if sk in keys and ek in keys: return arr[sk], arr[ek]
    raise ValueError(f'Unknown keys in {npz_path}: {keys}')

def maxpool1d(x):
    if len(x) == 0: return x
    y = x.copy()
    if len(x) == 1: return y
    y[0] = max(x[0], x[1])
    for i in range(1, len(x)-1): y[i] = max(x[i-1], x[i], x[i+1])
    y[-1] = max(x[-2], x[-1])
    return y

def is_boundary_char(ch: str) -> bool:
    return ch.isspace() or ch in {'.', ',', '!', '?', DANDA, '"', "'", '(', ')', '[', ']', '{', '}', '\u201c', '\u201d', '\u2018', '\u2019', '\u00ab', '\u00bb'}

def snap_span(ctx: str, S: np.ndarray, E: np.ndarray, a: int, b: int, delta: float = 0.02):
    base = float(S[a] + E[b])
    a2 = a
    for i in range(a-1, -1, -1):
        if is_boundary_char(ctx[i]):
            a2 = i+1; break
    b2 = b
    for j in range(b, len(ctx)):
        if is_boundary_char(ctx[j]):
            b2 = max(a2, j-1); break
    cand = float(S[a2] + E[b2])
    if cand >= base - delta:
        return a2, b2
    if a > 0 and b < len(ctx)-1 and ctx[a-1] in {'(', '"', "'"} and ctx[b+1] in {')', '"', "'"}:
        cand2 = float(S[a-1] + E[b+1])
        if cand2 >= base - delta and (b+1 - (a-1) <= (b - a) + 2):
            return a-1, b+1
    return a, b

# Streams: 512 (3-seed avg) + 384, no MuRIL
streams = [
    dict(name='xlmr512', npz='xlmr_large_512_3seeds_avg.npz', map_dir='xlmr_large_512_test_logits'),
    dict(name='xlmr384', npz='xlmr_large_test_avg.npz',       map_dir='xlmr_large_test_logits'),
]
loaded = []
for s in streams:
    try:
        s_start, s_end = load_npz_logits(s['npz'])
        eid_path = Path(s['map_dir']) / 'test_example_id.json'
        offs_path = Path(s['map_dir']) / 'test_offset_mapping.npy'
        eid = json.loads(eid_path.read_text())
        offs = np.load(offs_path, allow_pickle=True)
        loaded.append((s['name'], s_start, s_end, eid, offs))
        print('Loaded', s['name'], s_start.shape, 'offsets', getattr(offs,'shape',None))
    except Exception as e:
        print('Skip stream', s['name'], '->', e)

def decode_3seed_tight():
    # Weights per language
    weights_hi = {'xlmr512': 0.90, 'xlmr384': 0.10}
    weights_ta = {'xlmr512': 1.00, 'xlmr384': 0.00}
    # Hyperparams
    lambda_len = 0.12
    clip_prior = (-0.8, 0.0)
    beta = 0.0
    shortlist = 6
    K_hi, K_ta = 200, 240
    Lmax_hi, Lmax_ta = 48, 58

    # index maps per stream
    idx_maps = []
    for name, s_start, s_end, eid, offs in loaded:
        m = {}
        for i, qid in enumerate(eid): m.setdefault(qid, []).append(i)
        idx_maps.append(m)

    out_rows = []
    t0 = time.time()
    for qid in test['id'].tolist():
        lang = id2lang[qid]; ctx = id2context[qid]
        mu = priors[lang]['mu']; sigma = priors[lang]['sigma']
        Lmax = Lmax_hi if lang=='hindi' else Lmax_ta
        K = K_hi if lang=='hindi' else K_ta
        weights = weights_hi if lang=='hindi' else weights_ta

        S = np.zeros(len(ctx), dtype=np.float32)
        E = np.zeros(len(ctx), dtype=np.float32)

        for (name, s_start, s_end, eid, offs), m in zip(loaded, idx_maps):
            w = weights.get(name, 0.0)
            if w <= 0.0: continue
            for fi in m.get(qid, []):
                offs_raw = offs[fi]
                M = len(offs_raw) if hasattr(offs_raw,'__len__') else s_start.shape[1]
                s_log = s_start[fi][:M] * w
                e_log = s_end[fi][:M] * w
                for ti in range(M):
                    a, b = to_pair(offs_raw[ti])
                    if b > a and 0 <= a < len(ctx) and 1 <= b <= len(ctx):
                        S[a] += s_log[ti]
                        E[b-1] += e_log[ti]

        S = maxpool1d(S); E = maxpool1d(E)

        starts = np.argsort(S)[::-1][:K]
        best_score = -1e18
        best_span = (0, max(0, min(Lmax, len(ctx))-1))
        for si in starts:
            end_lo = si; end_hi = min(len(ctx)-1, si + Lmax - 1)
            if end_hi < end_lo: continue
            seg = E[end_lo:end_hi+1]
            top_rel = np.argsort(seg)[::-1][:shortlist]
            for rel in top_rel:
                ej = end_lo + int(rel)
                raw = float(S[si] + E[ej])
                clen = ej - si + 1
                lp = log_normal_logpdf_len(clen, mu, sigma)
                lp = max(clip_prior[0], min(clip_prior[1], lp))
                raw += lambda_len * lp
                if raw > best_score:
                    best_score = raw; best_span = (si, ej)

        a, b = best_span
        a, b = snap_span(ctx, S, E, a, b, delta=0.02)
        text = clean_span_text(ctx[a:b+1], lang)
        if not text:
            b2 = min(len(ctx), a+1)
            text = clean_span_text(ctx[a:b2], lang) or ctx[a:b2]
        out_rows.append((qid, text))

    sub = pd.DataFrame(out_rows, columns=['id','PredictionString'])
    empties = (sub['PredictionString'].astype(str).str.len()==0).sum()
    mean_len = sub['PredictionString'].astype(str).str.len().mean()
    print(f'3-seed+384 tight decode done in {time.time()-t0:.1f}s. Empties={empties}, mean_len={mean_len:.2f}')
    return sub

sub_3tight = decode_3seed_tight()
out_3tight = 'submission_3seed_512_plus_384_tight_wHI90_10_wTA100_0_lambda012_K200_240_Lmax48_58_delta002.csv'
sub_3tight.to_csv(out_3tight, index=False)
pd.read_csv(out_3tight).to_csv('submission.csv', index=False)
print('submission.csv updated ->', out_3tight)

Loaded xlmr512 (1401, 512) offsets (1401, 512)


Loaded xlmr384 (1921, 384) offsets (1921, 384)


In [23]:
# 512 single-seed char-fusion with question-aware Lmax cap, tighter search, expanded boundaries, beta=0
import json, math, time, re, unicodedata as ud
import numpy as np, pandas as pd
from pathlib import Path

test = pd.read_csv('test.csv')
id2lang = dict(zip(test['id'].tolist(), test['language'].tolist()))
id2context = dict(zip(test['id'].tolist(), test['context'].astype(str).tolist()))
id2question = dict(zip(test['id'].tolist(), test.get('question', pd.Series(['']*len(test))).astype(str).tolist()))
train = pd.read_csv('train.csv')
train['answer_text'] = train['answer_text'].astype(str).str.strip()
train['char_len'] = train['answer_text'].str.len().clip(lower=1)
def fit_log_normal_params(df):
    x = np.log(df['char_len'].values.astype(float))
    mu = float(x.mean()); sd = float(x.std());
    if sd <= 1e-6: sd = 1e-6
    return mu, sd
priors = {lang: dict(zip(['mu','sigma'], fit_log_normal_params(g))) for lang, g in train.groupby('language')}

ZW_CHARS = {'\u200B','\u200C','\u200D','\u2060','\ufeff'}
NBSP_SET = {'\u00A0','\u2002','\u2003','\u2004','\u2005','\u2006','\u2007','\u2008','\u2009','\u200A'}
HI_VIRAMA='\u094D'; TA_PULLI='\u0BCD'; DANDA='\u0964'
def clean_span_text(s: str, lang: str) -> str:
    if not s: return ''
    s = ''.join(ch for ch in s if ch not in ZW_CHARS)
    s = ''.join(' ' if ch in NBSP_SET else ch for ch in s)
    s = re.sub(r'\s+', ' ', s).strip()
    s = re.sub(r'(?<=\d)[\s,._-](?=\d)', '', s)
    if s:
        last = s[-1]
        if ud.category(last) == 'Mn' or last in (HI_VIRAMA, TA_PULLI): s = s[:-1]
    if lang == 'hindi':
        s = s.replace(DANDA+DANDA, DANDA)
        if s.endswith(DANDA): s = s[:-1].rstrip()
    return s

def log_normal_logpdf_len(char_len: int, mu: float, sigma: float) -> float:
    Lc = max(1, int(char_len))
    x = math.log(Lc)
    return -0.5*((x-mu)/sigma)**2 - math.log(max(Lc,1e-6)) - math.log(sigma) - 0.5*math.log(2*math.pi)

def to_pair(t):
    try:
        if t is None: return (0,0)
        if isinstance(t,(list,tuple,np.ndarray)) and len(t)>=2:
            a = int(t[0]) if t[0] is not None else 0
            b = int(t[1]) if t[1] is not None else 0
            return (a,b)
    except Exception:
        pass
    return (0,0)

def load_npz_logits(npz_path):
    arr = np.load(npz_path, allow_pickle=True)
    keys = list(arr.keys())
    if 'start' in keys and 'end' in keys:
        return arr['start'], arr['end']
    for sk in ['start_logits','test_start_logits','start_logits_avg']:
        for ek in ['end_logits','test_end_logits','end_logits_avg']:
            if sk in keys and ek in keys: return arr[sk], arr[ek]
    raise ValueError(f'Unknown keys in {npz_path}: {keys}')

def maxpool1d(x):
    if len(x) == 0: return x
    y = x.copy()
    if len(x) == 1: return y
    y[0] = max(x[0], x[1])
    for i in range(1, len(x)-1): y[i] = max(x[i-1], x[i], x[i+1])
    y[-1] = max(x[-2], x[-1])
    return y

def is_boundary_char(ch: str) -> bool:
    return ch.isspace() or ch in {'.', ',', '!', '?', DANDA, '"', "'", '(', ')', '[', ']', '{', '}', '\u201c', '\u201d', '\u2018', '\u2019', '\u00ab', '\u00bb'}

def snap_span(ctx: str, S: np.ndarray, E: np.ndarray, a: int, b: int, delta: float = 0.02):
    base = float(S[a] + E[b])
    a2 = a
    for i in range(a-1, -1, -1):
        if is_boundary_char(ctx[i]):
            a2 = i+1; break
    b2 = b
    for j in range(b, len(ctx)):
        if is_boundary_char(ctx[j]):
            b2 = max(a2, j-1); break
    cand = float(S[a2] + E[b2])
    if cand >= base - delta:
        return a2, b2
    if a > 0 and b < len(ctx)-1 and ctx[a-1] in {'(', '"', "'"} and ctx[b+1] in {')', '"', "'"}:
        cand2 = float(S[a-1] + E[b+1])
        if cand2 >= base - delta and (b+1 - (a-1) <= (b - a) + 2):
            return a-1, b+1
    return a, b

# Question-aware cap detection
NUM_PAT = re.compile(r'\d')
DATE_PAT = re.compile(r'(\d{1,2}[\-/\.]\d{1,2}|\d{4})')
HI_NUM_WORDS = {'कब','कितने','कितना','संख्या','दिन','साल','तारीख'}
TA_NUM_WORDS = {'எப்போது','எத்தனை','எவ்வளவு','எண்','ஆண்டு','தேதி'}

def cap_Lmax_by_question(qid: str, base_Lmax: int) -> int:
    q = id2question.get(qid, '')
    lang = id2lang[qid]
    has_num = bool(NUM_PAT.search(q)) or bool(DATE_PAT.search(q))
    if lang == 'hindi':
        for w in HI_NUM_WORDS:
            if w in q: has_num = True; break
    else:
        for w in TA_NUM_WORDS:
            if w in q: has_num = True; break
    if has_num:
        return min(base_Lmax, 20)
    return base_Lmax

# Load 512 single-seed only (alignment-safe)
s_start, s_end = load_npz_logits('xlmr_large_512_test_avg.npz')
eid = json.loads(Path('xlmr_large_512_test_logits/test_example_id.json').read_text())
offs_all = np.load('xlmr_large_512_test_logits/test_offset_mapping.npy', allow_pickle=True)
print('Loaded 512 single-seed:', s_start.shape, 'offsets', getattr(offs_all,'shape',None))

def decode_512_alignsafe_qcap():
    lambda_len = 0.12
    clip_prior = (-0.8, 0.0)
    K_hi, K_ta = 180, 200
    Lmax_hi, Lmax_ta = 46, 56
    shortlist = 5
    # index map
    m = {}
    for i, qid in enumerate(eid): m.setdefault(qid, []).append(i)
    out_rows = []
    t0 = time.time()
    for qid in test['id'].tolist():
        lang = id2lang[qid]; ctx = id2context[qid]
        mu = priors[lang]['mu']; sigma = priors[lang]['sigma']
        Lmax_base = Lmax_hi if lang=='hindi' else Lmax_ta
        Lmax = cap_Lmax_by_question(qid, Lmax_base)
        K = K_hi if lang=='hindi' else K_ta
        S = np.zeros(len(ctx), dtype=np.float32)
        E = np.zeros(len(ctx), dtype=np.float32)
        for fi in m.get(qid, []):
            offs_raw = offs_all[fi]
            M = len(offs_raw) if hasattr(offs_raw,'__len__') else s_start.shape[1]
            s_log = s_start[fi][:M]
            e_log = s_end[fi][:M]
            for ti in range(M):
                a, b = to_pair(offs_raw[ti])
                if b > a and 0 <= a < len(ctx) and 1 <= b <= len(ctx):
                    S[a] += s_log[ti]
                    E[b-1] += e_log[ti]
        S = maxpool1d(S); E = maxpool1d(E)
        starts = np.argsort(S)[::-1][:K]
        best_score = -1e18; best_span = (0, max(0, min(Lmax, len(ctx))-1))
        for si in starts:
            end_lo = si; end_hi = min(len(ctx)-1, si + Lmax - 1)
            if end_hi < end_lo: continue
            seg = E[end_lo:end_hi+1]
            top_rel = np.argsort(seg)[::-1][:shortlist]
            for rel in top_rel:
                ej = end_lo + int(rel)
                raw = float(S[si] + E[ej])
                clen = ej - si + 1
                lp = log_normal_logpdf_len(clen, mu, sigma)
                lp = max(clip_prior[0], min(clip_prior[1], lp))
                raw += lambda_len * lp
                if raw > best_score:
                    best_score = raw; best_span = (si, ej)
        a, b = best_span
        a, b = snap_span(ctx, S, E, a, b, delta=0.02)
        text = clean_span_text(ctx[a:b+1], lang)
        if not text:
            b2 = min(len(ctx), a+1)
            text = clean_span_text(ctx[a:b2], lang) or ctx[a:b2]
        out_rows.append((qid, text))
    sub = pd.DataFrame(out_rows, columns=['id','PredictionString'])
    empties = (sub['PredictionString'].astype(str).str.len()==0).sum()
    mean_len = sub['PredictionString'].astype(str).str.len().mean()
    print(f'512 align-safe q-cap decode done in {time.time()-t0:.1f}s. Empties={empties}, mean_len={mean_len:.2f}')
    return sub

sub_qcap = decode_512_alignsafe_qcap()
out_qcap = 'submission_512_alignsafe_qcap_lambda012_K180_200_Lmax46_56_short5_delta002.csv'
sub_qcap.to_csv(out_qcap, index=False)
pd.read_csv(out_qcap).to_csv('submission.csv', index=False)
print('submission.csv updated ->', out_qcap)

Loaded 512 single-seed: (1401, 512) offsets (1401, 512)


512 align-safe q-cap decode done in 2.5s. Empties=0, mean_len=13.91
submission.csv updated -> submission_512_alignsafe_qcap_lambda012_K180_200_Lmax46_56_short5_delta002.csv


In [24]:
# Single-stream 3-seed 512 char-fusion with boundary snap (alignment as before), no MuRIL/384
import json, math, time, re, unicodedata as ud
import numpy as np, pandas as pd
from pathlib import Path

test = pd.read_csv('test.csv')
id2lang = dict(zip(test['id'].tolist(), test['language'].tolist()))
id2context = dict(zip(test['id'].tolist(), test['context'].astype(str).tolist()))
train = pd.read_csv('train.csv')
train['answer_text'] = train['answer_text'].astype(str).str.strip()
train['char_len'] = train['answer_text'].str.len().clip(lower=1)
def fit_log_normal_params(df):
    x = np.log(df['char_len'].values.astype(float))
    mu = float(x.mean()); sd = float(x.std());
    if sd <= 1e-6: sd = 1e-6
    return mu, sd
priors = {lang: dict(zip(['mu','sigma'], fit_log_normal_params(g))) for lang, g in train.groupby('language')}

ZW_CHARS = {'\u200B','\u200C','\u200D','\u2060','\ufeff'}
NBSP_SET = {'\u00A0','\u2002','\u2003','\u2004','\u2005','\u2006','\u2007','\u2008','\u2009','\u200A'}
HI_VIRAMA='\u094D'; TA_PULLI='\u0BCD'; DANDA='\u0964'
def clean_span_text(s: str, lang: str) -> str:
    if not s: return ''
    s = ''.join(ch for ch in s if ch not in ZW_CHARS)
    s = ''.join(' ' if ch in NBSP_SET else ch for ch in s)
    s = re.sub(r'\s+', ' ', s).strip()
    s = re.sub(r'(?<=\d)[\s,._-](?=\d)', '', s)
    if s:
        last = s[-1]
        if ud.category(last) == 'Mn' or last in (HI_VIRAMA, TA_PULLI): s = s[:-1]
    if lang == 'hindi':
        s = s.replace(DANDA+DANDA, DANDA)
        if s.endswith(DANDA): s = s[:-1].rstrip()
    return s

def log_normal_logpdf_len(char_len: int, mu: float, sigma: float) -> float:
    Lc = max(1, int(char_len))
    x = math.log(Lc)
    return -0.5*((x-mu)/sigma)**2 - math.log(max(Lc,1e-6)) - math.log(sigma) - 0.5*math.log(2*math.pi)

def to_pair(t):
    try:
        if t is None: return (0,0)
        if isinstance(t,(list,tuple,np.ndarray)) and len(t)>=2:
            a = int(t[0]) if t[0] is not None else 0
            b = int(t[1]) if t[1] is not None else 0
            return (a,b)
    except Exception:
        pass
    return (0,0)

def maxpool1d(x):
    if len(x) == 0: return x
    y = x.copy()
    if len(x) == 1: return y
    y[0] = max(x[0], x[1])
    for i in range(1, len(x)-1): y[i] = max(x[i-1], x[i], x[i+1])
    y[-1] = max(x[-2], x[-1])
    return y

def is_boundary_char(ch: str) -> bool:
    return ch.isspace() or ch in {'.', ',', '!', '?', DANDA, '"', "'", '(', ')', '[', ']', '{', '}', '\u201c', '\u201d', '\u2018', '\u2019', '\u00ab', '\u00bb'}

def snap_span(ctx: str, S: np.ndarray, E: np.ndarray, a: int, b: int, delta: float = 0.02):
    base = float(S[a] + E[b])
    a2 = a
    for i in range(a-1, -1, -1):
        if is_boundary_char(ctx[i]):
            a2 = i+1; break
    b2 = b
    for j in range(b, len(ctx)):
        if is_boundary_char(ctx[j]):
            b2 = max(a2, j-1); break
    cand = float(S[a2] + E[b2])
    if cand >= base - delta:
        return a2, b2
    if a > 0 and b < len(ctx)-1 and ctx[a-1] in {'(', '"', "'"} and ctx[b+1] in {')', '"', "'"}:
        cand2 = float(S[a-1] + E[b+1])
        if cand2 >= base - delta and (b+1 - (a-1) <= (b - a) + 2):
            return a-1, b+1
    return a, b

# Load 3-seed 512 logits and 512 mapping
s512, e512 = np.load('xlmr_large_512_3seeds_avg.npz', allow_pickle=True)['start'], np.load('xlmr_large_512_3seeds_avg.npz', allow_pickle=True)['end']
eid_512 = json.loads(Path('xlmr_large_512_test_logits/test_example_id.json').read_text())
off_512 = np.load('xlmr_large_512_test_logits/test_offset_mapping.npy', allow_pickle=True)
print('Loaded 3-seed 512:', s512.shape, 'mapping', len(eid_512))

# Build qid -> feat indices
qid_to_feat_idx = {}
for i, qid in enumerate(eid_512):
    qid_to_feat_idx.setdefault(qid, []).append(i)

def decode_3seed_single(lambda_len=0.12, clip_prior=(-0.8,0.0), nbest_hi=200, nbest_ta=240, Lmax_hi=50, Lmax_ta=60):
    t0 = time.time()
    out_rows = []
    for qid in test['id'].tolist():
        lang = id2lang[qid]; ctx = id2context[qid]
        mu = priors[lang]['mu']; sigma = priors[lang]['sigma']
        Lmax = Lmax_hi if lang=='hindi' else Lmax_ta
        K = nbest_hi if lang=='hindi' else nbest_ta
        S = np.zeros(len(ctx), dtype=np.float32)
        E = np.zeros(len(ctx), dtype=np.float32)
        for fi in qid_to_feat_idx.get(qid, []):
            offs_raw = off_512[fi]
            M = len(offs_raw) if hasattr(offs_raw,'__len__') else s512.shape[1]
            s_log = s512[fi][:M]
            e_log = e512[fi][:M]
            for ti in range(M):
                a,b = to_pair(offs_raw[ti])
                if b > a and 0 <= a < len(ctx) and 1 <= b <= len(ctx):
                    S[a] += s_log[ti]
                    E[b-1] += e_log[ti]
        S = maxpool1d(S); E = maxpool1d(E)
        if len(S) == 0:
            out_rows.append((qid, ctx[:1] if len(ctx)>0 else '')); continue
        starts = np.argsort(S)[::-1][:K]
        best_score = -1e18; best_span = (0, max(0, min(Lmax, len(ctx))-1))
        for si in starts:
            end_lo = si; end_hi = min(len(ctx)-1, si + Lmax - 1)
            if end_hi < end_lo: continue
            seg = E[end_lo:end_hi+1]
            ej = end_lo + int(np.argmax(seg))
            raw = float(S[si] + E[ej])
            clen = ej - si + 1
            lp = log_normal_logpdf_len(clen, mu, sigma)
            lp = max(clip_prior[0], min(clip_prior[1], lp))
            raw += lambda_len * lp
            if raw > best_score:
                best_score = raw; best_span = (si, ej)
        a,b = best_span
        a,b = snap_span(ctx, S, E, a, b, delta=0.02)
        text = clean_span_text(ctx[a:b+1], lang)
        if not text:
            b2 = min(len(ctx), a+1); text = clean_span_text(ctx[a:b2], lang) or ctx[a:b2]
        out_rows.append((qid, text))
    sub = pd.DataFrame(out_rows, columns=['id','PredictionString'])
    empties = (sub['PredictionString'].astype(str).str.len()==0).sum()
    mean_len = sub['PredictionString'].astype(str).str.len().mean()
    print(f'3-seed single-stream snap decode done in {time.time()-t0:.1f}s. Empties={empties}, mean_len={mean_len:.2f}')
    return sub

sub3s = decode_3seed_single(lambda_len=0.12, nbest_hi=200, nbest_ta=240, Lmax_hi=50, Lmax_ta=60)
out3s = 'submission_3seed512_single_charfusion_snap_lambda012_K200_240_Lmax50_60_delta002.csv'
sub3s.to_csv(out3s, index=False)
pd.read_csv(out3s).to_csv('submission.csv', index=False)
print('submission.csv updated ->', out3s)

Loaded 3-seed 512: (1401, 512) mapping 1401


3-seed single-stream snap decode done in 2.3s. Empties=0, mean_len=11.25
submission.csv updated -> submission_3seed512_single_charfusion_snap_lambda012_K200_240_Lmax50_60_delta002.csv


In [25]:
# Consensus ensemble over multiple healthy submissions via average word-level Jaccard
import pandas as pd, numpy as np, math, json, time

test = pd.read_csv('test.csv')
id_list = test['id'].tolist()
lang_map = dict(zip(test['id'], test['language']))

# Candidate submissions (only healthy diagnostics, no long-drift variants)
cand_files = [
    'submission_charfusion_512_lambda015.csv',
    'submission_384only_charfusion_lambda015.csv',
    'submission_primary_multistream_snap_hi85_10_5_ta97_3_0_lambda015.csv',
    'submission_third_multistream_drop_muril_snap_lambda010.csv',
    'submission_tokenselect_512single_or_384_lambda012.csv',
    'submission_3seed512_single_charfusion_snap_lambda012_K200_240_Lmax50_60_delta002.csv',
]

# Load all candidates that exist
cands = []
for f in cand_files:
    try:
        df = pd.read_csv(f)
        if set(df.columns) >= {'id','PredictionString'} and len(df)==len(test):
            df = df[['id','PredictionString']].copy()
            df['PredictionString'] = df['PredictionString'].astype(str)
            cands.append((f, df.set_index('id')['PredictionString'].to_dict()))
            print('Loaded candidate:', f)
        else:
            print('Skip (shape/cols mismatch):', f)
    except Exception as e:
        print('Skip (read error):', f, '->', e)

assert len(cands) >= 3, 'Not enough candidate submissions loaded for consensus'

def words(s):
    return [w for w in str(s).strip().split() if len(w)>0]

def jaccard(a, b):
    wa, wb = set(words(a)), set(words(b))
    if not wa and not wb: return 1.0
    if not wa or not wb: return 0.0
    inter = len(wa & wb); uni = len(wa | wb)
    return inter / uni if uni>0 else 0.0

# Per-language log-normal priors
train = pd.read_csv('train.csv')
train['answer_text'] = train['answer_text'].astype(str).str.strip()
train['char_len'] = train['answer_text'].str.len().clip(lower=1)
def fit_log_normal_params(df):
    x = np.log(df['char_len'].values.astype(float))
    mu = float(x.mean()); sd = float(x.std());
    if sd <= 1e-6: sd = 1e-6
    return mu, sd
priors = {lang: dict(zip(['mu','sigma'], fit_log_normal_params(g))) for lang, g in train.groupby('language')}

def len_prior_penalty(s, lang):
    L = max(1, len(str(s)))
    mu = priors.get(lang, {}).get('mu', 2.3)
    return -abs(math.log(L) - mu)  # higher is better (closer to mean)

# Consensus: for each id, score each candidate by average Jaccard vs others; tie-break by len prior proximity
out_rows = []
t0 = time.time()
for qid in id_list:
    lang = lang_map[qid]
    texts = [d[qid] for _, d in cands]
    best_idx, best_score, best_lp = -1, -1e18, -1e18
    for i, ti in enumerate(texts):
        # avg Jaccard vs others
        if len(texts) == 1:
            avg_j = 0.0
        else:
            s = 0.0; cnt = 0
            for j, tj in enumerate(texts):
                if i==j: continue
                s += jaccard(ti, tj); cnt += 1
            avg_j = s / max(1, cnt)
        lp = len_prior_penalty(ti, lang)
        score = avg_j + 0.02 * lp  # small regularization toward prior
        if score > best_score or (abs(score-best_score) < 1e-9 and lp > best_lp):
            best_score, best_idx, best_lp = score, i, lp
    out_rows.append((qid, texts[best_idx]))

sub = pd.DataFrame(out_rows, columns=['id','PredictionString'])
empties = (sub['PredictionString'].astype(str).str.len()==0).sum()
mean_len = sub['PredictionString'].astype(str).str.len().mean()
print(f'Consensus built from {len(cands)} submissions in {time.time()-t0:.2f}s. Empties={empties}, mean_len={mean_len:.2f}')

out_path = 'submission_consensus_avgjacc_lenreg002.csv'
sub.to_csv(out_path, index=False)
pd.read_csv(out_path).to_csv('submission.csv', index=False)
print('submission.csv updated ->', out_path)

Loaded candidate: submission_charfusion_512_lambda015.csv
Loaded candidate: submission_384only_charfusion_lambda015.csv
Loaded candidate: submission_primary_multistream_snap_hi85_10_5_ta97_3_0_lambda015.csv
Loaded candidate: submission_third_multistream_drop_muril_snap_lambda010.csv
Loaded candidate: submission_tokenselect_512single_or_384_lambda012.csv
Loaded candidate: submission_3seed512_single_charfusion_snap_lambda012_K200_240_Lmax50_60_delta002.csv


Consensus built from 6 submissions in 0.01s. Empties=0, mean_len=10.78
submission.csv updated -> submission_consensus_avgjacc_lenreg002.csv


In [27]:
# Per-language consensus with majority override and length regularization
import pandas as pd, numpy as np, math, time

test = pd.read_csv('test.csv')
id_list = test['id'].tolist()
lang_map = dict(zip(test['id'], test['language']))

# Global candidate pool (healthy only)
all_cands = [
    'submission_charfusion_512_lambda015.csv',
    'submission_384only_charfusion_lambda015.csv',
    'submission_primary_multistream_snap_hi85_10_5_ta97_3_0_lambda015.csv',
    'submission_third_multistream_drop_muril_snap_lambda010.csv',
    'submission_tokenselect_512single_or_384_lambda012.csv',
    'submission_3seed512_single_charfusion_snap_lambda012_K200_240_Lmax50_60_delta002.csv',
]

# Tamil-restricted candidate set (avoid 384 pull); include third_variant optionally if exists
ta_pref = [
    'submission_charfusion_512_lambda015.csv',
    'submission_3seed512_single_charfusion_snap_lambda012_K200_240_Lmax50_60_delta002.csv',
    'submission_tokenselect_512single_or_384_lambda012.csv',
    'submission_third_multistream_drop_muril_snap_lambda010.csv',
]

def load_cands(paths):
    loaded = []
    for f in paths:
        try:
            df = pd.read_csv(f)
            if set(df.columns) >= {'id','PredictionString'} and len(df)==len(test):
                ser = df.set_index('id')['PredictionString'].astype(str)
                loaded.append((f, ser.to_dict()))
                print('Loaded:', f)
            else:
                print('Skip (shape/cols):', f)
        except Exception as e:
            print('Skip:', f, '->', e)
    return loaded

# Build per-language candidate lists
cands_hi = load_cands(all_cands)
cands_ta = load_cands([f for f in ta_pref if f in all_cands])
if len(cands_ta) < 3:
    # Fallback: use all_cands if restriction too small
    cands_ta = cands_hi.copy()

def words(s):
    return [w for w in str(s).strip().split() if len(w)>0]

def jaccard(a, b):
    wa, wb = set(words(a)), set(words(b))
    if not wa and not wb: return 1.0
    if not wa or not wb: return 0.0
    inter = len(wa & wb); uni = len(wa | wb)
    return inter/uni if uni>0 else 0.0

# Fit per-language log-normal priors from train
train = pd.read_csv('train.csv')
train['answer_text'] = train['answer_text'].astype(str).str.strip()
train['char_len'] = train['answer_text'].str.len().clip(lower=1)
def fit_log_normal_params(df):
    x = np.log(df['char_len'].values.astype(float))
    mu = float(x.mean()); sd = float(x.std());
    if sd <= 1e-6: sd = 1e-6
    return mu, sd
priors = {lang: dict(zip(['mu','sigma'], fit_log_normal_params(g))) for lang, g in train.groupby('language')}

def len_prior_score(s, lang):
    L = max(1, len(str(s)))
    mu = priors.get(lang, {}).get('mu', 2.3)
    return -abs(math.log(L) - mu)  # higher is better (closer to mean)

def majority_override(texts, min_votes=3):
    from collections import Counter
    cnt = Counter(texts)
    text, votes = cnt.most_common(1)[0]
    return (text if votes >= min_votes else None), votes

# Per-language lambda for length regularization
lambda_hi = 0.02
lambda_ta = 0.03

t0 = time.time()
rows = []
for qid in id_list:
    lang = lang_map[qid]
    cands = cands_hi if lang=='hindi' else cands_ta
    texts = [d[qid] for _, d in cands]
    # Majority override
    maj_text, votes = majority_override(texts, min_votes=3)
    if maj_text is not None:
        rows.append((qid, maj_text))
        continue
    # Score by avg word-Jaccard + len prior
    best_idx, best_score = -1, -1e18
    best_len = 10**9
    lam = lambda_hi if lang=='hindi' else lambda_ta
    for i, ti in enumerate(texts):
        if len(texts) == 1:
            avg_j = 0.0
        else:
            s = 0.0; cnt = 0
            for j, tj in enumerate(texts):
                if i==j: continue
                s += jaccard(ti, tj); cnt += 1
            avg_j = s / max(1, cnt)
        lp = len_prior_score(ti, lang)
        score = avg_j + lam * lp
        # Tie-breaker within 0.01: prefer shorter string
        L = len(str(ti))
        if (score > best_score + 1e-12) or (abs(score - best_score) <= 0.01 and L < best_len):
            best_score = score; best_idx = i; best_len = L
    rows.append((qid, texts[best_idx]))

sub = pd.DataFrame(rows, columns=['id','PredictionString'])
empties = (sub['PredictionString'].astype(str).str.len()==0).sum()
mean_len = sub['PredictionString'].astype(str).str.len().mean()
print(f'Per-language consensus built. Empties={int(empties)}, mean_len={mean_len:.2f}, time={time.time()-t0:.2f}s')

out_path = 'submission_consensus_perlang_majority_lenhi002_lenta003.csv'
sub.to_csv(out_path, index=False)
pd.read_csv(out_path).to_csv('submission.csv', index=False)
print('submission.csv updated ->', out_path)

Loaded: submission_charfusion_512_lambda015.csv
Loaded: submission_384only_charfusion_lambda015.csv
Loaded: submission_primary_multistream_snap_hi85_10_5_ta97_3_0_lambda015.csv
Loaded: submission_third_multistream_drop_muril_snap_lambda010.csv
Loaded: submission_tokenselect_512single_or_384_lambda012.csv
Loaded: submission_3seed512_single_charfusion_snap_lambda012_K200_240_Lmax50_60_delta002.csv
Loaded: submission_charfusion_512_lambda015.csv
Loaded: submission_3seed512_single_charfusion_snap_lambda012_K200_240_Lmax50_60_delta002.csv
Loaded: submission_tokenselect_512single_or_384_lambda012.csv
Loaded: submission_third_multistream_drop_muril_snap_lambda010.csv


Per-language consensus built. Empties=0, mean_len=10.71, time=0.01s
submission.csv updated -> submission_consensus_perlang_majority_lenhi002_lenta003.csv


In [28]:
# Global stronger length-regularized consensus (lambda_len=0.05), same healthy candidates
import pandas as pd, numpy as np, math, time

test = pd.read_csv('test.csv')
id_list = test['id'].tolist()
lang_map = dict(zip(test['id'], test['language']))

cand_files = [
    'submission_charfusion_512_lambda015.csv',
    'submission_384only_charfusion_lambda015.csv',
    'submission_primary_multistream_snap_hi85_10_5_ta97_3_0_lambda015.csv',
    'submission_third_multistream_drop_muril_snap_lambda010.csv',
    'submission_tokenselect_512single_or_384_lambda012.csv',
    'submission_3seed512_single_charfusion_snap_lambda012_K200_240_Lmax50_60_delta002.csv',
]

cands = []
for f in cand_files:
    try:
        df = pd.read_csv(f)
        if set(df.columns) >= {'id','PredictionString'} and len(df)==len(test):
            cands.append((f, df.set_index('id')['PredictionString'].astype(str).to_dict()))
            print('Loaded candidate:', f)
        else:
            print('Skip (shape/cols mismatch):', f)
    except Exception as e:
        print('Skip (read error):', f, '->', e)
assert len(cands) >= 3, 'Need at least 3 candidates'

def words(s):
    return [w for w in str(s).strip().split() if len(w)>0]

def jaccard(a, b):
    wa, wb = set(words(a)), set(words(b))
    if not wa and not wb: return 1.0
    if not wa or not wb: return 0.0
    inter = len(wa & wb); uni = len(wa | wb)
    return inter/uni if uni>0 else 0.0

# Per-language log-normal prior (for length regularization)
train = pd.read_csv('train.csv')
train['answer_text'] = train['answer_text'].astype(str).str.strip()
train['char_len'] = train['answer_text'].str.len().clip(lower=1)
def fit_log_normal_params(df):
    x = np.log(df['char_len'].values.astype(float))
    mu = float(x.mean()); sd = float(x.std());
    if sd <= 1e-6: sd = 1e-6
    return mu, sd
priors = {lang: dict(zip(['mu','sigma'], fit_log_normal_params(g))) for lang, g in train.groupby('language')}

def len_prior_score(s, lang):
    L = max(1, len(str(s)))
    mu = priors.get(lang, {}).get('mu', 2.3)
    return -abs(math.log(L) - mu)  # higher is better when closer to lang mean

lambda_len = 0.05
rows = []
t0 = time.time()
for qid in id_list:
    lang = lang_map[qid]
    texts = [d[qid] for _, d in cands]
    best_idx, best_score, best_len = -1, -1e18, 10**9
    for i, ti in enumerate(texts):
        if len(texts) == 1:
            avg_j = 0.0
        else:
            s = 0.0; cnt = 0
            for j, tj in enumerate(texts):
                if i==j: continue
                s += jaccard(ti, tj); cnt += 1
            avg_j = s / max(1, cnt)
        lp = len_prior_score(ti, lang)
        score = avg_j + lambda_len * lp
        L = len(str(ti))
        if (score > best_score + 1e-12) or (abs(score - best_score) <= 0.01 and L < best_len):
            best_score = score; best_idx = i; best_len = L
    rows.append((qid, texts[best_idx]))

sub = pd.DataFrame(rows, columns=['id','PredictionString'])
empties = (sub['PredictionString'].astype(str).str.len()==0).sum()
mean_len = sub['PredictionString'].astype(str).str.len().mean()
print(f'Stronger len-reg consensus built from {len(cands)} in {time.time()-t0:.2f}s. Empties={int(empties)}, mean_len={mean_len:.2f}')

out_path = 'submission_consensus_lenreg005.csv'
sub.to_csv(out_path, index=False)
pd.read_csv(out_path).to_csv('submission.csv', index=False)
print('submission.csv updated ->', out_path)

Loaded candidate: submission_charfusion_512_lambda015.csv
Loaded candidate: submission_384only_charfusion_lambda015.csv
Loaded candidate: submission_primary_multistream_snap_hi85_10_5_ta97_3_0_lambda015.csv
Loaded candidate: submission_third_multistream_drop_muril_snap_lambda010.csv
Loaded candidate: submission_tokenselect_512single_or_384_lambda012.csv
Loaded candidate: submission_3seed512_single_charfusion_snap_lambda012_K200_240_Lmax50_60_delta002.csv


Stronger len-reg consensus built from 6 in 0.01s. Empties=0, mean_len=10.77
submission.csv updated -> submission_consensus_lenreg005.csv


In [29]:
# Coverage-normalized decoders (per expert) + per-id consensus over A/B/C
import numpy as np, pandas as pd, math, json, time, re, unicodedata as ud
from pathlib import Path

test = pd.read_csv('test.csv')
id_list = test['id'].tolist()
id2lang = dict(zip(test['id'], test['language']))
id2ctx = dict(zip(test['id'], test['context'].astype(str)))

train = pd.read_csv('train.csv')
train['answer_text'] = train['answer_text'].astype(str).str.strip()
train['char_len'] = train['answer_text'].str.len().clip(lower=1)
def fit_log_normal_params(df):
    x = np.log(df['char_len'].values.astype(float))
    mu = float(x.mean()); sd = float(x.std());
    if sd <= 1e-6: sd = 1e-6
    return mu, sd
priors = {lang: dict(zip(['mu','sigma'], fit_log_normal_params(g))) for lang, g in train.groupby('language')}

def to_pair(t):
    try:
        if t is None: return (0,0)
        if isinstance(t,(list,tuple,np.ndarray)) and len(t)>=2:
            a = int(t[0]) if t[0] is not None else 0
            b = int(t[1]) if t[1] is not None else 0
            return (a,b)
    except Exception:
        pass
    return (0,0)

def softmax(x):
    x = np.asarray(x, dtype=np.float64)
    m = np.max(x) if x.size else 0.0
    e = np.exp(x - m)
    s = e.sum()
    return e / s if s > 0 else np.zeros_like(x, dtype=np.float64)

ZW_CHARS = {'\u200B','\u200C','\u200D','\u2060','\ufeff'}
NBSP_SET = {'\u00A0','\u2002','\u2003','\u2004','\u2005','\u2006','\u2007','\u2008','\u2009','\u200A'}
HI_VIRAMA='\u094D'; TA_PULLI='\u0BCD'; DANDA='\u0964'
def clean_span_text(s: str, lang: str) -> str:
    if not s: return ''
    s = ''.join(ch for ch in s if ch not in ZW_CHARS)
    s = ''.join(' ' if ch in NBSP_SET else ch for ch in s)
    s = re.sub(r'\s+', ' ', s).strip()
    s = re.sub(r'(?<=\d)[\s,._-](?=\d)', '', s)
    if s:
        last = s[-1]
        if ud.category(last) == 'Mn' or last in (HI_VIRAMA, TA_PULLI): s = s[:-1]
    if lang == 'hindi':
        s = s.replace(DANDA+DANDA, DANDA)
        if s.endswith(DANDA): s = s[:-1].rstrip()
    return s

# Load 512 single-seed (alignment-safe) artifacts
def load_npz_logits(npz_path):
    arr = np.load(npz_path, allow_pickle=True)
    keys = list(arr.keys())
    if 'start' in keys and 'end' in keys: return arr['start'], arr['end']
    for sk in ['start_logits','test_start_logits','start_logits_avg']:
        for ek in ['end_logits','test_end_logits','end_logits_avg']:
            if sk in keys and ek in keys: return arr[sk], arr[ek]
    raise ValueError(f'Unknown keys in {npz_path}: {keys}')

s512, e512 = load_npz_logits('xlmr_large_512_test_avg.npz')
eid_512 = json.loads(Path('xlmr_large_512_test_logits/test_example_id.json').read_text())
off_512 = np.load('xlmr_large_512_test_logits/test_offset_mapping.npy', allow_pickle=True)
print('512 single-seed logits:', s512.shape, 'mapping', len(eid_512))

# Load 384 artifacts
s384, e384 = load_npz_logits('xlmr_large_test_avg.npz')
eid_384 = json.loads(Path('xlmr_large_test_logits/test_example_id.json').read_text())
off_384 = np.load('xlmr_large_test_logits/test_offset_mapping.npy', allow_pickle=True)
print('384 logits:', s384.shape, 'mapping', len(eid_384))

def build_index(eids):
    m = {}
    for i, q in enumerate(eids): m.setdefault(q, []).append(i)
    return m

idx512 = build_index(eid_512)
idx384 = build_index(eid_384)

def decode_charfusion_covnorm(start_logits, end_logits, eids, offs_all, idx_map, tag: str):
    rows = []
    t0 = time.time()
    for qid in id_list:
        ctx = id2ctx[qid]; lang = id2lang[qid]
        Lmax = 50 if lang=='hindi' else 60
        K = 200 if lang=='hindi' else 240
        S = np.zeros(len(ctx), dtype=np.float64)
        E = np.zeros(len(ctx), dtype=np.float64)
        CS = np.zeros(len(ctx), dtype=np.float64)  # coverage counters
        CE = np.zeros(len(ctx), dtype=np.float64)
        for fi in idx_map.get(qid, []):
            offs = offs_all[fi]
            M = len(offs) if hasattr(offs,'__len__') else start_logits.shape[1]
            ps = softmax(start_logits[fi][:M])
            pe = softmax(end_logits[fi][:M])
            for ti in range(M):
                a,b = to_pair(offs[ti])
                if b > a and 0 <= a < len(ctx) and 1 <= b <= len(ctx):
                    # distribute uniformly over covered chars
                    S[a:b] += ps[ti]
                    CS[a:b] += 1.0
                    E[a:b] += pe[ti]
                    CE[a:b] += 1.0
        # coverage-normalize
        CS = np.maximum(CS, 1.0); CE = np.maximum(CE, 1.0)
        S /= CS; E /= CE
        # decode with tight caps, no priors
        if len(S) == 0:
            rows.append((qid, ctx[:1] if len(ctx)>0 else '')); continue
        starts = np.argsort(S)[::-1][:K]
        best = (-1e18, 0, max(0, min(Lmax, len(ctx))-1))
        for si in starts:
            end_lo = si; end_hi = min(len(ctx)-1, si + Lmax - 1)
            if end_hi < end_lo: continue
            seg = E[end_lo:end_hi+1]
            ej = end_lo + int(np.argmax(seg))
            score = float(S[si] + E[ej])
            if score > best[0]: best = (score, si, ej)
        _, a, b = best
        text = clean_span_text(ctx[a:b+1], lang)
        if not text:
            b2 = min(len(ctx), a+1)
            text = clean_span_text(ctx[a:b2], lang) or ctx[a:b2]
        rows.append((qid, text))
    sub = pd.DataFrame(rows, columns=['id','PredictionString'])
    empties = (sub['PredictionString'].astype(str).str.len()==0).sum()
    mean_len = sub['PredictionString'].astype(str).str.len().mean()
    print(f'Cov-norm char-fusion {tag}: Empties={int(empties)}, mean_len={mean_len:.2f}, time={time.time()-t0:.2f}s')
    return sub

def decode_token_dp_512():
    rows = []
    t0 = time.time()
    for qid in id_list:
        ctx = id2ctx[qid]; lang = id2lang[qid]
        Lmax = 50 if lang=='hindi' else 60
        K = 200 if lang=='hindi' else 240
        best = (-1e18, '')
        for fi in idx512.get(qid, []):
            offs = off_512[fi]
            M = len(offs) if hasattr(offs,'__len__') else s512.shape[1]
            ps = softmax(s512[fi][:M]); pe = softmax(e512[fi][:M])
            # valid token indices
            valid = []
            for ti in range(M):
                a,b = to_pair(offs[ti])
                if b > a and 0 <= a < len(ctx) and 1 <= b <= len(ctx): valid.append(ti)
            if not valid: continue
            valid = np.array(valid, dtype=np.int32)
            s_top = valid[np.argsort(ps[valid])[::-1][:K]]
            for si in s_top:
                a0, _ = to_pair(offs[si])
                cand_e = []
                for ei in range(si, M):
                    a2,b2 = to_pair(offs[ei])
                    if not (b2 > a2 and 0 <= a2 < len(ctx) and 1 <= b2 <= len(ctx)): continue
                    clen = b2 - a0
                    if clen <= 0 or clen > Lmax: continue
                    cand_e.append(ei)
                if not cand_e: continue
                cand_e = np.array(cand_e, dtype=np.int32)
                # score = log ps + log pe
                score_vec = np.log(np.maximum(ps[si], 1e-15)) + np.log(np.maximum(pe[cand_e], 1e-15))
                ei = int(cand_e[int(np.argmax(score_vec))])
                a,b = to_pair(offs[si])[0], to_pair(offs[ei])[1]
                if not (0 <= a < len(ctx) and 1 <= b <= len(ctx) and b > a): continue
                text = clean_span_text(ctx[a:b], lang) or ctx[a:b]
                sc = float(np.max(score_vec))
                if sc > best[0]: best = (sc, text)
        if best[1] == '':
            best = (best[0], clean_span_text(ctx[:1], lang) or ctx[:1])
        rows.append((qid, best[1]))
    sub = pd.DataFrame(rows, columns=['id','PredictionString'])
    empties = (sub['PredictionString'].astype(str).str.len()==0).sum()
    mean_len = sub['PredictionString'].astype(str).str.len().mean()
    print(f'Token-DP 512: Empties={int(empties)}, mean_len={mean_len:.2f}, time={time.time()-t0:.2f}s')
    return sub

# Run three decoders
subA = decode_charfusion_covnorm(s512, e512, eid_512, off_512, idx512, tag='A(512)')
pathA = 'submission_covnorm_charfusion_512.csv'; subA.to_csv(pathA, index=False)
subB = decode_charfusion_covnorm(s384, e384, eid_384, off_384, idx384, tag='B(384)')
pathB = 'submission_covnorm_charfusion_384.csv'; subB.to_csv(pathB, index=False)
subC = decode_token_dp_512()
pathC = 'submission_tokendp_512.csv'; subC.to_csv(pathC, index=False)

# Per-id consensus over A/B/C: avg word-Jaccard; tie-break by length proximity to lang prior
def words(s):
    return [w for w in str(s).strip().split() if len(w)>0]
def jaccard(a, b):
    wa, wb = set(words(a)), set(words(b))
    if not wa and not wb: return 1.0
    if not wa or not wb: return 0.0
    inter = len(wa & wb); uni = len(wa | wb)
    return inter/uni if uni>0 else 0.0
def len_prior_dist(s, lang):
    L = max(1, len(str(s))); mu = priors.get(lang, {}).get('mu', 2.3)
    return abs(math.log(L) - mu)

mA = subA.set_index('id')['PredictionString'].to_dict()
mB = subB.set_index('id')['PredictionString'].to_dict()
mC = subC.set_index('id')['PredictionString'].to_dict()
rows = []
t0 = time.time()
for qid in id_list:
    lang = id2lang[qid]
    cand = [mA[qid], mB[qid], mC[qid]]
    best_i, best_s, best_d = 0, -1e18, 1e9
    for i, ti in enumerate(cand):
        s = 0.0; cnt = 0
        for j, tj in enumerate(cand):
            if i==j: continue
            s += jaccard(ti, tj); cnt += 1
        avg_j = s / max(1, cnt)
        d = len_prior_dist(ti, lang)
        # tie-break: prefer closer to length prior if within 0.01
        if (avg_j > best_s + 1e-12) or (abs(avg_j - best_s) <= 0.01 and d < best_d):
            best_s, best_i, best_d = avg_j, i, d
    rows.append((qid, cand[best_i]))
sub = pd.DataFrame(rows, columns=['id','PredictionString'])
empties = (sub['PredictionString'].astype(str).str.len()==0).sum()
mean_len = sub['PredictionString'].astype(str).str.len().mean()
print(f'Consensus A/B/C built. Empties={int(empties)}, mean_len={mean_len:.2f}')
out_path = 'submission_consensus_covnorm_tokendp.csv'
sub.to_csv(out_path, index=False)
pd.read_csv(out_path).to_csv('submission.csv', index=False)
print('submission.csv updated ->', out_path)

512 single-seed logits: (1401, 512) mapping 1401


384 logits: (1921, 384) mapping 1921


Cov-norm char-fusion A(512): Empties=0, mean_len=7.57, time=5.25s


Cov-norm char-fusion B(384): Empties=0, mean_len=7.62, time=5.39s


Token-DP 512: Empties=0, mean_len=10.71, time=50.22s
Consensus A/B/C built. Empties=0, mean_len=8.65
submission.csv updated -> submission_consensus_covnorm_tokendp.csv


In [30]:
# Coverage-normalized decoders with Lmax+4 hedge and new consensus
import numpy as np, pandas as pd, math, json, time, re, unicodedata as ud
from pathlib import Path

test = pd.read_csv('test.csv')
id_list = test['id'].tolist()
id2lang = dict(zip(test['id'], test['language']))
id2ctx = dict(zip(test['id'], test['context'].astype(str)))

train = pd.read_csv('train.csv')
train['answer_text'] = train['answer_text'].astype(str).str.strip()
train['char_len'] = train['answer_text'].str.len().clip(lower=1)
def fit_log_normal_params(df):
    x = np.log(df['char_len'].values.astype(float))
    mu = float(x.mean()); sd = float(x.std());
    if sd <= 1e-6: sd = 1e-6
    return mu, sd
priors = {lang: dict(zip(['mu','sigma'], fit_log_normal_params(g))) for lang, g in train.groupby('language')}

def to_pair(t):
    try:
        if t is None: return (0,0)
        if isinstance(t,(list,tuple,np.ndarray)) and len(t)>=2:
            a = int(t[0]) if t[0] is not None else 0
            b = int(t[1]) if t[1] is not None else 0
            return (a,b)
    except Exception:
        pass
    return (0,0)

def softmax(x):
    x = np.asarray(x, dtype=np.float64)
    m = np.max(x) if x.size else 0.0
    e = np.exp(x - m)
    s = e.sum()
    return e / s if s > 0 else np.zeros_like(x, dtype=np.float64)

ZW_CHARS = {'\u200B','\u200C','\u200D','\u2060','\ufeff'}
NBSP_SET = {'\u00A0','\u2002','\u2003','\u2004','\u2005','\u2006','\u2007','\u2008','\u2009','\u200A'}
HI_VIRAMA='\u094D'; TA_PULLI='\u0BCD'; DANDA='\u0964'
def clean_span_text(s: str, lang: str) -> str:
    if not s: return ''
    s = ''.join(ch for ch in s if ch not in ZW_CHARS)
    s = ''.join(' ' if ch in NBSP_SET else ch for ch in s)
    s = re.sub(r'\s+', ' ', s).strip()
    s = re.sub(r'(?<=\d)[\s,._-](?=\d)', '', s)
    if s:
        last = s[-1]
        if ud.category(last) == 'Mn' or last in (HI_VIRAMA, TA_PULLI): s = s[:-1]
    if lang == 'hindi':
        s = s.replace(DANDA+DANDA, DANDA)
        if s.endswith(DANDA): s = s[:-1].rstrip()
    return s

def load_npz_logits(npz_path):
    arr = np.load(npz_path, allow_pickle=True)
    keys = list(arr.keys())
    if 'start' in keys and 'end' in keys: return arr['start'], arr['end']
    for sk in ['start_logits','test_start_logits','start_logits_avg']:
        for ek in ['end_logits','test_end_logits','end_logits_avg']:
            if sk in keys and ek in keys: return arr[sk], arr[ek]
    raise ValueError(f'Unknown keys in {npz_path}: {keys}')

# Load 512 single-seed and 384 artifacts
s512, e512 = load_npz_logits('xlmr_large_512_test_avg.npz')
eid_512 = json.loads(Path('xlmr_large_512_test_logits/test_example_id.json').read_text())
off_512 = np.load('xlmr_large_512_test_logits/test_offset_mapping.npy', allow_pickle=True)
s384, e384 = load_npz_logits('xlmr_large_test_avg.npz')
eid_384 = json.loads(Path('xlmr_large_test_logits/test_example_id.json').read_text())
off_384 = np.load('xlmr_large_test_logits/test_offset_mapping.npy', allow_pickle=True)

def build_index(eids):
    m = {}
    for i, q in enumerate(eids): m.setdefault(q, []).append(i)
    return m
idx512 = build_index(eid_512)
idx384 = build_index(eid_384)

def decode_charfusion_covnorm_plus4(start_logits, end_logits, offs_all, idx_map, tag: str):
    rows = []; t0 = time.time()
    for qid in id_list:
        ctx = id2ctx[qid]; lang = id2lang[qid]
        Lmax = (50 if lang=='hindi' else 60) + 4
        K = (200 if lang=='hindi' else 240) + 20
        S = np.zeros(len(ctx), dtype=np.float64)
        E = np.zeros(len(ctx), dtype=np.float64)
        CS = np.zeros(len(ctx), dtype=np.float64)
        CE = np.zeros(len(ctx), dtype=np.float64)
        for fi in idx_map.get(qid, []):
            offs = offs_all[fi]
            M = len(offs) if hasattr(offs,'__len__') else start_logits.shape[1]
            ps = softmax(start_logits[fi][:M])
            pe = softmax(end_logits[fi][:M])
            for ti in range(M):
                a,b = to_pair(offs[ti])
                if b > a and 0 <= a < len(ctx) and 1 <= b <= len(ctx):
                    S[a:b] += ps[ti]; CS[a:b] += 1.0
                    E[a:b] += pe[ti]; CE[a:b] += 1.0
        CS = np.maximum(CS, 1.0); CE = np.maximum(CE, 1.0)
        S /= CS; E /= CE
        if len(S) == 0:
            rows.append((qid, ctx[:1] if len(ctx)>0 else '')); continue
        starts = np.argsort(S)[::-1][:K]
        best = (-1e18, 0, max(0, min(Lmax, len(ctx))-1))
        for si in starts:
            end_lo = si; end_hi = min(len(ctx)-1, si + Lmax - 1)
            if end_hi < end_lo: continue
            seg = E[end_lo:end_hi+1]
            ej = end_lo + int(np.argmax(seg))
            score = float(S[si] + E[ej])
            if score > best[0]: best = (score, si, ej)
        _, a, b = best
        text = clean_span_text(ctx[a:b+1], lang)
        if not text:
            b2 = min(len(ctx), a+1)
            text = clean_span_text(ctx[a:b2], lang) or ctx[a:b2]
        rows.append((qid, text))
    sub = pd.DataFrame(rows, columns=['id','PredictionString'])
    empties = (sub['PredictionString'].astype(str).str.len()==0).sum()
    mean_len = sub['PredictionString'].astype(str).str.len().mean()
    print(f'Cov-norm+4 {tag}: Empties={int(empties)}, mean_len={mean_len:.2f}, time={time.time()-t0:.2f}s')
    return sub

def decode_tokendp_512_plus4():
    rows = []; t0 = time.time()
    for qid in id_list:
        ctx = id2ctx[qid]; lang = id2lang[qid]
        Lmax = (50 if lang=='hindi' else 60) + 4
        K = (200 if lang=='hindi' else 240) + 20
        best = (-1e18, '')
        for fi in idx512.get(qid, []):
            offs = off_512[fi]
            M = len(offs) if hasattr(offs,'__len__') else s512.shape[1]
            ps = softmax(s512[fi][:M]); pe = softmax(e512[fi][:M])
            valid = []
            for ti in range(M):
                a,b = to_pair(offs[ti])
                if b > a and 0 <= a < len(ctx) and 1 <= b <= len(ctx): valid.append(ti)
            if not valid: continue
            valid = np.array(valid, dtype=np.int32)
            s_top = valid[np.argsort(ps[valid])[::-1][:K]]
            for si in s_top:
                a0, _ = to_pair(offs[si])
                cand_e = []
                for ei in range(si, M):
                    a2,b2 = to_pair(offs[ei])
                    if not (b2 > a2 and 0 <= a2 < len(ctx) and 1 <= b2 <= len(ctx)): continue
                    clen = b2 - a0
                    if clen <= 0 or clen > Lmax: continue
                    cand_e.append(ei)
                if not cand_e: continue
                cand_e = np.array(cand_e, dtype=np.int32)
                score_vec = np.log(np.maximum(ps[si], 1e-15)) + np.log(np.maximum(pe[cand_e], 1e-15))
                ei = int(cand_e[int(np.argmax(score_vec))])
                a,b = to_pair(offs[si])[0], to_pair(offs[ei])[1]
                if not (0 <= a < len(ctx) and 1 <= b <= len(ctx) and b > a): continue
                text = clean_span_text(ctx[a:b], lang) or ctx[a:b]
                sc = float(np.max(score_vec))
                if sc > best[0]: best = (sc, text)
        if best[1] == '':
            best = (best[0], clean_span_text(ctx[:1], lang) or ctx[:1])
        rows.append((qid, best[1]))
    sub = pd.DataFrame(rows, columns=['id','PredictionString'])
    empties = (sub['PredictionString'].astype(str).str.len()==0).sum()
    mean_len = sub['PredictionString'].astype(str).str.len().mean()
    print(f'Token-DP+4 512: Empties={int(empties)}, mean_len={mean_len:.2f}, time={time.time()-t0:.2f}s')
    return sub

# Run decoders with Lmax+4
subA = decode_charfusion_covnorm_plus4(s512, e512, off_512, idx512, tag='A(512)')
pathA = 'submission_covnorm_charfusion_512_LmaxPlus4.csv'; subA.to_csv(pathA, index=False)
subB = decode_charfusion_covnorm_plus4(s384, e384, off_384, idx384, tag='B(384)')
pathB = 'submission_covnorm_charfusion_384_LmaxPlus4.csv'; subB.to_csv(pathB, index=False)
subC = decode_tokendp_512_plus4()
pathC = 'submission_tokendp_512_LmaxPlus4.csv'; subC.to_csv(pathC, index=False)

# Consensus A/B/C (avg word Jaccard, tie-break by length prior proximity)
def words(s):
    return [w for w in str(s).strip().split() if len(w)>0]
def jaccard(a, b):
    wa, wb = set(words(a)), set(words(b))
    if not wa and not wb: return 1.0
    if not wa or not wb: return 0.0
    inter = len(wa & wb); uni = len(wa | wb)
    return inter/uni if uni>0 else 0.0
def len_prior_dist(s, lang):
    L = max(1, len(str(s))); mu = priors.get(lang, {}).get('mu', 2.3)
    return abs(math.log(L) - mu)

mA = subA.set_index('id')['PredictionString'].to_dict()
mB = subB.set_index('id')['PredictionString'].to_dict()
mC = subC.set_index('id')['PredictionString'].to_dict()
rows = []
for qid in id_list:
    lang = id2lang[qid]
    cand = [mA[qid], mB[qid], mC[qid]]
    best_i, best_s, best_d = 0, -1e18, 1e9
    for i, ti in enumerate(cand):
        s = 0.0; cnt = 0
        for j, tj in enumerate(cand):
            if i==j: continue
            s += jaccard(ti, tj); cnt += 1
        avg_j = s / max(1, cnt)
        d = len_prior_dist(ti, lang)
        if (avg_j > best_s + 1e-12) or (abs(avg_j - best_s) <= 0.01 and d < best_d):
            best_s, best_i, best_d = avg_j, i, d
    rows.append((qid, cand[best_i]))
sub = pd.DataFrame(rows, columns=['id','PredictionString'])
empties = (sub['PredictionString'].astype(str).str.len()==0).sum()
mean_len = sub['PredictionString'].astype(str).str.len().mean()
print(f'Consensus A/B/C (Lmax+4) built. Empties={int(empties)}, mean_len={mean_len:.2f}')
out_path = 'submission_consensus_covnorm_tokendp_LmaxPlus4.csv'
sub.to_csv(out_path, index=False)
pd.read_csv(out_path).to_csv('submission.csv', index=False)
print('submission.csv updated ->', out_path)

Cov-norm+4 A(512): Empties=0, mean_len=7.57, time=5.12s


Cov-norm+4 B(384): Empties=0, mean_len=7.62, time=5.40s


Token-DP+4 512: Empties=0, mean_len=10.71, time=55.48s
Consensus A/B/C (Lmax+4) built. Empties=0, mean_len=8.65
submission.csv updated -> submission_consensus_covnorm_tokendp_LmaxPlus4.csv


In [33]:
# Meta-consensus per expert: per-language candidate pools, majority override, mean Jaccard + len prior
import pandas as pd, numpy as np, math, time

test = pd.read_csv('test.csv')
id_list = test['id'].tolist()
lang_map = dict(zip(test['id'], test['language']))

# Candidates (healthy only). Hindi uses all 6; Tamil restrict to 4 (no 384-only).
cand_hi_files = [
    'submission_3seed512_single_charfusion_snap_lambda012_K200_240_Lmax50_60_delta002.csv',
    'submission_charfusion_512_lambda015.csv',
    'submission_tokenselect_512single_or_384_lambda012.csv',
    'submission_384only_charfusion_lambda015.csv',
    'submission_primary_multistream_snap_hi85_10_5_ta97_3_0_lambda015.csv',
    'submission_third_multistream_drop_muril_snap_lambda010.csv',
]
cand_ta_files = [
    'submission_3seed512_single_charfusion_snap_lambda012_K200_240_Lmax50_60_delta002.csv',
    'submission_charfusion_512_lambda015.csv',
    'submission_tokenselect_512single_or_384_lambda012.csv',
    'submission_third_multistream_drop_muril_snap_lambda010.csv',
]
# Optional: include token-DP 512 for Hindi only if present
optional_hi = ['submission_tokendp_512.csv']

def load_cands(paths):
    out = []
    for f in paths:
        try:
            df = pd.read_csv(f)
            if set(df.columns) >= {'id','PredictionString'} and len(df)==len(test):
                out.append((f, df.set_index('id')['PredictionString'].astype(str).to_dict()))
                print('Loaded:', f)
            else:
                print('Skip (shape/cols):', f)
        except Exception as e:
            print('Skip:', f, '->', e)
    return out

cands_hi = load_cands(cand_hi_files + [f for f in optional_hi if pd.Series([f]).apply(lambda x: os.path.exists(x) if 'os' in globals() else __import__('os').path.exists(x)).iloc[0]])
cands_ta = load_cands(cand_ta_files)
if len(cands_hi) < 4: cands_hi = load_cands(cand_hi_files)  # fallback if optional missing

def words(s):
    return [w for w in str(s).strip().split() if len(w)>0]
def jaccard(a, b):
    wa, wb = set(words(a)), set(words(b))
    if not wa and not wb: return 1.0
    if not wa or not wb: return 0.0
    inter = len(wa & wb); uni = len(wa | wb)
    return inter/uni if uni>0 else 0.0

# Per-language log-normal priors
train = pd.read_csv('train.csv')
train['answer_text'] = train['answer_text'].astype(str).str.strip()
train['char_len'] = train['answer_text'].str.len().clip(lower=1)
def fit_log_normal_params(df):
    x = np.log(df['char_len'].values.astype(float))
    mu = float(x.mean()); sd = float(x.std());
    if sd <= 1e-6: sd = 1e-6
    return mu, sd
priors = {lang: dict(zip(['mu','sigma'], fit_log_normal_params(g))) for lang, g in train.groupby('language')}

def len_prior_dist(s, lang):
    L = max(1, len(str(s))); mu = priors.get(lang, {}).get('mu', 2.3)
    return abs(math.log(L) - mu)

from collections import Counter
def majority_override(texts, min_votes=3):
    cnt = Counter(texts)
    text, votes = cnt.most_common(1)[0]
    return (text if votes >= min_votes else None), votes

# Length regularization weights
lambda_hi = 0.02
lambda_ta = 0.03

rows = []
t0 = time.time()
for qid in id_list:
    lang = lang_map[qid]
    pool = cands_hi if lang=='hindi' else cands_ta
    texts = [d[qid] for _, d in pool]
    # Majority override first
    maj, votes = majority_override(texts, min_votes=3)
    if maj is not None:
        rows.append((qid, maj));
        continue
    # Score by mean Jaccard; add small len prior; tie-break by closer to prior, then shorter within 0.01
    best_i, best_s, best_d, best_L = -1, -1e18, 1e9, 10**9
    lam = lambda_hi if lang=='hindi' else lambda_ta
    for i, ti in enumerate(texts):
        s = 0.0; cnt = 0
        for j, tj in enumerate(texts):
            if i==j: continue
            s += jaccard(ti, tj); cnt += 1
        avg_j = s / max(1, cnt)
        dlen = len_prior_dist(ti, lang)
        score = avg_j - lam * dlen  # equivalent to + lam * (-|logL-mu|)
        L = len(str(ti))
        if (score > best_s + 1e-12) or (abs(score - best_s) <= 0.01 and (dlen < best_d or (abs(dlen-best_d) <= 1e-6 and L < best_L))):
            best_s, best_i, best_d, best_L = score, i, dlen, L
    rows.append((qid, texts[best_i]))

sub = pd.DataFrame(rows, columns=['id','PredictionString'])
empties = (sub['PredictionString'].astype(str).str.len()==0).sum()
mean_len = sub['PredictionString'].astype(str).str.len().mean()
print(f'Meta-consensus built. Empties={int(empties)}, mean_len={mean_len:.2f}, time={time.time()-t0:.2f}s')

out_path = 'submission_meta_consensus_perlang_majority_lenhi002_lenta003_plusjacc.csv'
sub.to_csv(out_path, index=False)
pd.read_csv(out_path).to_csv('submission.csv', index=False)
print('submission.csv updated ->', out_path)

Loaded: submission_3seed512_single_charfusion_snap_lambda012_K200_240_Lmax50_60_delta002.csv
Loaded: submission_charfusion_512_lambda015.csv
Loaded: submission_tokenselect_512single_or_384_lambda012.csv
Loaded: submission_384only_charfusion_lambda015.csv
Loaded: submission_primary_multistream_snap_hi85_10_5_ta97_3_0_lambda015.csv
Loaded: submission_third_multistream_drop_muril_snap_lambda010.csv
Loaded: submission_tokendp_512.csv
Loaded: submission_3seed512_single_charfusion_snap_lambda012_K200_240_Lmax50_60_delta002.csv
Loaded: submission_charfusion_512_lambda015.csv
Loaded: submission_tokenselect_512single_or_384_lambda012.csv
Loaded: submission_third_multistream_drop_muril_snap_lambda010.csv


Meta-consensus built. Empties=0, mean_len=10.95, time=0.00s
submission.csv updated -> submission_meta_consensus_perlang_majority_lenhi002_lenta003_plusjacc.csv


In [45]:
# Improved 512 single-seed token-DP (K=280, Lmax=54/64, ends shortlist=10); overwrite submission_tokendp_512.csv
import numpy as np, pandas as pd, json, math, time, re, unicodedata as ud
from pathlib import Path

test = pd.read_csv('test.csv')
id_list = test['id'].tolist()
id2lang = dict(zip(test['id'], test['language']))
id2ctx = dict(zip(test['id'], test['context'].astype(str)))

train = pd.read_csv('train.csv')
train['answer_text'] = train['answer_text'].astype(str).str.strip()
train['char_len'] = train['answer_text'].str.len().clip(lower=1)
def fit_log_normal_params(df):
    x = np.log(df['char_len'].values.astype(float))
    mu = float(x.mean()); sd = float(x.std());
    if sd <= 1e-6: sd = 1e-6
    return mu, sd
priors = {lang: dict(zip(['mu','sigma'], fit_log_normal_params(g))) for lang, g in train.groupby('language')}

def to_pair(t):
    try:
        if t is None: return (0,0)
        if isinstance(t,(list,tuple,np.ndarray)) and len(t)>=2:
            a = int(t[0]) if t[0] is not None else 0
            b = int(t[1]) if t[1] is not None else 0
            return (a,b)
    except Exception:
        pass
    return (0,0)

def softmax(x):
    x = np.asarray(x, dtype=np.float64)
    m = np.max(x) if x.size else 0.0
    e = np.exp(x - m)
    s = e.sum()
    return e / s if s > 0 else np.zeros_like(x, dtype=np.float64)

ZW_CHARS = {'\u200B','\u200C','\u200D','\u2060','\ufeff'}
NBSP_SET = {'\u00A0','\u2002','\u2003','\u2004','\u2005','\u2006','\u2007','\u2008','\u2009','\u200A'}
HI_VIRAMA='\u094D'; TA_PULLI='\u0BCD'; DANDA='\u0964'
def clean_span_text(s: str, lang: str) -> str:
    if not s: return ''
    s = ''.join(ch for ch in s if ch not in ZW_CHARS)
    s = ''.join(' ' if ch in NBSP_SET else ch for ch in s)
    s = re.sub(r'\s+', ' ', s).strip()
    s = re.sub(r'(?<=\d)[\s,._-](?=\d)', '', s)
    if s:
        last = s[-1]
        if ud.category(last) == 'Mn' or last in (HI_VIRAMA, TA_PULLI): s = s[:-1]
    if lang == 'hindi':
        s = s.replace(DANDA+DANDA, DANDA)
        if s.endswith(DANDA): s = s[:-1].rstrip()
    return s

def log_normal_logpdf_len(char_len: int, mu: float, sigma: float) -> float:
    Lc = max(1, int(char_len))
    x = math.log(Lc)
    return -0.5*((x-mu)/sigma)**2 - math.log(max(Lc,1e-6)) - math.log(sigma) - 0.5*math.log(2*math.pi)

def load_npz_logits(npz_path):
    arr = np.load(npz_path, allow_pickle=True)
    keys = list(arr.keys())
    if 'start' in keys and 'end' in keys: return arr['start'], arr['end']
    for sk in ['start_logits','test_start_logits','start_logits_avg']:
        for ek in ['end_logits','test_end_logits','end_logits_avg']:
            if sk in keys and ek in keys: return arr[sk], arr[ek]
    raise ValueError(f'Unknown keys in {npz_path}: {keys}')

# Load 512 single-seed (alignment-safe)
s512, e512 = load_npz_logits('xlmr_large_512_test_avg.npz')
eid_512 = json.loads(Path('xlmr_large_512_test_logits/test_example_id.json').read_text())
off_512 = np.load('xlmr_large_512_test_logits/test_offset_mapping.npy', allow_pickle=True)
print('512 single-seed logits:', s512.shape, 'mapping', len(eid_512))

def build_index(eids):
    m = {}
    for i, q in enumerate(eids): m.setdefault(q, []).append(i)
    return m
idx512 = build_index(eid_512)

def decode_tokendp_512_fixed():
    rows = []; t0 = time.time()
    # Expert tweak: ends shortlist 12 (was 10); K bumped by +20 (hi 260, ta 300); keep Lmax 54/64 and lambda_len 0.12
    K_hi, K_ta = 260, 300
    Lmax_hi, Lmax_ta = 54, 64
    shortlist = 12
    lam = 0.12; clip_prior = (-0.8, 0.0)
    for qid in id_list:
        ctx = id2ctx[qid]; lang = id2lang[qid]
        mu = priors[lang]['mu']; sigma = priors[lang]['sigma']
        Lmax = Lmax_hi if lang=='hindi' else Lmax_ta
        K = K_hi if lang=='hindi' else K_ta
        best = (-1e18, '')
        for fi in idx512.get(qid, []):
            offs = off_512[fi]
            M = len(offs) if hasattr(offs,'__len__') else s512.shape[1]
            ps = softmax(s512[fi][:M]); pe = softmax(e512[fi][:M])
            valid = []
            for ti in range(M):
                a,b = to_pair(offs[ti])
                if b > a and 0 <= a < len(ctx) and 1 <= b <= len(ctx): valid.append(ti)
            if not valid: continue
            valid = np.array(valid, dtype=np.int32)
            s_top = valid[np.argsort(ps[valid])[::-1][:K]]
            for si in s_top:
                a0, _ = to_pair(offs[si])
                cand_e = []
                for ei in range(si, M):
                    a2,b2 = to_pair(offs[ei])
                    if not (b2 > a2 and 0 <= a2 < len(ctx) and 1 <= b2 <= len(ctx)): continue
                    clen = b2 - a0
                    if clen <= 0 or clen > Lmax: continue
                    cand_e.append(ei)
                if not cand_e: continue
                cand_e = np.array(cand_e, dtype=np.int32)
                seg = pe[cand_e]
                top_e = cand_e[np.argsort(seg)[::-1][:shortlist]]
                for ei in top_e:
                    a,b = to_pair(offs[si])[0], to_pair(offs[ei])[1]
                    if not (0 <= a < len(ctx) and 1 <= b <= len(ctx) and b > a): continue
                    raw = float(np.log(max(ps[si],1e-15)) + np.log(max(pe[ei],1e-15)))
                    clen = b - a
                    lp = log_normal_logpdf_len(clen, mu, sigma)
                    lp = max(clip_prior[0], min(clip_prior[1], lp))
                    raw += lam * lp
                    if raw > best[0]:
                        best = (raw, clean_span_text(ctx[a:b], lang) or ctx[a:b])
        if best[1] == '':
            best = (best[0], clean_span_text(ctx[:1], lang) or ctx[:1])
        rows.append((qid, best[1]))
    sub = pd.DataFrame(rows, columns=['id','PredictionString'])
    empties = (sub['PredictionString'].astype(str).str.len()==0).sum()
    mean_len = sub['PredictionString'].astype(str).str.len().mean()
    print(f'Token-DP 512 fixed: Empties={int(empties)}, mean_len={mean_len:.2f}, time={time.time()-t0:.2f}s')
    return sub

sub_fix = decode_tokendp_512_fixed()
out_path = 'submission_tokendp_512.csv'  # overwrite existing to be picked by meta-consensus
sub_fix.to_csv(out_path, index=False)
pd.read_csv(out_path).to_csv('submission.csv', index=False)
print('submission.csv updated ->', out_path)

512 single-seed logits: (1401, 512) mapping 1401


Token-DP 512 fixed: Empties=0, mean_len=10.71, time=77.24s
submission.csv updated -> submission_tokendp_512.csv


In [34]:
# Meta-consensus with normalized scoring (punct/quote-insensitive) and per-language pools
import pandas as pd, numpy as np, math, time, re

test = pd.read_csv('test.csv')
id_list = test['id'].tolist()
lang_map = dict(zip(test['id'], test['language']))

# Candidate pools (same as expert meta-consensus):
cand_hi_files = [
    'submission_3seed512_single_charfusion_snap_lambda012_K200_240_Lmax50_60_delta002.csv',
    'submission_charfusion_512_lambda015.csv',
    'submission_tokenselect_512single_or_384_lambda012.csv',
    'submission_384only_charfusion_lambda015.csv',
    'submission_primary_multistream_snap_hi85_10_5_ta97_3_0_lambda015.csv',
    'submission_third_multistream_drop_muril_snap_lambda010.csv',
]
cand_ta_files = [
    'submission_3seed512_single_charfusion_snap_lambda012_K200_240_Lmax50_60_delta002.csv',
    'submission_charfusion_512_lambda015.csv',
    'submission_tokenselect_512single_or_384_lambda012.csv',
    'submission_third_multistream_drop_muril_snap_lambda010.csv',
]
optional_hi = ['submission_tokendp_512.csv']

def load_cands(paths):
    out = []
    for f in paths:
        try:
            df = pd.read_csv(f)
            if set(df.columns) >= {'id','PredictionString'} and len(df)==len(test):
                out.append((f, df.set_index('id')['PredictionString'].astype(str).to_dict()))
                print('Loaded:', f)
            else:
                print('Skip (shape/cols):', f)
        except Exception as e:
            print('Skip:', f, '->', e)
    return out

import os
cands_hi = load_cands(cand_hi_files + [f for f in optional_hi if os.path.exists(f)])
cands_ta = load_cands(cand_ta_files)
if len(cands_hi) < 4: cands_hi = load_cands(cand_hi_files)

# Normalization for scoring only (do not change final output text)
PUNCT = set(list('.,!?;:"\'()[]{}“”‘’«»'))
def norm_text(s: str) -> str:
    s = str(s).strip()
    # trim surrounding quotes/brackets
    while len(s) >= 2 and ((s[0], s[-1]) in [('"','"'),("'","'"),('“','”'),('‘','’'),('(',')'),('[',']'),('{','}'),('«','»')]):
        s = s[1:-1].strip()
    # collapse spaces
    s = re.sub(r'\s+', ' ', s)
    # strip leading/trailing punctuation
    s = s.strip(''.join(PUNCT))
    return s.strip()

def words_norm(s):
    s = norm_text(s)
    # split on whitespace; strip punctuation from tokens
    toks = []
    for w in s.split():
        w = w.strip(''.join(PUNCT))
        if w: toks.append(w)
    return toks

def jaccard_norm(a, b):
    wa, wb = set(words_norm(a)), set(words_norm(b))
    if not wa and not wb: return 1.0
    if not wa or not wb: return 0.0
    inter = len(wa & wb); uni = len(wa | wb)
    return inter/uni if uni>0 else 0.0

# Per-language log-normal priors (for small length regularization in log-space)
train = pd.read_csv('train.csv')
train['answer_text'] = train['answer_text'].astype(str).str.strip()
train['char_len'] = train['answer_text'].str.len().clip(lower=1)
def fit_log_normal_params(df):
    x = np.log(df['char_len'].values.astype(float))
    mu = float(x.mean()); sd = float(x.std());
    if sd <= 1e-6: sd = 1e-6
    return mu, sd
priors = {lang: dict(zip(['mu','sigma'], fit_log_normal_params(g))) for lang, g in train.groupby('language')}

def len_prior_dist(s, lang):
    L = max(1, len(str(s))); mu = priors.get(lang, {}).get('mu', 2.3)
    return abs(math.log(L) - mu)

from collections import Counter
def majority_override_norm(texts, min_votes=3):
    # vote on normalized form
    norm_map = [norm_text(t) for t in texts]
    cnt = Counter(norm_map)
    top_text, votes = cnt.most_common(1)[0]
    if votes >= min_votes:
        # return the first candidate whose normalized form matches the winner
        for orig, nt in zip(texts, norm_map):
            if nt == top_text:
                return orig, votes
    return None, votes

# Length regularization weights
lambda_hi = 0.02
lambda_ta = 0.03

rows = []
t0 = time.time()
for qid in id_list:
    lang = lang_map[qid]
    pool = cands_hi if lang=='hindi' else cands_ta
    texts = [d[qid] for _, d in pool]
    # Majority override on normalized strings
    maj, votes = majority_override_norm(texts, min_votes=3)
    if maj is not None:
        rows.append((qid, maj))
        continue
    # Score by mean normalized Jaccard; add small per-language len prior; tie-break by closer to prior, then shorter within 0.01
    best_i, best_s, best_d, best_L = -1, -1e18, 1e9, 10**9
    lam = lambda_hi if lang=='hindi' else lambda_ta
    for i, ti in enumerate(texts):
        s = 0.0; cnt = 0
        for j, tj in enumerate(texts):
            if i==j: continue
            s += jaccard_norm(ti, tj); cnt += 1
        avg_j = s / max(1, cnt)
        dlen = len_prior_dist(ti, lang)
        score = avg_j - lam * dlen
        L = len(str(ti))
        if (score > best_s + 1e-12) or (abs(score - best_s) <= 0.01 and (dlen < best_d or (abs(dlen-best_d) <= 1e-6 and L < best_L))):
            best_s, best_i, best_d, best_L = score, i, dlen, L
    rows.append((qid, texts[best_i]))

sub = pd.DataFrame(rows, columns=['id','PredictionString'])
empties = (sub['PredictionString'].astype(str).str.len()==0).sum()
mean_len = sub['PredictionString'].astype(str).str.len().mean()
print(f'Meta-consensus (normalized scoring) built. Empties={int(empties)}, mean_len={mean_len:.2f}, time={time.time()-t0:.2f}s')

out_path = 'submission_meta_consensus_normscore_perlang_majority_lenhi002_lenta003.csv'
sub.to_csv(out_path, index=False)
pd.read_csv(out_path).to_csv('submission.csv', index=False)
print('submission.csv updated ->', out_path)

Loaded: submission_3seed512_single_charfusion_snap_lambda012_K200_240_Lmax50_60_delta002.csv
Loaded: submission_charfusion_512_lambda015.csv
Loaded: submission_tokenselect_512single_or_384_lambda012.csv
Loaded: submission_384only_charfusion_lambda015.csv
Loaded: submission_primary_multistream_snap_hi85_10_5_ta97_3_0_lambda015.csv
Loaded: submission_third_multistream_drop_muril_snap_lambda010.csv
Loaded: submission_tokendp_512.csv
Loaded: submission_3seed512_single_charfusion_snap_lambda012_K200_240_Lmax50_60_delta002.csv
Loaded: submission_charfusion_512_lambda015.csv
Loaded: submission_tokenselect_512single_or_384_lambda012.csv
Loaded: submission_third_multistream_drop_muril_snap_lambda010.csv


Meta-consensus (normalized scoring) built. Empties=0, mean_len=10.96, time=0.01s
submission.csv updated -> submission_meta_consensus_normscore_perlang_majority_lenhi002_lenta003.csv


In [56]:
# Switch submission.csv to 512 single-seed token-DP variant and print diagnostics
import pandas as pd, os, time
path = 'submission_tokendp_512.csv'
assert os.path.exists(path), f'Missing {path}'
sub = pd.read_csv(path)
empties = (sub['PredictionString'].astype(str).str.len()==0).sum()
mean_len = sub['PredictionString'].astype(str).str.len().mean()
sub.to_csv('submission.csv', index=False)
print('submission.csv updated ->', path)
print('Diagnostics: empties=', int(empties), 'mean_len=', round(float(mean_len), 2))
print('mtime(submission.csv)=', time.ctime(os.path.getmtime('submission.csv')))

submission.csv updated -> submission_tokendp_512.csv
Diagnostics: empties= 0 mean_len= 10.71
mtime(submission.csv)= Thu Sep 25 12:42:02 2025


In [39]:
# Alignment-safe 512 single-seed char-fusion + meta-consensus over alignment-safe candidates
import numpy as np, pandas as pd, json, time, math, re, unicodedata as ud
from pathlib import Path

test = pd.read_csv('test.csv')
id_list = test['id'].tolist()
id2lang = dict(zip(test['id'], test['language']))
id2ctx = dict(zip(test['id'], test['context'].astype(str)))

train = pd.read_csv('train.csv')
train['answer_text'] = train['answer_text'].astype(str).str.strip()
train['char_len'] = train['answer_text'].str.len().clip(lower=1)
def fit_log_normal_params(df):
    x = np.log(df['char_len'].values.astype(float))
    mu = float(x.mean()); sd = float(x.std());
    if sd <= 1e-6: sd = 1e-6
    return mu, sd
priors = {lang: dict(zip(['mu','sigma'], fit_log_normal_params(g))) for lang, g in train.groupby('language')}

ZW_CHARS = {'\u200B','\u200C','\u200D','\u2060','\ufeff'}
NBSP_SET = {'\u00A0','\u2002','\u2003','\u2004','\u2005','\u2006','\u2007','\u2008','\u2009','\u200A'}
HI_VIRAMA='\u094D'; TA_PULLI='\u0BCD'; DANDA='\u0964'
def clean_span_text(s: str, lang: str) -> str:
    if not s: return ''
    s = ''.join(ch for ch in s if ch not in ZW_CHARS)
    s = ''.join(' ' if ch in NBSP_SET else ch for ch in s)
    s = re.sub(r'\s+', ' ', s).strip()
    s = re.sub(r'(?<=\d)[\s,._-](?=\d)', '', s)
    if s:
        last = s[-1]
        if ud.category(last) == 'Mn' or last in (HI_VIRAMA, TA_PULLI): s = s[:-1]
    if lang == 'hindi':
        s = s.replace(DANDA+DANDA, DANDA)
        if s.endswith(DANDA): s = s[:-1].rstrip()
    return s

def to_pair(t):
    try:
        if t is None: return (0,0)
        if isinstance(t,(list,tuple,np.ndarray)) and len(t)>=2:
            a = int(t[0]) if t[0] is not None else 0
            b = int(t[1]) if t[1] is not None else 0
            return (a,b)
    except Exception:
        pass
    return (0,0)

def load_npz_logits(npz_path):
    arr = np.load(npz_path, allow_pickle=True)
    keys = list(arr.keys())
    if 'start' in keys and 'end' in keys: return arr['start'], arr['end']
    for sk in ['start_logits','test_start_logits','start_logits_avg']:
        for ek in ['end_logits','test_end_logits','end_logits_avg']:
            if sk in keys and ek in keys: return arr[sk], arr[ek]
    raise ValueError(f'Unknown keys in {npz_path}: {keys}')

# Load alignment-safe 512 single-seed logits + mapping
s512, e512 = load_npz_logits('xlmr_large_512_test_avg.npz')
eid_512 = json.loads(Path('xlmr_large_512_test_logits/test_example_id.json').read_text())
off_512 = np.load('xlmr_large_512_test_logits/test_offset_mapping.npy', allow_pickle=True)
print('512 single-seed:', s512.shape, 'mapping', len(eid_512))

def build_index(eids):
    m = {}
    for i, q in enumerate(eids): m.setdefault(q, []).append(i)
    return m
idx512 = build_index(eid_512)

def maxpool1d(x):
    if len(x) == 0: return x
    y = x.copy()
    if len(x) == 1: return y
    y[0] = max(x[0], x[1])
    for i in range(1, len(x)-1): y[i] = max(x[i-1], x[i], x[i+1])
    y[-1] = max(x[-2], x[-1])
    return y

def log_normal_logpdf_len(char_len: int, mu: float, sigma: float) -> float:
    Lc = max(1, int(char_len))
    x = math.log(Lc)
    return -0.5*((x-mu)/sigma)**2 - math.log(max(Lc,1e-6)) - math.log(sigma) - 0.5*math.log(2*math.pi)

def decode_charfusion_512_single(lambda_len=0.12, clip_prior=(-0.8,0.0), K_hi=200, K_ta=240, Lmax_hi=50, Lmax_ta=60):
    rows = []; t0 = time.time()
    for qid in id_list:
        lang = id2lang[qid]; ctx = id2ctx[qid]
        mu = priors[lang]['mu']; sigma = priors[lang]['sigma']
        Lmax = Lmax_hi if lang=='hindi' else Lmax_ta
        K = K_hi if lang=='hindi' else K_ta
        S = np.zeros(len(ctx), dtype=np.float32)
        E = np.zeros(len(ctx), dtype=np.float32)
        for fi in idx512.get(qid, []):
            offs_raw = off_512[fi]
            M = len(offs_raw) if hasattr(offs_raw,'__len__') else s512.shape[1]
            s_log = s512[fi][:M]; e_log = e512[fi][:M]
            for ti in range(M):
                a,b = to_pair(offs_raw[ti])
                if b > a and 0 <= a < len(ctx) and 1 <= b <= len(ctx):
                    S[a] += s_log[ti]
                    E[b-1] += e_log[ti]
        S = maxpool1d(S); E = maxpool1d(E)
        if len(S) == 0:
            rows.append((qid, ctx[:1] if len(ctx)>0 else '')); continue
        starts = np.argsort(S)[::-1][:K]
        best_score = -1e18; best_span = (0, max(0, min(Lmax, len(ctx))-1))
        for si in starts:
            end_lo = si; end_hi = min(len(ctx)-1, si + Lmax - 1)
            if end_hi < end_lo: continue
            seg = E[end_lo:end_hi+1]
            ej = end_lo + int(np.argmax(seg))
            raw = float(S[si] + E[ej])
            clen = ej - si + 1
            lp = log_normal_logpdf_len(clen, mu, sigma)
            lp = max(clip_prior[0], min(clip_prior[1], lp))
            raw += lambda_len * lp
            if raw > best_score:
                best_score = raw; best_span = (si, ej)
        a,b = best_span
        text = clean_span_text(ctx[a:b+1], lang)
        if not text:
            b2 = min(len(ctx), a+1)
            text = clean_span_text(ctx[a:b2], lang) or ctx[a:b2]
        rows.append((qid, text))
    sub = pd.DataFrame(rows, columns=['id','PredictionString'])
    empties = (sub['PredictionString'].astype(str).str.len()==0).sum()
    mean_len = sub['PredictionString'].astype(str).str.len().mean()
    print(f'512 single-seed char-fusion: Empties={int(empties)}, mean_len={mean_len:.2f}, time={time.time()-t0:.2f}s')
    return sub

# Build 512 single-seed char-fusion submission
sub_512cf = decode_charfusion_512_single(lambda_len=0.12, K_hi=200, K_ta=240, Lmax_hi=50, Lmax_ta=60)
path_512cf = 'submission_charfusion_512_single_seed_alignsafe.csv'
sub_512cf.to_csv(path_512cf, index=False)

# Meta-consensus over alignment-safe candidates only
def words(s):
    return [w for w in str(s).strip().split() if len(w)>0]
def jaccard(a, b):
    wa, wb = set(words(a)), set(words(b))
    if not wa and not wb: return 1.0
    if not wa or not wb: return 0.0
    inter = len(wa & wb); uni = len(wa | wb)
    return inter/uni if uni>0 else 0.0
def len_prior_dist(s, lang):
    L = max(1, len(str(s))); mu = priors.get(lang, {}).get('mu', 2.3)
    return abs(math.log(L) - mu)
from collections import Counter
def majority_override(texts, min_votes=3):
    cnt = Counter(texts); text, votes = cnt.most_common(1)[0]
    return (text if votes >= min_votes else None), votes

# Pools: Hindi uses {512 single-seed CF, tokendp_512, tokenselect}; Tamil uses {512 single-seed CF, tokendp_512, tokenselect}
cand_hi_files = [
    path_512cf,
    'submission_tokendp_512.csv',
    'submission_tokenselect_512single_or_384_lambda012.csv'
]
cand_ta_files = [
    path_512cf,
    'submission_tokendp_512.csv',
    'submission_tokenselect_512single_or_384_lambda012.csv',
]
def load_cands(paths):
    out = []
    for f in paths:
        try:
            df = pd.read_csv(f)
            if set(df.columns) >= {'id','PredictionString'} and len(df)==len(test):
                out.append((f, df.set_index('id')['PredictionString'].astype(str).to_dict()))
                print('Loaded:', f)
            else:
                print('Skip (shape/cols):', f)
        except Exception as e:
            print('Skip:', f, '->', e)
    return out
cands_hi = load_cands(cand_hi_files)
cands_ta = load_cands(cand_ta_files)

lambda_hi = 0.02; lambda_ta = 0.045
rows = []; t0 = time.time()
for qid in id_list:
    lang = id2lang[qid]
    pool = cands_hi if lang=='hindi' else cands_ta
    texts = [d[qid] for _, d in pool]
    maj, votes = majority_override(texts, min_votes=3)
    if maj is not None:
        rows.append((qid, maj)); continue
    best_i, best_s, best_d, best_L = -1, -1e18, 1e9, 10**9
    lam = lambda_hi if lang=='hindi' else lambda_ta
    for i, ti in enumerate(texts):
        s = 0.0; cnt = 0
        for j, tj in enumerate(texts):
            if i==j: continue
            s += jaccard(ti, tj); cnt += 1
        avg_j = s / max(1, cnt)
        dlen = len_prior_dist(ti, lang)
        score = avg_j - lam * dlen
        L = len(str(ti))
        if (score > best_s + 1e-12) or (abs(score - best_s) <= 0.01 and (dlen < best_d or (abs(dlen-best_d) <= 1e-6 and L < best_L))):
            best_s, best_i, best_d, best_L = score, i, dlen, L
    rows.append((qid, texts[best_i]))

sub = pd.DataFrame(rows, columns=['id','PredictionString'])
empties = (sub['PredictionString'].astype(str).str.len()==0).sum()
mean_len = sub['PredictionString'].astype(str).str.len().mean()
print(f'Alignment-safe meta-consensus built. Empties={int(empties)}, mean_len={mean_len:.2f}, time={time.time()-t0:.2f}s')

out_path = 'submission_meta_consensus_alignsafe_512singlecf_tokendp_tokenselect.csv'
sub.to_csv(out_path, index=False)
pd.read_csv(out_path).to_csv('submission.csv', index=False)
print('submission.csv updated ->', out_path)

512 single-seed: (1401, 512) mapping 1401


512 single-seed char-fusion: Empties=0, mean_len=10.79, time=2.26s
Loaded: submission_charfusion_512_single_seed_alignsafe.csv
Loaded: submission_tokendp_512.csv
Loaded: submission_tokenselect_512single_or_384_lambda012.csv
Loaded: submission_charfusion_512_single_seed_alignsafe.csv
Loaded: submission_tokendp_512.csv
Loaded: submission_tokenselect_512single_or_384_lambda012.csv
Alignment-safe meta-consensus built. Empties=0, mean_len=10.86, time=0.00s
submission.csv updated -> submission_meta_consensus_alignsafe_512singlecf_tokendp_tokenselect.csv


In [61]:
# Alignment-safe meta-consensus with normalized scoring (no 3-seed; Tamil excludes 384-only, Hindi also drops 384-only)
import pandas as pd, numpy as np, math, time, re, os, unicodedata as ud

test = pd.read_csv('test.csv')
id_list = test['id'].tolist()
lang_map = dict(zip(test['id'], test['language']))

# Alignment-safe candidate pools
# Hindi: 512 single-seed char-fusion, 512 token-DP, 512 vs 384 token-select (DROP 384-only CF)
# Tamil: 512 single-seed char-fusion, 512 token-DP, 512 vs 384 token-select
cand_hi_files = [
    'submission_charfusion_512_single_seed_alignsafe.csv',
    'submission_tokendp_512.csv',
    'submission_tokenselect_512single_or_384_lambda012.csv',
]
cand_ta_files = [
    'submission_charfusion_512_single_seed_alignsafe.csv',
    'submission_tokendp_512.csv',
    'submission_tokenselect_512single_or_384_lambda012.csv',
]

def load_cands(paths):
    out = []
    for f in paths:
        try:
            if not os.path.exists(f):
                print('Missing:', f); continue
            df = pd.read_csv(f)
            if set(df.columns) >= {'id','PredictionString'} and len(df)==len(test):
                out.append((f, df.set_index('id')['PredictionString'].astype(str).to_dict()))
                print('Loaded:', f)
            else:
                print('Skip (shape/cols):', f)
        except Exception as e:
            print('Skip:', f, '->', e)
    return out

cands_hi = load_cands(cand_hi_files)
cands_ta = load_cands(cand_ta_files)
assert len(cands_hi) >= 3 and len(cands_ta) >= 3, 'Insufficient alignment-safe candidates'

# Normalization for scoring (do not change final output text) + digit/sep normalization
PUNCT = set(list('.,!?;:\"\'()[]{}\u201c\u201d\u2018\u2019\u00ab\u00bb'))
DIG_MAP = str.maketrans({
    '\u0966':'0','\u0967':'1','\u0968':'2','\u0969':'3','\u096a':'4','\u096b':'5','\u096c':'6','\u096d':'7','\u096e':'8','\u096f':'9',
    '\u0be6':'0','\u0be7':'1','\u0be8':'2','\u0be9':'3','\u0bea':'4','\u0beb':'5','\u0bec':'6','\u0bed':'7','\u0bee':'8','\u0bef':'9'
})
SEP_PAT = re.compile(r'[\u2010\u2011\u2012\u2013\u2014\u2212]+')  # various dashes
DANDA = '\u0964'
def norm_text(s: str) -> str:
    s = str(s).strip()
    s = s.translate(DIG_MAP)
    s = SEP_PAT.sub('-', s)
    s = s.replace('\u00A0',' ').replace('\u2009',' ').replace('\u200A',' ')
    while len(s) >= 2 and ((s[0], s[-1]) in [("\"","\""),("'","'"),('\u201c','\u201d'),('\u2018','\u2019'),('(',')'),('[',']'),('{','}'),('\u00ab','\u00bb')]):
        s = s[1:-1].strip()
    # drop standalone danda at ends for matching robustness
    s = s.strip(DANDA)
    s = re.sub(r'\s+', ' ', s)
    s = s.strip(''.join(PUNCT))
    return s.strip()

def words_norm(s):
    s = norm_text(s)
    toks = []
    for w in s.split():
        w = w.strip(''.join(PUNCT))
        if w: toks.append(w)
    return toks

def jaccard_norm(a, b):
    wa, wb = set(words_norm(a)), set(words_norm(b))
    if not wa and not wb: return 1.0
    if not wa or not wb: return 0.0
    inter = len(wa & wb); uni = len(wa | wb)
    return inter/uni if uni>0 else 0.0

# Per-language log-normal priors
train = pd.read_csv('train.csv')
train['answer_text'] = train['answer_text'].astype(str).str.strip()
train['char_len'] = train['answer_text'].str.len().clip(lower=1)
def fit_log_normal_params(df):
    x = np.log(df['char_len'].values.astype(float))
    mu = float(x.mean()); sd = float(x.std());
    if sd <= 1e-6: sd = 1e-6
    return mu, sd
priors = {lang: dict(zip(['mu','sigma'], fit_log_normal_params(g))) for lang, g in train.groupby('language')}

def len_prior_dist(s, lang):
    L = max(1, len(str(s))); mu = priors.get(lang, {}).get('mu', 2.3)
    return abs(math.log(L) - mu)

from collections import Counter
def majority_override_norm(texts, min_votes=3):
    norm_map = [norm_text(t) for t in texts]
    cnt = Counter(norm_map)
    top_text, votes = cnt.most_common(1)[0]
    if votes >= min_votes:
        for orig, nt in zip(texts, norm_map):
            if nt == top_text:
                return orig, votes
    return None, votes

# Length regularization weights
lambda_hi = 0.02
lambda_ta = 0.045

# Numeric/date micro-rule: match on normalized TD text; allow digits/separators; len <= 16; output raw TD
num_re = re.compile(r'^[\d\u0966-\u096f\u0be6-\u0bef\s\-/\.]+$')

def tokendp_index(pool):
    for i, (fname, _) in enumerate(pool):
        if fname.endswith('submission_tokendp_512.csv'):
            return i
    return None

def tokenselect_index(pool):
    for i, (fname, _) in enumerate(pool):
        if fname.endswith('submission_tokenselect_512single_or_384_lambda012.csv'):
            return i
    return None

# Ultra-safe micro-trim (apply only if trimmed normalized form exists among candidates)
TA_PULLI = '\u0BCD'
def apply_micro_trim(chosen: str, lang: str, cand_norms: set, norm_text_fn):
    if not chosen:
        return chosen
    # Hindi: final danda
    if lang == 'hindi' and chosen.endswith(DANDA):
        trimmed = chosen[:-1].rstrip()
        if norm_text_fn(trimmed) in cand_norms:
            return trimmed
    # Any trailing combining mark (incl. virama/pulli)
    if len(chosen) >= 2 and ud.category(chosen[-1]) == 'Mn':
        trimmed = chosen[:-1]
        if norm_text_fn(trimmed) in cand_norms:
            return trimmed
    return chosen

rows = []
t0 = time.time()
for qid in id_list:
    lang = lang_map[qid]
    pool = cands_hi if lang=='hindi' else cands_ta
    texts = [d[qid] for _, d in pool]
    cand_norms = {norm_text(t) for t in texts}
    # Micro-override: prefer token-DP if numeric/date-like and short (per-language cap) on normalized form
    tdp_idx = tokendp_index(pool)
    if tdp_idx is not None:
        tdp_raw = texts[tdp_idx]
        tdp_norm = norm_text(tdp_raw).replace('\u0964','')
        if tdp_raw and (((lang == 'hindi') and (len(tdp_raw) <= 18)) or ((lang != 'hindi') and (len(tdp_raw) <= 16))) and num_re.match(tdp_norm):
            pick = apply_micro_trim(tdp_raw, lang, cand_norms, norm_text)
            rows.append((qid, pick))
            continue
    # Majority override on normalized strings
    maj, votes = majority_override_norm(texts, min_votes=3)
    if maj is not None:
        pick = apply_micro_trim(maj, lang, cand_norms, norm_text)
        # Tie-break: if token-select normalized equals chosen normalized, prefer token-select raw
        ts_idx = tokenselect_index(pool)
        if ts_idx is not None:
            TS = texts[ts_idx]
            if norm_text(TS) == norm_text(pick) and TS != pick:
                pick = apply_micro_trim(TS, lang, cand_norms, norm_text)
        rows.append((qid, pick))
        continue
    # Removed contained-shorter rule to avoid over-shortening
    # Score by mean normalized Jaccard; add small per-language len prior; tie-break by closer to prior, then shorter within 0.01
    best_i, best_s, best_d, best_L = -1, -1e18, 1e9, 10**9
    lam = lambda_hi if lang=='hindi' else lambda_ta
    for i, ti in enumerate(texts):
        s = 0.0; cnt = 0
        for j, tj in enumerate(texts):
            if i==j: continue
            s += jaccard_norm(ti, tj); cnt += 1
        avg_j = s / max(1, cnt)
        dlen = len_prior_dist(ti, lang)
        score = avg_j - lam * dlen
        L = len(str(ti))
        if (score > best_s + 1e-12) or (abs(score - best_s) <= 0.01 and (dlen < best_d or (abs(dlen-best_d) <= 1e-6 and L < best_L))):
            best_s, best_i, best_d, best_L = score, i, dlen, L
    pick = apply_micro_trim(texts[best_i], lang, cand_norms, norm_text)
    # Tie-break after scoring: if token-select normalized equals chosen normalized, prefer token-select raw
    ts_idx = tokenselect_index(pool)
    if ts_idx is not None:
        TS = texts[ts_idx]
        if norm_text(TS) == norm_text(pick) and TS != pick:
            pick = apply_micro_trim(TS, lang, cand_norms, norm_text)
    rows.append((qid, pick))

sub = pd.DataFrame(rows, columns=['id','PredictionString'])
empties = (sub['PredictionString'].astype(str).str.len()==0).sum()
mean_len = sub['PredictionString'].astype(str).str.len().mean()
print(f'Align-safe meta-consensus (norm scoring, +numeric<=16 +digit/sep norm +micro-trim, TS-norm tie-break, no-contained-shorter, HI no 384-only) built. Empties={int(empties)}, mean_len={mean_len:.2f}, time={time.time()-t0:.2f}s')

out_path = 'submission_meta_consensus_alignsafe_normscore_512singlecf_tokendp_tokenselect.csv'
sub.to_csv(out_path, index=False)
pd.read_csv(out_path).to_csv('submission.csv', index=False)
print('submission.csv updated ->', out_path)

Loaded: submission_charfusion_512_single_seed_alignsafe.csv
Loaded: submission_tokendp_512.csv
Loaded: submission_tokenselect_512single_or_384_lambda012.csv
Loaded: submission_charfusion_512_single_seed_alignsafe.csv
Loaded: submission_tokendp_512.csv
Loaded: submission_tokenselect_512single_or_384_lambda012.csv


Align-safe meta-consensus (norm scoring, +numeric<=16 +digit/sep norm +micro-trim, TS-norm tie-break, no-contained-shorter, HI no 384-only) built. Empties=0, mean_len=10.70, time=0.02s
submission.csv updated -> submission_meta_consensus_alignsafe_normscore_512singlecf_tokendp_tokenselect.csv


In [57]:
# Hedge 2 (per-id confidence gate): fallback to token-DP on low-consensus; trailing-punct diff prefers token-DP; numeric/date override <=16
import pandas as pd, numpy as np, re, time, math, os, unicodedata as ud

test = pd.read_csv('test.csv')
id_list = test['id'].tolist()
lang_map = dict(zip(test['id'], test['language']))

# Inputs:
# - Cell 28 normalized meta-consensus (primary) WITH numeric/date micro-override
sub28_path = 'submission_meta_consensus_alignsafe_normscore_512singlecf_tokendp_tokenselect.csv'
# - Cell 27 non-normalized meta-consensus (older pool; approximate use for trailing-punct check if present)
sub27_path = 'submission_meta_consensus_alignsafe_512singlecf_tokendp_tokenselect.csv'
# - Alignment-safe three candidates for confidence computation: 512 CF single-seed, token-DP 512, token-select
cf_path  = 'submission_charfusion_512_single_seed_alignsafe.csv'
tdp_path = 'submission_tokendp_512.csv'
tsel_path= 'submission_tokenselect_512single_or_384_lambda012.csv'

def must_load(path):
    assert os.path.exists(path), f'Missing {path}'
    df = pd.read_csv(path)
    assert set(df.columns) >= {'id','PredictionString'} and len(df)==len(test), f'Bad shape/cols for {path}'
    return df.set_index('id')['PredictionString'].astype(str).to_dict()

m28 = must_load(sub28_path)
mCF = must_load(cf_path)
mTD = must_load(tdp_path)
mTS = must_load(tsel_path)
m27 = None
if os.path.exists(sub27_path):
    try:
        m27 = must_load(sub27_path)
    except Exception:
        m27 = None

# Normalized scoring utils (match Cell 28 digit/sep + punctuation handling)
PUNCT = set(list('.,!?;:\"\'()[]{}“”‘’«»'))
DIG_MAP = str.maketrans({
    '०':'0','१':'1','२':'2','३':'3','४':'4','५':'5','६':'6','७':'7','८':'8','९':'9',
    '௦':'0','௧':'1','௨':'2','௩':'3','௪':'4','௫':'5','௬':'6','௭':'7','௮':'8','௯':'9'
})
SEP_PAT = re.compile(r'[\u2010\u2011\u2012\u2013\u2014\u2212]+')
DANDA = '\u0964'
def norm_text(s: str) -> str:
    s = str(s).strip()
    s = s.translate(DIG_MAP)
    s = SEP_PAT.sub('-', s)
    s = s.replace('\u00A0',' ').replace('\u2009',' ').replace('\u200A',' ')
    while len(s) >= 2 and ((s[0], s[-1]) in [("\"","\""),("'","'"),('“','”'),('‘','’'),('(',')'),('[',']'),('{','}'),('«','»')]):
        s = s[1:-1].strip()
    s = s.strip(DANDA)
    s = re.sub(r'\s+', ' ', s)
    s = s.strip(''.join(PUNCT))
    return s.strip()

def words_norm(s):
    s = norm_text(s)
    toks = []
    for w in s.split():
        w = w.strip(''.join(PUNCT))
        if w: toks.append(w)
    return toks

def jaccard_norm(a, b):
    wa, wb = set(words_norm(a)), set(words_norm(b))
    if not wa and not wb: return 1.0
    if not wa or not wb: return 0.0
    inter = len(wa & wb); uni = len(wa | wb)
    return inter/uni if uni>0 else 0.0

# Numeric/date micro-override (HI len <= 18, TA len <= 16; match on normalized TD); output TD raw when fired
num_re = re.compile(r'^[\d\u0966-\u096f\u0be6-\u0bef\s\-/\.]+$')

# Ultra-safe micro-trim (same as Cell 28): trim final danda or combining mark if another candidate's normalized form matches the trimmed)
TA_PULLI = '\u0BCD'
def apply_micro_trim(chosen: str, lang: str, cand_norms: set, norm_text_fn):
    if not chosen:
        return chosen
    # Hindi: final danda
    if lang == 'hindi' and chosen.endswith(DANDA):
        trimmed = chosen[:-1].rstrip()
        if norm_text_fn(trimmed) in cand_norms:
            return trimmed
    # Any trailing combining mark
    if len(chosen) >= 2 and ud.category(chosen[-1]) == 'Mn':
        trimmed = chosen[:-1]
        if norm_text_fn(trimmed) in cand_norms:
            return trimmed
    return chosen

rows = []
t0 = time.time()
for qid in id_list:
    lang = lang_map[qid]
    # Numeric/date override first (per-language len cap)
    TD = mTD[qid]
    if TD:
        td_norm = norm_text(TD).replace(DANDA,'')
        len_cap = 18 if lang == 'hindi' else 16
        if len(TD) <= len_cap and num_re.match(td_norm):
            rows.append((qid, TD))
            continue
    # Three alignment-safe candidates for confidence computation
    CF = mCF[qid]; TS = mTS[qid]
    cands = [CF, TD, TS]
    cand_norms = {norm_text(t) for t in cands}
    # Preference 1: if token-select normalized equals char-fusion normalized, prefer token-select raw
    if norm_text(TS) == norm_text(CF) and TS != CF:
        pick = apply_micro_trim(TS, lang, cand_norms, norm_text)
        rows.append((qid, pick))
        continue
    # Preference 2 (new): if token-select normalized equals token-DP normalized, prefer token-select raw
    if norm_text(TS) == norm_text(TD) and TS != TD:
        pick = apply_micro_trim(TS, lang, cand_norms, norm_text)
        rows.append((qid, pick))
        continue
    # Compute mean normalized Jaccard for each candidate vs the other two
    conf_scores = []
    for i in range(3):
        s = 0.0; cnt = 0
        for j in range(3):
            if i==j: continue
            s += jaccard_norm(cands[i], cands[j]); cnt += 1
        conf_scores.append(s / max(1, cnt))
    conf = max(conf_scores) if conf_scores else 0.0
    pick = m28[qid]  # default to Cell 28 primary
    # Optional trailing-punctuation/quotes diff rule using Cell 27 (if available)
    if m27 is not None:
        t27n = norm_text(m27[qid]); t28n = norm_text(m28[qid])
        if t27n == t28n and m27[qid] != m28[qid]:
            pick = mTD[qid]
            pick = apply_micro_trim(pick, lang, cand_norms, norm_text)
            rows.append((qid, pick)); continue
    # Per-language low-confidence gate
    thresh = 0.37 if lang == 'tamil' else 0.33
    if conf < thresh:
        pick = mTD[qid]
    pick = apply_micro_trim(pick, lang, cand_norms, norm_text)
    rows.append((qid, pick))

sub = pd.DataFrame(rows, columns=['id','PredictionString'])
empties = (sub['PredictionString'].astype(str).str.len()==0).sum()
mean_len = sub['PredictionString'].astype(str).str.len().mean()
print(f'Per-id hedge built (TA conf<0.37/HI conf<0.33, +HI numeric<=18/TA<=16, +TS=CF|TD norm prefer TS, +micro-trim). Empties={int(empties)}, mean_len={mean_len:.2f}, time={time.time()-t0:.2f}s')

out_path = 'submission_perid_hedge_tokendp_lowconf.csv'
sub.to_csv(out_path, index=False)
pd.read_csv(out_path).to_csv('submission.csv', index=False)
print('submission.csv updated ->', out_path)

Per-id hedge built (TA conf<0.37/HI conf<0.33, +HI numeric<=18/TA<=16, +TS=CF|TD norm prefer TS, +micro-trim). Empties=0, mean_len=10.47, time=0.02s
submission.csv updated -> submission_perid_hedge_tokendp_lowconf.csv


In [46]:
# Hedge 1: Non-normalized meta-consensus (standard Jaccard) + numeric/date micro-override; strict 3-candidate pool
import pandas as pd, numpy as np, math, time, re, os

test = pd.read_csv('test.csv')
id_list = test['id'].tolist()
lang_map = dict(zip(test['id'], test['language']))

# Strict alignment-safe pools (exactly 3 candidates for both HI and TA, as in Cell 27 before 384 addition)
cand_hi_files = [
    'submission_charfusion_512_single_seed_alignsafe.csv',
    'submission_tokendp_512.csv',
    'submission_tokenselect_512single_or_384_lambda012.csv',
]
cand_ta_files = [
    'submission_charfusion_512_single_seed_alignsafe.csv',
    'submission_tokendp_512.csv',
    'submission_tokenselect_512single_or_384_lambda012.csv',
]

def load_cands(paths):
    out = []
    for f in paths:
        try:
            if not os.path.exists(f):
                print('Missing:', f); continue
            df = pd.read_csv(f)
            if set(df.columns) >= {'id','PredictionString'} and len(df)==len(test):
                out.append((f, df.set_index('id')['PredictionString'].astype(str).to_dict()))
                print('Loaded:', f)
            else:
                print('Skip (shape/cols):', f)
        except Exception as e:
            print('Skip:', f, '->', e)
    return out

cands_hi = load_cands(cand_hi_files)
cands_ta = load_cands(cand_ta_files)
assert len(cands_hi) == 3 and len(cands_ta) == 3, 'Expected exactly 3 candidates per language'

# Standard word-level Jaccard
def words(s):
    return [w for w in str(s).strip().split() if len(w)>0]
def jaccard(a, b):
    wa, wb = set(words(a)), set(words(b))
    if not wa and not wb: return 1.0
    if not wa or not wb: return 0.0
    inter = len(wa & wb); uni = len(wa | wb)
    return inter/uni if uni>0 else 0.0

# Per-language log-normal priors
train = pd.read_csv('train.csv')
train['answer_text'] = train['answer_text'].astype(str).str.strip()
train['char_len'] = train['answer_text'].str.len().clip(lower=1)
def fit_log_normal_params(df):
    x = np.log(df['char_len'].values.astype(float))
    mu = float(x.mean()); sd = float(x.std());
    if sd <= 1e-6: sd = 1e-6
    return mu, sd
priors = {lang: dict(zip(['mu','sigma'], fit_log_normal_params(g))) for lang, g in train.groupby('language')}
def len_prior_dist(s, lang):
    L = max(1, len(str(s))); mu = priors.get(lang, {}).get('mu', 2.3)
    return abs(math.log(L) - mu)

from collections import Counter
def majority_override(texts, min_votes=3):
    cnt = Counter(texts)
    text, votes = cnt.most_common(1)[0]
    return (text if votes >= min_votes else None), votes

# Length regularization weights (keep as advised)
lambda_hi = 0.02
lambda_ta = 0.045

# Numeric/date micro-rule regex per expert: prefer token-DP if matches and <=12 chars
num_re = re.compile(r'^[\d\u0966-\u096f\u0be6-\u0bef\s\-/\.]+$')
def tokendp_index(pool):
    for i, (fname, _) in enumerate(pool):
        if fname.endswith('submission_tokendp_512.csv'):
            return i
    return None

rows = []
t0 = time.time()
for qid in id_list:
    lang = lang_map[qid]
    pool = cands_hi if lang=='hindi' else cands_ta
    texts = [d[qid] for _, d in pool]
    # Micro-override (numeric/date-like and short) -> token-DP
    tdp_idx = tokendp_index(pool)
    if tdp_idx is not None:
        tdp_txt = texts[tdp_idx]
        if tdp_txt and len(tdp_txt) <= 12 and num_re.match(tdp_txt):
            rows.append((qid, tdp_txt))
            continue
    # Majority override on raw strings
    maj, votes = majority_override(texts, min_votes=3)
    if maj is not None:
        rows.append((qid, maj))
        continue
    # Score by mean word-level Jaccard + small len prior; tie-break by closer to prior, then shorter within 0.01
    best_i, best_s, best_d, best_L = -1, -1e18, 1e9, 10**9
    lam = lambda_hi if lang=='hindi' else lambda_ta
    for i, ti in enumerate(texts):
        s = 0.0; cnt = 0
        for j, tj in enumerate(texts):
            if i==j: continue
            s += jaccard(ti, tj); cnt += 1
        avg_j = s / max(1, cnt)
        dlen = len_prior_dist(ti, lang)
        score = avg_j - lam * dlen
        L = len(str(ti))
        if (score > best_s + 1e-12) or (abs(score - best_s) <= 0.01 and (dlen < best_d or (abs(dlen-best_d) <= 1e-6 and L < best_L))):
            best_s, best_i, best_d, best_L = score, i, dlen, L
    rows.append((qid, texts[best_i]))

sub = pd.DataFrame(rows, columns=['id','PredictionString'])
empties = (sub['PredictionString'].astype(str).str.len()==0).sum()
mean_len = sub['PredictionString'].astype(str).str.len().mean()
print(f'Hedge1 non-normalized (std Jaccard, +numeric override) built. Empties={int(empties)}, mean_len={mean_len:.2f}, time={time.time()-t0:.2f}s')

out_path = 'submission_meta_consensus_alignsafe_nonorm_numeric_3cand.csv'
sub.to_csv(out_path, index=False)
pd.read_csv(out_path).to_csv('submission.csv', index=False)
print('submission.csv updated ->', out_path)

Loaded: submission_charfusion_512_single_seed_alignsafe.csv
Loaded: submission_tokendp_512.csv
Loaded: submission_tokenselect_512single_or_384_lambda012.csv
Loaded: submission_charfusion_512_single_seed_alignsafe.csv
Loaded: submission_tokendp_512.csv
Loaded: submission_tokenselect_512single_or_384_lambda012.csv


Hedge1 non-normalized (std Jaccard, +numeric override) built. Empties=0, mean_len=10.71, time=0.00s
submission.csv updated -> submission_meta_consensus_alignsafe_nonorm_numeric_3cand.csv


In [48]:
# Meta-hedge: combine current norm+containment (Cell 28), older norm (384hi variant if present), and non-norm (Cell 30), with TD low-conf gate
import pandas as pd, numpy as np, re, math, time, os

test = pd.read_csv('test.csv')
id_list = test['id'].tolist()
lang_map = dict(zip(test['id'], test['language']))

# Candidates
path_curr = 'submission_meta_consensus_alignsafe_normscore_512singlecf_tokendp_tokenselect.csv'  # Cell 28 current (digit/sep+containment)
path_old  = 'submission_meta_consensus_alignsafe_normscore_512singlecf_tokendp_tokenselect_384hi.csv'  # older norm-safe (no containment)
path_non  = 'submission_meta_consensus_alignsafe_nonorm_numeric_3cand.csv'  # Cell 30 non-normalized numeric override
path_tdp  = 'submission_tokendp_512.csv'

def must_load(path):
    assert os.path.exists(path), f'Missing {path}'
    df = pd.read_csv(path)
    assert set(df.columns) >= {'id','PredictionString'} and len(df)==len(test), f'Bad shape for {path}'
    return df.set_index('id')['PredictionString'].astype(str).to_dict()

m_curr = must_load(path_curr)
m_non  = must_load(path_non)
m_tdp  = must_load(path_tdp)
m_old  = None
if os.path.exists(path_old):
    try:
        m_old = must_load(path_old)
    except Exception:
        m_old = None

# Normalized scoring helpers
PUNCT = set(list('.,!?;:"\'()[]{}“”‘’«»'))
def norm_text(s: str) -> str:
    s = str(s).strip()
    while len(s) >= 2 and ((s[0], s[-1]) in [("\"","\""),("'","'"),('“','”'),('‘','’'),('(',')'),('[',']'),('{','}'),('«','»')]):
        s = s[1:-1].strip()
    s = re.sub(r'\s+', ' ', s)
    s = s.strip(''.join(PUNCT))
    return s.strip()
def words_norm(s):
    s = norm_text(s)
    toks = []
    for w in s.split():
        w = w.strip(''.join(PUNCT))
        if w: toks.append(w)
    return toks
def jaccard_norm(a, b):
    wa, wb = set(words_norm(a)), set(words_norm(b))
    if not wa and not wb: return 1.0
    if not wa or not wb: return 0.0
    inter = len(wa & wb); uni = len(wa | wb)
    return inter/uni if uni>0 else 0.0

# Length prior from train
train = pd.read_csv('train.csv')
train['answer_text'] = train['answer_text'].astype(str).str.strip()
train['char_len'] = train['answer_text'].str.len().clip(lower=1)
def fit_log_normal_params(df):
    x = np.log(df['char_len'].values.astype(float))
    mu = float(x.mean()); sd = float(x.std());
    if sd <= 1e-6: sd = 1e-6
    return mu, sd
priors = {lang: dict(zip(['mu','sigma'], fit_log_normal_params(g))) for lang, g in train.groupby('language')}
def len_prior_dist(s, lang):
    L = max(1, len(str(s))); mu = priors.get(lang, {}).get('mu', 2.3)
    return abs(math.log(L) - mu)

# Numeric/date micro-rule
num_re = re.compile(r'^[\d\u0966-\u096f\u0be6-\u0bef\s\-/\.]+$')

lambda_hi = 0.02
lambda_ta = 0.045
CONF_THRESH = 0.35

rows = []
t0 = time.time()
for qid in id_list:
    lang = lang_map[qid]
    # build trio: current norm, old norm (fallback to current if missing), non-norm
    A = m_curr[qid]
    B = m_old[qid] if m_old is not None else A
    C = m_non[qid]
    TD = m_tdp[qid]
    # Numeric/date override: prefer TD if short numeric/date
    if TD and len(TD) <= 12 and num_re.match(TD):
        rows.append((qid, TD))
        continue
    # Confidence over A/B/C
    texts = [A, B, C]
    confs = []
    best_idx, best_score, best_d, best_L = -1, -1e18, 1e9, 10**9
    lam = lambda_hi if lang=='hindi' else lambda_ta
    for i, ti in enumerate(texts):
        s = 0.0; cnt = 0
        for j, tj in enumerate(texts):
            if i==j: continue
            s += jaccard_norm(ti, tj); cnt += 1
        avg_j = s / max(1, cnt)
        confs.append(avg_j)
        dlen = len_prior_dist(ti, lang)
        score = avg_j - lam * dlen
        L = len(str(ti))
        if (score > best_score + 1e-12) or (abs(score - best_score) <= 0.01 and (dlen < best_d or (abs(dlen-best_d) <= 1e-6 and L < best_L))):
            best_score, best_idx, best_d, best_L = score, i, dlen, L
    conf = max(confs) if confs else 0.0
    pick = texts[best_idx]
    if conf < CONF_THRESH:
        pick = TD
    rows.append((qid, pick))

sub = pd.DataFrame(rows, columns=['id','PredictionString'])
empties = (sub['PredictionString'].astype(str).str.len()==0).sum()
mean_len = sub['PredictionString'].astype(str).str.len().mean()
print(f'Meta-hedge (A=current norm, B=old norm 384hi, C=non-norm, TD low-conf) Empties={int(empties)}, mean_len={mean_len:.2f}, time={time.time()-t0:.2f}s')

out_path = 'submission_metahedge_curr_old_non_tdp.csv'
sub.to_csv(out_path, index=False)
pd.read_csv(out_path).to_csv('submission.csv', index=False)
print('submission.csv updated ->', out_path)

Meta-hedge (A=current norm, B=old norm 384hi, C=non-norm, TD low-conf) Empties=0, mean_len=10.47, time=0.01s
submission.csv updated -> submission_metahedge_curr_old_non_tdp.csv


In [51]:
# Switch submission.csv to older normalized 384hi variant and print diagnostics
import pandas as pd, os, time
path = 'submission_meta_consensus_alignsafe_normscore_512singlecf_tokendp_tokenselect_384hi.csv'
assert os.path.exists(path), f'Missing {path}'
sub = pd.read_csv(path)
empties = (sub['PredictionString'].astype(str).str.len()==0).sum()
mean_len = sub['PredictionString'].astype(str).str.len().mean()
sub.to_csv('submission.csv', index=False)
print('submission.csv updated ->', path)
print('Diagnostics: empties=', int(empties), 'mean_len=', round(float(mean_len), 2))
print('mtime(submission.csv)=', time.ctime(os.path.getmtime('submission.csv')))

submission.csv updated -> submission_meta_consensus_alignsafe_normscore_512singlecf_tokendp_tokenselect_384hi.csv
Diagnostics: empties= 0 mean_len= 10.65
mtime(submission.csv)= Thu Sep 25 12:33:08 2025


In [52]:
# Hedge 1b: Non-normalized consensus (std Jaccard) + numeric/date override <=16; add 384-only to Hindi pool
import pandas as pd, numpy as np, math, time, re, os

test = pd.read_csv('test.csv')
id_list = test['id'].tolist()
lang_map = dict(zip(test['id'], test['language']))

# Pools: Hindi uses 4 candidates (add 384-only CF for diversity); Tamil stays at 3
cand_hi_files = [
    'submission_charfusion_512_single_seed_alignsafe.csv',
    'submission_tokendp_512.csv',
    'submission_tokenselect_512single_or_384_lambda012.csv',
    'submission_384only_charfusion_lambda015.csv',
]
cand_ta_files = [
    'submission_charfusion_512_single_seed_alignsafe.csv',
    'submission_tokendp_512.csv',
    'submission_tokenselect_512single_or_384_lambda012.csv',
]

def load_cands(paths):
    out = []
    for f in paths:
        try:
            if not os.path.exists(f):
                print('Missing:', f); continue
            df = pd.read_csv(f)
            if set(df.columns) >= {'id','PredictionString'} and len(df)==len(test):
                out.append((f, df.set_index('id')['PredictionString'].astype(str).to_dict()))
                print('Loaded:', f)
            else:
                print('Skip (shape/cols):', f)
        except Exception as e:
            print('Skip:', f, '->', e)
    return out

cands_hi = load_cands(cand_hi_files)
cands_ta = load_cands(cand_ta_files)
assert len(cands_hi) >= 3 and len(cands_ta) == 3, 'Unexpected candidate counts'

# Standard word-level Jaccard
def words(s):
    return [w for w in str(s).strip().split() if len(w)>0]
def jaccard(a, b):
    wa, wb = set(words(a)), set(words(b))
    if not wa and not wb: return 1.0
    if not wa or not wb: return 0.0
    inter = len(wa & wb); uni = len(wa | wb)
    return inter/uni if uni>0 else 0.0

# Per-language log-normal priors
train = pd.read_csv('train.csv')
train['answer_text'] = train['answer_text'].astype(str).str.strip()
train['char_len'] = train['answer_text'].str.len().clip(lower=1)
def fit_log_normal_params(df):
    x = np.log(df['char_len'].values.astype(float))
    mu = float(x.mean()); sd = float(x.std());
    if sd <= 1e-6: sd = 1e-6
    return mu, sd
priors = {lang: dict(zip(['mu','sigma'], fit_log_normal_params(g))) for lang, g in train.groupby('language')}
def len_prior_dist(s, lang):
    L = max(1, len(str(s))); mu = priors.get(lang, {}).get('mu', 2.3)
    return abs(math.log(L) - mu)

from collections import Counter
def majority_override(texts, min_votes=3):
    cnt = Counter(texts)
    text, votes = cnt.most_common(1)[0]
    return (text if votes >= min_votes else None), votes

# Length regularization weights
lambda_hi = 0.02
lambda_ta = 0.045

# Numeric/date micro-rule: prefer token-DP if normalized numeric/date-like and len<=16; output TD raw
num_re = re.compile(r'^[\d\u0966-\u096f\u0be6-\u0bef\s\-/\.]+$')
PUNCT = set(list('.,!?;:\"\'()[]{}“”‘’«»'))
def norm_text(s: str) -> str:
    s = str(s).strip()
    while len(s) >= 2 and ((s[0], s[-1]) in [("\"","\""),("'","'"),('“','”'),('‘','’'),('(',')'),('[',']'),('{','}'),('«','»')]):
        s = s[1:-1].strip()
    s = re.sub(r'\s+', ' ', s)
    s = s.strip(''.join(PUNCT))
    return s.strip()
def tokendp_index(pool):
    for i, (fname, _) in enumerate(pool):
        if fname.endswith('submission_tokendp_512.csv'):
            return i
    return None

rows = []
t0 = time.time()
for qid in id_list:
    lang = lang_map[qid]
    pool = cands_hi if lang=='hindi' else cands_ta
    texts = [d[qid] for _, d in pool]
    # Numeric/date override first (normalized match), len<=16 -> pick TD raw
    tdp_idx = tokendp_index(pool)
    if tdp_idx is not None:
        td_raw = texts[tdp_idx]
        if td_raw:
            td_norm = norm_text(td_raw).replace('\u0964','')
            if len(td_raw) <= 16 and num_re.match(td_norm):
                rows.append((qid, td_raw)); continue
    # Majority override on raw strings
    maj, votes = majority_override(texts, min_votes=3)
    if maj is not None:
        rows.append((qid, maj)); continue
    # Score by mean word-Jaccard + small len prior; tie-break by closer to prior, then shorter within 0.01
    best_i, best_s, best_d, best_L = -1, -1e18, 1e9, 10**9
    lam = lambda_hi if lang=='hindi' else lambda_ta
    for i, ti in enumerate(texts):
        s = 0.0; cnt = 0
        for j, tj in enumerate(texts):
            if i==j: continue
            s += jaccard(ti, tj); cnt += 1
        avg_j = s / max(1, cnt)
        dlen = len_prior_dist(ti, lang)
        score = avg_j - lam * dlen
        L = len(str(ti))
        if (score > best_s + 1e-12) or (abs(score - best_s) <= 0.01 and (dlen < best_d or (abs(dlen-best_d) <= 1e-6 and L < best_L))):
            best_s, best_i, best_d, best_L = score, i, dlen, L
    rows.append((qid, texts[best_i]))

sub = pd.DataFrame(rows, columns=['id','PredictionString'])
empties = (sub['PredictionString'].astype(str).str.len()==0).sum()
mean_len = sub['PredictionString'].astype(str).str.len().mean()
print(f'Hedge1b non-normalized (std Jaccard, +numeric<=16, HI+384-only) built. Empties={int(empties)}, mean_len={mean_len:.2f}, time={time.time()-t0:.2f}s')

out_path = 'submission_meta_consensus_alignsafe_nonorm_numeric_hi384.csv'
sub.to_csv(out_path, index=False)
pd.read_csv(out_path).to_csv('submission.csv', index=False)
print('submission.csv updated ->', out_path)

Loaded: submission_charfusion_512_single_seed_alignsafe.csv
Loaded: submission_tokendp_512.csv
Loaded: submission_tokenselect_512single_or_384_lambda012.csv
Loaded: submission_384only_charfusion_lambda015.csv
Loaded: submission_charfusion_512_single_seed_alignsafe.csv
Loaded: submission_tokendp_512.csv
Loaded: submission_tokenselect_512single_or_384_lambda012.csv


Hedge1b non-normalized (std Jaccard, +numeric<=16, HI+384-only) built. Empties=0, mean_len=10.52, time=0.01s
submission.csv updated -> submission_meta_consensus_alignsafe_nonorm_numeric_hi384.csv


In [64]:
# Final meta-hedge: combine Cell 28 (norm meta-consensus), Cell 29 (per-id hedge), and token-DP
import pandas as pd, numpy as np, re, math, time, os, unicodedata as ud

test = pd.read_csv('test.csv')
id_list = test['id'].tolist()
lang_map = dict(zip(test['id'], test['language']))

path_28 = 'submission_meta_consensus_alignsafe_normscore_512singlecf_tokendp_tokenselect.csv'
path_29 = 'submission_perid_hedge_tokendp_lowconf.csv'
path_td = 'submission_tokendp_512.csv'

def must_load(path):
    assert os.path.exists(path), f'Missing {path}'
    df = pd.read_csv(path)
    assert set(df.columns) >= {'id','PredictionString'} and len(df)==len(test), f'Bad shape/cols for {path}'
    return df.set_index('id')['PredictionString'].astype(str).to_dict()

m28 = must_load(path_28)
m29 = must_load(path_29)
mTD = must_load(path_td)

# Normalization for scoring only (digits/dashes/quotes-insensitive, matches Cell 28)
PUNCT = set(list('.,!?;:\"\'()[]{}“”‘’«»'))
DIG_MAP = str.maketrans({
    '०':'0','१':'1','२':'2','३':'3','४':'4','५':'5','६':'6','७':'7','८':'8','९':'9',
    '௦':'0','௧':'1','௨':'2','௩':'3','௪':'4','௫':'5','௬':'6','௭':'7','௮':'8','௯':'9'
})
SEP_PAT = re.compile(r'[\u2010\u2011\u2012\u2013\u2014\u2212]+')
DANDA = '\u0964'
def norm_text(s: str) -> str:
    s = str(s).strip()
    s = s.translate(DIG_MAP)
    s = SEP_PAT.sub('-', s)
    s = s.replace('\u00A0',' ').replace('\u2009',' ').replace('\u200A',' ')
    while len(s) >= 2 and ((s[0], s[-1]) in [("\"","\""),("'","'"),('“','”'),('‘','’'),('(',')'),('[',']'),('{','}'),('«','»')]):
        s = s[1:-1].strip()
    s = s.strip(DANDA)
    s = re.sub(r'\s+', ' ', s)
    s = s.strip(''.join(PUNCT))
    return s.strip()

def words_norm(s):
    s = norm_text(s)
    toks = []
    for w in s.split():
        w = w.strip(''.join(PUNCT))
        if w: toks.append(w)
    return toks

def jaccard_norm(a, b):
    wa, wb = set(words_norm(a)), set(words_norm(b))
    if not wa and not wb: return 1.0
    if not wa or not wb: return 0.0
    inter = len(wa & wb); uni = len(wa | wb)
    return inter/uni if uni>0 else 0.0

# Per-language length prior from train
train = pd.read_csv('train.csv')
train['answer_text'] = train['answer_text'].astype(str).str.strip()
train['char_len'] = train['answer_text'].str.len().clip(lower=1)
def fit_log_normal_params(df):
    x = np.log(df['char_len'].values.astype(float))
    mu = float(x.mean()); sd = float(x.std());
    if sd <= 1e-6: sd = 1e-6
    return mu, sd
priors = {lang: dict(zip(['mu','sigma'], fit_log_normal_params(g))) for lang, g in train.groupby('language')}
def len_prior_dist(s, lang):
    L = max(1, len(str(s))); mu = priors.get(lang, {}).get('mu', 2.3)
    return abs(math.log(L) - mu)

# Numeric/date micro-override: HI len<=18, TA len<=16; normalized numeric/date-like -> pick TD raw
num_re = re.compile(r'^[\d\u0966-\u096f\u0be6-\u0bef\s\-/\.]+$')

# Ultra-safe micro-trim: trim final danda or combining mark if another candidate's normalized form matches the trimmed
def apply_micro_trim(chosen: str, lang: str, cand_norms: set):
    if not chosen:
        return chosen
    if lang == 'hindi' and chosen.endswith(DANDA):
        trimmed = chosen[:-1].rstrip()
        if norm_text(trimmed) in cand_norms:
            return trimmed
    if len(chosen) >= 2 and ud.category(chosen[-1]) == 'Mn':
        trimmed = chosen[:-1]
        if norm_text(trimmed) in cand_norms:
            return trimmed
    return chosen

lambda_hi = 0.02
lambda_ta = 0.045

rows = []
t0 = time.time()
for qid in id_list:
    lang = lang_map[qid]
    A = m28[qid]; B = m29[qid]; TD = mTD[qid]
    # One-line tweak: if A and B are norm-equivalent but differ raw, prefer B
    if A != B and norm_text(A) == norm_text(B): A = B
    # Numeric/date override first
    if TD:
        td_norm = norm_text(TD).replace(DANDA,'')
        len_cap = 18 if lang == 'hindi' else 16
        if len(TD) <= len_cap and num_re.match(td_norm):
            rows.append((qid, TD))
            continue
    texts = [A, B, TD]
    cand_norms = {norm_text(t) for t in texts}
    # Majority on normalized strings
    from collections import Counter
    cnt = Counter([norm_text(t) for t in texts])
    top_norm, votes = cnt.most_common(1)[0]
    if votes >= 2 and top_norm != '':
        for t in texts:
            if norm_text(t) == top_norm:
                pick = apply_micro_trim(t, lang, cand_norms)
                rows.append((qid, pick))
                break
        continue
    # Score by mean normalized Jaccard + small len prior (per-language)
    best_i, best_s, best_d, best_L = -1, -1e18, 1e9, 10**9
    lam = lambda_hi if lang == 'hindi' else lambda_ta
    for i, ti in enumerate(texts):
        s = 0.0; cntp = 0
        for j, tj in enumerate(texts):
            if i==j: continue
            s += jaccard_norm(ti, tj); cntp += 1
        avg_j = s / max(1, cntp)
        dlen = len_prior_dist(ti, lang)
        score = avg_j - lam * dlen
        L = len(str(ti))
        if (score > best_s + 1e-12) or (abs(score - best_s) <= 0.01 and (dlen < best_d or (abs(dlen-best_d) <= 1e-6 and L < best_L))):
            best_s, best_i, best_d, best_L = score, i, dlen, L
    pick = apply_micro_trim(texts[best_i], lang, cand_norms)
    rows.append((qid, pick))

sub = pd.DataFrame(rows, columns=['id','PredictionString'])
empties = (sub['PredictionString'].astype(str).str.len()==0).sum()
mean_len = sub['PredictionString'].astype(str).str.len().mean()
print(f'Final meta-hedge (Cell28 vs Cell29 vs TD) built. Empties={int(empties)}, mean_len={mean_len:.2f}, time={time.time()-t0:.2f}s')

out_path = 'submission_metahedge_28_29_tokendp.csv'
sub.to_csv(out_path, index=False)
pd.read_csv(out_path).to_csv('submission.csv', index=False)
print('submission.csv updated ->', out_path)

Final meta-hedge (Cell28 vs Cell29 vs TD) built. Empties=0, mean_len=10.71, time=0.01s
submission.csv updated -> submission_metahedge_28_29_tokendp.csv


In [63]:
# Zero-shot QA (deepset/xlm-roberta-large-squad2) + ultra-safe meta with Cell 34 pick
import pandas as pd, numpy as np, time, re, os, unicodedata as ud
from transformers import pipeline, AutoTokenizer, AutoModelForQuestionAnswering
import torch

test = pd.read_csv('test.csv')
ids = test['id'].tolist()
questions = test['question'].astype(str).tolist()
contexts = test['context'].astype(str).tolist()

# Load zero-shot QA pipeline (prefer GPU if available)
model_name = 'deepset/xlm-roberta-large-squad2'
device = 0 if torch.cuda.is_available() else -1
qa = pipeline('question-answering', model=model_name, tokenizer=model_name, device=device)

rows = []
t0 = time.time()
for i, (qid, q, ctx) in enumerate(zip(ids, questions, contexts)):
    try:
        out = qa({'question': q, 'context': ctx}, handle_impossible_answer=True)
        ans = str(out.get('answer', '')).strip()
    except Exception:
        ans = ''
    if not ans:
        ans = ctx[:1] if len(ctx) > 0 else ''
    rows.append((qid, ans))
    if (i+1) % 20 == 0:
        print(f'Zero-shot decoded {i+1}/{len(ids)} in {time.time()-t0:.1f}s', flush=True)

sub_zz = pd.DataFrame(rows, columns=['id','PredictionString'])
out_path_zz = 'submission_zeroshot_xlmr_squad2.csv'
sub_zz.to_csv(out_path_zz, index=False)
print('Wrote', out_path_zz, 'empties=', int((sub_zz['PredictionString'].astype(str).str.len()==0).sum()),
      'mean_len=', float(sub_zz['PredictionString'].astype(str).str.len().mean()))

# Ultra-safe merge: prefer zero-shot raw ONLY if its normalized form equals current meta-hedge pick
curr_path = 'submission_metahedge_28_29_tokendp.csv'
assert os.path.exists(curr_path), f'Missing {curr_path}'
curr = pd.read_csv(curr_path).set_index('id')['PredictionString'].astype(str).to_dict()
zz = sub_zz.set_index('id')['PredictionString'].astype(str).to_dict()

PUNCT = set(list('.,!?;:\"\'()[]{}\u201c\u201d\u2018\u2019\u00ab\u00bb'))
DIG_MAP = str.maketrans({
    '\u0966':'0','\u0967':'1','\u0968':'2','\u0969':'3','\u096a':'4','\u096b':'5','\u096c':'6','\u096d':'7','\u096e':'8','\u096f':'9',
    '\u0be6':'0','\u0be7':'1','\u0be8':'2','\u0be9':'3','\u0bea':'4','\u0beb':'5','\u0bec':'6','\u0bed':'7','\u0bee':'8','\u0bef':'9'
})
SEP_PAT = re.compile(r'[\u2010\u2011\u2012\u2013\u2014\u2212]+')
DANDA = '\u0964'
def norm_text(s: str) -> str:
    s = str(s).strip()
    s = s.translate(DIG_MAP)
    s = SEP_PAT.sub('-', s)
    s = s.replace('\u00A0',' ').replace('\u2009',' ').replace('\u200A',' ')
    while len(s) >= 2 and ((s[0], s[-1]) in [("\"","\""),("'","'"),('\u201c','\u201d'),('\u2018','\u2019'),('(',')'),('[',']'),('{','}'),('\u00ab','\u00bb')]):
        s = s[1:-1].strip()
    s = s.strip(DANDA)
    s = re.sub(r'\s+', ' ', s)
    s = s.strip(''.join(PUNCT))
    return s.strip()

def apply_micro_trim(chosen: str, lang: str, cand_norms: set):
    if not chosen:
        return chosen
    if lang == 'hindi' and chosen.endswith(DANDA):
        trimmed = chosen[:-1].rstrip()
        if norm_text(trimmed) in cand_norms:
            return trimmed
    if len(chosen) >= 2 and ud.category(chosen[-1]) == 'Mn':
        trimmed = chosen[:-1]
        if norm_text(trimmed) in cand_norms:
            return trimmed
    return chosen

lang_map = dict(zip(test['id'], test['language']))
rows2 = []
for qid in ids:
    base = curr[qid]
    z = zz[qid]
    lang = lang_map[qid]
    cand_norms = {norm_text(base), norm_text(z)}
    pick = base
    if norm_text(z) == norm_text(base) and z != base:
        pick = apply_micro_trim(z, lang, cand_norms)
    rows2.append((qid, pick))

sub_final = pd.DataFrame(rows2, columns=['id','PredictionString'])
empties = (sub_final['PredictionString'].astype(str).str.len()==0).sum()
mean_len = sub_final['PredictionString'].astype(str).str.len().mean()
print(f'Zero-shot ultra-safe merge done. Empties={int(empties)}, mean_len={mean_len:.2f}')

out_merge = 'submission_metahedge_28_29_tokendp_plus_zeroshot.csv'
sub_final.to_csv(out_merge, index=False)
pd.read_csv(out_merge).to_csv('submission.csv', index=False)
print('submission.csv updated ->', out_merge)

  from .autonotebook import tqdm as notebook_tqdm


Some weights of the model checkpoint at deepset/xlm-roberta-large-squad2 were not used when initializing XLMRobertaForQuestionAnswering: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).




You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Zero-shot decoded 20/112 in 16.3s


Zero-shot decoded 40/112 in 36.6s


Zero-shot decoded 60/112 in 55.0s


Zero-shot decoded 80/112 in 67.8s


Zero-shot decoded 100/112 in 79.0s


Wrote submission_zeroshot_xlmr_squad2.csv empties= 0 mean_len= 9.375
Zero-shot ultra-safe merge done. Empties=0, mean_len=10.82
submission.csv updated -> submission_metahedge_28_29_tokendp_plus_zeroshot.csv
