In [None]:
import subprocess
import sys
import os
import shutil
from pathlib import Path

def pip(*args):
    print('>', *args, flush=True)
    subprocess.run([sys.executable, '-m', 'pip', *args], check=True)

# Set PIP_TARGET to writable directory
pip_target = '/app/.pip-target'
os.environ['PIP_TARGET'] = pip_target
if os.path.exists(pip_target):
    print('Removing existing', pip_target)
    shutil.rmtree(pip_target, ignore_errors=True)

# 0) Hard reset any prior torch stacks
for pkg in ('torch', 'torchvision', 'torchaudio'):
    subprocess.run([sys.executable, '-m', 'pip', 'uninstall', '-y', pkg], check=False)

# Clean stray site dirs
for d in (
    f'{pip_target}/torch',
    f'{pip_target}/torch-2.8.0.dist-info',
    f'{pip_target}/torch-2.4.1.dist-info',
    f'{pip_target}/torchvision',
    f'{pip_target}/torchvision-0.23.0.dist-info',
    f'{pip_target}/torchvision-0.19.1.dist-info',
    f'{pip_target}/torchaudio',
    f'{pip_target}/torchaudio-2.8.0.dist-info',
    f'{pip_target}/torchaudio-2.4.1.dist-info',
    f'{pip_target}/torchgen',
    f'{pip_target}/functorch',
):
    if os.path.exists(d):
        print('Removing', d)
        shutil.rmtree(d, ignore_errors=True)

# 1) Install the EXACT cu121 torch stack FIRST with --no-deps to avoid system dir installs
pip('install',
    '--index-url', 'https://download.pytorch.org/whl/cu121',
    '--extra-index-url', 'https://pypi.org/simple',
    '--force-reinstall', '--no-deps',
    'torch==2.4.1', 'torchvision==0.19.1', 'torchaudio==2.4.1')

# 2) Create a constraints file
Path('constraints.txt').write_text(
    'torch==2.4.1\n'
    'torchvision==0.19.1\n'
    'torchaudio==2.4.1\n'
)

# 3) Install NON-torch deps
pip('install', '-c', 'constraints.txt',
    'transformers==4.44.2', 'accelerate==0.34.2',
    'datasets==2.21.0', 'evaluate==0.4.2',
    'sentencepiece', 'scikit-learn',
    '--upgrade-strategy', 'only-if-needed')

# 4) Sanity gate - add pip_target to sys.path
sys.path.insert(0, pip_target)
import torch
print('torch:', torch.__version__, 'built CUDA:', getattr(torch.version, 'cuda', None))
print('CUDA available:', torch.cuda.is_available())
assert str(getattr(torch.version,'cuda','')).startswith('12.1'), f'Wrong CUDA build: {torch.version.cuda}'
assert torch.cuda.is_available(), 'CUDA not available'
print('GPU:', torch.cuda.get_device_name(0))

# Install additional packages with PIP_TARGET
pip('install', 'rank_bm25')
pip('install', 'langdetect')
pip('install', 'indic-nlp-library', 'pyarrow')

# Downgrade fsspec
pip('install', '-c', 'constraints.txt', 'fsspec[http]<=2024.6.1,>=2023.1.0', '--upgrade')

# Verify additional imports
try:
    from rank_bm25 import BM25Okapi
    print('BM25 available')
except ImportError:
    print('BM25 not available')
try:
    from langdetect import detect
    print('langdetect available')
except ImportError:
    print('langdetect not available')
print('Environment setup complete')

Removing existing /app/.pip-target






> install --index-url https://download.pytorch.org/whl/cu121 --extra-index-url https://pypi.org/simple --force-reinstall --no-deps torch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1




Looking in indexes: https://download.pytorch.org/whl/cu121, https://pypi.org/simple


Collecting torch==2.4.1
  Downloading https://download.pytorch.org/whl/cu121/torch-2.4.1%2Bcu121-cp311-cp311-linux_x86_64.whl (799.0 MB)


In [None]:
import os
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
import gc
import ast
import sys
import copy
import json
import math
import random
import time
from datetime import datetime

import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset, TensorDataset
from torch.cuda.amp import autocast, GradScaler

from transformers import (
    AutoTokenizer,
    AutoModel,
    AutoConfig,
    get_linear_schedule_with_warmup,
    TrainingArguments,
    Trainer,
    AutoModelForQuestionAnswering,
    )
from transformers import default_data_collator

from datasets import load_dataset
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import hashlib

import subprocess
import shutil
import unicodedata

# Add pip_target to sys.path if not already
pip_target = '/app/.pip-target'
if pip_target not in sys.path:
    sys.path.insert(0, pip_target)

# BM25 and langdetect
BM25_AVAILABLE = False
try:
    from rank_bm25 import BM25Okapi
    BM25_AVAILABLE = True
    print('BM25 available')
except ImportError:
    print('BM25 not available, falling back to TF-IDF only')

LANGDETECT_AVAILABLE = False
try:
    from langdetect import detect
    LANGDETECT_AVAILABLE = True
    print('langdetect available')
except ImportError:
    print('langdetect not available, using script fallback')

# Script-based language detection fallback
def detect_lang(text):
    if not isinstance(text, str):
        return 'hindi'
    for c in text:
        if 0x0B80 <= ord(c) <= 0x0BFF:  # Tamil Unicode range
            return 'tamil'
    return 'hindi'

# Set seeds
def set_seed(seed=123):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

set_seed(123)

# Constants with coach tweaks for seed 123
DEBUG = False  # Set to True for rapid prototyping
MAX_LEN = 512
DOC_STRIDE = 128
N_SPLITS = 5
BATCH_SIZE = 2
GRAD_ACCUM_STEPS = 12
EPOCHS = 4
LR = 2.5e-5
WEIGHT_DECAY = 0.01
NEG_WEIGHT = 0.3
USE_RETRIEVAL = True
TOP_K_CHUNKS_TRAIN = 12
TOP_K_CHUNKS_EVAL_HINDI = 10
TOP_K_CHUNKS_EVAL_TAMIL = 35  # Coach tweak for better Tamil recall
CHUNK_SIZE = 1800
OVERLAP = 250
NEG_POS_RATIO = 2
MODEL_NAME = 'deepset/xlm-roberta-large-squad2'
PUNCT = '\u0964,.\uff0c!\uff01?\uff1f"\\\'\u201c\u201d\u2018\u2019()[\]{}:;'
MAX_ANSWER_LENGTH = 80  # Coach tweak for longer spans

# Load data
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

if DEBUG:
    train_df = train_df.sample(n=200, random_state=123).reset_index(drop=True)
    print(f'DEBUG mode: using {len(train_df)} samples')
else:
    print(f'Full mode: using {len(train_df)} samples')

print('Train shape:', train_df.shape)
print('Test shape:', test_df.shape)

# Label alignment fix with progress tracking
print('Before fix_span')
def fix_span(row):
    ctx, ans, s = row['context'], row['answer_text'], row['answer_start']
    if s < 0 or ctx[s:s+len(ans)] != ans:
        idx = ctx.find(ans)
        if idx != -1:
            row['answer_start'] = idx
    return row

train_df = train_df.apply(fix_span, axis=1)
print('After fix_span')

# Context groups for CV (hash first 1024 chars to group same articles)
def get_context_hash(context):
    return hashlib.md5(context[:1024].encode()).hexdigest()

train_df['context_hash'] = train_df['context'].apply(get_context_hash)
print('Context hashes computed')

# Jaccard metric with NFKC normalization
def jaccard_word(pred, true):
    pred = unicodedata.normalize('NFKC', pred).lower()
    true = unicodedata.normalize('NFKC', true).lower()
    if not pred or not true:
        return 0.0
    pw, tw = set(pred.split()), set(true.split())
    return len(pw & tw) / len(pw | tw) if pw and tw else 0.0

def compute_jaccard(preds, trues):
    return np.mean([jaccard_word(p, t) for p, t in zip(preds, trues)])

# Assign language to test_df using langdetect or fallback
print('Assigning language to test_df...')
if LANGDETECT_AVAILABLE:
    test_df['language'] = test_df['question'].apply(lambda x: {'ta':'tamil','hi':'hindi'}.get(detect(x), 'hindi') if isinstance(x, str) else 'hindi')
else:
    test_df['language'] = test_df['question'].apply(detect_lang)
print('Test language dist:', test_df['language'].value_counts())

# CV splitting with StratifiedGroupKFold
sgkf = StratifiedGroupKFold(n_splits=N_SPLITS, shuffle=True, random_state=123)
train_df['fold'] = -1
for fold, (trn_idx, val_idx) in enumerate(sgkf.split(train_df, train_df['language'], groups=train_df['context_hash'])):
    train_df.loc[val_idx, 'fold'] = fold

print('Fold distribution:')
print(train_df.groupby(['fold', 'language']).size())
print(f'Folds created: {train_df["fold"].nunique()}')

N_FOLDS = 3 if DEBUG else N_SPLITS
print(f'Using {N_FOLDS} folds for training')

In [None]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
print('Tokenizer loaded:', tokenizer.name_or_path)

# TF-IDF Retrieval setup with language-specific vectorizers
if USE_RETRIEVAL:
    print('Fitting language-specific TF-IDF vectorizers...')
    hindi_df = train_df[train_df['language'] == 'hindi']
    tamil_df = train_df[train_df['language'] == 'tamil']
    
    # Hindi vectorizer
    print('Processing Hindi...')
    hindi_questions = hindi_df['question'].tolist()
    hindi_contexts = hindi_df['context'].tolist()
    hindi_chunks = []
    for ctx in tqdm(hindi_contexts, desc='Chunking Hindi contexts'):
        chunks = []
        for i in range(0, len(ctx), CHUNK_SIZE - OVERLAP):
            chunk = ctx[i:i + CHUNK_SIZE]
            if len(chunk) > 100:
                chunks.append(chunk)
        hindi_chunks.extend(chunks)
    print(f'Hindi chunks total: {len(hindi_chunks)}')
    hindi_corpus = hindi_questions + random.sample(hindi_chunks, min(3000, len(hindi_chunks)))
    print(f'Hindi corpus size: {len(hindi_corpus)}')
    hindi_vectorizer = TfidfVectorizer(
        analyzer='char_wb',
        ngram_range=(2, 4),
        max_features=5000,
        min_df=2,
        max_df=0.95,
        lowercase=False,
        sublinear_tf=True,
        dtype=np.float32
    )
    print('Fitting Hindi vectorizer...')
    start_time = time.time()
    hindi_vectorizer.fit(hindi_corpus)
    fit_time = time.time() - start_time
    print(f'Hindi TF-IDF fitted in {fit_time:.2f}s: {len(hindi_corpus)} docs')
    
    # Tamil vectorizer - fixed to char n-grams for better recall
    print('Processing Tamil...')
    tamil_questions = tamil_df['question'].tolist()
    tamil_contexts = tamil_df['context'].tolist()
    tamil_chunks = []
    for ctx in tqdm(tamil_contexts, desc='Chunking Tamil contexts'):
        chunks = []
        for i in range(0, len(ctx), CHUNK_SIZE - OVERLAP):
            chunk = ctx[i:i + CHUNK_SIZE]
            if len(chunk) > 100:
                chunks.append(chunk)
        tamil_chunks.extend(chunks)
    print(f'Tamil chunks total: {len(tamil_chunks)}')
    tamil_corpus = tamil_questions + random.sample(tamil_chunks, min(1500, len(tamil_chunks)))
    print(f'Tamil corpus size: {len(tamil_corpus)}')
    tamil_vectorizer = TfidfVectorizer(
        analyzer='char_wb',
        ngram_range=(3, 5),
        max_features=15000,
        min_df=3,
        max_df=0.9,
        lowercase=False,
        sublinear_tf=True,
        dtype=np.float32
    )
    print('Fitting Tamil vectorizer...')
    start_time = time.time()
    tamil_vectorizer.fit(tamil_corpus)
    fit_time = time.time() - start_time
    print(f'Tamil TF-IDF fitted in {fit_time:.2f}s: {len(tamil_corpus)} docs')
else:
    hindi_vectorizer = tamil_vectorizer = None

In [None]:
# Prepare training features with hybrid retrieval and sliding windows
def prepare_train_features(examples, neg_pos_ratio=NEG_POS_RATIO):
    features = []
    for ex in examples:
        q, ctx, ans, ex_id, lang = ex['question'].strip(), ex['context'].strip(), {'text': ex['answer_text'], 'answer_start': ex['answer_start']}, ex['id'], ex['language']
        
        if USE_RETRIEVAL:
            # Chunk context
            chunks = []
            chunk_starts = []
            for i in range(0, len(ctx), CHUNK_SIZE - OVERLAP):
                chunk = ctx[i:i + CHUNK_SIZE]
                if len(chunk) > 100:
                    chunks.append(chunk)
                    chunk_starts.append(i)
            
            if not chunks:
                continue
            
            # Select vectorizer by language
            if lang == 'hindi':
                vectorizer = hindi_vectorizer
            else:
                vectorizer = tamil_vectorizer
            
            # TF-IDF retrieval
            q_vec = vectorizer.transform([q])
            chunk_vecs = vectorizer.transform(chunks)
            similarities = cosine_similarity(q_vec, chunk_vecs).flatten()
            
            # BM25 hybrid if available
            if BM25_AVAILABLE:
                tokenized_chunks = [chunk.lower().split() for chunk in chunks]
                bm25 = BM25Okapi(tokenized_chunks)
                q_tokens = q.lower().split()
                bm25_scores = bm25.get_scores(q_tokens)
                if np.max(bm25_scores) > 0:
                    norm_bm25 = bm25_scores / np.max(bm25_scores)
                else:
                    norm_bm25 = np.zeros_like(bm25_scores)
                hybrid_scores = 0.5 * norm_bm25 + 0.5 * similarities
            else:
                hybrid_scores = similarities
            top_indices = np.argsort(hybrid_scores)[-TOP_K_CHUNKS_TRAIN:]
            
            # Guarantee gold chunk inclusion for training by replacing lowest sim if needed
            start_char = ans['answer_start']
            end_char = start_char + len(ans['text'])
            pos_idx = None
            for ci, st in enumerate(chunk_starts):
                if start_char >= st and end_char <= st + len(chunks[ci]):
                    pos_idx = ci
                    break
            if pos_idx is not None and pos_idx not in top_indices:
                # Replace the lowest hybrid score in top_indices with pos_idx
                min_hybrid_arg = np.argmin(hybrid_scores[top_indices])
                top_indices[min_hybrid_arg] = pos_idx
            # Sort by hybrid descending
            sort_args = np.argsort(hybrid_scores[top_indices])[::-1]
            top_indices = top_indices[sort_args]
            
            # Get top chunks with their global start positions
            top_chunks = [(hybrid_scores[idx], chunk_starts[idx], chunks[idx]) for idx in top_indices]
        else:
            top_chunks = [(1.0, 0, ctx)]  # full context if no retrieval
        
        # Now process each top chunk with sliding windows
        pos_feats, neg_feats = [], []
        for sim, chunk_start, chunk in top_chunks:
            tokenized = tokenizer(
                q,
                chunk,
                truncation='only_second',
                max_length=MAX_LEN,
                stride=DOC_STRIDE,
                return_overflowing_tokens=True,
                return_offsets_mapping=True,
                padding=False,
            )
            
            for j in range(len(tokenized['input_ids'])):
                input_ids = tokenized['input_ids'][j]
                attention_mask = tokenized['attention_mask'][j]
                offsets = tokenized['offset_mapping'][j]
                sequence_ids = tokenized.sequence_ids(j)
                
                # Skip windows without context tokens
                if 1 not in sequence_ids:
                    continue
                
                # Global offsets: add chunk_start to context offsets
                global_offsets = []
                ctx_start = 0
                while ctx_start < len(sequence_ids) and sequence_ids[ctx_start] != 1:
                    global_offsets.append(None)
                    ctx_start += 1
                while ctx_start < len(sequence_ids) and sequence_ids[ctx_start] == 1:
                    local_offset = offsets[ctx_start]
                    global_offset = (local_offset[0] + chunk_start, local_offset[1] + chunk_start) if local_offset else None
                    global_offsets.append(global_offset)
                    ctx_start += 1
                while ctx_start < len(sequence_ids):
                    global_offsets.append(None)
                    ctx_start += 1
                
                # Find start/end positions using global offsets
                start_pos = -1
                end_pos = -1
                is_positive = False
                start_char = ans['answer_start']
                end_char = start_char + len(ans['text'])
                
                for tok_idx, off in enumerate(global_offsets):
                    if off is not None and off[0] <= start_char < off[1]:
                        start_pos = tok_idx
                    if off is not None and off[0] < end_char <= off[1]:
                        end_pos = tok_idx
                if start_pos != -1 and end_pos != -1 and end_pos >= start_pos:
                    is_positive = True
                else:
                    start_pos = 0
                    end_pos = 0
                
                # Pad/truncate
                pad_len = MAX_LEN - len(input_ids)
                if pad_len > 0:
                    input_ids += [tokenizer.pad_token_id] * pad_len
                    attention_mask += [0] * pad_len
                else:
                    input_ids = input_ids[:MAX_LEN]
                    attention_mask = attention_mask[:MAX_LEN]
                
                feat = {
                    'input_ids': input_ids,
                    'attention_mask': attention_mask,
                    'start_positions': start_pos,
                    'end_positions': end_pos,
                    'example_id': ex_id,
                    'is_positive': is_positive
                }
                (pos_feats if is_positive else neg_feats).append(feat)
        
        # Oversample positives as per expert advice
        if pos_feats:
            pos_feats = pos_feats * 2
        
        # Cap negatives
        if pos_feats:
            features.extend(pos_feats)
            random.shuffle(neg_feats)
            n_neg = min(len(neg_feats), neg_pos_ratio * len(pos_feats))
            features.extend(neg_feats[:n_neg])
        elif neg_feats:
            features.append(random.choice(neg_feats))
    return features

# Prepare validation features (lang-specific TOP_K_EVAL)
def prepare_validation_features(examples):
    features = []
    for ex in examples:
        q, ctx, ex_id, lang = ex['question'].strip(), ex['context'].strip(), ex['id'], ex['language']
        
        if USE_RETRIEVAL:
            # Same chunking and retrieval as train, but use lang-specific TOP_K_EVAL
            chunks = []
            chunk_starts = []
            for i in range(0, len(ctx), CHUNK_SIZE - OVERLAP):
                chunk = ctx[i:i + CHUNK_SIZE]
                if len(chunk) > 100:
                    chunks.append(chunk)
                    chunk_starts.append(i)
            
            if not chunks:
                continue
            
            # Select vectorizer by language
            if lang == 'hindi':
                vectorizer = hindi_vectorizer
                top_k_eval = TOP_K_CHUNKS_EVAL_HINDI
            else:
                vectorizer = tamil_vectorizer
                top_k_eval = TOP_K_CHUNKS_EVAL_TAMIL
            
            # TF-IDF
            q_vec = vectorizer.transform([q])
            chunk_vecs = vectorizer.transform(chunks)
            similarities = cosine_similarity(q_vec, chunk_vecs).flatten()
            
            # BM25 hybrid if available
            if BM25_AVAILABLE:
                tokenized_chunks = [chunk.lower().split() for chunk in chunks]
                bm25 = BM25Okapi(tokenized_chunks)
                q_tokens = q.lower().split()
                bm25_scores = bm25.get_scores(q_tokens)
                if np.max(bm25_scores) > 0:
                    norm_bm25 = bm25_scores / np.max(bm25_scores)
                else:
                    norm_bm25 = np.zeros_like(bm25_scores)
                hybrid_scores = 0.5 * norm_bm25 + 0.5 * similarities
            else:
                hybrid_scores = similarities
            top_indices = np.argsort(hybrid_scores)[-top_k_eval:]
            top_chunks = [(hybrid_scores[idx], chunk_starts[idx], chunks[idx]) for idx in top_indices]
        else:
            top_chunks = [(1.0, 0, ctx)]
        
        # Process each top chunk
        for sim, chunk_start, chunk in top_chunks:
            tokenized = tokenizer(
                q,
                chunk,
                truncation='only_second',
                max_length=MAX_LEN,
                stride=DOC_STRIDE,
                return_overflowing_tokens=True,
                return_offsets_mapping=True,
                padding=False,
            )
            
            for j in range(len(tokenized['input_ids'])):
                input_ids = tokenized['input_ids'][j]
                attention_mask = tokenized['attention_mask'][j]
                offsets = tokenized['offset_mapping'][j]
                sequence_ids = tokenized.sequence_ids(j)
                
                # Skip windows without context tokens
                if 1 not in sequence_ids:
                    continue
                
                # Global offsets for post-processing
                global_offsets = []
                ctx_start = 0
                while ctx_start < len(sequence_ids) and sequence_ids[ctx_start] != 1:
                    global_offsets.append(None)
                    ctx_start += 1
                while ctx_start < len(sequence_ids) and sequence_ids[ctx_start] == 1:
                    local_offset = offsets[ctx_start]
                    global_offset = (local_offset[0] + chunk_start, local_offset[1] + chunk_start) if local_offset else None
                    global_offsets.append(global_offset)
                    ctx_start += 1
                while ctx_start < len(sequence_ids):
                    global_offsets.append(None)
                    ctx_start += 1
                
                # Pad/truncate
                pad_len = MAX_LEN - len(input_ids)
                if pad_len > 0:
                    input_ids += [tokenizer.pad_token_id] * pad_len
                    attention_mask += [0] * pad_len
                    global_offsets += [None] * pad_len
                else:
                    input_ids = input_ids[:MAX_LEN]
                    attention_mask = attention_mask[:MAX_LEN]
                    global_offsets = global_offsets[:MAX_LEN]
                
                features.append({
                    'input_ids': input_ids,
                    'attention_mask': attention_mask,
                    'offset_mapping': global_offsets,
                    'example_id': ex_id,
                })
    return features

# Test on small batch
test_examples = train_df.head(1).to_dict('records')
print('Testing on example:', test_examples[0]['id'], 'Language:', test_examples[0]['language'])
print('Gold answer:', test_examples[0]['answer_text'], 'at', test_examples[0]['answer_start'])
train_features = prepare_train_features(test_examples)
val_features = prepare_validation_features(test_examples)
print(f'Train features: {len(train_features)}')
print(f'Val features: {len(val_features)}')
if train_features:
    print('Sample train feature keys:', list(train_features[0].keys()))
    print('Sample input_ids len:', len(train_features[0]['input_ids']))
    print('Sample is_positive:', train_features[0]['is_positive'])
if val_features:
    print('Sample val offset_mapping len:', len(val_features[0]['offset_mapping']))

In [None]:
import torch.nn.functional as F

# Post-processing to aggregate predictions across sliding windows with improved scoring
def get_predictions(features, start_logits, end_logits, n_best_size=50, max_answer_length=80):
    example_to_features = {}
    for i, f in enumerate(features):
        example_to_features.setdefault(f['example_id'], []).append((i, f))

    pred_dict = {}
    for example_id, feat_list in example_to_features.items():
        prelim_predictions = []
        for feat_idx, f in feat_list:
            offsets = f['offset_mapping']
            sl = start_logits[feat_idx]
            el = end_logits[feat_idx]

            # Context indices (non-None offsets)
            ctx_idx = [i for i, o in enumerate(offsets) if o is not None]
            if not ctx_idx:
                continue

            # Log-softmax on context logits only
            start_log = log_softmax_np(sl[ctx_idx])
            end_log = log_softmax_np(el[ctx_idx])

            # Top n_best_size start/end positions in context
            top_start_idx = np.argsort(sl[ctx_idx])[-n_best_size:].tolist()[::-1]
            top_end_idx = np.argsort(el[ctx_idx])[-n_best_size:].tolist()[::-1]

            # Global indices
            top_start = [ctx_idx[i] for i in top_start_idx]
            top_end = [ctx_idx[i] for i in top_end_idx]

            # Generate candidates
            for s in top_start:
                for e in top_end:
                    if e < s:
                        continue
                    length = e - s + 1
                    if length > max_answer_length:
                        continue
                    sc, ec = offsets[s][0], offsets[e][1]
                    # Score with softened length penalty
                    score = start_log[top_start_idx[top_start.index(s)]] + end_log[top_end_idx[top_end.index(e)]] - 0.001 * max(0, length - 25)
                    prelim_predictions.append((score, sc, ec))

        if prelim_predictions:
            _, sc, ec = max(prelim_predictions, key=lambda x: x[0])
            pred_dict[example_id] = (sc, ec)
        else:
            # Fallback: best single-token span in context across all features
            best_score = -np.inf
            best_sc, best_ec = 0, 0
            for feat_idx, f in feat_list:
                offsets = f['offset_mapping']
                sl = start_logits[feat_idx]
                ctx_idx = [i for i, o in enumerate(offsets) if o is not None]
                if not ctx_idx:
                    continue
                s_log = log_softmax_np(sl[ctx_idx])
                best_s_local = np.argmax(sl[ctx_idx])
                s_global = ctx_idx[best_s_local]
                sc, ec = offsets[s_global][0], offsets[s_global][1]
                score = s_log[best_s_local]
                if score > best_score:
                    best_score = score
                    best_sc, best_ec = sc, ec
            pred_dict[example_id] = (best_sc, best_ec)
    return pred_dict

# Function to extract answer from context with NFKC and punctuation trim
def extract_answer(context, start_char, end_char):
    if start_char == 0 and end_char == 0:
        return ''
    s = context[start_char:end_char]
    s = unicodedata.normalize('NFKC', s).strip().strip(PUNCT)
    return s

# Dataset class - updated to include is_positive for training
class QADataset(Dataset):
    def __init__(self, features):
        self.input_ids = [f['input_ids'] for f in features]
        self.attention_mask = [f['attention_mask'] for f in features]
        if 'start_positions' in features[0]:
            self.start_positions = [f['start_positions'] for f in features]
            self.end_positions = [f['end_positions'] for f in features]
            self.is_positive = [f['is_positive'] for f in features]
        else:
            self.start_positions = None
            self.end_positions = None
            self.is_positive = None
        self.offset_mapping = [f.get('offset_mapping') for f in features]
        self.example_id = [f['example_id'] for f in features]

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        item = {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx]
        }
        assert len(item['input_ids']) == MAX_LEN, 'Input ids not padded correctly'
        assert len(item['attention_mask']) == MAX_LEN, 'Attention mask not padded correctly'
        if self.start_positions is not None:
            item['start_positions'] = self.start_positions[idx]
            item['end_positions'] = self.end_positions[idx]
            item['is_positive'] = self.is_positive[idx]
        return item

# Custom Weighted Trainer to down-weight negative examples (fixed per-example weighting)
class WeightedQATrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        start_positions = inputs.pop('start_positions')
        end_positions = inputs.pop('end_positions')
        is_positive = inputs.pop('is_positive', None)  # tensor [bs] or None

        outputs = model(**inputs)
        start_logits = outputs.start_logits
        end_logits = outputs.end_logits

        start_loss = F.cross_entropy(start_logits, start_positions, reduction='none')
        end_loss = F.cross_entropy(end_logits, end_positions, reduction='none')
        loss = (start_loss + end_loss) / 2.0

        if is_positive is not None:
            ispos = is_positive.bool()
            weights = torch.where(ispos, torch.ones_like(loss), torch.full_like(loss, NEG_WEIGHT))
            loss = (loss * weights).mean()
        else:
            loss = loss.mean()

        return (loss, outputs) if return_outputs else loss

# Numpy log_softmax for numpy arrays
def log_softmax_np(x):
    x = x - np.max(x, axis=-1, keepdims=True)
    return x - np.log(np.sum(np.exp(x), axis=-1, keepdims=True))

# Test dataset creation
val_features_test = prepare_validation_features(train_df.head(1).to_dict('records'))
val_dataset_test = QADataset(val_features_test)
print(f'Dataset length: {len(val_dataset_test)}')
sample_item = val_dataset_test[0]
print('Sample item keys:', list(sample_item.keys()))
print('Sample input_ids len:', len(sample_item['input_ids']))

# Test train dataset with is_positive
trn_features_test = prepare_train_features(train_df.head(1).to_dict('records'))
if trn_features_test:
    trn_dataset_test = QADataset(trn_features_test)
    sample_trn_item = trn_dataset_test[0]
    print('Sample train item keys:', list(sample_trn_item.keys()))
    print('Sample is_positive:', sample_trn_item['is_positive'])

In [1]:
from transformers import TrainingArguments, Trainer

# Precompute test features once (language already set in Cell 1)
print('Test language distribution:', test_df['language'].value_counts())
test_features = prepare_validation_features(test_df.to_dict('records'))
test_dataset = QADataset(test_features)
test_start_sum = None
test_end_sum = None

# Training loop for seed 123
oof_preds = []
oof_trues = []
oof_ids = []
fold_jaccards = []

for fold in range(N_FOLDS):
    print(f'\n=== Fold {fold} ===')
    trn_df = train_df[train_df['fold'] != fold].reset_index(drop=True)
    val_df = train_df[train_df['fold'] == fold].reset_index(drop=True)
    print(f'Train: {len(trn_df)}, Val: {len(val_df)}')

    # 2x Tamil oversampling for better balance
    trn_df = pd.concat([trn_df, trn_df[trn_df['language'] == 'tamil']]).reset_index(drop=True)

    print('Preparing train features...')
    start_time = time.time()
    trn_features = prepare_train_features(trn_df.to_dict('records'))
    prep_time = time.time() - start_time
    print(f'Trn features prepared in {prep_time:.2f}s: {len(trn_features)}')

    print('Preparing val features...')
    start_time = time.time()
    val_features = prepare_validation_features(val_df.to_dict('records'))
    prep_time = time.time() - start_time
    print(f'Val features prepared in {prep_time:.2f}s: {len(val_features)}')

    trn_dataset = QADataset(trn_features)
    val_dataset = QADataset(val_features)

    model = AutoModelForQuestionAnswering.from_pretrained(MODEL_NAME)
    model.gradient_checkpointing_enable()
    param_count = sum(p.numel() for p in model.parameters())
    print(f'Model param count: {param_count:,}')

    args = TrainingArguments(
        output_dir=f'/tmp/model_seed123_{fold}',
        bf16=True,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=16,
        gradient_accumulation_steps=GRAD_ACCUM_STEPS,
        num_train_epochs=EPOCHS,
        learning_rate=LR,
        weight_decay=WEIGHT_DECAY,
        save_strategy='no',
        report_to='none',
        dataloader_pin_memory=False,
        dataloader_num_workers=2,
        remove_unused_columns=False,
        warmup_ratio=0.1,
        lr_scheduler_type='linear',
        max_grad_norm=1.0,
        logging_steps=10,  # More frequent logging
    )

    trainer = WeightedQATrainer(
        model=model,
        args=args,
        train_dataset=trn_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        data_collator=default_data_collator,
    )

    print('Starting training...')
    train_start = time.time()
    trainer.train()
    train_time = time.time() - train_start
    print(f'Training completed in {train_time:.2f}s')

    predictions = trainer.predict(val_dataset)
    pred_dict = get_predictions(val_features, predictions.predictions[0], predictions.predictions[1], n_best_size=50, max_answer_length=MAX_ANSWER_LENGTH)

    fold_preds = []
    for idx, row in val_df.iterrows():
        start_char, end_char = pred_dict.get(row['id'], (0, 0))
        pred = extract_answer(row['context'], start_char, end_char)
        fold_preds.append(pred)

    print('Empty OOF preds:', (np.array(fold_preds) == '').mean())

    fold_trues = val_df['answer_text'].tolist()
    fold_jacc = compute_jaccard(fold_preds, fold_trues)
    fold_jaccards.append(fold_jacc)
    print(f'Fold {fold} Jaccard: {fold_jacc:.4f}')

    oof_preds.extend(fold_preds)
    oof_trues.extend(fold_trues)
    oof_ids.extend(val_df['id'].tolist())

    # Per language
    hindi_mask = val_df['language'] == 'hindi'
    if hindi_mask.sum() > 0:
        pred_hindi = np.array(fold_preds)[hindi_mask]
        true_hindi = val_df.loc[hindi_mask, 'answer_text'].tolist()
        jacc_hindi = compute_jaccard(pred_hindi, true_hindi)
        print(f'  Hindi Jaccard: {jacc_hindi:.4f}')
    tamil_mask = val_df['language'] == 'tamil'
    if tamil_mask.sum() > 0:
        pred_tamil = np.array(fold_preds)[tamil_mask]
        true_tamil = val_df.loc[tamil_mask, 'answer_text'].tolist()
        jacc_tamil = compute_jaccard(pred_tamil, true_tamil)
        print(f'  Tamil Jaccard: {jacc_tamil:.4f}')

    # Accumulate test logits
    test_out = trainer.predict(test_dataset)
    if test_start_sum is None:
        test_start_sum = test_out.predictions[0]
        test_end_sum = test_out.predictions[1]
    else:
        test_start_sum += test_out.predictions[0]
        test_end_sum += test_out.predictions[1]

    del model, trainer, trn_dataset, val_dataset, trn_features, val_features
    gc.collect()
    torch.cuda.empty_cache()

print(f'\nMean fold Jaccard: {np.mean(fold_jaccards):.4f} (+/- {np.std(fold_jaccards):.4f})')
overall_jacc = compute_jaccard(oof_preds, oof_trues)
print(f'Overall OOF Jaccard: {overall_jacc:.4f}')

# Save OOF for analysis
oof_df = pd.DataFrame({'id': oof_ids, 'pred': oof_preds, 'true': oof_trues})
oof_df.to_csv('oof_predictions_seed123.csv', index=False)
print('OOF saved to oof_predictions_seed123.csv')

# Generate submission from averaged test logits with per-language max_answer_length
test_start_avg = test_start_sum / N_FOLDS
test_end_avg = test_end_sum / N_FOLDS

# Compute predictions with different max lengths
pred60 = get_predictions(test_features, test_start_avg, test_end_avg, n_best_size=50, max_answer_length=60)
pred90 = get_predictions(test_features, test_start_avg, test_end_avg, n_best_size=50, max_answer_length=90)

# Select per language
test_pred_dict = {}
for idx, row in test_df.iterrows():
    ex_id = row['id']
    if row['language'] == 'tamil':
        test_pred_dict[ex_id] = pred90.get(ex_id, (0, 0))
    else:
        test_pred_dict[ex_id] = pred60.get(ex_id, (0, 0))

submission_preds = []
for idx, row in test_df.iterrows():
    start_char, end_char = test_pred_dict.get(row['id'], (0, 0))
    pred = extract_answer(row['context'], start_char, end_char)
    submission_preds.append(pred)

submission = pd.DataFrame({'id': test_df['id'], 'PredictionString': submission_preds})
submission.to_csv('submission_seed123.csv', index=False)
print('Submission saved to submission_seed123.csv')

# Save test logits and feature order for ensembling (seed 123)
import json
np.savez('test_logits_seed123_sum.npz', start=test_start_sum, end=test_end_sum, n_folds=N_FOLDS)
json.dump([f['example_id'] for f in test_features], open('test_features_order_seed123.json', 'w'))
print('Test logits and feature order saved for ensembling')

Test language distribution: language
hindi    84
tamil    28
Name: count, dtype: int64



=== Fold 0 ===
Train: 800, Val: 202
Preparing train features...


Trn features prepared in 22.27s: 6343
Preparing val features...


Val features prepared in 3.73s: 1914


Some weights of the model checkpoint at deepset/xlm-roberta-large-squad2 were not used when initializing XLMRobertaForQuestionAnswering: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Model param count: 558,842,882


Starting training...


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss


Training completed in 2067.23s


Empty OOF preds: 0.0
Fold 0 Jaccard: 0.6256
  Hindi Jaccard: 0.6636
  Tamil Jaccard: 0.5627



=== Fold 1 ===
Train: 797, Val: 205
Preparing train features...


Trn features prepared in 22.44s: 6406
Preparing val features...


Val features prepared in 4.05s: 1957


Some weights of the model checkpoint at deepset/xlm-roberta-large-squad2 were not used when initializing XLMRobertaForQuestionAnswering: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Model param count: 558,842,882


Starting training...


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss


Training completed in 2085.02s
