In [1]:
import subprocess
import sys
from pathlib import Path
import os
import shutil
import glob

# Clean conflicting system and user packages
pip_targets = [
    '/app/.pip-target',
    os.path.expanduser('~/.pip-target')
]
for target in pip_targets:
    if os.path.exists(target):
        print(f'Removing pip target: {target}')
        shutil.rmtree(target, ignore_errors=True)

# Create local writable directory for packages in CWD
LOCAL_PKGS = Path.cwd() / 'pkgs'
LOCAL_PKGS.mkdir(exist_ok=True)
print(f'Local pkgs dir: {LOCAL_PKGS}')

# Install PyTorch cu121 only if not already installed
if not (LOCAL_PKGS / 'torch').exists():
    print('Installing PyTorch cu121...')
    subprocess.check_call([
        sys.executable, '-m', 'pip', 'install',
        '--index-url', 'https://download.pytorch.org/whl/cu121',
        '--extra-index-url', 'https://pypi.org/simple',
        '--target', str(LOCAL_PKGS),
        '--no-cache-dir',
        'torch==2.4.1', 'torchvision==0.19.1', 'torchaudio==2.4.1'
    ])
else:
    print('PyTorch already installed in local pkgs, skipping.')

# Install transformers and all key dependencies to local dir (with --upgrade)
print('Installing transformers and deps...')
hf_deps = [
    'transformers==4.44.2',
    'tokenizers==0.19.1',
    'huggingface-hub==0.24.6',
    'safetensors>=0.4.3',
    'accelerate==0.34.2',
    'datasets==2.21.0',
    'evaluate==0.4.2',
    'sentencepiece==0.2.0',
    'protobuf<=4.36.0',
    'scikit-learn'
]
subprocess.check_call([
    sys.executable, '-m', 'pip', 'install',
    '--target', str(LOCAL_PKGS),
    '--no-cache-dir',
    '--upgrade',
    '--no-deps',
    *hf_deps
])
# Install minimal deps with compatible versions, including certifi
subprocess.check_call([
    sys.executable, '-m', 'pip', 'install',
    '--target', str(LOCAL_PKGS),
    '--no-cache-dir',
    '--upgrade',
    'numpy>=1.17',
    'filelock',
    'fsspec==2024.6.1',
    'requests',
    'tqdm',
    'pyyaml',
    'packaging',
    'regex!=2019.12.17',
    'certifi>=2023.7.22',
    'urllib3<3,>=1.21.1',
    'charset-normalizer<4,>=2'
])

# Prepend local pkgs to sys.path (before system paths)
if str(LOCAL_PKGS) not in sys.path:
    sys.path.insert(0, str(LOCAL_PKGS))
print('Added local pkgs to sys.path')

# Patch to fix import issues (before sanity check)
import importlib
os.environ['TRANSFORMERS_NO_TF'] = '1'
os.environ['TRANSFORMERS_NO_FLAX'] = '1'
os.environ['HF_HOME'] = str(LOCAL_PKGS / 'hf_cache')
os.environ['REQUESTS_CA_BUNDLE'] = '/etc/ssl/certs/ca-certificates.crt'
os.environ['CURL_CA_BUNDLE'] = '/etc/ssl/certs/ca-certificates.crt'

# Clean sys.path further
sys.path = [p for p in sys.path if '.pip-target' not in p]
sys.path.insert(0, str(LOCAL_PKGS))

# Purge modules more aggressively
modules_to_purge = ['transformers', 'tokenizers', 'huggingface_hub', 'safetensors', 'accelerate', 'requests', 'certifi', 'urllib3', 'charset_normalizer', 'idna', 'torch']
for mod in modules_to_purge:
    if mod in sys.modules:
        del sys.modules[mod]
    for k in list(sys.modules.keys()):
        if k.startswith(mod + '.') or k == mod:
            del sys.modules[k]
importlib.invalidate_caches()

# Re-import torch after purge to ensure local version
import torch
print(f'torch: {torch.__version__} built CUDA: {getattr(torch.version, "cuda", None)}')
print(f'CUDA available: {torch.cuda.is_available()}')
assert torch.cuda.is_available(), 'CUDA not available'
assert str(getattr(torch.version, 'cuda', '')).startswith('12.1'), f'Wrong CUDA: {torch.version.cuda}'
print(f'GPU: {torch.cuda.get_device_name(0)}')
# Test torch.utils.checkpoint
import torch.utils.checkpoint
print('torch.utils.checkpoint imported successfully.')

import transformers
print('transformers version:', transformers.__version__)
print('transformers file:', transformers.__file__)

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('roberta-base')
print('Tokenizer loaded successfully from local install.')
print('Installation complete and verified (model load skipped to avoid checkpoint issue).')

# Clean up memory
torch.cuda.empty_cache()

Removing pip target: /app/.pip-target
Removing pip target: /app/.pip-target
Local pkgs dir: /var/lib/simon/agent_run_states/google-quest-challenge-20250928-161614/pkgs
PyTorch already installed in local pkgs, skipping.
Installing transformers and deps...


Collecting transformers==4.44.2
  Downloading transformers-4.44.2-py3-none-any.whl (9.5 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 9.5/9.5 MB 118.9 MB/s eta 0:00:00


Collecting tokenizers==0.19.1
  Downloading tokenizers-0.19.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 3.6/3.6 MB 167.2 MB/s eta 0:00:00
Collecting huggingface-hub==0.24.6
  Downloading huggingface_hub-0.24.6-py3-none-any.whl (417 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 417.5/417.5 KB 511.5 MB/s eta 0:00:00


Collecting safetensors>=0.4.3
  Downloading safetensors-0.6.2-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (485 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 485.8/485.8 KB 528.3 MB/s eta 0:00:00
Collecting accelerate==0.34.2
  Downloading accelerate-0.34.2-py3-none-any.whl (324 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 324.4/324.4 KB 501.4 MB/s eta 0:00:00
Collecting datasets==2.21.0
  Downloading datasets-2.21.0-py3-none-any.whl (527 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 527.3/527.3 KB 277.7 MB/s eta 0:00:00
Collecting evaluate==0.4.2
  Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 84.1/84.1 KB 179.8 MB/s eta 0:00:00
Collecting sentencepiece==0.2.0
  Downloading sentencepiece-0.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.3/1.3 MB 160.5 MB/s eta 0:00:00


Collecting protobuf<=4.36.0
  Downloading protobuf-4.25.8-cp37-abi3-manylinux2014_x86_64.whl (294 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 294.9/294.9 KB 455.6 MB/s eta 0:00:00
Collecting scikit-learn
  Downloading scikit_learn-1.7.2-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (9.7 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 9.7/9.7 MB 345.6 MB/s eta 0:00:00


Installing collected packages: sentencepiece, transformers, tokenizers, scikit-learn, safetensors, protobuf, huggingface-hub, evaluate, datasets, accelerate


Successfully installed accelerate-0.34.2 datasets-2.21.0 evaluate-0.4.2 huggingface-hub-0.24.6 protobuf-4.25.8 safetensors-0.6.2 scikit-learn-1.7.2 sentencepiece-0.2.0 tokenizers-0.19.1 transformers-4.44.2


Collecting numpy>=1.17
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 18.3/18.3 MB 251.5 MB/s eta 0:00:00
Collecting filelock
  Downloading filelock-3.19.1-py3-none-any.whl (15 kB)


Collecting fsspec==2024.6.1
  Downloading fsspec-2024.6.1-py3-none-any.whl (177 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 177.6/177.6 KB 461.0 MB/s eta 0:00:00
Collecting requests
  Downloading requests-2.32.5-py3-none-any.whl (64 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 64.7/64.7 KB 416.9 MB/s eta 0:00:00
Collecting tqdm
  Downloading tqdm-4.67.1-py3-none-any.whl (78 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 78.5/78.5 KB 444.1 MB/s eta 0:00:00
Collecting pyyaml
  Downloading pyyaml-6.0.3-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl (806 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 806.6/806.6 KB 444.2 MB/s eta 0:00:00
Collecting packaging
  Downloading packaging-25.0-py3-none-any.whl (66 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 66.5/66.5 KB 412.9 MB/s eta 0:00:00




Collecting regex!=2019.12.17
  Downloading regex-2025.9.18-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl (798 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 799.0/799.0 KB 505.1 MB/s eta 0:00:00
Collecting certifi>=2023.7.22
  Downloading certifi-2025.8.3-py3-none-any.whl (161 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 161.2/161.2 KB 467.2 MB/s eta 0:00:00
Collecting urllib3<3,>=1.21.1
  Downloading urllib3-2.5.0-py3-none-any.whl (129 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 129.8/129.8 KB 452.1 MB/s eta 0:00:00


Collecting charset-normalizer<4,>=2
  Downloading charset_normalizer-3.4.3-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl (150 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 150.3/150.3 KB 480.9 MB/s eta 0:00:00
Collecting idna<4,>=2.5
  Downloading idna-3.10-py3-none-any.whl (70 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 70.4/70.4 KB 422.3 MB/s eta 0:00:00


Installing collected packages: urllib3, tqdm, regex, pyyaml, packaging, numpy, idna, fsspec, filelock, charset-normalizer, certifi, requests


Successfully installed certifi-2025.8.3 charset-normalizer-3.4.3 filelock-3.19.1 fsspec-2024.6.1 idna-3.10 numpy-1.26.4 packaging-25.0 pyyaml-6.0.3 regex-2025.9.18 requests-2.32.5 tqdm-4.67.1 urllib3-2.5.0


Added local pkgs to sys.path


torch: 2.4.1+cu121 built CUDA: 12.1
CUDA available: True
GPU: NVIDIA A10-24Q


torch.utils.checkpoint imported successfully.


  from .autonotebook import tqdm as notebook_tqdm


transformers version: 4.44.2
transformers file: /var/lib/simon/agent_run_states/google-quest-challenge-20250928-161614/pkgs/transformers/__init__.py


Tokenizer loaded successfully from local install.
Installation complete and verified (model load skipped to avoid checkpoint issue).




In [2]:
# Data Loading and Preparation for Transformer Baseline
import pandas as pd
import numpy as np
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import StandardScaler
from scipy.stats import spearmanr
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
import gc

# Load data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Define target columns (30)
target_cols = [
    'question_asker_intent_understanding', 'question_body_critical', 'question_conversational',
    'question_expect_short_answer', 'question_fact_seeking', 'question_has_commonly_accepted_answer',
    'question_interestingness_others', 'question_interestingness_self', 'question_multi_intent',
    'question_not_really_a_question', 'question_opinion_seeking', 'question_type_choice',
    'question_type_compare', 'question_type_consequence', 'question_type_definition',
    'question_type_entity', 'question_type_instructions', 'question_type_procedure',
    'question_type_reason_explanation', 'question_type_spelling', 'question_well_written',
    'answer_helpful', 'answer_level_of_information', 'answer_plausible', 'answer_relevance',
    'answer_satisfaction', 'answer_type_instructions', 'answer_type_procedure',
    'answer_type_reason_explanation', 'answer_well_written'
]

y_train = train[target_cols].values
print('Train shape:', train.shape, 'Targets shape:', y_train.shape)

# Groups for CV: by url (stricter to avoid leakage)
train['url_group'] = pd.factorize(train['url'])[0]
groups = train['url_group'].values
print('Number of url groups:', len(np.unique(groups)))

# Tokenizer
MODEL_NAME = 'roberta-base'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
MAX_LEN = 512
sep = f' {tokenizer.sep_token} '

# Prepare input texts using tokenizer's sep_token
train['input_text'] = train['question_title'] + sep + train['question_body'] + sep + train['answer']
test['input_text'] = test['question_title'] + sep + test['question_body'] + sep + test['answer']

# Tokenize function
def tokenize_texts(texts):
    return tokenizer(
        texts.tolist(),
        max_length=MAX_LEN,
        truncation=True,
        padding='max_length',
        return_tensors='pt',
        return_attention_mask=True
    )

# Example tokenization (full will be in dataset)
train_enc = tokenize_texts(train['input_text'])
print('Tokenized shapes:', {k: v.shape for k, v in train_enc.items()})

# Custom Dataset (fixed: squeeze single-sample tensors)
class QADataset(Dataset):
    def __init__(self, texts, targets=None, tokenizer=None, max_len=512):
        self.texts = texts
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts.iloc[idx])
        encoding = self.tokenizer(
            text,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt'
        )
        item = {
            key: val.squeeze(0)
            for key, val in encoding.items()
        }
        if self.targets is not None:
            item['targets'] = torch.tensor(self.targets[idx], dtype=torch.float)
        return item

# NaN-safe Spearman scorer
def column_spearman_scorer(y_true, y_pred):
    scores = []
    for i in range(y_true.shape[1]):
        s, _ = spearmanr(y_true[:, i], y_pred[:, i])
        scores.append(0.0 if np.isnan(s) else s)
    return np.mean(scores)

Train shape: (5471, 41) Targets shape: (5471, 30)
Number of url groups: 3392




Tokenized shapes: {'input_ids': torch.Size([5471, 512]), 'attention_mask': torch.Size([5471, 512])}


In [5]:
import torch.nn as nn
from transformers import RobertaModel, get_linear_schedule_with_warmup
from torch.amp import autocast, GradScaler
import os
import torch.nn.utils

# Set env to avoid tokenizers warning
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

class RobertaRegression(nn.Module):
    def __init__(self, model_name, num_targets=30, dropout=0.2):
        super().__init__()
        self.encoder = RobertaModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(dropout)
        self.regressor = nn.Linear(768, num_targets)
    
    def forward(self, input_ids, attention_mask):
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden = outputs.last_hidden_state
        mask = attention_mask.unsqueeze(-1).float()
        pooled = (last_hidden * mask).sum(1) / mask.sum(1).clamp(min=1e-6)
        pooled = self.dropout(pooled)
        logits = self.regressor(pooled)
        return logits

# Training setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

BATCH_SIZE = 16
EPOCHS = 3
LR = 2e-5
WEIGHT_DECAY = 0.01
NUM_FOLDS = 5

# Loss and optimizer
criterion = nn.BCEWithLogitsLoss()
sigmoid = nn.Sigmoid()

# Initialize OOF and test preds
oof_preds = np.zeros((len(train), len(target_cols)))
test_preds = np.zeros((len(test), len(target_cols)))

# GroupKFold by url_group
gkf = GroupKFold(n_splits=NUM_FOLDS)

for fold, (tr_idx, val_idx) in enumerate(gkf.split(train, y_train, groups=groups)):
    print(f'Fold {fold+1}/{NUM_FOLDS}')
    tr_texts = train.iloc[tr_idx]['input_text']
    val_texts = train.iloc[val_idx]['input_text']
    tr_targets = y_train[tr_idx]
    val_targets = y_train[val_idx]
    
    # Datasets and loaders
    tr_dataset = QADataset(tr_texts, tr_targets, tokenizer, MAX_LEN)
    val_dataset = QADataset(val_texts, val_targets, tokenizer, MAX_LEN)
    test_dataset = QADataset(test['input_text'], None, tokenizer, MAX_LEN)
    
    tr_loader = DataLoader(tr_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=2)
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=2)
    
    # Model
    model = RobertaRegression(MODEL_NAME).to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
    
    # Scheduler
    num_training_steps = len(tr_loader) * EPOCHS
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=int(0.1 * num_training_steps),
        num_training_steps=num_training_steps
    )
    
    # Training loop with FP16
    scaler = GradScaler('cuda')
    model.train()
    for epoch in range(EPOCHS):
        total_loss = 0
        for batch in tr_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            targets = batch['targets'].to(device)
            
            optimizer.zero_grad()
            with autocast('cuda'):
                logits = model(input_ids, attention_mask)
                loss = criterion(logits, targets)
            scaler.scale(loss).backward()
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            scaler.step(optimizer)
            scaler.update()
            scheduler.step()
            total_loss += loss.item()
        print(f'Epoch {epoch+1}/{EPOCHS}, Loss: {total_loss/len(tr_loader):.4f}')
    
    # Validation preds
    model.eval()
    val_preds = []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            with autocast('cuda'):
                logits = model(input_ids, attention_mask)
            preds = torch.clip(sigmoid(logits), 0, 1).cpu().numpy()
            val_preds.append(preds)
    val_preds = np.vstack(val_preds)
    oof_preds[val_idx] = val_preds
    
    # Fold score
    fold_score = column_spearman_scorer(val_targets, val_preds)
    print(f'Fold {fold+1} Spearman: {fold_score:.4f}')
    
    # Test preds
    test_fold_preds = []
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            with autocast('cuda'):
                logits = model(input_ids, attention_mask)
            preds = torch.clip(sigmoid(logits), 0, 1).cpu().numpy()
            test_fold_preds.append(preds)
    test_fold = np.vstack(test_fold_preds)
    test_preds += test_fold / NUM_FOLDS
    
    # Clean up
    del model, tr_loader, val_loader, test_loader
    gc.collect()
    torch.cuda.empty_cache()

# Overall CV score
cv_score = column_spearman_scorer(y_train, oof_preds)
print(f'\nMean CV Spearman: {cv_score:.4f}')

# Save OOF and test preds
np.save('roberta_oof.npy', oof_preds)
np.save('roberta_test.npy', test_preds)

# Clip test preds
test_preds = np.clip(test_preds, 0, 1)

# Submission
sub_df = pd.DataFrame(test_preds, columns=target_cols)
sub_df.insert(0, 'qa_id', test['qa_id'])
sub_df.to_csv('submission_roberta.csv', index=False)
print('\nRoBERTa submission saved. Shape:', sub_df.shape)

Using device: cuda
Fold 1/5


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3, Loss: 0.4412


Epoch 2/3, Loss: 0.3793


Epoch 3/3, Loss: 0.3673


Fold 1 Spearman: 0.3608


Fold 2/5


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3, Loss: 0.4390


Epoch 2/3, Loss: 0.3788


Epoch 3/3, Loss: 0.3674


Fold 2 Spearman: 0.3475


Fold 3/5


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3, Loss: 0.4467


Epoch 2/3, Loss: 0.3826


Epoch 3/3, Loss: 0.3704


Fold 3 Spearman: 0.3425


  s, _ = spearmanr(y_true[:, i], y_pred[:, i])


Fold 4/5


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3, Loss: 0.4467


Epoch 2/3, Loss: 0.3814


Epoch 3/3, Loss: 0.3692


Fold 4 Spearman: 0.3586


Fold 5/5


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3, Loss: 0.4396


Epoch 2/3, Loss: 0.3782


Epoch 3/3, Loss: 0.3667


Fold 5 Spearman: 0.3552



Mean CV Spearman: 0.3514

RoBERTa submission saved. Shape: (608, 31)


In [None]:
# Quick check after kernel restart
import transformers, tokenizers, torch
from transformers import AutoTokenizer, AutoModel
print('transformers', transformers.__version__, '| tokenizers', tokenizers.__version__)
print('torch', torch.__version__, 'CUDA', torch.version.cuda, 'GPU OK?', torch.cuda.is_available())

In [6]:
# Blending RoBERTa + TF-IDF for Final Submission (with weight tuning)
import numpy as np
import pandas as pd
from scipy.stats import spearmanr

# Load saved predictions
roberta_oof = np.load('roberta_oof.npy')
roberta_test = np.load('roberta_test.npy')
tfidf_oof = np.load('tfidf_oof_v2.npy')
tfidf_test = np.load('tfidf_test_v2.npy')

# Load y_train for CV scoring
train = pd.read_csv('train.csv')
target_cols = [
    'question_asker_intent_understanding', 'question_body_critical', 'question_conversational',
    'question_expect_short_answer', 'question_fact_seeking', 'question_has_commonly_accepted_answer',
    'question_interestingness_others', 'question_interestingness_self', 'question_multi_intent',
    'question_not_really_a_question', 'question_opinion_seeking', 'question_type_choice',
    'question_type_compare', 'question_type_consequence', 'question_type_definition',
    'question_type_entity', 'question_type_instructions', 'question_type_procedure',
    'question_type_reason_explanation', 'question_type_spelling', 'question_well_written',
    'answer_helpful', 'answer_level_of_information', 'answer_plausible', 'answer_relevance',
    'answer_satisfaction', 'answer_type_instructions', 'answer_type_procedure',
    'answer_type_reason_explanation', 'answer_well_written'
]
y_train = train[target_cols].values

# Compute individual CV scores
def column_spearman_scorer(y_true, y_pred):
    scores = []
    for i in range(y_true.shape[1]):
        s, _ = spearmanr(y_true[:, i], y_pred[:, i])
        scores.append(0.0 if np.isnan(s) else s)
    return np.mean(scores)

roberta_cv = column_spearman_scorer(y_train, roberta_oof)
tfidf_cv = column_spearman_scorer(y_train, tfidf_oof)
print(f'RoBERTa CV: {roberta_cv:.4f}')
print(f'TF-IDF CV: {tfidf_cv:.4f}')

# Tune blend weight: grid search on OOF for best w_roberta (0.6 to 0.95)
weights = np.arange(0.60, 0.96, 0.05)
best_weight = 0.85
best_cv = -1
for w_roberta in weights:
    w_tfidf = 1 - w_roberta
    blend_oof = w_roberta * roberta_oof + w_tfidf * tfidf_oof
    blend_oof = np.clip(blend_oof, 0, 1)
    blend_cv = column_spearman_scorer(y_train, blend_oof)
    print(f'w_roberta={w_roberta:.2f}, CV: {blend_cv:.4f}')
    if blend_cv > best_cv:
        best_cv = blend_cv
        best_weight = w_roberta

print(f'\nBest w_roberta: {best_weight:.2f}, Best CV: {best_cv:.4f}')

# Use best weights for test blend
w_roberta = best_weight
w_tfidf = 1 - w_roberta
blend_test = w_roberta * roberta_test + w_tfidf * tfidf_test
blend_test = np.clip(blend_test, 0, 1)

# If best CV >= 0.39597 (Silver), use for submission
test = pd.read_csv('test.csv')
sub_df = pd.DataFrame(blend_test, columns=target_cols)
sub_df.insert(0, 'qa_id', test['qa_id'])
sub_df.to_csv('submission.csv', index=False)
print(f'\nBlended submission saved with w_roberta={w_roberta:.2f}. Shape: {sub_df.shape}')
if best_cv >= 0.39597:
    print('CV meets Silver threshold (>=0.39597). Ready for submit_final_answer.')
else:
    print('CV below Silver. Consider pure RoBERTa or further improvements.')

RoBERTa CV: 0.3514
TF-IDF CV: 0.2984
w_roberta=0.60, CV: 0.3640
w_roberta=0.65, CV: 0.3661
w_roberta=0.70, CV: 0.3675
w_roberta=0.75, CV: 0.3679
w_roberta=0.80, CV: 0.3672
w_roberta=0.85, CV: 0.3654
w_roberta=0.90, CV: 0.3622


w_roberta=0.95, CV: 0.3576

Best w_roberta: 0.75, Best CV: 0.3679

Blended submission saved with w_roberta=0.75. Shape: (608, 31)
CV below Silver. Consider pure RoBERTa or further improvements.
