In [1]:
import subprocess
import sys
result = subprocess.run(['bash', '-lc', 'nvidia-smi || true'], capture_output=True, text=True)
print(result.stdout)
print(result.stderr)

Sun Sep 28 16:20:36 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.144.06             Driver Version: 550.144.06     CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A10-24Q                 On  |   00000002:00:00.0 Off |                    0 |
| N/A   N/A    P0             N/A /  N/A  |     128MiB /  24512MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

# Initial Plan for Google QUEST Q&A Labeling Competition

## Objective
Predict multiple subjective quality aspects of question-answer pairs using column-wise Spearman correlation. Target columns include 'question_type_definition' and likely others (need to confirm from data). Aim for Gold medal: >=0.42278

## Step 1: Environment Setup
- GPU confirmed available (NVIDIA A10-24Q, CUDA 12.4).
- Install necessary packages if needed (e.g., PyTorch with cu121, transformers for NLP).

## Step 2: Data Loading and EDA
- Load train.csv and test.csv.
- Explore shapes, columns, missing values, distributions of targets.
- Since it's text data (questions, answers), analyze text lengths, types, etc.
- Identify all target columns (likely 30+ subjective labels).

## Step 3: Feature Engineering
- Text features: TF-IDF, word counts, sentiment, etc.
- Advanced: Pre-trained embeddings (BERT, RoBERTa) fine-tuned for this task.
- Metadata features if any (e.g., url, product).

## Step 4: Modeling
- Baselines: Ridge regression on TF-IDF.
- Advanced: Neural networks with transformers, multi-task learning since multiple targets.
- Cross-validation: 5-fold stratified or group KF (by qid?).
- Metric: Column-wise Spearman.

## Step 5: Ensembling and Submission
- Blend models.
- Generate submission.csv with all target columns.

## Milestones for Expert Review
- After EDA.
- After baseline model.
- After feature engineering.
- Before final submission.

Next: Load data and perform initial EDA.

In [2]:
import pandas as pd
import numpy as np

train = pd.read_csv('train.csv')
print('Train shape:', train.shape)
print('\nColumns:', train.columns.tolist())
print('\nHead:')
print(train.head())
print('\nMissing values:')
print(train.isnull().sum())

Train shape: (5471, 41)

Columns: ['qa_id', 'question_title', 'question_body', 'question_user_name', 'question_user_page', 'answer', 'answer_user_name', 'answer_user_page', 'url', 'category', 'host', 'question_asker_intent_understanding', 'question_body_critical', 'question_conversational', 'question_expect_short_answer', 'question_fact_seeking', 'question_has_commonly_accepted_answer', 'question_interestingness_others', 'question_interestingness_self', 'question_multi_intent', 'question_not_really_a_question', 'question_opinion_seeking', 'question_type_choice', 'question_type_compare', 'question_type_consequence', 'question_type_definition', 'question_type_entity', 'question_type_instructions', 'question_type_procedure', 'question_type_reason_explanation', 'question_type_spelling', 'question_well_written', 'answer_helpful', 'answer_level_of_information', 'answer_plausible', 'answer_relevance', 'answer_satisfaction', 'answer_type_instructions', 'answer_type_procedure', 'answer_type_rea

In [3]:
# Deeper EDA
test = pd.read_csv('test.csv')
print('Test shape:', test.shape)
print('\nTest columns:', test.columns.tolist())

# Target columns
target_cols = [col for col in train.columns if col not in ['qa_id', 'question_title', 'question_body', 'question_user_name', 'question_user_page', 'answer', 'answer_user_name', 'answer_user_page', 'url', 'category', 'host']]
print('\nTarget columns (30):', len(target_cols))
print(target_cols)

# Target distributions
print('\nTarget min/max:')
for col in target_cols:
    print(f'{col}: {train[col].min():.3f} - {train[col].max():.3f}, mean: {train[col].mean():.3f}')

# Text features
train['question_text'] = train['question_title'] + ' ' + train['question_body']
train['answer_text'] = train['answer']
train['full_text'] = train['question_text'] + ' ' + train['answer_text']

print('\nText lengths:')
print('Question title len - mean:', train['question_title'].str.len().mean())
print('Question body len - mean:', train['question_body'].str.len().mean())
print('Answer len - mean:', train['answer'].str.len().mean())
print('Full text len - mean:', train['full_text'].str.len().mean())

# Categories
print('\nCategories:')
print(train['category'].value_counts())

# Hosts
print('\nHosts:')
print(train['host'].value_counts().head())

Test shape: (608, 11)

Test columns: ['qa_id', 'question_title', 'question_body', 'question_user_name', 'question_user_page', 'answer', 'answer_user_name', 'answer_user_page', 'url', 'category', 'host']

Target columns (30): 30
['question_asker_intent_understanding', 'question_body_critical', 'question_conversational', 'question_expect_short_answer', 'question_fact_seeking', 'question_has_commonly_accepted_answer', 'question_interestingness_others', 'question_interestingness_self', 'question_multi_intent', 'question_not_really_a_question', 'question_opinion_seeking', 'question_type_choice', 'question_type_compare', 'question_type_consequence', 'question_type_definition', 'question_type_entity', 'question_type_instructions', 'question_type_procedure', 'question_type_reason_explanation', 'question_type_spelling', 'question_well_written', 'answer_helpful', 'answer_level_of_information', 'answer_plausible', 'answer_relevance', 'answer_satisfaction', 'answer_type_instructions', 'answer_type

# Updated Plan Based on Expert Review

## Key Insights from Experts
- Multi-task prediction for 30 targets (0-1 scales, some skewed like question_type_definition mean=0.03).
- Medal winners: Fine-tuned transformers (RoBERTa/DeBERTa) with shared encoder + 30-regression head, GroupKFold CV by url/host, TF-IDF Ridge baseline blended in.
- Pitfalls: Leakage (use GroupKFold by url or host), metric (column-wise Spearman), truncation, no random KFold.

## Immediate Next Steps: TF-IDF Baseline (Aim CV >=0.34)
1. Imports: sklearn, scipy for Spearman.
2. Define targets, create GroupKFold (groups by host, mapped to int).
3. Features: Separate TF-IDF (word 1-2g, char 3-5g) for title/body/answer; hstack; add one-hot category/host, lengths (title/body/answer/full), counts (questions marks, exclamations, code blocks via ```).
4. Model: MultiOutputRegressor(Ridge(alpha=1)) or per-target Ridge; fit per fold.
5. OOF predictions, clip [0,1], compute mean column-wise Spearman on OOF vs train targets.
6. Generate test predictions, save submission.csv, submit to check LB.

## Then: Transformer Baseline
- Install PyTorch cu121, transformers.
- RoBERTa-base, input: [CLS] title [SEP] body [SEP] answer, max_len=512 with truncation budget.
- Multi-task MSE loss, 5-fold GroupKFold, 3-5 epochs, lr=2e-5, batch=16, FP16.
- Blend with TF-IDF (e.g., 0.85 trans + 0.15 TFIDF).

For rare targets (e.g., type_* with low mean), consider logistic for binary (y>0) + calibration, blend with Ridge.
Track CV Spearman per column to identify weak ones.
After baseline execution, request expert review on CV score and pipeline.

In [19]:
# TF-IDF Baseline Setup (Multi-Target: 30 columns)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import GroupKFold
from scipy.stats import spearmanr
from scipy import sparse

# Define all 30 targets
target_cols = [
    'question_asker_intent_understanding', 'question_body_critical', 'question_conversational',
    'question_expect_short_answer', 'question_fact_seeking', 'question_has_commonly_accepted_answer',
    'question_interestingness_others', 'question_interestingness_self', 'question_multi_intent',
    'question_not_really_a_question', 'question_opinion_seeking', 'question_type_choice',
    'question_type_compare', 'question_type_consequence', 'question_type_definition',
    'question_type_entity', 'question_type_instructions', 'question_type_procedure',
    'question_type_reason_explanation', 'question_type_spelling', 'question_well_written',
    'answer_helpful', 'answer_level_of_information', 'answer_plausible', 'answer_relevance',
    'answer_satisfaction', 'answer_type_instructions', 'answer_type_procedure',
    'answer_type_reason_explanation', 'answer_well_written'
]

# Prepare data
X_train = train.drop(columns=target_cols + ['qa_id'])
y_train = train[target_cols]

# Create groups by host (63 unique for proper GroupKFold)
host_to_id = {host: idx for idx, host in enumerate(X_train['host'].unique())}
groups = X_train['host'].map(host_to_id).values

print('Number of groups:', len(np.unique(groups)))
print('Target shape:', y_train.shape)

Number of groups: 63
Target shape: (5471, 30)


In [22]:
# Feature Engineering for TF-IDF Baseline

# Prepare test data similarly
X_test = test.drop(columns=['qa_id'])

# Text fields (already in train from EDA)
X_train['question_title'] = X_train['question_title'].fillna('')
X_train['question_body'] = X_train['question_body'].fillna('')
X_train['answer'] = X_train['answer'].fillna('')
X_test['question_title'] = X_test['question_title'].fillna('')
X_test['question_body'] = X_test['question_body'].fillna('')
X_test['answer'] = X_test['answer'].fillna('')

# Metadata features: lengths and counts
def get_metadata_features(df):
    df = df.copy()
    df['title_len'] = df['question_title'].str.len()
    df['body_len'] = df['question_body'].str.len()
    df['answer_len'] = df['answer'].str.len()
    df['full_len'] = df['title_len'] + df['body_len'] + df['answer_len']
    df['title_qmarks'] = df['question_title'].str.count(r'\?')
    df['body_qmarks'] = df['question_body'].str.count(r'\?')
    df['answer_qmarks'] = df['answer'].str.count(r'\?')
    df['title_excl'] = df['question_title'].str.count(r'!')
    df['body_excl'] = df['question_body'].str.count(r'!')
    df['answer_excl'] = df['answer'].str.count(r'!')
    df['code_blocks'] = df['question_body'].str.count('```') + df['answer'].str.count('```')
    # Regex features for definition questions (general for multi-task)
    definition_pattern = r'(?i)(what is|what are|define|definition|meaning|means|stand for|acronym)'
    df['title_definition'] = df['question_title'].str.contains(definition_pattern, na=False).astype(int)
    df['body_definition'] = df['question_body'].str.contains(definition_pattern, na=False).astype(int)
    return df

X_train = get_metadata_features(X_train)
X_test = get_metadata_features(X_test)

# One-hot encoding for category only (drop host to avoid leakage with grouping)
cat_ohe_train = pd.get_dummies(X_train['category'], prefix='cat')
cat_ohe_test = pd.get_dummies(X_test['category'], prefix='cat').reindex(columns=cat_ohe_train.columns, fill_value=0)

# Metadata columns to include (added regex)
meta_cols = ['title_len', 'body_len', 'answer_len', 'full_len', 'title_qmarks', 'body_qmarks', 'answer_qmarks',
             'title_excl', 'body_excl', 'answer_excl', 'code_blocks', 'title_definition', 'body_definition']

print('Metadata shape:', X_train[meta_cols].shape)
print('Category OHE shape:', cat_ohe_train.shape)

# TF-IDF Vectorizers (will fit per fold)
word_vectorizer = TfidfVectorizer(max_features=50000, ngram_range=(1,2), sublinear_tf=True, min_df=2)
char_vectorizer = TfidfVectorizer(max_features=100000, ngram_range=(3,5), sublinear_tf=True, min_df=2, analyzer='char_wb')

# Function to get TF-IDF features per fold (title, body, answer)
def get_tfidf_features(X_fold):
    title_word = word_vectorizer.fit_transform(X_fold['question_title'])
    body_word = word_vectorizer.fit_transform(X_fold['question_body'])
    answer_word = word_vectorizer.fit_transform(X_fold['answer'])
    title_char = char_vectorizer.fit_transform(X_fold['question_title'])
    body_char = char_vectorizer.fit_transform(X_fold['question_body'])
    answer_char = char_vectorizer.fit_transform(X_fold['answer'])
    tfidf_features = sparse.hstack([
        title_word, body_word, answer_word,
        title_char, body_char, answer_char
    ]).tocsr()
    return tfidf_features

# Function to combine all features (TF-IDF + metadata + cat OHE only)
def get_full_features(tfidf_feat, meta, cat_ohe):
    meta_sparse = sparse.csr_matrix(meta.values.astype(float))
    cat_sparse = sparse.csr_matrix(cat_ohe.values.astype(float))
    full_feat = sparse.hstack([tfidf_feat, meta_sparse, cat_sparse]).tocsr()
    return full_feat

  df['title_definition'] = df['question_title'].str.contains(definition_pattern, na=False).astype(int)
  df['body_definition'] = df['question_body'].str.contains(definition_pattern, na=False).astype(int)


Metadata shape: (5471, 13)
Category OHE shape: (5471, 5)


  df['title_definition'] = df['question_title'].str.contains(definition_pattern, na=False).astype(int)
  df['body_definition'] = df['question_body'].str.contains(definition_pattern, na=False).astype(int)


In [26]:
# CV Loop for Improved TF-IDF Baseline (Multi-Target, 30 columns)

# Custom scorer for column-wise Spearman (NaN-safe)
def column_spearman_scorer(y_true, y_pred):
    spearman_scores = []
    for i in range(y_true.shape[1]):
        score, _ = spearmanr(y_true[:, i], y_pred[:, i])
        spearman_scores.append(0.0 if np.isnan(score) else score)
    return np.mean(spearman_scores)

# Initialize OOF and test predictions (n, 30)
n_splits = 5
oof_preds = np.zeros((len(X_train), len(target_cols)))
test_preds = np.zeros((len(X_test), len(target_cols)))

# GroupKFold by url (stricter, consistent with transformer)
url_groups = pd.factorize(train['url'])[0]
gkf = GroupKFold(n_splits=n_splits)

for fold, (tr_idx, val_idx) in enumerate(gkf.split(X_train, y_train, url_groups)):
    print(f'Fold {fold+1}/{n_splits}')
    X_tr_fold = X_train.iloc[tr_idx]
    X_val_fold = X_train.iloc[val_idx]
    y_tr_fold = y_train.iloc[tr_idx]
    y_val_fold = y_train.iloc[val_idx]
    
    # Separate vectorizers per field inside fold (increased max_features for better fit)
    word_title = TfidfVectorizer(max_features=50000, ngram_range=(1,2), sublinear_tf=True, min_df=2)
    word_body = TfidfVectorizer(max_features=50000, ngram_range=(1,2), sublinear_tf=True, min_df=2)
    word_answer = TfidfVectorizer(max_features=50000, ngram_range=(1,2), sublinear_tf=True, min_df=2)
    char_title = TfidfVectorizer(max_features=100000, ngram_range=(3,5), sublinear_tf=True, min_df=2, analyzer='char_wb')
    char_body = TfidfVectorizer(max_features=100000, ngram_range=(3,5), sublinear_tf=True, min_df=2, analyzer='char_wb')
    char_answer = TfidfVectorizer(max_features=100000, ngram_range=(3,5), sublinear_tf=True, min_df=2, analyzer='char_wb')
    
    # Fit and transform train fold
    title_word_tr = word_title.fit_transform(X_tr_fold['question_title'])
    body_word_tr = word_body.fit_transform(X_tr_fold['question_body'])
    answer_word_tr = word_answer.fit_transform(X_tr_fold['answer'])
    title_char_tr = char_title.fit_transform(X_tr_fold['question_title'])
    body_char_tr = char_body.fit_transform(X_tr_fold['question_body'])
    answer_char_tr = char_answer.fit_transform(X_tr_fold['answer'])
    tfidf_tr = sparse.hstack([title_word_tr, body_word_tr, answer_word_tr, title_char_tr, body_char_tr, answer_char_tr]).tocsr()
    
    # Transform val fold
    title_word_val = word_title.transform(X_val_fold['question_title'])
    body_word_val = word_body.transform(X_val_fold['question_body'])
    answer_word_val = word_answer.transform(X_val_fold['answer'])
    title_char_val = char_title.transform(X_val_fold['question_title'])
    body_char_val = char_body.transform(X_val_fold['question_body'])
    answer_char_val = char_answer.transform(X_val_fold['answer'])
    tfidf_val = sparse.hstack([title_word_val, body_word_val, answer_word_val, title_char_val, body_char_val, answer_char_val]).tocsr()
    
    # Full features for train and val
    meta_tr = X_tr_fold[meta_cols]
    meta_val = X_val_fold[meta_cols]
    full_tr = get_full_features(tfidf_tr, meta_tr, cat_ohe_train.iloc[tr_idx])
    full_val = get_full_features(tfidf_val, meta_val, cat_ohe_train.iloc[val_idx])
    
    # Model (MultiOutput Ridge, lower alpha=1.0, sparse_cg solver for better fit on sparse data)
    model = MultiOutputRegressor(Ridge(alpha=1.0, fit_intercept=True, solver='sparse_cg'))
    model.fit(full_tr, y_tr_fold)
    
    # OOF pred
    oof_fold = model.predict(full_val)
    oof_fold = np.clip(oof_fold, 0, 1)
    oof_preds[val_idx] = oof_fold
    
    # Test pred
    title_word_test = word_title.transform(X_test['question_title'])
    body_word_test = word_body.transform(X_test['question_body'])
    answer_word_test = word_answer.transform(X_test['answer'])
    title_char_test = char_title.transform(X_test['question_title'])
    body_char_test = char_body.transform(X_test['question_body'])
    answer_char_test = char_answer.transform(X_test['answer'])
    tfidf_test = sparse.hstack([title_word_test, body_word_test, answer_word_test, title_char_test, body_char_test, answer_char_test]).tocsr()
    full_test = get_full_features(tfidf_test, X_test[meta_cols], cat_ohe_test)
    test_fold = model.predict(full_test)
    test_preds += test_fold / n_splits
    
    # Fold score
    fold_score = column_spearman_scorer(y_val_fold.values, oof_fold)
    print(f'Fold {fold+1} Spearman: {fold_score:.4f}')

# Overall CV score
cv_score = column_spearman_scorer(y_train.values, oof_preds)
print(f'\nMean CV Spearman: {cv_score:.4f}')

# Save OOF and test preds for blending (v2)
np.save('tfidf_oof_v2.npy', oof_preds)
np.save('tfidf_test_v2.npy', test_preds)

# Clip test preds
test_preds = np.clip(test_preds, 0, 1)

# Submission (30 columns)
sub_df = pd.DataFrame(test_preds, columns=target_cols)
sub_df.insert(0, 'qa_id', test['qa_id'])
sub_df.to_csv('submission_tfidf_v2.csv', index=False)
print('\nSubmission saved (v2). Shape:', sub_df.shape)
print('Head:')
print(sub_df.head())

# Per-column scores (NaN-safe)
per_col_scores = [spearmanr(y_train.iloc[:, i], oof_preds[:, i])[0] if not np.isnan(spearmanr(y_train.iloc[:, i], oof_preds[:, i])[0]) else 0.0 for i in range(len(target_cols))]
print('\nPer-column Spearman mean:', np.mean(per_col_scores))
print('Low scoring columns:')
for col, score in zip(target_cols, per_col_scores):
    if score < 0.2:
        print(f'{col}: {score:.4f}')

Fold 1/5


Fold 1 Spearman: 0.2977
Fold 2/5


Fold 2 Spearman: 0.2981
Fold 3/5




Fold 3 Spearman: 0.2960
Fold 4/5


Fold 4 Spearman: 0.2981
Fold 5/5


Fold 5 Spearman: 0.3001

Mean CV Spearman: 0.2984

Submission saved (v2). Shape: (608, 31)
Head:
   qa_id  question_asker_intent_understanding  question_body_critical  \
0   6516                             0.950378                0.523153   
1   6168                             0.858214                0.484438   
2   8575                             0.955203                0.732544   
3    618                             0.818749                0.675022   
4   3471                             0.973680                0.615796   

   question_conversational  question_expect_short_answer  \
0                 0.024698                      0.853590   
1                 0.009032                      0.692637   
2                 0.086021                      0.894099   
3                 0.000000                      0.749484   
4                 0.140890                      0.978866   

   question_fact_seeking  question_has_commonly_accepted_answer  \
0               0.855309            