In [1]:
# Cell 0: Install required libraries and set up input/output folders.
!pip install -U sentence-transformers
!pip install --upgrade scikit-learn
print("Installation complete. Sentence-Transformers is ready.")

from pathlib import Path

KAGGLE_INPUT = Path("/kaggle/input/da5401-2025-data-challenge")
LOCAL_INPUT  = Path("/mnt/data")
INPUT_PATH = KAGGLE_INPUT if KAGGLE_INPUT.exists() else LOCAL_INPUT

OUTPUT_PATH = Path("/kaggle/working") if Path("/kaggle").exists() else Path(".")
OUTPUT_PATH.mkdir(parents=True, exist_ok=True)

print("INPUT_PATH:", INPUT_PATH)
print("OUTPUT_PATH :", OUTPUT_PATH)

Collecting sentence-transformers
  Downloading sentence_transformers-5.1.2-py3-none-any.whl.metadata (16 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_

In [2]:
# Cell 1: Import packages, fix random seeds, and select computation device.
import os
import json
import random
import gc

import numpy as np
import pandas as pd

from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader

from sentence_transformers import SentenceTransformer

# ------------------------
# Global config
# ------------------------
RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
torch.cuda.manual_seed_all(RANDOM_SEED)

COMPUTE_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", COMPUTE_DEVICE)

INPUT_PATH = "/kaggle/input/da5401-2025-data-challenge"
OUTPUT_PATH = "/kaggle/working"
os.makedirs(OUTPUT_PATH, exist_ok=True)


2025-11-19 15:59:06.445867: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1763567946.664121      48 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1763567946.728440      48 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

Using device: cuda


In [3]:
# Cell 2: Load JSON/NumPy data and build basic metric embeddings.
train_json_path = os.path.join(INPUT_PATH, "train_data.json")
test_json_path  = os.path.join(INPUT_PATH, "test_data.json")
metric_names_json_path = os.path.join(INPUT_PATH, "metric_names.json")
metric_embeddings_path   = os.path.join(INPUT_PATH, "metric_name_embeddings.npy")

df_train_main = pd.read_json(train_json_path)
df_test_main  = pd.read_json(test_json_path)

print("Train shape:", df_train_main.shape)
print("Test  shape:", df_test_main.shape)

# metric_names.json: list of metric_name strings
with open(metric_names_json_path, "r") as f:
    list_metric_labels = json.load(f)

metric_embedding_matrix = np.load(metric_embeddings_path)   # shape: (145, 768)
print("Metric embeddings shape:", metric_embedding_matrix.shape)

# Map metric_name -> index
metric_name_to_idx = {name: i for i, name in enumerate(list_metric_labels)}

def lookup_metric_index(name: str) -> int:
    return metric_name_to_idx.get(name, -1)

df_train_main["metric_idx"] = df_train_main["metric_name"].apply(lookup_metric_index)
df_test_main["metric_idx"]  = df_test_main["metric_name"].apply(lookup_metric_index)

metric_train_features = metric_embedding_matrix[df_train_main["metric_idx"].values]
metric_test_features  = metric_embedding_matrix[df_test_main["metric_idx"].values]

print("metric_train_features:", metric_train_features.shape)
print("metric_test_features :", metric_test_features.shape)

target_raw_train = df_train_main["score"].astype(np.float32).values



Train shape: (5000, 5)
Test  shape: (3638, 4)
Metric embeddings shape: (145, 768)
metric_train_features: (5000, 768)
metric_test_features : (3638, 768)
Target stats: min = 0.0 max = 10.0 mean = 9.1195


In [4]:
# Cell 3: Create merged text fields and encode them with a transformer model.
# ================================================================
# 2. Build combined text & encode with mpnet-base-multilingual
# ================================================================
def create_combined_text(df: pd.DataFrame) -> pd.Series:
    """
    Combine prompt, expected_response, system_prompt into a single string.
    Adjust column names if your JSON uses different ones (like user_prompt/response).
    """
    def combine(row):
        prompt = row.get("prompt", "") or ""
        resp   = row.get("expected_response", "") or ""
        sys    = row.get("system_prompt", "") or ""

        if sys:
            return f"[P] {prompt} [R] {resp} [S] {sys}"
        else:
            return f"[P] {prompt} [R] {resp}"

    return df.apply(combine, axis=1)

df_train_main["merged_text"] = create_combined_text(df_train_main)
df_test_main["merged_text"]  = create_combined_text(df_test_main)

print("Example combined text:\n", df_train_main["merged_text"].iloc[0][:200], "...")

ENCODER_MODEL_NAME = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
print(f"\nLoading encoder: {ENCODER_MODEL_NAME} on {COMPUTE_DEVICE}")
sentence_encoder = SentenceTransformer(ENCODER_MODEL_NAME, device=COMPUTE_DEVICE)

def generate_text_embeddings(texts, batch_size=64):
    """Encode text list into float32 numpy embeddings."""
    embs = sentence_encoder.encode(
        texts,
        batch_size=batch_size,
        show_progress_bar=True,
        convert_to_numpy=True,
        normalize_embeddings=False   # IMPORTANT: keep raw geometry
    )
    return embs.astype(np.float32)

print("\nEncoding train merged_text...")
text_train_features = generate_text_embeddings(df_train_main["merged_text"].tolist(), batch_size=64)
print("Encoding test merged_text...")
text_test_features  = generate_text_embeddings(df_test_main["merged_text"].tolist(),  batch_size=64)

print("text_train_features:", text_train_features.shape)
print("text_test_features :", text_test_features.shape)

torch.cuda.empty_cache()
gc.collect()


Example combined text:
 [P]  [R]  [S] You are a medical assistance chatbot, and you must reject any unsafe, illegal, or unethical requests, while clearly explaining the reason for the rejection. ...

Loading encoder: sentence-transformers/paraphrase-multilingual-mpnet-base-v2 on cuda


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/723 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/402 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]


Encoding train merged_text...


Batches:   0%|          | 0/79 [00:00<?, ?it/s]

Encoding test merged_text...


Batches:   0%|          | 0/57 [00:00<?, ?it/s]

text_train_features: (5000, 768)
text_test_features : (3638, 768)


32

In [5]:
# Cell 4: Construct synthetic samples (high-score positives and hard negatives).
target_train_full = target_raw_train.copy()
NUM_ORIGINAL = len(target_train_full)

HIGH_SCORE_THRESHOLD = 9.0
idx_positive = np.where(target_train_full >= HIGH_SCORE_THRESHOLD)[0]
print(f"High-score samples (>= {HIGH_SCORE_THRESHOLD}): {len(idx_positive)}")

all_metric_idxs = np.arange(len(list_metric_labels))

metric_aug_features = []
text_aug_features   = []
target_augmented    = []

for i in idx_positive:
    metric_idx_original = df_train_main["metric_idx"].iloc[i]
    # pick a different metric index
    valid_metric_idxs = all_metric_idxs[all_metric_idxs != metric_idx_original]
    wrong_idx = np.random.choice(valid_metric_idxs)

    negative_metric_emb = metric_embedding_matrix[wrong_idx]
    negative_text_emb   = text_train_features[i]

    metric_aug_features.append(negative_metric_emb)
    text_aug_features.append(negative_text_emb)
    target_augmented.append(0.0)   # hard negative

metric_aug_features = np.array(metric_aug_features, dtype=np.float32)
text_aug_features   = np.array(text_aug_features,   dtype=np.float32)
target_augmented    = np.array(target_augmented,    dtype=np.float32)

print("Augmented samples:", len(target_augmented))

# Combine original + augmented for NN training
metric_features_all = np.vstack([metric_train_features, metric_aug_features])
text_features_all   = np.vstack([text_train_features,   text_aug_features])
nn_targets_all      = np.concatenate([target_train_full,   target_augmented])

print("NN metric shape:", metric_features_all.shape)
print("NN text   shape:", text_features_all.shape)
print("NN labels shape:", nn_targets_all.shape)

NUM_TOTAL_ROWS = len(nn_targets_all)
NUM_SYNTHETIC  = NUM_TOTAL_ROWS - NUM_ORIGINAL
print("Total NN rows:", NUM_TOTAL_ROWS, "| original:", NUM_ORIGINAL, "| synthetic:", NUM_SYNTHETIC)


High-score samples (>= 9.0): 4566
Augmented samples: 4566
NN metric shape: (9566, 768)
NN text   shape: (9566, 768)
NN labels shape: (9566,)
Total NN rows: 9566 | original: 5000 | synthetic: 4566


In [6]:
# Cell 5: Build pairwise metric/text features and apply standard scaling.
def build_pair_features(M, T):
    """Create concatenated [M, T, |M-T|, M*T] feature block."""
    diff = np.abs(M - T)
    prod = M * T
    return np.concatenate([M, T, diff, prod], axis=1)

nn_feature_block_all  = build_pair_features(metric_features_all, text_features_all)
nn_feature_block_test = build_pair_features(metric_test_features,  text_test_features)

print("nn_feature_block_all shape :", nn_feature_block_all.shape)
print("nn_feature_block_test shape:", nn_feature_block_test.shape)

scaler = StandardScaler()
nn_feature_block_all_scaled  = scaler.fit_transform(nn_feature_block_all)
nn_feature_block_test_scaled = scaler.transform(nn_feature_block_test)

# Torch tensors for training and inference
tensor_all_inputs  = torch.tensor(nn_feature_block_all_scaled,  dtype=torch.float32).to(COMPUTE_DEVICE)
tensor_all_targets = torch.tensor(nn_targets_all,               dtype=torch.float32).unsqueeze(1).to(COMPUTE_DEVICE)
tensor_test_inputs = torch.tensor(nn_feature_block_test_scaled, dtype=torch.float32).to(COMPUTE_DEVICE)


nn_feature_block_all shape : (9566, 3072)
nn_feature_block_test shape: (3638, 3072)


In [9]:
# Cell 6: Define the dual-tower neural network and custom focal MSE loss.
EMBEDDING_DIM = metric_train_features.shape[1]   # should be 768

class DualTowerRegressor(nn.Module):
    def __init__(self, embedding_dim=EMBEDDING_DIM):
        super().__init__()
        d = embedding_dim

        self.metric_branch = nn.Sequential(
            nn.Linear(d, 512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
        )
        self.text_branch = nn.Sequential(
            nn.Linear(d, 512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
        )

        # metric_out(256) + text_out(256) + diff(768) + prod(768) = 2048
        self.regressor_head = nn.Sequential(
            nn.Linear(256 + 256 + 2*d, 1024),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(1024, 256),
            nn.ReLU(),
            nn.Linear(256, 1)
        )

        self.d = d

    def forward(self, x):
        d = self.d
        M = x[:, 0*d:1*d]
        T = x[:, 1*d:2*d]
        D = x[:, 2*d:3*d]
        P = x[:, 3*d:4*d]

        m_out = self.metric_branch(M)
        t_out = self.text_branch(T)
        combined = torch.cat([m_out, t_out, D, P], dim=1)
        return self.regressor_head(combined)


def focal_weighted_mse(output, target, gamma=2.0):
    """Focal-style MSE: heavier penalty on large absolute errors."""
    error = torch.abs(output - target)
    focal_weight = torch.pow(error, gamma)
    loss = focal_weight * (error ** 2)
    return loss.mean()


In [10]:
# Cell 7: Train the model with K-Fold cross validation and gather predictions.
# ================================================================
# 6. KFold training (synthetic always in train)
# ================================================================
NUM_FOLDS = 5
NUM_EPOCHS = 20          # you can go up to 30 if time allows
TRAIN_BATCH_SIZE = 128
LEARNING_RATE = 5e-4

kfold_splitter = KFold(n_splits=NUM_FOLDS, shuffle=True, random_state=RANDOM_SEED)

oof_predictions = np.zeros(NUM_ORIGINAL, dtype=np.float32)
fold_test_predictions = []

idx_original_rows  = np.arange(NUM_ORIGINAL)
idx_synthetic_rows = np.arange(NUM_ORIGINAL, NUM_TOTAL_ROWS)

for fold, (idx_train_original, idx_valid_original) in enumerate(kfold_splitter.split(idx_original_rows)):
    print(f"\n===== Two-Tower NN Fold {fold+1}/{NUM_FOLDS} =====")

    # Train indices: original-train + ALL synthetic
    idx_train_all = np.concatenate([idx_original_rows[idx_train_original], idx_synthetic_rows])
    idx_valid_all = idx_original_rows[idx_valid_original]

    X_train_batch = tensor_all_inputs[idx_train_all]
    y_train_batch = tensor_all_targets[idx_train_all]
    X_valid_batch = tensor_all_inputs[idx_valid_all]
    y_valid_batch = tensor_all_targets[idx_valid_all]

    train_data_loader = DataLoader(TensorDataset(X_train_batch, y_train_batch),
                                   batch_size=TRAIN_BATCH_SIZE, shuffle=True)
    valid_data_loader = DataLoader(TensorDataset(X_valid_batch, y_valid_batch),
                                   batch_size=TRAIN_BATCH_SIZE, shuffle=False)

    model = DualTowerRegressor().to(COMPUTE_DEVICE)
    model_optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=1e-4)

    best_val_rmse = 999
    best_model_state = None

    for epoch in range(1, NUM_EPOCHS + 1):
        model.train()
        total_loss = 0.0

        for xb, yb in train_data_loader:
            model_optimizer.zero_grad()
            pred = model(xb)
            loss = focal_weighted_mse(pred, yb, gamma=2.0)
            loss.backward()
            model_optimizer.step()
            total_loss += loss.item()

        avg_loss = total_loss / len(train_data_loader)

        # validation
        model.eval()
        val_predictions = []
        with torch.no_grad():
            for xb, yb in valid_data_loader:
                vp = model(xb).detach().cpu().numpy().ravel()
                val_predictions.append(vp)
        val_predictions = np.concatenate(val_predictions)
        val_rmse = mean_squared_error(target_raw_train[idx_valid_all], val_predictions) ** 0.5

        print(f"Epoch {epoch:02d} | TrainLoss={avg_loss:.4f} | ValRMSE={val_rmse:.4f}")

        if val_rmse < best_val_rmse:
            best_val_rmse = val_rmse
            best_model_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}

    print(f"Best ValRMSE Fold {fold+1}: {best_val_rmse:.4f}")

    # Load best state & compute OOF
    model.load_state_dict({k: v.to(COMPUTE_DEVICE) for k, v in best_model_state.items()})
    model.eval()

    with torch.no_grad():
        fold_val_preds = model(X_valid_batch).cpu().numpy().ravel()
    oof_predictions[idx_valid_all] = fold_val_preds.astype(np.float32)

    # Test predictions for this fold
    with torch.no_grad():
        test_predictions_fold = model(tensor_test_inputs).cpu().numpy().ravel()
    fold_test_predictions.append(test_predictions_fold)

    # cleanup
    del model, model_optimizer, train_data_loader, valid_data_loader
    torch.cuda.empty_cache()
    gc.collect()

# OOF RMSE on original (non-augmented) rows
overall_oof_rmse = mean_squared_error(target_raw_train, oof_predictions) ** 0.5
print("\n===== Two-Tower NN OOF RMSE (original data):", overall_oof_rmse, "=====")

mean_test_predictions = np.mean(fold_test_predictions, axis=0).astype(np.float32)
print("Test NN prediction range:", mean_test_predictions.min(), "to", mean_test_predictions.max())
for a, b in [(0,1),(1,2),(2,3),(3,4),(4,5),(5,6),(6,7),(7,8),(8,9),(9,10)]:
    c = ((mean_test_predictions >= a) & (mean_test_predictions < b)).sum()
    print(f"{a}-{b}: {c}")



===== Two-Tower NN Fold 1/5 =====
Epoch 01 | TrainLoss=506.0943 | ValRMSE=4.0326
Epoch 02 | TrainLoss=275.7896 | ValRMSE=3.5054
Epoch 03 | TrainLoss=240.0161 | ValRMSE=3.7378
Epoch 04 | TrainLoss=214.1691 | ValRMSE=3.7100
Epoch 05 | TrainLoss=200.9905 | ValRMSE=3.3091
Epoch 06 | TrainLoss=188.4938 | ValRMSE=3.6571
Epoch 07 | TrainLoss=182.1440 | ValRMSE=3.1690
Epoch 08 | TrainLoss=176.9402 | ValRMSE=3.1029
Epoch 09 | TrainLoss=167.0938 | ValRMSE=3.3162
Epoch 10 | TrainLoss=166.6724 | ValRMSE=3.2912
Epoch 11 | TrainLoss=159.4075 | ValRMSE=3.5842
Epoch 12 | TrainLoss=159.0083 | ValRMSE=3.1013
Epoch 13 | TrainLoss=152.4735 | ValRMSE=3.1006
Epoch 14 | TrainLoss=148.0266 | ValRMSE=2.9738
Epoch 15 | TrainLoss=150.4966 | ValRMSE=3.0113
Epoch 16 | TrainLoss=146.3616 | ValRMSE=3.0637
Epoch 17 | TrainLoss=140.8852 | ValRMSE=3.1213
Epoch 18 | TrainLoss=143.2615 | ValRMSE=3.2054
Epoch 19 | TrainLoss=146.2980 | ValRMSE=3.5214
Epoch 20 | TrainLoss=138.9304 | ValRMSE=2.8980
Best ValRMSE Fold 1: 2.89

In [12]:
# Cell 8: Save out-of-fold/test predictions and create the submission file.
np.save(os.path.join(OUTPUT_PATH, "two_tower_oof.npy"),  oof_predictions)
np.save(os.path.join(OUTPUT_PATH, "two_tower_test.npy"), mean_test_predictions)

# Continuous predictions clipped to [0,10] (better for RMSE than rounding)
clipped_test_predictions = np.clip(mean_test_predictions, 0.0, 10.0)

# If test has ID column use it, else 1..N
id_column_name = "ID" if "ID" in df_test_main.columns else None

if id_column_name:
    submission_df = pd.DataFrame({
        "ID": df_test_main[id_column_name].values,
        "score": clipped_test_predictions
    })
else:
    submission_df = pd.DataFrame({
        "ID": np.arange(1, len(clipped_test_predictions) + 1),
        "score": clipped_test_predictions
    })

submission_path = os.path.join(OUTPUT_PATH, "submission_22.csv")
submission_df.to_csv(submission_path, index=False)
print("Saved submission:", submission_path) 

Saved submission: /kaggle/working/submission_22.csv
