In [None]:
### Step 1: Import all necessary libraries used for data processing, text embeddings, and model training
import json
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from pathlib import Path
import torch.nn.functional as F
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from math import sqrt

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
### Step 2: Set the random seed, choose the compute device, and load all training/testing data and metric embeddings
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device: {device}")

DATA_DIR = Path("data") 

with open(f"{DATA_DIR}/train_data.json", encoding='utf-8') as f:
    train_data = json.load(f)
with open(f"{DATA_DIR}/test_data.json", encoding='utf-8') as f:
    test_data = json.load(f)

metric_emb_matrix = np.load(f"{DATA_DIR}/metric_name_embeddings.npy")
with open(f"{DATA_DIR}/metric_names.json") as f:
    metric_names_obj = json.load(f)

In [None]:
### Step 3: Cleaning Metric Names and Building Text Processing Utilities

def clean_name(x):
    if isinstance(x, list) and len(x) > 0:
        x = x[0]
    return str(x).strip()

if isinstance(metric_names_obj, list):
    name_to_idx = {clean_name(n): i for i, n in enumerate(metric_names_obj)}
else:
    name_to_idx = {clean_name(k): int(v) for k, v in metric_names_obj.items()}

def find_key(options, record):
    for opt in options:
        if opt in record:
            return record[opt]
    return ""

def build_text(item):
    metric = item.get('metric_name', '')
    prompt = find_key(['prompt', 'input', 'query', 'question'], item)
    system = find_key(['system_prompt', 'instruction', 'system', 'context'], item)
    response = find_key(['expected_response', 'response', 'answer', 'output'], item)

    parts = [f"[M] {metric}", f"[P] {prompt}", f"[S] {system}", f"[R] {response}"]
    return " ".join(parts)

### Step 4: Build Training & Pseudo-Training Data and Prepare Text Embeddings
train_texts = [build_text(d) for d in train_data]
train_scores = np.array([float(find_key(['score', 'target', 'fitness', 'label'], d)) for d in train_data])
train_metric_names = [d['metric_name'] for d in train_data]

test_texts = [build_text(d) for d in test_data]
test_metric_names = [d['metric_name'] for d in test_data]

pseudo_df = pd.read_csv("submission_minilm_embedder.csv")
pseudo_scores = pseudo_df['score'].values

high_conf_mask = (pseudo_scores <= 2.0) | (pseudo_scores >= 8.0)
high_conf_idx = np.where(high_conf_mask)[0]

pseudo_texts = [test_texts[i] for i in high_conf_idx]
pseudo_scores_selected = pseudo_scores[high_conf_idx]
pseudo_metric_names = [test_metric_names[i] for i in high_conf_idx]

combined_texts = train_texts + pseudo_texts
combined_scores = np.concatenate([train_scores, pseudo_scores_selected])
combined_metric_names = train_metric_names + pseudo_metric_names

print(f"Training samples: {len(train_texts)}")
print(f"Pseudo-labeled samples: {len(pseudo_texts)}")
print(f"Total training: {len(combined_texts)}")

print("\nEncoding with MiniLM...")
encoder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

### Step 5: Generate Text & Metric Embeddings and Prepare Train/Validation Splits
train_embeddings = encoder.encode(combined_texts, batch_size=64, show_progress_bar=True, convert_to_numpy=True)
test_embeddings = encoder.encode(test_texts, batch_size=64, show_progress_bar=True, convert_to_numpy=True)

train_metric_indices = [name_to_idx[clean_name(m)] for m in combined_metric_names]
test_metric_indices = [name_to_idx[clean_name(m)] for m in test_metric_names]

train_metric_embs = metric_emb_matrix[train_metric_indices]
test_metric_embs = metric_emb_matrix[test_metric_indices]

if train_metric_embs.shape[1] > train_embeddings.shape[1]:
    train_metric_embs = train_metric_embs[:, :train_embeddings.shape[1]]
    test_metric_embs = test_metric_embs[:, :test_embeddings.shape[1]]
elif train_metric_embs.shape[1] < train_embeddings.shape[1]:
    pad_width = train_embeddings.shape[1] - train_metric_embs.shape[1]
    train_metric_embs = np.pad(train_metric_embs, ((0, 0), (0, pad_width)), mode='constant')
    test_metric_embs = np.pad(test_metric_embs, ((0, 0), (0, pad_width)), mode='constant')

X_train_text = train_embeddings.astype(np.float32)
X_train_metric = train_metric_embs.astype(np.float32)
y_train = combined_scores.astype(np.float32)

X_test_text = test_embeddings.astype(np.float32)
X_test_metric = test_metric_embs.astype(np.float32)

X_tr_text, X_val_text, X_tr_metric, X_val_metric, y_tr, y_val = train_test_split(
    X_train_text, X_train_metric, y_train, test_size=0.15, random_state=SEED
)

### Step 6: Create the Dataset Class and Define a Cross-Attention Layer
class ScoreDataset(Dataset):
    def __init__(self, text_emb, metric_emb, scores):
        self.text = torch.FloatTensor(text_emb)
        self.metric = torch.FloatTensor(metric_emb)
        self.scores = torch.FloatTensor(scores)

    def __len__(self):
        return len(self.scores)

    def __getitem__(self, idx):
        return self.text[idx], self.metric[idx], self.scores[idx]

class CrossAttention(nn.Module):
    def __init__(self, dim, num_heads=4):
        super().__init__()
        self.num_heads = num_heads
        self.dim = dim
        self.head_dim = dim // num_heads

        self.q_proj = nn.Linear(dim, dim)
        self.k_proj = nn.Linear(dim, dim)
        self.v_proj = nn.Linear(dim, dim)
        self.out_proj = nn.Linear(dim, dim)

    def forward(self, query, key_value):
        B = query.size(0)

        Q = self.q_proj(query).view(B, 1, self.num_heads, self.head_dim).transpose(1, 2)
        K = self.k_proj(key_value).view(B, 1, self.num_heads, self.head_dim).transpose(1, 2)
        V = self.v_proj(key_value).view(B, 1, self.num_heads, self.head_dim).transpose(1, 2)

        attn_weights = torch.matmul(Q, K.transpose(-2, -1)) / (self.head_dim ** 0.5)
        attn_weights = F.softmax(attn_weights, dim=-1)

        attn_output = torch.matmul(attn_weights, V)
        attn_output = attn_output.transpose(1, 2).contiguous().view(B, self.dim)

        return self.out_proj(attn_output)
    
### Step 7: Build the Attention-Based Regressor Model
class AttentionRegressor(nn.Module):
    def __init__(self, input_dim, hidden_dim=512, num_heads=4):
        super().__init__()

        self.text_proj = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.LayerNorm(hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.2)
        )

        self.metric_proj = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.LayerNorm(hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.2)
        )

        self.cross_attn_1 = CrossAttention(hidden_dim, num_heads)
        self.cross_attn_2 = CrossAttention(hidden_dim, num_heads)

        self.fusion = nn.Sequential(
            nn.Linear(hidden_dim * 3, 768),
            nn.LayerNorm(768),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(768, 256),
            nn.LayerNorm(256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def forward(self, text_emb, metric_emb):
        text_feat = self.text_proj(text_emb)
        metric_feat = self.metric_proj(metric_emb)

        text_attended = self.cross_attn_1(text_feat, metric_feat)
        metric_attended = self.cross_attn_2(metric_feat, text_feat)

        combined = torch.cat([text_feat, text_attended, metric_attended], dim=1)

        output = self.fusion(combined)
        return output.squeeze()
    
### Step 8: Train the Attention-Based Regressor With Early Stopping and Learning-Rate Scheduling
train_dataset = ScoreDataset(X_tr_text, X_tr_metric, y_tr)
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True, drop_last=False)

model = AttentionRegressor(X_train_text.shape[1], hidden_dim=512, num_heads=4).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0005, weight_decay=0.0001)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=50, eta_min=1e-6)
criterion = nn.MSELoss()

X_val_text_t = torch.FloatTensor(X_val_text).to(device)
X_val_metric_t = torch.FloatTensor(X_val_metric).to(device)

best_val_rmse = float('inf')
patience = 0
max_patience = 10

print("\nTraining attention-based regressor...")
for epoch in range(60):
    model.train()
    train_loss = 0.0

    for text_batch, metric_batch, score_batch in train_loader:
        text_batch = text_batch.to(device)
        metric_batch = metric_batch.to(device)
        score_batch = score_batch.to(device)

        optimizer.zero_grad()
        predictions = model(text_batch, metric_batch)
        loss = criterion(predictions, score_batch)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    scheduler.step()

    model.eval()
    with torch.no_grad():
        val_predictions = model(X_val_text_t, X_val_metric_t).cpu().numpy()
        val_rmse = sqrt(mean_squared_error(y_val, val_predictions))

    if (epoch + 1) % 5 == 0:
        print(f"Epoch {epoch+1}: Train Loss = {train_loss/len(train_loader):.4f}, Val RMSE = {val_rmse:.4f}")

    if val_rmse < best_val_rmse:
        best_val_rmse = val_rmse
        patience = 0
        torch.save(model.state_dict(), 'best_attention_model.pt')
    else:
        patience += 1
        if patience >= max_patience:
            print(f"Early stopping at epoch {epoch+1}")
            break

print(f"\nBest validation RMSE: {best_val_rmse:.4f}")

### Step 9: Run the Best Model on Test Data, Adjust Scores, and Create the Submission File
model.load_state_dict(torch.load('best_attention_model.pt'))
model.eval()

X_test_text_t = torch.FloatTensor(X_test_text).to(device)
X_test_metric_t = torch.FloatTensor(X_test_metric).to(device)

with torch.no_grad():
    test_predictions = model(X_test_text_t, X_test_metric_t).cpu().numpy()

test_predictions = np.clip(test_predictions, 0, 10)

score_ranges = [(0, 2.5), (2.5, 5.5), (5.5, 7.5), (7.5, 10)]
for low, high in score_ranges:
    mask = (test_predictions >= low) & (test_predictions < high)
    if mask.sum() > 0:
        offset = (y_val[(val_predictions >= low) & (val_predictions < high)] -
                 val_predictions[(val_predictions >= low) & (val_predictions < high)]).mean()
        if not np.isnan(offset):
            test_predictions[mask] += offset

test_predictions = np.clip(test_predictions, 0, 10)
test_predictions = np.round(test_predictions, 1)

submission = pd.DataFrame({
    "ID": range(1, len(test_predictions) + 1),
    "score": test_predictions
})

submission.to_csv("submission.csv", index=False)

print("\nTest predictions:")
print(f"  Mean: {test_predictions.mean():.2f}")
print(f"  Std: {test_predictions.std():.2f}")
print(f"  Range: [{test_predictions.min():.1f}, {test_predictions.max():.1f}]")
print(f"\nDistribution:")
print(f"  0-3: {(test_predictions <= 3).sum()} ({100*(test_predictions <= 3).sum()/len(test_predictions):.1f}%)")
print(f"  4-7: {((test_predictions > 3) & (test_predictions <= 7)).sum()} ({100*((test_predictions > 3) & (test_predictions <= 7)).sum()/len(test_predictions):.1f}%)")
print(f"  8-10: {(test_predictions >= 8).sum()} ({100*(test_predictions >= 8).sum()/len(test_predictions):.1f}%)")

print("\nSubmission saved to: submission.csv")



Device: cuda
Training samples: 5000
Pseudo-labeled samples: 2742
Total training: 7742

Encoding with MiniLM...


Batches:   0%|          | 0/121 [00:00<?, ?it/s]

Batches:   0%|          | 0/57 [00:00<?, ?it/s]


Training attention-based regressor...
Epoch 5: Train Loss = 8.4894, Val RMSE = 2.8611
Epoch 10: Train Loss = 6.3938, Val RMSE = 2.5065
Epoch 15: Train Loss = 3.6127, Val RMSE = 2.3141
Epoch 20: Train Loss = 2.7446, Val RMSE = 2.2710
Epoch 25: Train Loss = 1.8927, Val RMSE = 2.2001
Epoch 30: Train Loss = 1.5793, Val RMSE = 2.1381
Epoch 35: Train Loss = 1.3779, Val RMSE = 2.1513
Epoch 40: Train Loss = 1.2318, Val RMSE = 2.1165
Epoch 45: Train Loss = 1.1054, Val RMSE = 2.0843
Early stopping at epoch 48

Best validation RMSE: 2.0581

Test predictions:
  Mean: 6.35
  Std: 3.01
  Range: [1.8, 8.9]

Distribution:
  0-3: 1140 (31.3%)
  4-7: 153 (4.2%)
  8-10: 2165 (59.5%)

Submission saved to: submission.csv


  model.load_state_dict(torch.load('best_attention_model.pt'))
