In [1]:
pip install rtdl_revisiting_models


Collecting rtdl_revisiting_models
  Downloading rtdl_revisiting_models-0.0.2-py3-none-any.whl.metadata (888 bytes)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3,>=1.8->rtdl_revisiting_models)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch<3,>=1.8->rtdl_revisiting_models)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch<3,>=1.8->rtdl_revisiting_models)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch<3,>=1.8->rtdl_revisiting_models)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch<3,>=1.8->rtdl_revisiting_models)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylin

In [None]:
# ===============================================================
# HIGGS BOSON FTTransformer + SupCon + LGBM + XGB + Stacking (Tier1 + Tier2)
# - FTTransformer backbone + SupCon projection head (hard-negative top-k)
# - Weighted Focal loss (uses physics sample Weight)
# - 5-fold CV for NN, LGB, XGB -> OOF preds
# - Meta-learner (LogisticRegression) on OOF predictions (stacking)
# - AMS thresholding & Kaggle submission
# ===============================================================

import os, math, zipfile
import numpy as np
import pandas as pd
import random
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
import xgboost as xgb

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from rtdl_revisiting_models import FTTransformer

# ---------------- Settings ----------------
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
SEED = 42
torch.manual_seed(SEED); np.random.seed(SEED); random.seed(SEED)

BATCH_SIZE = 512
EPOCHS = 40
LR = 7e-4                 # slightly lower LR
WEIGHT_DECAY = 3e-5
PROJ_DIM = 64

# Tuned SupCon params (from suggestions)
SUPCON_TEMPERATURE = 0.08
HARD_NEG_TOPK = 30
INITIAL_SUP_W, MAX_SUP_W = 0.12, 0.45

# FT params (conservative)
D_BLOCK = 128
N_BLOCKS = 4
ATTN_HEADS = 8
FFN_MULT = 4

# CV / ensemble
N_FOLDS = 5

# Paths (Kaggle)
zip_files = {
    "train": "/kaggle/input/higgs-boson/training.zip",
    "test": "/kaggle/input/higgs-boson/test.zip",
    "submission": "/kaggle/input/higgs-boson/random_submission.zip"
}
extract_dir = "/kaggle/working/higgs_data/"
os.makedirs(extract_dir, exist_ok=True)
for key, path in zip_files.items():
    if os.path.exists(path):
        with zipfile.ZipFile(path, "r") as z:
            z.extractall(extract_dir)
            print(f"{key} unzipped.")
    else:
        print(f"{key} zip not found.")

TRAIN_CSV = os.path.join(extract_dir, "training.csv")
TEST_CSV = os.path.join(extract_dir, "test.csv")
OUT_SUB = "/kaggle/working/submission.csv"

# ---------------- AMS metric ----------------
def ams_score(s, b):
    b_reg = 10.0
    rad = 2.0 * ((s + b + b_reg) * math.log(1.0 + s / (b + b_reg)) - s)
    return math.sqrt(rad) if rad > 0 else 0.0

# ---------------- Dataset ----------------
class HiggsDataset(Dataset):
    def __init__(self, X, y=None, sample_weight=None):
        self.X = X.astype(np.float32)
        self.y = y
        self.sample_weight = sample_weight
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        x = torch.tensor(self.X[idx], dtype=torch.float32)
        if self.y is None:
            return x, torch.tensor(-1, dtype=torch.long), torch.tensor(0.0, dtype=torch.float32)
        y = torch.tensor(int(self.y[idx]), dtype=torch.long)
        w = torch.tensor(float(self.sample_weight[idx]) if self.sample_weight is not None else 1.0, dtype=torch.float32)
        return x, y, w

# ---------------- Hard-negative SupCon Loss ----------------
class HardNegSupConLoss(nn.Module):
    def __init__(self, temperature=0.08, top_k=30):
        super().__init__()
        self.temperature = temperature
        self.top_k = top_k

    def forward(self, features, labels):
        device = features.device
        labels = labels.contiguous().view(-1, 1)
        batch_size = features.shape[0]

        sim = torch.div(torch.matmul(features, features.T), self.temperature)

        same_label = torch.eq(labels, labels.T).float().to(device)
        diag = torch.eye(batch_size, device=device)
        pos_mask = same_label - diag
        neg_mask = 1.0 - same_label
        pos_counts = pos_mask.sum(dim=1)

        sim_max, _ = torch.max(sim, dim=1, keepdim=True)
        sim_stable = sim - sim_max.detach()
        exp_sim = torch.exp(sim_stable)

        pos_exp = exp_sim * pos_mask

        neg_sim_masked = sim.clone()
        neg_sim_masked[neg_mask == 0] = -1e9
        if batch_size > 1:
            max_neg_per_row = neg_mask.sum(dim=1).max().int().item()
            k = min(self.top_k, max_neg_per_row) if max_neg_per_row > 0 else 0
        else:
            k = 0

        eps = 1e-12
        if k == 0:
            denom = pos_exp.sum(dim=1) + (exp_sim * (1.0 - diag - pos_mask)).sum(dim=1)
            log_prob = torch.log((pos_exp.sum(dim=1) + eps) / (denom + eps))
            valid = pos_counts > 0
            if valid.sum() == 0:
                return torch.tensor(0.0, device=device)
            loss = -log_prob[valid.bool()].mean()
            return loss

        topk_vals, topk_idx = torch.topk(neg_sim_masked, k=k, dim=1)
        selected_neg_mask = torch.zeros_like(neg_mask)
        arange = torch.arange(batch_size, device=device).unsqueeze(1).expand(-1, k)
        selected_neg_mask[arange.reshape(-1), topk_idx.reshape(-1)] = 1.0
        selected_neg_mask = selected_neg_mask * neg_mask

        selected_neg_exp = exp_sim * selected_neg_mask
        sum_pos_exp = pos_exp.sum(dim=1)
        sum_neg_exp = selected_neg_exp.sum(dim=1)

        valid = (pos_counts > 0)
        if valid.sum() == 0:
            return torch.tensor(0.0, device=device)

        log_prob = torch.log((sum_pos_exp + eps) / (sum_pos_exp + sum_neg_exp + eps))
        loss = -log_prob[valid.bool()].mean()
        return loss

# ---------------- FTTransformer + SupCon wrapper ----------------
class FTTransformerSupCon(nn.Module):
    def __init__(self, n_features, proj_dim=PROJ_DIM, d_block=D_BLOCK, n_blocks=N_BLOCKS):
        super().__init__()
        # instantiate according to rtdl_revisiting_models API used earlier
        self.ft = FTTransformer(
            n_cont_features=n_features,
            cat_cardinalities=[],
            d_block=d_block,
            n_blocks=n_blocks,
            attention_n_heads=ATTN_HEADS,
            ffn_d_hidden_multiplier=FFN_MULT,
            attention_dropout=0.2,
            ffn_dropout=0.2,
            residual_dropout=0.1,
            d_out=2
        )
        # projection head: from model logits (2-d) -> proj_dim
        self.proj_head = nn.Sequential(
            nn.Linear(2, 128),
            nn.ReLU(inplace=True),
            nn.Linear(128, proj_dim)
        )

    def forward(self, x, return_emb=False):
        # pass x_cat=None for purely numerical inputs
        out = self.ft(x, x_cat=None)
        if isinstance(out, tuple):
            out = out[0]
        emb = F.normalize(self.proj_head(out), dim=1)
        if return_emb:
            return out, emb
        return out

# ---------------- Weighted focal loss ----------------
class WeightedFocalLoss(nn.Module):
    def __init__(self, alpha=0.75, gamma=2.0):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma

    def forward(self, logits, targets, sample_weight=None):
        # logits: [B, C], targets: [B] (int)
        ce = F.cross_entropy(logits, targets, reduction='none')  # per-sample CE
        p_t = torch.exp(-ce)
        focal = ((1 - p_t) ** self.gamma) * ce
        if sample_weight is not None:
            # sample_weight shape [B]
            focal = focal * sample_weight
            denom = sample_weight.sum()
            return focal.sum() / (denom + 1e-12)
        return focal.mean()

# ---------------- Data prep ----------------
train_df = pd.read_csv(TRAIN_CSV)
test_df = pd.read_csv(TEST_CSV)

# replace -999 with nan and create missing flags
train_df.replace(-999.0, np.nan, inplace=True)
test_df.replace(-999.0, np.nan, inplace=True)

for c in train_df.columns:
    if c in ['EventId', 'Weight', 'Label']:
        continue
    if (train_df[c] == -999).any() or (test_df[c] == -999).any():
        train_df[c + '_miss'] = (train_df[c].isna()).astype(int)
        test_df[c + '_miss'] = (test_df[c].isna()).astype(int)

# fill numeric nan with median of train
# Select numeric columns but exclude 'Weight' (not present in test)
numeric_cols = [c for c in train_df.select_dtypes(include=np.number).columns if c != "Weight"]

train_df[numeric_cols] = train_df[numeric_cols].fillna(train_df[numeric_cols].median())

# Only use intersection of numeric cols that exist in test_df
num_cols_test = [c for c in numeric_cols if c in test_df.columns]
test_df[num_cols_test] = test_df[num_cols_test].fillna(train_df[num_cols_test].median())


# basic physics features (keep your previous ones)
if 'DER_mass_MMC' in train_df.columns and 'DER_mass_vis' in train_df.columns:
    train_df['mass_ratio'] = train_df['DER_mass_MMC'] / (train_df['DER_mass_vis'] + 1e-6)
    test_df['mass_ratio']  = test_df['DER_mass_MMC'] / (test_df['DER_mass_vis'] + 1e-6)
if 'PRI_tau_pt' in train_df.columns and 'PRI_met' in train_df.columns:
    train_df['pt_ratio'] = train_df['PRI_tau_pt'] / (train_df['PRI_met'] + 1e-6)
    test_df['pt_ratio']  = test_df['PRI_tau_pt'] / (test_df['PRI_met'] + 1e-6)

# label and weights
y = (train_df['Label'] == 's').astype(int).values
weights = train_df['Weight'].values
event_ids_test = test_df['EventId'].values if 'EventId' in test_df.columns else None

# drop meta columns
train_features = train_df.drop(columns=['EventId','Weight','Label'], errors='ignore')
test_features  = test_df.drop(columns=['EventId'], errors='ignore')

# fill any remaining na and scale
train_features = train_features.fillna(train_features.median())
test_features = test_features.fillna(train_features.median())

scaler = StandardScaler()
X = scaler.fit_transform(train_features.values.astype(np.float32))
X_test = scaler.transform(test_features.values.astype(np.float32))

# ---------------- CV setup ----------------
kf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)

# containers for OOF and test preds
oof_nn = np.zeros(len(X), dtype=np.float32)
test_pred_nn = np.zeros(len(X_test), dtype=np.float32)

oof_lgb = np.zeros(len(X), dtype=np.float32)
test_pred_lgb_folds = []  # accumulate fold test preds, average later

oof_xgb = np.zeros(len(X), dtype=np.float32)
test_pred_xgb_folds = []

# SupCon instance
supcon = HardNegSupConLoss(temperature=SUPCON_TEMPERATURE, top_k=HARD_NEG_TOPK)

# ---------------- CV: train NN (FT+SupCon) and gather OOF preds ----------------
for fold, (tr_idx, va_idx) in enumerate(kf.split(X, y)):
    print(f"\n===== NN Fold {fold+1}/{N_FOLDS} =====")
    X_tr, X_va = X[tr_idx], X[va_idx]
    y_tr, y_va = y[tr_idx], y[va_idx]
    w_tr, w_va = weights[tr_idx], weights[va_idx]

    tr_ds = HiggsDataset(X_tr, y_tr, sample_weight=w_tr)
    va_ds = HiggsDataset(X_va, y_va, sample_weight=w_va)
    tr_loader = DataLoader(tr_ds, batch_size=BATCH_SIZE, shuffle=True, drop_last=False)
    va_loader = DataLoader(va_ds, batch_size=BATCH_SIZE, shuffle=False)

    model = FTTransformerSupCon(X.shape[1], proj_dim=PROJ_DIM).to(DEVICE)
    optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=EPOCHS)
    criterion_cls = WeightedFocalLoss(alpha=0.75, gamma=2.0)

    best_auc = 0.0
    best_state = None
    best_val_pred = None

    for epoch in range(EPOCHS):
        model.train()
        frac = epoch / max(1, EPOCHS - 1)
        # dynamic supcon weight schedule
        if frac <= 0.5:
            sup_w = INITIAL_SUP_W + (MAX_SUP_W - INITIAL_SUP_W) * (frac / 0.5)
        else:
            sup_w = MAX_SUP_W * (1.0 - 0.5 * (frac - 0.5) / 0.5)
        sup_w = float(sup_w)

        train_losses = []
        for xb, yb, wb in tqdm(tr_loader, desc=f"Fold{fold+1} Epoch{epoch+1}", leave=False):
            xb = xb.to(DEVICE)
            yb = yb.to(DEVICE)
            wb = wb.to(DEVICE)

            optimizer.zero_grad()
            logits, emb = model(xb, return_emb=True)  # logits shape [B,2]
            loss_cls = criterion_cls(logits, yb, sample_weight=wb)
            loss_sup = supcon(emb, yb)
            loss = loss_cls + sup_w * loss_sup
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0)
            optimizer.step()
            train_losses.append(loss.item())

        scheduler.step()

        # validation
        model.eval()
        val_probs = []
        with torch.no_grad():
            for xb, yb, wb in va_loader:
                xb = xb.to(DEVICE)
                out = model(xb)
                p = F.softmax(out, dim=1)[:, 1].cpu().numpy()
                val_probs.extend(p)
        val_auc = roc_auc_score(y_va, val_probs)
        print(f"Epoch {epoch+1}/{EPOCHS} | train_loss={np.mean(train_losses):.5f} | val_auc={val_auc:.5f} | sup_w={sup_w:.4f}")
        if val_auc > best_auc:
            best_auc = val_auc
            best_state = {k: v.cpu() for k, v in model.state_dict().items()}
            best_val_pred = np.array(val_probs)

    # load best
    if best_state is not None:
        model.load_state_dict(best_state)
    model.eval()

    # record OOF preds
    oof_nn[va_idx] = best_val_pred

    # test preds for this fold
    preds_test_fold = []
    test_loader = DataLoader(HiggsDataset(X_test, None, None), batch_size=1024, shuffle=False)
    with torch.no_grad():
        for xb, _, _ in test_loader:
            xb = xb.to(DEVICE)
            out = model(xb)
            p = F.softmax(out, dim=1)[:, 1].cpu().numpy()
            preds_test_fold.extend(p)
    test_pred_nn += np.array(preds_test_fold) / N_FOLDS

    print(f"Fold {fold+1} NN best AUC={best_auc:.5f}")

# ---------------- CV: train LightGBM OOF preds ----------------
print("\n=== LightGBM OOF training ===")
for fold, (tr_idx, va_idx) in enumerate(kf.split(X, y)):
    X_tr, X_va = X[tr_idx], X[va_idx]
    y_tr, y_va = y[tr_idx], y[va_idx]

    lgbm = lgb.LGBMClassifier(
        n_estimators=1200, learning_rate=0.01, max_depth=-1, num_leaves=64,
        subsample=0.8, colsample_bytree=0.8, reg_lambda=1.0, random_state=SEED
    )
    lgbm.fit(
    X_tr, y_tr,
    eval_set=[(X_va, y_va)],
    callbacks=[
        lgb.early_stopping(stopping_rounds=50),
        lgb.log_evaluation(100)
    ]
    )

    oof_lgb[va_idx] = lgbm.predict_proba(X_va)[:, 1]
    pred_test = lgbm.predict_proba(X_test)[:, 1]
    test_pred_lgb_folds.append(pred_test)
    print(f"Fold {fold+1} LGB AUC={roc_auc_score(y_va, oof_lgb[va_idx]):.5f}")

test_pred_lgb = np.mean(test_pred_lgb_folds, axis=0)

# ---------------- CV: train XGBoost OOF preds ----------------
print("\n=== XGBoost OOF training ===")
for fold, (tr_idx, va_idx) in enumerate(kf.split(X, y)):
    X_tr, X_va = X[tr_idx], X[va_idx]
    y_tr, y_va = y[tr_idx], y[va_idx]

    xgbm = xgb.XGBClassifier(
        n_estimators=1200, learning_rate=0.01, max_depth=6,
        subsample=0.8, colsample_bytree=0.8, reg_lambda=1.0,
        eval_metric='auc', tree_method='hist', random_state=SEED, use_label_encoder=False
    )
    xgbm.fit(X_tr, y_tr, eval_set=[(X_va, y_va)], early_stopping_rounds=50, verbose=100)
    oof_xgb[va_idx] = xgbm.predict_proba(X_va)[:, 1]
    pred_test = xgbm.predict_proba(X_test)[:, 1]
    test_pred_xgb_folds.append(pred_test)
    print(f"Fold {fold+1} XGB AUC={roc_auc_score(y_va, oof_xgb[va_idx]):.5f}")

test_pred_xgb = np.mean(test_pred_xgb_folds, axis=0)

# ---------------- OOF summary ----------------
print("\nNN CV AUC:", roc_auc_score(y, oof_nn))
print("LGB CV AUC:", roc_auc_score(y, oof_lgb))
print("XGB CV AUC:", roc_auc_score(y, oof_xgb))

# ---------------- Tier-2: meta-learner stacking ----------------
print("\n=== Training meta-learner (stacking) ===")
stack_oof = np.vstack([oof_nn, oof_lgb, oof_xgb]).T
stack_test = np.vstack([test_pred_nn, test_pred_lgb, test_pred_xgb]).T

meta = LogisticRegression(max_iter=2000, solver='lbfgs')
meta.fit(stack_oof, y, sample_weight=weights)   # use physics weights in meta training

final_test_pred = meta.predict_proba(stack_test)[:, 1]
# also compute oof pred by meta for AMS tuning
oof_meta = meta.predict_proba(stack_oof)[:, 1]
print("Stacked (meta) CV AUC:", roc_auc_score(y, oof_meta))

# ---------------- AMS threshold search (on meta OOF) ----------------
thr_range = np.linspace(0.01, 0.99, 99)
best_thr, best_ams = 0.5, -1.0
for t in thr_range:
    s = weights[(y == 1) & (oof_meta > t)].sum()
    b = weights[(y == 0) & (oof_meta > t)].sum()
    score = ams_score(s, b)
    if score > best_ams:
        best_ams, best_thr = score, t
print(f"Best AMS on stacked OOF = {best_ams:.3f} @ thr={best_thr:.4f}")

# ---------------- Submission ----------------
print("\nWriting submission...")
rankorder = np.argsort(np.argsort(final_test_pred)) + 1
classes = np.where(final_test_pred > best_thr, 's', 'b')
sub = pd.DataFrame({
    "EventId": event_ids_test,
    "RankOrder": rankorder,
    "Class": classes
})
sub.to_csv(OUT_SUB, index=False)
print("Saved submission to:", OUT_SUB)
print("Final stacked OOF AUC:", roc_auc_score(y, oof_meta))
print("Final stacked OOF AMS:", best_ams)


train unzipped.
test unzipped.
submission unzipped.

===== NN Fold 1/5 =====


                                                               

Epoch 1/40 | train_loss=0.01706 | val_auc=0.87292 | sup_w=0.1200


                                                               

Epoch 2/40 | train_loss=0.01818 | val_auc=0.88398 | sup_w=0.1369


                                                               

Epoch 3/40 | train_loss=0.02001 | val_auc=0.88827 | sup_w=0.1538


                                                               

Epoch 4/40 | train_loss=0.02185 | val_auc=0.89106 | sup_w=0.1708


                                                               

Epoch 5/40 | train_loss=0.02372 | val_auc=0.89244 | sup_w=0.1877


                                                               

Epoch 6/40 | train_loss=0.02558 | val_auc=0.89231 | sup_w=0.2046


                                                               

Epoch 7/40 | train_loss=0.02744 | val_auc=0.89624 | sup_w=0.2215


                                                               

Epoch 8/40 | train_loss=0.02931 | val_auc=0.89474 | sup_w=0.2385


                                                               

Epoch 9/40 | train_loss=0.03118 | val_auc=0.89206 | sup_w=0.2554


                                                                

Epoch 10/40 | train_loss=0.03305 | val_auc=0.89660 | sup_w=0.2723


Fold1 Epoch11:  57%|█████▋    | 224/391 [00:13<00:09, 17.02it/s]