In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, average_precision_score, accuracy_score

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.amp import GradScaler, autocast
from tqdm import tqdm

In [None]:
# 1. Load the dataset

df = pd.read_csv("HIGGS_short.csv")
y = df["label"].values.astype(np.float32)
X = df.drop(columns=["label"]).values.astype(np.float32)

In [None]:
# 2. Train/Val/Test Split (70/15/15)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, random_state=42, stratify=y_temp
)

In [5]:
# 3. Dataset / Dataloaders

class HiggsDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return (
            torch.tensor(self.X[idx], dtype=torch.float32),
            torch.tensor(self.y[idx], dtype=torch.float32),
        )

train_ds = HiggsDataset(X_train, y_train)
val_ds   = HiggsDataset(X_val, y_val)
test_ds  = HiggsDataset(X_test, y_test)

train_loader = DataLoader(train_ds, batch_size=8192, shuffle=True, num_workers=0, pin_memory=True)
val_loader   = DataLoader(val_ds,   batch_size=8192, shuffle=False, num_workers=0, pin_memory=True)
test_loader  = DataLoader(test_ds,  batch_size=8192, shuffle=False, num_workers=0, pin_memory=True)

In [None]:
# 4. Model Definition

class ResidualSwiGLUBlock(nn.Module):
    def __init__(self, dim):
        super().__init__()

        self.norm = nn.LayerNorm(dim)
        self.fc1 = nn.Linear(dim, dim * 2)
        self.fc2 = nn.Linear(dim, dim)

        # LayerScale
        self.layer_scale = nn.Parameter(0.1 * torch.ones(dim))

    def forward(self, x):
        residual = x
        x = self.norm(x)

        # SwiGLU
        a, b = self.fc1(x).chunk(2, dim=-1)
        x = a * torch.sigmoid(b)

        x = self.fc2(x)

        # LayerScale
        x = x * self.layer_scale

        return residual + x


class DeepMLP(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        hidden = 1536
        depth = 10

        self.input_layer = nn.Sequential(
            nn.Linear(input_dim, hidden),
            nn.LayerNorm(hidden),
        )

        self.blocks = nn.Sequential(
            *[ResidualSwiGLUBlock(hidden) for _ in range(depth)]
        )

        self.output_layer = nn.Linear(hidden, 1)

    def forward(self, x):
        x = self.input_layer(x)
        x = self.blocks(x)
        x = self.output_layer(x)
        return x.squeeze(1)

In [None]:
# 5. Device + Model Initialization

device = "cuda" #auto if no cuda
print("Using:", device)

model = DeepMLP(X_train.shape[1]).to(device)

criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4, weight_decay=1e-4)

print("Device:", device)
print("Model on:", next(model.parameters()).device)

Using: cuda
Device: cuda
Model on: cuda:0


In [8]:
# 6. LR Warmup + Cosine Annealing

EPOCHS = 50
total_steps = len(train_loader) * EPOCHS
warmup_steps = int(total_steps * 0.2)

def lr_lambda(step):
    if step < warmup_steps:
        return step / warmup_steps
    progress = (step - warmup_steps) / (total_steps - warmup_steps)
    return 0.5 * (1 + np.cos(np.pi * progress))

scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)

In [9]:
# 7. Model EMA

class EMA:
    def __init__(self, model, decay=0.999):
        self.decay = decay
        self.shadow = {k: v.clone().detach() for k, v in model.state_dict().items()}

    def update(self, model):
        for k, v in model.state_dict().items():
            self.shadow[k] = self.decay * self.shadow[k] + (1 - self.decay) * v.detach()

    def apply(self, model):
        model.load_state_dict(self.shadow)

ema = EMA(model)

In [10]:
# 8. GradScaler (CPU-safe)

scaler = GradScaler(device=device)

# 9. Validation Function

def evaluate_auc(loader):
    model.eval()
    preds, trues = [], []

    with torch.no_grad(), autocast(device_type=device):
        for xb, yb in loader:
            xb = xb.to(device)
            yb = yb.to(device)

            logits = model(xb)
            prob = torch.sigmoid(logits)

            preds.append(prob.cpu().numpy())
            trues.append(yb.cpu().numpy())

    return roc_auc_score(np.concatenate(trues), np.concatenate(preds))

In [11]:
# 10. Training Loop with Early Stopping

PATIENCE = 2
best_auc = 0
counter = 0
step = 0

for epoch in range(EPOCHS):
    model.train()
    loop = tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS}")

    for xb, yb in loop:
        xb = xb.to(device)
        yb = yb.to(device)

        optimizer.zero_grad()

        with autocast(device_type=device):
            logits = model(xb)
            loss = criterion(logits, yb)

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()

        # Update EMA after optimizer step
        ema.update(model)
        step += 1

        loop.set_postfix(loss=loss.item())

    val_auc = evaluate_auc(val_loader)
    print(f"Epoch {epoch+1} Validation AUC = {val_auc:.5f}")

    if val_auc > best_auc:
        best_auc = val_auc
        counter = 0

        # Save both raw weights AND EMA weights
        torch.save(model.state_dict(), "best_raw.pt")
        torch.save(ema.shadow, "best_ema.pt")

        print("New Best Model")
    else:
        counter += 1
        print(f"  No improvement ({counter}/{PATIENCE})")
        if counter >= PATIENCE:
            print("Early stopping!")
            break

Epoch 1/50: 100%|██████████| 939/939 [05:47<00:00,  2.70it/s, loss=0.591]


Epoch 1 Validation AUC = 0.75037
New Best Model


Epoch 2/50: 100%|██████████| 939/939 [05:55<00:00,  2.64it/s, loss=0.559]


Epoch 2 Validation AUC = 0.79634
New Best Model


Epoch 3/50: 100%|██████████| 939/939 [05:54<00:00,  2.65it/s, loss=0.5]  


Epoch 3 Validation AUC = 0.82914
New Best Model


Epoch 4/50: 100%|██████████| 939/939 [05:50<00:00,  2.68it/s, loss=0.481]


Epoch 4 Validation AUC = 0.84157
New Best Model


Epoch 5/50: 100%|██████████| 939/939 [05:39<00:00,  2.77it/s, loss=0.492]


Epoch 5 Validation AUC = 0.84903
New Best Model


Epoch 6/50: 100%|██████████| 939/939 [05:46<00:00,  2.71it/s, loss=0.478]


Epoch 6 Validation AUC = 0.85579
New Best Model


Epoch 7/50: 100%|██████████| 939/939 [05:59<00:00,  2.61it/s, loss=0.47] 


Epoch 7 Validation AUC = 0.85926
New Best Model


Epoch 8/50: 100%|██████████| 939/939 [05:57<00:00,  2.63it/s, loss=0.449]


Epoch 8 Validation AUC = 0.86216
New Best Model


Epoch 9/50: 100%|██████████| 939/939 [05:49<00:00,  2.68it/s, loss=0.453]


Epoch 9 Validation AUC = 0.86698
New Best Model


Epoch 10/50: 100%|██████████| 939/939 [05:48<00:00,  2.69it/s, loss=0.443]


Epoch 10 Validation AUC = 0.87036
New Best Model


Epoch 11/50: 100%|██████████| 939/939 [05:46<00:00,  2.71it/s, loss=0.428]


Epoch 11 Validation AUC = 0.87283
New Best Model


Epoch 12/50: 100%|██████████| 939/939 [05:47<00:00,  2.71it/s, loss=0.44] 


Epoch 12 Validation AUC = 0.87522
New Best Model


Epoch 13/50: 100%|██████████| 939/939 [05:39<00:00,  2.77it/s, loss=0.415]


Epoch 13 Validation AUC = 0.87730
New Best Model


Epoch 14/50: 100%|██████████| 939/939 [05:39<00:00,  2.77it/s, loss=0.412]


Epoch 14 Validation AUC = 0.87798
New Best Model


Epoch 15/50: 100%|██████████| 939/939 [05:41<00:00,  2.75it/s, loss=0.407]


Epoch 15 Validation AUC = 0.87871
New Best Model


Epoch 16/50: 100%|██████████| 939/939 [05:44<00:00,  2.73it/s, loss=0.399]


Epoch 16 Validation AUC = 0.87779
  No improvement (1/2)


Epoch 17/50: 100%|██████████| 939/939 [05:46<00:00,  2.71it/s, loss=0.408]


Epoch 17 Validation AUC = 0.87797
  No improvement (2/2)
Early stopping!


In [None]:
# 11. Final Evaluation (EMA weights)

ema_weights = torch.load("best_ema.pt")
model.load_state_dict(ema_weights)

model.eval()
test_preds, test_trues = [], []

with torch.no_grad(), autocast(device_type=device):
    for xb, yb in test_loader:
        xb = xb.to(device)
        yb = yb.to(device)
        prob = torch.sigmoid(model(xb))
        test_preds.append(prob.cpu().numpy())
        test_trues.append(yb.cpu().numpy())

test_preds = np.concatenate(test_preds)
test_trues = np.concatenate(test_trues)

auc = roc_auc_score(test_trues, test_preds)
pr_auc = average_precision_score(test_trues, test_preds)
acc = accuracy_score(test_trues, (test_preds > 0.5).astype(int))

print("\n================= FINAL MLP RESULTS =================")
print("ROC-AUC:", round(auc, 5))
print("PR-AUC:", round(pr_auc, 5))
print("Accuracy:", round(acc, 5))
print("====================================================")

torch.save(model.state_dict(), "Models/DNN.pt")


ROC-AUC: 0.88525
PR-AUC: 0.89509
Accuracy: 0.79986
