In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
import lightgbm as lgb
from sklearn.metrics import roc_auc_score, accuracy_score, average_precision_score

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

In [None]:
# 1. Load data

df = pd.read_csv("HIGGS_short.csv")
y = df["label"].values
X = df.drop(columns=["label"]).values.astype("float32")

In [None]:
# 2. Train/Val/Test Split (70/15/15)

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, random_state=42, stratify=y_temp
)

print(f"Train: {X_train.shape}, Val: {X_val.shape}, Test: {X_test.shape}")

In [None]:
# 3. Torch datasets

train_ds = TensorDataset(torch.tensor(X_train))
val_ds   = TensorDataset(torch.tensor(X_val))
test_ds  = TensorDataset(torch.tensor(X_test))

train_dl = DataLoader(train_ds, batch_size=1024, shuffle=True)
val_dl   = DataLoader(val_ds, batch_size=2048)
test_dl  = DataLoader(test_ds, batch_size=2048)

In [None]:
# 4. Autoencoder Model

class Autoencoder(nn.Module):
    def __init__(self, input_dim=28, latent_dim=8):
        super().__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 14),
            nn.ReLU(),
            nn.Linear(14, latent_dim)
        )
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 14),
            nn.ReLU(),
            nn.Linear(14, input_dim)
        )

    def forward(self, x):
        z = self.encoder(x)
        recon = self.decoder(z)
        return recon, z

latent_dim = 8
model = Autoencoder(input_dim=X.shape[1], latent_dim=latent_dim).to(device)

loss_fn = nn.MSELoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)

In [None]:
# 5. Train Autoencoder

EPOCHS = 20
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    for (xb,) in train_dl:
        xb = xb.to(device)

        optimizer.zero_grad()
        recon, _ = model(xb)
        loss = loss_fn(recon, xb)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{EPOCHS}, Loss = {total_loss:.4f}")

In [None]:
# 6. Extract latent embeddings

def get_latent(loader):
    model.eval()
    latents = []
    with torch.no_grad():
        for (xb,) in loader:
            xb = xb.to(device)
            _, z = model(xb)
            latents.append(z.cpu().numpy())
    return np.vstack(latents)

Z_train = get_latent(train_dl)
Z_val   = get_latent(val_dl)
Z_test  = get_latent(test_dl)

print("Latent shape:", Z_train.shape)

In [None]:
# 7. Combine original features + latent

X_train_aug = np.hstack([X_train, Z_train])
X_val_aug   = np.hstack([X_val,   Z_val])
X_test_aug  = np.hstack([X_test,  Z_test])

In [None]:
# 8. Train LightGBM on augmented features

train_data = lgb.Dataset(X_train_aug, label=y_train)
val_data   = lgb.Dataset(X_val_aug,   label=y_val)

params = {
    "objective": "binary",
    "metric": "auc",
    "learning_rate": 0.05,
    "num_leaves": 64,
    "feature_fraction": 0.9,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "max_depth": -1,
    "min_data_in_leaf": 50,
    "verbose": -1
}

model_lgb = lgb.train(
    params,
    train_data,
    num_boost_round=2000,
    valid_sets=[val_data],
    callbacks=[
        lgb.early_stopping(stopping_rounds=100),
        lgb.log_evaluation(period=100)
        ]
)

In [None]:
# 9. Evaluate

preds_proba = model_lgb.predict(X_test_aug)
preds = (preds_proba > 0.5).astype(int)

auc = roc_auc_score(y_test, preds_proba)
pr_auc = average_precision_score(y_test, preds_proba)
acc = accuracy_score(y_test, preds)

print("\n================= HYBRID RESULTS =================")
print("ROC-AUC:", round(auc, 5))
print("PR-AUC:", round(pr_auc, 5))
print("Accuracy:", round(acc, 5))
print("==================================================")

In [None]:
# 10. Save Model

torch.save(model.state_dict(), "Models/autoencoder_lgbm.pth")