In [None]:
# ===============================================================
# HIGGS BOSON — Autoencoder (PyTorch) + XGBoost (Classification)
# ===============================================================

import os, math, zipfile
import numpy as np
import pandas as pd
import random
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import xgboost as xgb
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

# ---------------- Settings ----------------
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
SEED = 42
torch.manual_seed(SEED); np.random.seed(SEED); random.seed(SEED)

BATCH_SIZE = 512
EPOCHS = 40                 # used for AE training (kept same as your original)
LR = 7e-4
WEIGHT_DECAY = 3e-5
PROJ_DIM = 64               # autoencoder bottleneck dim

N_FOLDS = 5

# ---------------- Kaggle File Handling ----------------
zip_files = {
    "train": "/kaggle/input/higgs-boson/training.zip",
    "test": "/kaggle/input/higgs-boson/test.zip",
    "submission": "/kaggle/input/higgs-boson/random_submission.zip"
}
extract_dir = "/kaggle/working/higgs_data/"
os.makedirs(extract_dir, exist_ok=True)
for key, path in zip_files.items():
    if os.path.exists(path):
        with zipfile.ZipFile(path, "r") as z:
            z.extractall(extract_dir)
        print(f"{key} unzipped.")
    else:
        print(f"{key} zip not found at {path}")

TRAIN_CSV = os.path.join(extract_dir, "training.csv")
TEST_CSV = os.path.join(extract_dir, "test.csv")
OUT_SUB = "/kaggle/working/submission.csv"

# ---------------- AMS Metric ----------------
def ams_score(s, b):
    b_reg = 10.0
    rad = 2.0 * ((s + b + b_reg) * math.log(1.0 + s / (b + b_reg)) - s)
    return math.sqrt(rad) if rad > 0 else 0.0

# ---------------- Dataset ----------------
class HiggsDataset(Dataset):
    def __init__(self, X, y=None, sample_weight=None):
        self.X = X.astype(np.float32)
        self.y = y
        self.sample_weight = sample_weight
    def __len__(self): return len(self.X)
    def __getitem__(self, idx):
        x = torch.tensor(self.X[idx], dtype=torch.float32)
        if self.y is None:
            return x, torch.tensor(-1, dtype=torch.long), torch.tensor(0.0)
        y = torch.tensor(int(self.y[idx]), dtype=torch.long)
        w = torch.tensor(float(self.sample_weight[idx]) if self.sample_weight is not None else 1.0)
        return x, y, w

# ---------------- Autoencoder ----------------
class Autoencoder(nn.Module):
    def __init__(self, n_features, bottleneck_dim=64, hidden_dims=[128, 64]):
        super().__init__()
        dims = [n_features] + hidden_dims + [bottleneck_dim]
        enc_layers = []
        for i in range(len(dims)-1):
            enc_layers.append(nn.Linear(dims[i], dims[i+1]))
            if i < len(dims)-2:
                enc_layers.append(nn.ReLU())
        self.encoder = nn.Sequential(*enc_layers)
        # decoder symmetric
        dec_dims = [bottleneck_dim] + list(reversed(hidden_dims)) + [n_features]
        dec_layers = []
        for i in range(len(dec_dims)-1):
            dec_layers.append(nn.Linear(dec_dims[i], dec_dims[i+1]))
            if i < len(dec_dims)-2:
                dec_layers.append(nn.ReLU())
        self.decoder = nn.Sequential(*dec_layers)
    def forward(self, x):
        z = self.encoder(x)
        recon = self.decoder(z)
        return recon, z
    def encode(self, x):
        return self.encoder(x)

# ---------------- Weighted Focal Loss (kept for reference but not used) ----------------
class WeightedFocalLoss(nn.Module):
    def __init__(self, alpha=0.75, gamma=2.0):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma
    def forward(self, logits, targets, sample_weight=None):
        ce = F.cross_entropy(logits, targets, reduction='none')
        p_t = torch.exp(-ce)
        focal = ((1 - p_t) ** self.gamma) * ce
        if sample_weight is not None:
            focal = focal * sample_weight
        return focal.sum() / (sample_weight.sum() + 1e-12)

# ---------------- Load Data ----------------
train_df = pd.read_csv(TRAIN_CSV)
test_df = pd.read_csv(TEST_CSV)

train_df.replace(-999.0, np.nan, inplace=True)
test_df.replace(-999.0, np.nan, inplace=True)

for c in train_df.columns:
    if c in ['EventId','Weight','Label']: continue
    if train_df[c].isna().any():
        train_df[c+'_miss'] = train_df[c].isna().astype(int)
        test_df[c+'_miss'] = test_df[c].isna().astype(int)

numeric_cols = [c for c in train_df.select_dtypes(include=np.number).columns if c != "Weight"]
train_df[numeric_cols] = train_df[numeric_cols].fillna(train_df[numeric_cols].median())
num_cols_test = [c for c in numeric_cols if c in test_df.columns]
test_df[num_cols_test] = test_df[num_cols_test].fillna(train_df[num_cols_test].median())

if {'DER_mass_MMC','DER_mass_vis'}.issubset(train_df.columns):
    train_df['mass_ratio'] = train_df['DER_mass_MMC']/(train_df['DER_mass_vis']+1e-6)
    test_df['mass_ratio'] = test_df['DER_mass_MMC']/(test_df['DER_mass_vis']+1e-6)
if {'PRI_tau_pt','PRI_met'}.issubset(train_df.columns):
    train_df['pt_ratio'] = train_df['PRI_tau_pt']/(train_df['PRI_met']+1e-6)
    test_df['pt_ratio'] = test_df['PRI_tau_pt']/(test_df['PRI_met']+1e-6)

y = (train_df['Label'] == 's').astype(int).values
weights = train_df['Weight'].values
event_ids_test = test_df['EventId'].values

train_features = train_df.drop(columns=['EventId','Weight','Label'], errors='ignore')
test_features = test_df.drop(columns=['EventId'], errors='ignore')

scaler = StandardScaler()
X = scaler.fit_transform(train_features.values.astype(np.float32))
X_test = scaler.transform(test_features.values.astype(np.float32))

kf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)

# ---------------- Training (AE per-fold + XGBoost) ----------------
oof_xgb = np.zeros(len(X))
test_pred_xgb = np.zeros(len(X_test))

tsne_before_list, tsne_after_list, tsne_labels = [], [], []

for fold, (tr_idx, va_idx) in enumerate(kf.split(X, y)):
    print(f"\n===== Fold {fold+1}/{N_FOLDS} =====")
    X_tr, X_va = X[tr_idx], X[va_idx]
    y_tr, y_va = y[tr_idx], y[va_idx]
    w_tr, w_va = weights[tr_idx], weights[va_idx]

    # Dataloaders for AE training (unsupervised)
    tr_loader = DataLoader(HiggsDataset(X_tr), batch_size=BATCH_SIZE, shuffle=True)
    # instantiate AE
    ae = Autoencoder(n_features=X.shape[1], bottleneck_dim=PROJ_DIM, hidden_dims=[128, 64]).to(DEVICE)

    optimizer_ae = torch.optim.AdamW(ae.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
    scheduler_ae = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer_ae, T_max=EPOCHS)

    # Save "before" raw features for t-SNE (we'll use the raw val features as 'before')
    tsne_before_list.append(X_va.copy())
    tsne_labels.append(y_va)

    # AE training
    ae.train()
    for epoch in range(EPOCHS):
        epoch_loss = 0.0
        for xb, _, _ in tr_loader:
            xb = xb.to(DEVICE)
            optimizer_ae.zero_grad()
            recon, _ = ae(xb)
            loss = F.mse_loss(recon, xb)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(ae.parameters(), 5.0)
            optimizer_ae.step()
            epoch_loss += loss.item() * xb.size(0)
        scheduler_ae.step()
        # optional: print small progress
        if (epoch+1) % 10 == 0 or epoch==0:
            print(f"  AE Epoch {epoch+1}/{EPOCHS} — loss: {epoch_loss / len(X_tr):.6f}")

    ae.eval()
    # encode train/val/test to get bottleneck features
    with torch.no_grad():
        X_tr_lat = []
        for i in range(0, len(X_tr), BATCH_SIZE):
            xb = torch.tensor(X_tr[i:i+BATCH_SIZE], dtype=torch.float32).to(DEVICE)
            z = ae.encode(xb)
            X_tr_lat.append(z.cpu().numpy())
        X_tr_lat = np.vstack(X_tr_lat)

        X_va_lat = []
        for i in range(0, len(X_va), BATCH_SIZE):
            xb = torch.tensor(X_va[i:i+BATCH_SIZE], dtype=torch.float32).to(DEVICE)
            z = ae.encode(xb)
            X_va_lat.append(z.cpu().numpy())
        X_va_lat = np.vstack(X_va_lat)

        X_test_lat = []
        for i in range(0, len(X_test), BATCH_SIZE):
            xb = torch.tensor(X_test[i:i+BATCH_SIZE], dtype=torch.float32).to(DEVICE)
            z = ae.encode(xb)
            X_test_lat.append(z.cpu().numpy())
        X_test_lat = np.vstack(X_test_lat)

    tsne_after_list.append(X_va_lat.copy())

    # ---------------- XGBoost classifier on latent features ----------------
    # XGBoost parameters (reasonable defaults; adjust if needed)
    xgb_clf = xgb.XGBClassifier(
        n_estimators=2000,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        tree_method='auto',
        use_label_encoder=False,
        eval_metric='auc',
        random_state=SEED
    )

    # Fit with early stopping on validation latent set
    xgb_clf.fit(
        X_tr_lat, y_tr,
        sample_weight=w_tr,
        eval_set=[(X_va_lat, y_va)],
        sample_weight_eval_set=[w_va],
        early_stopping_rounds=50,
        verbose=False
    )

    # validation preds
    val_probs = xgb_clf.predict_proba(X_va_lat)[:,1]
    oof_xgb[va_idx] = val_probs

    # test preds (average)
    test_pred_xgb += xgb_clf.predict_proba(X_test_lat)[:,1] / N_FOLDS

    val_auc = roc_auc_score(y_va, val_probs)
    print(f"Fold {fold+1} XGBoost val AUC: {val_auc:.5f}")

# ---------------- t-SNE Visualization (latent AFTER only) ----------------
X_after = np.vstack(tsne_after_list)
y_tsne = np.concatenate(tsne_labels)
print("Running t-SNE on validation latent embeddings (after AE training)...")
tsne = TSNE(n_components=2, perplexity=50, random_state=SEED, init='pca', n_iter=1000)
X2d_after = tsne.fit_transform(X_after)
plt.figure(figsize=(7,6))
plt.scatter(X2d_after[:,0], X2d_after[:,1], c=y_tsne, s=6, alpha=0.7)
plt.title("t-SNE of Validation Latent Embeddings — AFTER (Autoencoder)")
plt.xlabel("TSNE-1"); plt.ylabel("TSNE-2")
plt.show()

# ---------------- AMS Optimization ----------------
thr_range = np.linspace(0.01,0.99,99)
best_thr, best_ams = 0.5, -1
for t in thr_range:
    s = weights[(y==1) & (oof_xgb>t)].sum()
    b = weights[(y==0) & (oof_xgb>t)].sum()
    sc = ams_score(s,b)
    if sc > best_ams:
        best_ams, best_thr = sc, t
print(f"Best AMS on OOF = {best_ams:.3f} @ thr={best_thr:.4f}")

# ---------------- Submission ----------------
print("\nWriting submission...")
rankorder = np.argsort(np.argsort(test_pred_xgb)) + 1
classes = np.where(test_pred_xgb > best_thr, 's', 'b')
sub = pd.DataFrame({"EventId": event_ids_test, "RankOrder": rankorder, "Class": classes})
sub.to_csv(OUT_SUB, index=False)
print("Saved submission to:", OUT_SUB)
print("Final AMS:", best_ams)
