In [None]:
# ===============================================================
# HIGGS BOSON — XGBoost only
# ===============================================================

import os, math, zipfile
import numpy as np
import pandas as pd
import random
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
import xgboost as xgb

# ---------------- Settings ----------------
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
N_FOLDS = 5

# ---------------- Kaggle File Handling ----------------
zip_files = {
    "train": "/kaggle/input/higgs-boson/training.zip",
    "test":  "/kaggle/input/higgs-boson/test.zip",
    "submission": "/kaggle/input/higgs-boson/random_submission.zip"
}
extract_dir = "/kaggle/working/higgs_data/"
os.makedirs(extract_dir, exist_ok=True)
for key, path in zip_files.items():
    if os.path.exists(path):
        with zipfile.ZipFile(path, "r") as z:
            z.extractall(extract_dir)
            print(f"{key} unzipped.")
    else:
        print(f"{key} zip not found at {path}")

TRAIN_CSV = os.path.join(extract_dir, "training.csv")
TEST_CSV  = os.path.join(extract_dir, "test.csv")
OUT_SUB  = "/kaggle/working/submission.csv"

# ---------------- AMS Metric ----------------
def ams_score(s, b):
    b_reg = 10.0
    rad = 2.0 * ((s + b + b_reg) * math.log(1.0 + s / (b + b_reg)) - s)
    return math.sqrt(rad) if rad > 0 else 0.0

# ---------------- Load Data ----------------
train_df = pd.read_csv(TRAIN_CSV)
test_df  = pd.read_csv(TEST_CSV)

# Replace missing sentinel and create missing flags
train_df.replace(-999.0, np.nan, inplace=True)
test_df.replace(-999.0, np.nan, inplace=True)

for c in train_df.columns:
    if c in ['EventId','Weight','Label']: continue
    if train_df[c].isna().any():
        train_df[c+'_miss'] = train_df[c].isna().astype(int)
        test_df[c+'_miss']  = test_df[c].isna().astype(int)

numeric_cols = [c for c in train_df.select_dtypes(include=np.number).columns if c != "Weight"]
train_df[numeric_cols] = train_df[numeric_cols].fillna(train_df[numeric_cols].median())
num_cols_test = [c for c in numeric_cols if c in test_df.columns]
test_df[num_cols_test] = test_df[num_cols_test].fillna(train_df[num_cols_test].median())

# Add derived features (same as original)
if {'DER_mass_MMC','DER_mass_vis'}.issubset(train_df.columns):
    train_df['mass_ratio'] = train_df['DER_mass_MMC']/(train_df['DER_mass_vis']+1e-6)
    test_df['mass_ratio']  = test_df['DER_mass_MMC']/(test_df['DER_mass_vis']+1e-6)
if {'PRI_tau_pt','PRI_met'}.issubset(train_df.columns):
    train_df['pt_ratio'] = train_df['PRI_tau_pt']/(train_df['PRI_met']+1e-6)
    test_df['pt_ratio']  = test_df['PRI_tau_pt']/(test_df['PRI_met']+1e-6)

# Targets, weights, ids
y = (train_df['Label'] == 's').astype(int).values
weights = train_df['Weight'].values
event_ids_test = test_df['EventId'].values

# Features
train_features = train_df.drop(columns=['EventId','Weight','Label'], errors='ignore')
test_features  = test_df.drop(columns=['EventId'], errors='ignore')

# Scaling (same as original)
scaler = StandardScaler()
X = scaler.fit_transform(train_features.values.astype(np.float32))
X_test = scaler.transform(test_features.values.astype(np.float32))

kf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)

# ---------------- Containers ----------------
oof_xgb = np.zeros(len(X))
test_pred_xgb_folds = []

# ---------------- XGBoost (only model) ----------------
print("\n=== XGBoost ===")
for fold, (tr_idx, va_idx) in enumerate(kf.split(X, y)):
    print(f"\n--- Fold {fold+1}/{N_FOLDS} ---")
    X_tr, X_va = X[tr_idx], X[va_idx]
    y_tr, y_va = y[tr_idx], y[va_idx]
    w_tr, w_va = weights[tr_idx], weights[va_idx]

    xgbm = xgb.XGBClassifier(
        n_estimators=1200,
        learning_rate=0.01,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_lambda=1.0,
        use_label_encoder=False,
        tree_method='hist',
        eval_metric='auc',
        random_state=SEED
    )

    # Fit with early stopping using evaluation set, pass sample weights if desired
    xgbm.fit(
        X_tr, y_tr,
        sample_weight=w_tr,
        eval_set=[(X_va, y_va)],
        early_stopping_rounds=50,
        verbose=100
    )

    # OOF and test predictions
    oof_xgb[va_idx] = xgbm.predict_proba(X_va)[:,1]
    test_pred_xgb_folds.append(xgbm.predict_proba(X_test)[:,1])

# Average test predictions across folds
test_pred_xgb = np.mean(test_pred_xgb_folds, axis=0)

# ---------------- Evaluation & Threshold search (AMS) ----------------
thr_range = np.linspace(0.01, 0.99, 99)
best_thr, best_ams = 0.5, -1
for t in thr_range:
    s = weights[(y==1) & (oof_xgb > t)].sum()
    b = weights[(y==0) & (oof_xgb > t)].sum()
    sc = ams_score(s, b)
    if sc > best_ams:
        best_ams, best_thr = sc, t

print(f"\nBest AMS on XGBoost OOF = {best_ams:.6f} @ thr={best_thr:.4f}")
print("CV AUC (OOF):", roc_auc_score(y, oof_xgb))

# ---------------- Submission ----------------
print("\nWriting submission...")
rankorder = np.argsort(np.argsort(test_pred_xgb)) + 1
classes = np.where(test_pred_xgb > best_thr, 's', 'b')
sub = pd.DataFrame({"EventId": event_ids_test, "RankOrder": rankorder, "Class": classes})
sub.to_csv(OUT_SUB, index=False)
print("Saved submission to:", OUT_SUB)
print("Done.")


train unzipped.
test unzipped.
submission unzipped.

=== XGBoost ===

--- Fold 1/5 ---




[0]	validation_0-auc:0.77143
[100]	validation_0-auc:0.84352
[200]	validation_0-auc:0.85335
[300]	validation_0-auc:0.86197
[400]	validation_0-auc:0.86924
[500]	validation_0-auc:0.87583
[600]	validation_0-auc:0.88061
[700]	validation_0-auc:0.88304
[800]	validation_0-auc:0.88533
[900]	validation_0-auc:0.88703
[1000]	validation_0-auc:0.88856
[1100]	validation_0-auc:0.88967
[1199]	validation_0-auc:0.89057

--- Fold 2/5 ---
[0]	validation_0-auc:0.76911
[100]	validation_0-auc:0.84912
[200]	validation_0-auc:0.85767
[300]	validation_0-auc:0.86524
[400]	validation_0-auc:0.87303
[500]	validation_0-auc:0.87891
[600]	validation_0-auc:0.88312
[700]	validation_0-auc:0.88572
[800]	validation_0-auc:0.88785
[900]	validation_0-auc:0.88983
[1000]	validation_0-auc:0.89128
[1100]	validation_0-auc:0.89239
[1199]	validation_0-auc:0.89330

--- Fold 3/5 ---
[0]	validation_0-auc:0.77058
[100]	validation_0-auc:0.84759
[200]	validation_0-auc:0.85654
[300]	validation_0-auc:0.86445
[400]	validation_0-auc:0.87232
[50