```load data```

In [1]:
import numpy as np
import pandas as pd
import os
import gc
import warnings

from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, roc_auc_score
from xgboost import XGBClassifier
from scipy.stats import rankdata
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier


train_df_raw = pd.read_csv("/kaggle/input/playground-series-s6e2/train.csv")
test_df = pd.read_csv("/kaggle/input/playground-series-s6e2/test.csv")

SAVE_DIR = "/kaggle/working/model_outputs"
os.makedirs(SAVE_DIR, exist_ok=True)

print(train_df_raw.shape, test_df.shape)

(630000, 15) (270000, 14)


In [2]:
warnings.filterwarnings("ignore")

try:
    import torch
    USE_GPU = torch.cuda.is_available()
except Exception:
    USE_GPU = False

print(f"Hardware: {'GPU' if USE_GPU else 'CPU'}")

Hardware: CPU


```Separate Features & Target```

In [3]:
ID_COL = "id"
TARGET_COL = "Heart Disease"
train_df = train_df_raw.copy()
train_df.drop('id', axis=1, inplace=True) # id is not used for model training and prediction
X = train_df.drop(columns=[TARGET_COL]).copy()

y_raw = train_df[TARGET_COL].copy()

# Convert target to 0/1
# Absence -> 0, Presence -> 1
y = y_raw.map({"Absence": 0, "Presence": 1}).astype(int)

X_test = test_df

# Drop ID column
if ID_COL in X.columns:
    X = X.drop(columns=[ID_COL])
if ID_COL in X_test.columns:
    X_test = X_test.drop(columns=[ID_COL])

print("X shape:", X.shape)
print("X_test shape:", X_test.shape)
print("Target distribution:")
print(y.value_counts())

X shape: (630000, 13)
X_test shape: (270000, 13)
Target distribution:
Heart Disease
0    347546
1    282454
Name: count, dtype: int64


```Identify Column```

In [4]:
cat_cols = [
    "Sex",
    "Chest pain type",
    "FBS over 120",
    "EKG results",
    "Exercise angina",
    "Slope of ST",
    "Number of vessels fluro",
    "Thallium"
]

num_cols = [
    "Age",
    "BP",
    "Cholesterol",
    "Max HR",
    "ST depression"
]
extra_cat_cols = ["Chol_bin", "Age_bin", "ST_bin"]

```Frequency Encoding```

In [5]:
SEEDS = [42, 202, 777, 1001, 2023]
# SEEDS = [42]
# N_SPLITS = 3
N_SPLITS = 10
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)

In [6]:
def make_freq_features(df_train, df_test, cols):
    df_all = pd.concat([df_train[cols], df_test[cols]], axis=0, ignore_index=True)
    tr_out = pd.DataFrame(index=df_train.index)
    te_out = pd.DataFrame(index=df_test.index)

    for c in cols:
        freqs = df_all[c].value_counts(normalize=True)
        tr_out[f"{c}_freq"] = df_train[c].map(freqs).astype(np.float32)
        te_out[f"{c}_freq"] = df_test[c].map(freqs).astype(np.float32)

    return tr_out, te_out

train_freq, test_freq = make_freq_features(train_df, test_df, cat_cols + num_cols)

```K-fold target mean encoding```

In [7]:
def make_target_mean_features(df_train, y, df_test, cols, n_splits=N_SPLITS, seed=42, alpha=50):
    tr_out = pd.DataFrame(index=df_train.index)
    te_out = pd.DataFrame(index=df_test.index)

    global_mean = float(y.mean())

    for c in cols:
        tr_feat = pd.Series(index=df_train.index, dtype=np.float32)

        for tr_idx, va_idx in skf.split(df_train, y):
            tr_part = df_train.iloc[tr_idx]
            y_part = y.iloc[tr_idx]

            stats = (
                pd.DataFrame({c: tr_part[c].values, "y": y_part.values})
                .groupby(c)["y"]
                .agg(["mean", "count"])
            )

            smooth = (stats["mean"] * stats["count"] + global_mean * alpha) / (stats["count"] + alpha)
            tr_feat.iloc[va_idx] = df_train.iloc[va_idx][c].map(smooth)

        full_stats = (
            pd.DataFrame({c: df_train[c].values, "y": y.values})
            .groupby(c)["y"]
            .agg(["mean", "count"])
        )

        full_smooth = (full_stats["mean"] * full_stats["count"] + global_mean * alpha) / (full_stats["count"] + alpha)

        tr_out[f"{c}_te"] = tr_feat.fillna(global_mean).astype(np.float32)
        te_out[f"{c}_te"] = df_test[c].map(full_smooth).fillna(global_mean).astype(np.float32)

    return tr_out, te_out

train_te, test_te = make_target_mean_features(train_df, y, test_df, cat_cols, n_splits=N_SPLITS, seed=42, alpha=50)


```Build train-test metrics```

In [8]:
X = pd.concat(
    [train_df[cat_cols + num_cols], train_freq, train_te],
    axis=1
)

X_test = pd.concat(
    [test_df[cat_cols + num_cols], test_freq, test_te],
    axis=1
)


```Feature Engineering```

In [9]:
# STEP 1: Age × Max HR
X["Age_x_MaxHR"] = X["Age"] * X["Max HR"]
X_test["Age_x_MaxHR"] = X_test["Age"] * X_test["Max HR"]

# STEP 2: ST depression × Exercise angina
X["ST_x_Angina"] = X["ST depression"] * X["Exercise angina"]
X_test["ST_x_Angina"] = X_test["ST depression"] * X_test["Exercise angina"]

# STEP 3: High blood pressure flag (BP ≥ 140)
X["High_BP"] = (X["BP"] >= 140).astype(int)
X_test["High_BP"] = (X_test["BP"] >= 140).astype(int)

# STEP 4: Cholesterol risk bin (0=normal, 1=borderline, 2=high)
X["Chol_bin"] = pd.cut(X["Cholesterol"], bins=[0, 200, 240, 2000], labels=[0, 1, 2]).astype(int)
X_test["Chol_bin"] = pd.cut(X_test["Cholesterol"], bins=[0, 200, 240, 2000], labels=[0, 1, 2]).astype(int)

# STEP 5: Age risk bucket
X["Age_bin"] = pd.cut(
    X["Age"],
    bins=[0, 40, 50, 60, 70, 120],
    labels=[0, 1, 2, 3, 4]
).astype(int)

X_test["Age_bin"] = pd.cut(
    X_test["Age"],
    bins=[0, 40, 50, 60, 70, 120],
    labels=[0, 1, 2, 3, 4]
).astype(int)

# STEP 6: Low maximum heart rate flag
X["Low_MaxHR"] = (X["Max HR"] < 120).astype(int)
X_test["Low_MaxHR"] = (X_test["Max HR"] < 120).astype(int)

# STEP 7: ST depression severity bucket
X["ST_bin"] = pd.cut(
    X["ST depression"],
    bins=[-1, 0.5, 1.5, 10],
    labels=[0, 1, 2]
).astype(int)

X_test["ST_bin"] = pd.cut(
    X_test["ST depression"],
    bins=[-1, 0.5, 1.5, 10],
    labels=[0, 1, 2]
).astype(int)


# STEP 8: Simple risk score
X["Risk_score"] = (
    X["High_BP"]
    + X["Chol_bin"]
    + X["Exercise angina"]
    + X["Age_bin"]
)

X_test["Risk_score"] = (
    X_test["High_BP"]
    + X_test["Chol_bin"]
    + X_test["Exercise angina"]
    + X_test["Age_bin"]
)

In [10]:
all_cat_cols = cat_cols + extra_cat_cols

cat_features = [X.columns.get_loc(c) for c in all_cat_cols]

print("X_train shape:", X.shape, "| X_test shape:", X_test.shape)

X_train shape: (630000, 42) | X_test shape: (270000, 42)


In [11]:
neg = (y == 0).sum()
pos = (y == 1).sum()
scale_pos_weight = neg / pos

print("scale_pos_weight:", scale_pos_weight)


scale_pos_weight: 1.2304516841680415


In [12]:
# def make_lgb(seed: int) -> LGBMClassifier:
#     return LGBMClassifier(
#         n_estimators=20000,
#         learning_rate=0.02,
#         num_leaves=64,
#         max_depth=-1,
#         min_child_samples=200,
#         subsample=0.8,
#         colsample_bytree=0.8,
#         reg_alpha=0.0,
#         reg_lambda=1.0,
#         objective="binary",
#         random_state=seed,
#         early_stopping_round=200,
#         metric="auc",
#         categorical_feature=cat_features,
#         n_jobs=-1,
#         verbosity=-1#to silent
#     )

```Train multi-seed LightGBoost (OOF + test probs)```

In [13]:
# oof_lgb = np.zeros(len(train_df), dtype=np.float32)
# test_lgb = np.zeros(len(test_df), dtype=np.float32)

# lgb_oof_aucs = []

# for seed in SEEDS:
#     oof_s = np.zeros(len(train_df), dtype=np.float32)
#     test_s = np.zeros(len(test_df), dtype=np.float32)

#     for fold, (tr_idx, va_idx) in enumerate(skf.split(X, y), 1):
#         X_tr, y_tr = X.iloc[tr_idx], y.iloc[tr_idx]
#         X_va, y_va = X.iloc[va_idx], y.iloc[va_idx]

#         model = make_lgb(seed + fold)
#         model.fit(
#             X_tr, y_tr,
#             eval_set=[(X_va, y_va)]
#         )

#         va_pred = model.predict_proba(X_va)[:, 1]
#         oof_s[va_idx] = va_pred

#         test_s += model.predict_proba(X_test)[:, 1] / N_SPLITS

#         del model
#         gc.collect()

#     auc_s = roc_auc_score(y, oof_s)
#     lgb_oof_aucs.append(auc_s)

#     oof_lgb += oof_s / len(SEEDS)
#     test_lgb += test_s / len(SEEDS)

#     print(f"LGB Seed {seed} | OOF AUC: {auc_s:.6f}")

# print("\nLGB Mean OOF AUC:", float(np.mean(lgb_oof_aucs)))
# print("LGB Std  OOF AUC:", float(np.std(lgb_oof_aucs)))


```Save LightGBM predictions```

In [14]:
# pd.DataFrame({
#     "id": train_df_raw[ID_COL],
#     "oof_lgb": oof_lgb
# }).to_csv(f"{SAVE_DIR}/lgb_oof.csv", index=False)

# pd.DataFrame({
#     "id": test_df[ID_COL],
#     "test_lgb": test_lgb
# }).to_csv(f"{SAVE_DIR}/lgb_test.csv", index=False)

# print("Saved LGB predictions")


```XGBClassifier```

In [15]:
def make_xgb(seed: int) -> XGBClassifier:
    return XGBClassifier(
        n_estimators=4000,
        learning_rate=0.03,
        
        max_depth=6,
        min_child_weight=3,
        
        subsample=0.85,
        colsample_bytree=0.85,
        
        scale_pos_weight=scale_pos_weight,
        early_stopping_rounds=150,
        
        gamma=0.1,
        reg_alpha=0.1,
        reg_lambda=1.0,
        objective="binary:logistic",
        eval_metric="error",
        
        tree_method="hist",
        random_state=seed,
        n_jobs=-1
    )


```Cross-validation and seeds```

```Train multi-seed XGBoost (OOF + test probs)```

In [16]:
oof_xgb = np.zeros(len(train_df), dtype=np.float32)
test_xgb = np.zeros(len(test_df), dtype=np.float32)
xgb_oof_aucs = []

for s in SEEDS:
    oof_s = np.zeros(len(train_df), dtype=np.float32)
    test_s = np.zeros(len(test_df), dtype=np.float32)

    for fold, (tr_idx, va_idx) in enumerate(skf.split(X, y), 1):
        X_tr, y_tr = X.iloc[tr_idx], y.iloc[tr_idx]
        X_va, y_va = X.iloc[va_idx], y.iloc[va_idx]

        model = make_xgb(seed=s + fold)
        model.fit(
            X_tr, y_tr,
            eval_set=[(X_va, y_va)],
            verbose=False
        )

        va_pred = model.predict_proba(X_va)[:, 1]
        oof_s[va_idx] = va_pred

        test_s += model.predict_proba(X_test)[:, 1] / skf.n_splits

        del model
        gc.collect()

    auc_s = roc_auc_score(y, oof_s)
    xgb_oof_aucs.append(auc_s)

    oof_xgb += oof_s / len(SEEDS)
    test_xgb += test_s / len(SEEDS)

    print(f"XGB Seed {s} | OOF AUC: {auc_s:.6f}")

print("XGB Mean OOF AUC:", float(np.mean(xgb_oof_aucs)))


XGB Seed 42 | OOF AUC: 0.955286
XGB Seed 202 | OOF AUC: 0.955255
XGB Seed 777 | OOF AUC: 0.955199
XGB Seed 1001 | OOF AUC: 0.955242
XGB Seed 2023 | OOF AUC: 0.955292
XGB Mean OOF AUC: 0.9552547626231294


```Save XGBoost predictions```

In [17]:
# Save OOF
pd.DataFrame({
    "id": train_df_raw[ID_COL],
    "oof_xgb": oof_xgb
}).to_csv(f"{SAVE_DIR}/xgb_oof.csv", index=False)

# Save test predictions
pd.DataFrame({
    "id": test_df[ID_COL],
    "test_xgb": test_xgb
}).to_csv(f"{SAVE_DIR}/xgb_test.csv", index=False)

print("Saved XGB predictions")


Saved XGB predictions


In [18]:
# def make_cat(seed, USE_GPU=True):
#     return CatBoostClassifier(
#         iterations=8000,
#         learning_rate=0.03,
#         depth=6,
#         loss_function="Logloss",
#         eval_metric="AUC",
#         auto_class_weights="Balanced",
#         early_stopping_rounds=200,
#         task_type="GPU" if USE_GPU else "CPU",
#         logging_level="Silent",
#         random_seed=seed,
#         cat_features=cat_features
#     )


```Train multi-seed CatBoost (OOF + test probs)```

In [19]:
# oof_cat = np.zeros(len(train_df), dtype=np.float32)
# test_cat = np.zeros(len(test_df), dtype=np.float32)
# cat_oof_aucs = []

# for s in SEEDS:
#     oof_s = np.zeros(len(train_df), dtype=np.float32)
#     test_s = np.zeros(len(test_df), dtype=np.float32)

#     for fold, (tr_idx, va_idx) in enumerate(skf.split(X, y), 1):
#         X_tr, y_tr = X.iloc[tr_idx], y.iloc[tr_idx]
#         X_va, y_va = X.iloc[va_idx], y.iloc[va_idx]

#         model = make_cat(seed + fold, USE_GPU)
#         model.fit(
#             X_tr, y_tr,
#             eval_set=(X_va, y_va)
#         )

#         va_pred = model.predict_proba(X_va)[:, 1]
#         oof_s[va_idx] = va_pred

#         test_s += model.predict_proba(X_test)[:, 1] / skf.n_splits

#         del model
#         gc.collect()

#     auc_s = roc_auc_score(y, oof_s)
#     cat_oof_aucs.append(auc_s)

#     oof_cat += oof_s / len(SEEDS)
#     test_cat += test_s / len(SEEDS)

#     print(f"CAT Seed {s} | OOF AUC: {auc_s:.6f}")

# print("CAT Mean OOF AUC:", float(np.mean(cat_oof_aucs)))


```Save CatBoost predictions```

In [20]:
# pd.DataFrame({
#     "id": train_df[ID_COL],
#     "oof_cat": oof_cat
# }).to_csv(f"{SAVE_DIR}/cat_oof.csv", index=False)

# pd.DataFrame({
#     "id": test_df[ID_COL],
#     "test_cat": test_cat
# }).to_csv(f"{SAVE_DIR}/cat_test.csv", index=False)

# print("Saved CatBoost predictions")


```File loading```

In [21]:
# xgb_oof_df = pd.read_csv(f"{SAVE_DIR}/xgb_oof.csv")
# cat_oof_df = pd.read_csv(f"{SAVE_DIR}/cat_oof.csv")
# lgb_oof_df = pd.read_csv(f"{SAVE_DIR}/lgb_oof.csv")

# # Sort by ID_COL to guarantee alignment
# xgb_oof_df = xgb_oof_df.sort_values(ID_COL).reset_index(drop=True)
# cat_oof_df = cat_oof_df.sort_values(ID_COL).reset_index(drop=True)
# lgb_oof_df = lgb_oof_df.sort_values(ID_COL).reset_index(drop=True)

# # Safety check (very important)
# assert (
#     xgb_oof_df[ID_COL].equals(cat_oof_df[ID_COL]) and
#     xgb_oof_df[ID_COL].equals(lgb_oof_df[ID_COL])
# ), "ID mismatch detected between OOF files!"

# # Extract numpy arrays
# oof_xgb = xgb_oof_df["oof_xgb"].values
# oof_cat = cat_oof_df["oof_cat"].values
# oof_lgb = lgb_oof_df["oof_lgb"].values

# print("Loaded shapes:",
#       oof_xgb.shape,
#       oof_cat.shape,
#       oof_lgb.shape)

In [22]:
# # Rank-transform OOF predictions
# rank_oof_xgb = rankdata(oof_xgb)
# rank_oof_cat = rankdata(oof_cat)
# rank_oof_lgb = rankdata(oof_lgb)

# # Optional: scale ranks to [0, 1] (not required for AUC, but nice)
# rank_oof_xgb = (rank_oof_xgb - rank_oof_xgb.min()) / (rank_oof_xgb.max() - rank_oof_xgb.min())
# rank_oof_cat = (rank_oof_cat - rank_oof_cat.min()) / (rank_oof_cat.max() - rank_oof_cat.min())
# rank_oof_lgb = (rank_oof_lgb - rank_oof_lgb.min()) / (rank_oof_lgb.max() - rank_oof_lgb.min())

# best = {"auc": -1.0, "w_xgb": 0.33, "w_cat": 0.33, "w_lgb": 0.34}
# w0_xgb, w0_cat, w0_lgb = best["w_xgb"], best["w_cat"], best["w_lgb"]
# fine = np.arange(-0.10, 0.10 + 1e-9, 0.01)

# best_fine = best.copy()

# for dx in fine:
#     for dc in fine:
#         w_xgb = w0_xgb + dx
#         w_cat = w0_cat + dc
#         w_lgb = 1.0 - w_xgb - w_cat

#         if (w_xgb < 0) or (w_cat < 0) or (w_lgb < 0) or (w_xgb > 1) or (w_cat > 1) or (w_lgb > 1):
#             continue

#         oof_blend_rank = (
#             w_xgb * rank_oof_xgb +
#             w_cat * rank_oof_cat +
#             w_lgb * rank_oof_lgb
#         )

#         auc = roc_auc_score(y, oof_blend_rank)
#         if auc > best_fine["auc"]:
#             best_fine.update({"auc": auc, "w_xgb": w_xgb, "w_cat": w_cat, "w_lgb": w_lgb})

# print("Best blend OOF AUC (rank):", best_fine["auc"])
# print(
#     f"Best weights | w_xgb: {best_fine['w_xgb']:.2f} | "
#     f"w_cat: {best_fine['w_cat']:.2f} | w_lgb: {best_fine['w_lgb']:.2f}"
# )




```optimal bend weight```

In [23]:
# rank_test_xgb = rankdata(test_xgb)
# rank_test_cat = rankdata(test_cat)
# rank_test_lgb = rankdata(test_lgb)

# rank_test_xgb = (rank_test_xgb - rank_test_xgb.min()) / (rank_test_xgb.max() - rank_test_xgb.min())
# rank_test_cat = (rank_test_cat - rank_test_cat.min()) / (rank_test_cat.max() - rank_test_cat.min())
# rank_test_lgb = (rank_test_lgb - rank_test_lgb.min()) / (rank_test_lgb.max() - rank_test_lgb.min())

# final_test_probs = (
#     best["w_cat"] * rank_test_cat +
#     best["w_xgb"] * rank_test_xgb +
#     best["w_lgb"] * rank_test_lgb
# )

```create submission```

In [24]:
# if os.path.exists("submission.csv"):
#     os.remove("submission.csv")
    
# submission = pd.DataFrame({
#     "id": test_df["id"],
#     "Heart Disease": final_test_probs  # probabilities
# })
# submission.to_csv("submission.csv", index=False)
# print("Saved new submission.csv")