```load data```

In [None]:
import numpy as np
import pandas as pd
import os

from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
from xgboost import XGBClassifier

train_df = pd.read_csv("/kaggle/input/playground-series-s6e2/train.csv")
test_df = pd.read_csv("/kaggle/input/playground-series-s6e2/test.csv")

print(train_df.shape, test_df.shape)
# train_df.head(10)
# train_df.columns
# test_df.columns

```Separate Features & Target```

In [None]:
ID_COL = "id"
TARGET_COL = "Heart Disease"   
train_df.drop('id', axis=1, inplace=True) # id is not used for model training and prediction
X = train_df.drop(columns=[TARGET_COL]).copy()

y_raw = train_df[TARGET_COL].copy()

# Convert target to 0/1
# Absence -> 0, Presence -> 1
y = y_raw.map({"Absence": 0, "Presence": 1}).astype(int)

X_test = test_df

# Drop ID column
if ID_COL in X.columns:
    X = X.drop(columns=[ID_COL])
if ID_COL in X_test.columns:
    X_test = X_test.drop(columns=[ID_COL])

print("X shape:", X.shape)
print("X_test shape:", X_test.shape)
print("Target distribution:")
print(y.value_counts())

```Feature Engineering```

In [None]:
# STEP 1: Age × Max HR
X["Age_x_MaxHR"] = X["Age"] * X["Max HR"]
X_test["Age_x_MaxHR"] = X_test["Age"] * X_test["Max HR"]

# STEP 2: ST depression × Exercise angina
X["ST_x_Angina"] = X["ST depression"] * X["Exercise angina"]
X_test["ST_x_Angina"] = X_test["ST depression"] * X_test["Exercise angina"]

# STEP 3: High blood pressure flag (BP ≥ 140)
X["High_BP"] = (X["BP"] >= 140).astype(int)
X_test["High_BP"] = (X_test["BP"] >= 140).astype(int)

# STEP 4: Cholesterol risk bin (0=normal, 1=borderline, 2=high)
X["Chol_bin"] = pd.cut(X["Cholesterol"], bins=[0, 200, 240, 2000], labels=[0, 1, 2]).astype(int)
X_test["Chol_bin"] = pd.cut(X_test["Cholesterol"], bins=[0, 200, 240, 2000], labels=[0, 1, 2]).astype(int)

# STEP 5: Age risk bucket
X["Age_bin"] = pd.cut(
    X["Age"],
    bins=[0, 40, 50, 60, 70, 120],
    labels=[0, 1, 2, 3, 4]
).astype(int)

X_test["Age_bin"] = pd.cut(
    X_test["Age"],
    bins=[0, 40, 50, 60, 70, 120],
    labels=[0, 1, 2, 3, 4]
).astype(int)

# STEP 6: Low maximum heart rate flag
X["Low_MaxHR"] = (X["Max HR"] < 120).astype(int)
X_test["Low_MaxHR"] = (X_test["Max HR"] < 120).astype(int)

# STEP 7: ST depression severity bucket
X["ST_bin"] = pd.cut(
    X["ST depression"],
    bins=[-1, 0.5, 1.5, 10],
    labels=[0, 1, 2]
).astype(int)

X_test["ST_bin"] = pd.cut(
    X_test["ST depression"],
    bins=[-1, 0.5, 1.5, 10],
    labels=[0, 1, 2]
).astype(int)


# STEP 8: Simple risk score
X["Risk_score"] = (
    X["High_BP"]
    + X["Chol_bin"]
    + X["Exercise angina"]
    + X["Age_bin"]
)

X_test["Risk_score"] = (
    X_test["High_BP"]
    + X_test["Chol_bin"]
    + X_test["Exercise angina"]
    + X_test["Age_bin"]
)

In [None]:
neg = (y == 0).sum()
pos = (y == 1).sum()
scale_pos_weight = neg / pos

print("scale_pos_weight:", scale_pos_weight)


```Build a pipeline```


Random Forest Classifier

In [None]:
# pipe = Pipeline(steps=[
#     ("imputer", SimpleImputer(strategy="median")),
#     ("model", RandomForestClassifier(
#         n_estimators=300,
#         max_depth=12,
#         min_samples_leaf=10,
#         random_state=42,
#         n_jobs=-1
#     ))
# ])

```Build a pipeline```


Extra Trees Classifier

In [None]:
# pipe = Pipeline(steps=[
#     ("imputer", SimpleImputer(strategy="median")),
#     ("model", ExtraTreesClassifier(
#         n_estimators=800,
#         max_depth=None,
#         min_samples_leaf=5,
#         max_features="sqrt",
#         class_weight="balanced",
#         random_state=42,
#         n_jobs=-1
#     ))
# ])


```identify columns```

In [None]:
categorical_cols = [
    "Sex",
    "Chest pain type",
    "FBS over 120",
    "EKG results",
    "Exercise angina",
    "Slope of ST",
    "Number of vessels fluro",
    "Thallium"
]

numerical_cols = [
    "Age",
    "BP",
    "Cholesterol",
    "Max HR",
    "ST depression"
]


```build a pipeline```
Gradient Boosting Classifier

In [None]:
# pipe = Pipeline(steps=[
#     ("imputer", SimpleImputer(strategy="median")),
#     ("model", GradientBoostingClassifier(
#         n_estimators=600,
#         learning_rate=0.05,
#         max_depth=3,
#         subsample=0.8,
#         random_state=42
#     ))
# ])


```XGBClassifier```

In [None]:
def make_xgb(seed: int) -> XGBClassifier:
    return XGBClassifier(
        n_estimators=4000,
        learning_rate=0.03,

        max_depth=6,
        min_child_weight=1,

        subsample=0.85,
        colsample_bytree=0.85,

        gamma=0.1,
        reg_alpha=0.1,
        reg_lambda=1.0,

        scale_pos_weight=scale_pos_weight,
        # early_stopping_rounds=150,

        objective="binary:logistic",
        eval_metric="error",

        tree_method="hist",
        random_state=seed,
        n_jobs=-1
    )


```define model: RF```

In [None]:
# def make_rf(seed: int) -> RandomForestClassifier:
#     return RandomForestClassifier(
#         n_estimators=600,
#         max_depth=14,
#         min_samples_leaf=8,
#         max_features="sqrt",
#         random_state=seed,
#         n_jobs=-1
#     )

```define model: ET```

In [None]:
# def make_et(seed: int) -> ExtraTreesClassifier:
#     return ExtraTreesClassifier(
#         n_estimators=1200,
#         max_depth=None,
#         min_samples_leaf=2,
#         max_features=0.5,
#         bootstrap=False,
#         random_state=seed,
#         n_jobs=-1
#     )


```build a pipeline```
logistic regression

In [None]:
# preprocessor = ColumnTransformer(
#     transformers=[
#         ("num", StandardScaler(), numerical_cols),
#         ("cat", OneHotEncoder(handle_unknown="ignore", drop="first"), categorical_cols)
#     ]
# )

# pipe = Pipeline(steps=[
#     ("preprocess", preprocessor),
#     ("model", LogisticRegression(
#         C=15,
#         max_iter=4000,
#         solver="saga",
#         l1_ratio=0.2,
#         penalty="elasticnet",
#         class_weight="balanced"
#     ))
# ])

In [None]:
# Train ONE final model on full data (no CV)
model = make_xgb(seed=999)
model.fit(X, y)

importance = pd.Series(
    model.feature_importances_,
    index=X.columns
).sort_values(ascending=False)

importance.head(15)

In [None]:
importance.head(50)

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

oof_et = np.zeros(len(train_df))
oof_xgb = np.zeros(len(train_df))

test_et  = np.zeros(len(test_df))
test_xgb = np.zeros(len(test_df))

best_iters = []

for fold, (tr_idx, va_idx) in enumerate(skf.split(X, y), 1):
    # X_tr, y_tr = X.iloc[tr_idx], y.iloc[tr_idx]
    # X_va, y_va = X.iloc[va_idx], y.iloc[va_idx]

    # et = make_et(seed=100 + fold)
    # et.fit(X_tr, y_tr)

    # et_va_prob = et.predict_proba(X_va)[:, 1]
    # oof_et[va_idx] = et_va_prob

    # test_et += et.predict_proba(X_test)[:, 1] / skf.n_splits
    # et_auc  = roc_auc_score(y_va, et_va_prob)
    # print(f"Fold {fold} | ET AUC: {et_auc:.6f}")
    


    
    xgb = make_xgb(seed=200 + fold)
    xgb.fit(
        X_tr, y_tr,
        eval_set=[(X_va, y_va)],
        verbose=False
    )

    best_iters.append(xgb.best_iteration)

    xgb_va_prob = xgb.predict_proba(X_va)[:, 1]
    oof_xgb[va_idx] = xgb_va_prob

    test_xgb += xgb.predict_proba(X_test)[:, 1] / skf.n_splits

    xgb_auc = roc_auc_score(y_va, xgb_va_prob)
    print(f"Fold {fold} | XGB AUC: {xgb_auc:.6f} | XGB best_iter: {xgb.best_iteration}")

print("\nMean XGB best_iteration:", int(np.mean(best_iters)))
print("OOF ET AUC :", roc_auc_score(y, oof_et))
print("OOF XGB AUC:", roc_auc_score(y, oof_xgb))

In [None]:
# skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# oof = np.zeros(len(train_df))
# test_preds = np.zeros(len(test_df))

# for fold, (tr_idx, va_idx) in enumerate(skf.split(X, y), 1):
#     X_tr, y_tr = X.iloc[tr_idx], y.iloc[tr_idx]
#     X_va, y_va = X.iloc[va_idx], y.iloc[va_idx]

#     pipe.fit(X_tr, y_tr)

#     va_pred = pipe.predict_proba(X_va)[:, 1]
#     oof[va_idx] = va_pred

#     fold_auc = roc_auc_score(y_va, va_pred)
#     print(f"Fold {fold} AUC: {fold_auc:.6f}")

#     test_preds += pipe.predict_proba(X_test)[:, 1] / skf.n_splits

# print(f"\nOverall OOF AUC: {roc_auc_score(y, oof):.6f}")


```Find optimal bending weight```

In [None]:
weights = np.arange(0.0, 1.01, 0.05)

best_w = {"auc": -1.0, "weight_xgb": 1.0}

for w in weights:
    oof_blend = w * oof_xgb + (1.0 - w) * oof_et
    auc = roc_auc_score(y, oof_blend)

    if auc > best_w["auc"]:
        best_w["auc"] = auc
        best_w["weight_xgb"] = w

print("\nBest blend by AUC (coarse):", best_w)


```find AUC weight```

In [None]:
w0 = best_w["weight_xgb"]

w_start = max(0.0, w0 - 0.10)
w_end   = min(1.0, w0 + 0.10)

fine_weights = np.arange(w_start, w_end + 1e-9, 0.01)

best_w_fine = {"auc": -1.0, "weight_xgb": w0}

for w in fine_weights:
    oof_blend = w * oof_xgb + (1.0 - w) * oof_et
    auc = roc_auc_score(y, oof_blend)

    if auc > best_w_fine["auc"]:
        best_w_fine["auc"] = auc
        best_w_fine["weight_xgb"] = w

print("Best blend by AUC (fine):", best_w_fine)


In [None]:
# thresholds = np.arange(0.10, 0.90, 0.01)

# # ---- Coarse weight search
# weights = np.arange(0.0, 1.01, 0.05)

# best = {"acc": 0.0, "weight_xgb": 0.5, "threshold": 0.5}

# for w in weights:
#     oof_ens = w * oof_xgb + (1.0 - w) * oof_et

#     for t in thresholds:
#         acc = accuracy_score(y, (oof_ens >= t).astype(int))
#         if acc > best["acc"]:
#             best.update({"acc": acc, "weight_xgb": w, "threshold": t})

# print("\nBest (coarse) ->", best)

# # ---- Fine weight search around best
# w0 = best["weight_xgb"]
# w_start = max(0.0, w0 - 0.05)
# w_end   = min(1.0, w0 + 0.05)

# fine_weights = np.arange(w_start, w_end + 1e-9, 0.01)

# best_fine = best.copy()

# for w in fine_weights:
#     oof_ens = w * oof_xgb + (1.0 - w) * oof_et

#     for t in thresholds:
#         acc = accuracy_score(y, (oof_ens >= t).astype(int))
#         if acc > best_fine["acc"]:
#             best_fine.update({"acc": acc, "weight_xgb": w, "threshold": t})

# print("Best (fine)  ->", best_fine)

# # Final ensemble settings
# w_best = best_fine["weight_xgb"]
# t_best = best_fine["threshold"]

# oof_ens_final = w_best * oof_xgb + (1.0 - w_best) * oof_et
# print("\nFinal ensemble OOF AUC:", roc_auc_score(y, oof_ens_final))
# print("Final ensemble CV accuracy:", accuracy_score(y, (oof_ens_final >= t_best).astype(int)))
# print("Final weight_xgb:", w_best, "| weight_et:", 1.0 - w_best, "| threshold:", t_best)


```Find best threshold```

In [None]:
w_best = best_w_fine["weight_xgb"]
oof_final = w_best * oof_xgb + (1.0 - w_best) * oof_et

thresholds = np.arange(0.10, 0.90, 0.01)

best_t = {"acc": 0.0, "threshold": 0.5}

for t in thresholds:
    acc = accuracy_score(y, (oof_final >= t).astype(int))
    if acc > best_t["acc"]:
        best_t["acc"] = acc
        best_t["threshold"] = t

print("\nFinal blend OOF AUC:", roc_auc_score(y, oof_final))
print("Best CV accuracy:", best_t["acc"])
print("Best threshold:", best_t["threshold"])
print("Final weight_xgb:", w_best, "| weight_et:", 1.0 - w_best)


```create submission```

In [None]:
test_final = w_best * test_xgb + (1.0 - w_best) * test_et
test_preds_binary = (test_final >= best_t["threshold"]).astype(int)

if os.path.exists("submission.csv"):
    os.remove("submission.csv")

submission = pd.DataFrame({
    "id": test_df[ID_COL],
    "Heart Disease": test_preds_binary
})

submission.to_csv("submission.csv", index=False)
print("Saved new submission.csv")
submission.head()
