```load data```

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import roc_auc_score

train_df = pd.read_csv("/kaggle/input/playground-series-s6e2/train.csv")
test_df = pd.read_csv("/kaggle/input/playground-series-s6e2/test.csv")

print(train_df.shape, test_df.shape)
# train_df.columns
# test_df.columns

(630000, 15) (270000, 14)


```Separate Features & Target```

In [2]:
ID_COL = "id"
TARGET_COL = "Heart Disease"   
train_df.drop('id', axis=1, inplace=True) # id is not used for model training and prediction
X = train_df.drop(columns=[TARGET_COL]).copy()

y_raw = train_df[TARGET_COL].copy()

# Convert target to 0/1
# Absence -> 0, Presence -> 1
y = y_raw.map({"Absence": 0, "Presence": 1}).astype(int)

X_test = test_df

# Drop ID column
if ID_COL in X.columns:
    X = X.drop(columns=[ID_COL])
if ID_COL in X_test.columns:
    X_test = X_test.drop(columns=[ID_COL])

print("X shape:", X.shape)
print("X_test shape:", X_test.shape)
print("Target distribution:")
print(y.value_counts())

X shape: (630000, 13)
X_test shape: (270000, 13)
Target distribution:
Heart Disease
0    347546
1    282454
Name: count, dtype: int64


```Build a pipeline```


Random Forest Classifier

In [3]:
# pipe = Pipeline(steps=[
#     ("imputer", SimpleImputer(strategy="median")),
#     ("model", RandomForestClassifier(
#         n_estimators=300,
#         max_depth=12,
#         min_samples_leaf=10,
#         random_state=42,
#         n_jobs=-1
#     ))
# ])

```Build a pipeline```


Extra Trees Classifier

In [4]:
pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("model", ExtraTreesClassifier(
        n_estimators=800,
        max_depth=None,
        min_samples_leaf=5,
        max_features="sqrt",
        class_weight="balanced",
        random_state=42,
        n_jobs=-1
    ))
])


In [5]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

oof = np.zeros(len(train_df))
test_preds = np.zeros(len(test_df))

for fold, (tr_idx, va_idx) in enumerate(skf.split(X, y), 1):
    X_tr, y_tr = X.iloc[tr_idx], y.iloc[tr_idx]
    X_va, y_va = X.iloc[va_idx], y.iloc[va_idx]

    pipe.fit(X_tr, y_tr)

    va_pred = pipe.predict_proba(X_va)[:, 1]
    oof[va_idx] = va_pred

    fold_auc = roc_auc_score(y_va, va_pred)
    print(f"Fold {fold} AUC: {fold_auc:.6f}")

    test_preds += pipe.predict_proba(X_test)[:, 1] / skf.n_splits

print(f"\nOverall OOF AUC: {roc_auc_score(y, oof):.6f}")


Fold 1 AUC: 0.951963
Fold 2 AUC: 0.951307
Fold 3 AUC: 0.951866
Fold 4 AUC: 0.951375
Fold 5 AUC: 0.952094

Overall OOF AUC: 0.951716


```create submission```

In [6]:
import os

# Delete previous submission file if it exists
if os.path.exists("submission.csv"):
    os.remove("submission.csv")
    
# Convert probabilities to binary predictions
test_preds_binary = (test_preds >= 0.5).astype(int)

submission = pd.DataFrame({
    "id": test_df[ID_COL],
    "Heart Disease": test_preds_binary
})

submission.to_csv("submission.csv", index=False)
print("Saved submission.csv")
submission.head()

Saved submission.csv


Unnamed: 0,id,Heart Disease
0,630000,1
1,630001,0
2,630002,1
3,630003,0
4,630004,0
