```load data```

In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer

train_df = pd.read_csv("/kaggle/input/playground-series-s6e2/train.csv")
test_df = pd.read_csv("/kaggle/input/playground-series-s6e2/test.csv")

print(train_df.shape, test_df.shape)
# train_df.columns
# test_df.columns

```Separate Features & Target```

In [None]:
ID_COL = "id"
TARGET_COL = "Heart Disease"   
train_df.drop('id', axis=1, inplace=True) # id is not used for model training and prediction
X = train_df.drop(columns=[TARGET_COL]).copy()

y_raw = train_df[TARGET_COL].copy()

# Convert target to 0/1
# Absence -> 0, Presence -> 1
y = y_raw.map({"Absence": 0, "Presence": 1}).astype(int)

X_test = test_df

# Drop ID column
if ID_COL in X.columns:
    X = X.drop(columns=[ID_COL])
if ID_COL in X_test.columns:
    X_test = X_test.drop(columns=[ID_COL])

print("X shape:", X.shape)
print("X_test shape:", X_test.shape)
print("Target distribution:")
print(y.value_counts())

```Build a pipeline```


Random Forest Classifier

In [None]:
# pipe = Pipeline(steps=[
#     ("imputer", SimpleImputer(strategy="median")),
#     ("model", RandomForestClassifier(
#         n_estimators=300,
#         max_depth=12,
#         min_samples_leaf=10,
#         random_state=42,
#         n_jobs=-1
#     ))
# ])

```Build a pipeline```


Extra Trees Classifier

In [None]:
# pipe = Pipeline(steps=[
#     ("imputer", SimpleImputer(strategy="median")),
#     ("model", ExtraTreesClassifier(
#         n_estimators=800,
#         max_depth=None,
#         min_samples_leaf=5,
#         max_features="sqrt",
#         class_weight="balanced",
#         random_state=42,
#         n_jobs=-1
#     ))
# ])


```identify columns```

In [None]:
categorical_cols = [
    "Sex",
    "Chest pain type",
    "FBS over 120",
    "EKG results",
    "Exercise angina",
    "Slope of ST",
    "Number of vessels fluro",
    "Thallium"
]

numerical_cols = [
    "Age",
    "BP",
    "Cholesterol",
    "Max HR",
    "ST depression"
]


```build a pipeline```
logistic regression

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols)
    ]
)

pipe = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", LogisticRegression(
        C=10,
        max_iter=3000,
        solver="saga",
        l1_ratio=0.3,
        penalty="elasticnet",
        class_weight="balanced"
    ))
])

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

oof = np.zeros(len(train_df))
test_preds = np.zeros(len(test_df))

for fold, (tr_idx, va_idx) in enumerate(skf.split(X, y), 1):
    X_tr, y_tr = X.iloc[tr_idx], y.iloc[tr_idx]
    X_va, y_va = X.iloc[va_idx], y.iloc[va_idx]

    pipe.fit(X_tr, y_tr)

    va_pred = pipe.predict_proba(X_va)[:, 1]
    oof[va_idx] = va_pred

    fold_auc = roc_auc_score(y_va, va_pred)
    print(f"Fold {fold} AUC: {fold_auc:.6f}")

    test_preds += pipe.predict_proba(X_test)[:, 1] / skf.n_splits

print(f"\nOverall OOF AUC: {roc_auc_score(y, oof):.6f}")


```create submission```

In [None]:
import os

# Delete previous submission file if it exists
if os.path.exists("submission.csv"):
    os.remove("submission.csv")
    
# Convert probabilities to binary predictions
test_preds_binary = (test_preds >= 0.5).astype(int)

submission = pd.DataFrame({
    "id": test_df[ID_COL],
    "Heart Disease": test_preds_binary
})

submission.to_csv("submission.csv", index=False)
print("Saved submission.csv")
submission.head()