In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/heart-disease-cleveland-uci/heart_cleveland_upload.csv


In [4]:
import pandas as pd
df = pd.read_csv("/kaggle/input/heart-disease-cleveland-uci/heart_cleveland_upload.csv")
print(df.columns)
df.head()


Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal', 'condition'],
      dtype='object')


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,condition
0,69,1,0,160,234,1,2,131,0,0.1,1,1,0,0
1,69,0,0,140,239,0,0,151,0,1.8,0,2,0,0
2,66,0,0,150,226,0,0,114,0,2.6,2,0,0,0
3,65,1,0,138,282,1,2,174,0,1.4,1,1,0,1
4,64,1,0,110,211,0,2,144,1,1.8,1,0,0,0


In [9]:
# Full Kaggle-ready Heart Disease Pipeline (single file)
# - Auto target detection & rename
# - Robust FE + per-fold transforms (no leakage)
# - XGBoost / LightGBM / CatBoost ensemble (safe fallbacks)
# - Stratified K-Fold OOF + final full-data retrain
# - Robust submission builder that uses test.csv or sample_submission if available
# - Saves artifacts to ./artifacts_kaggle
#
# Copy-paste into a Kaggle notebook cell and run. Adjust TRAIN_PATH / TEST_PATH if needed.

import os, gc, glob, warnings, json
from datetime import datetime
warnings.filterwarnings("ignore")
SEED = 42

import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score

# Optional ML libraries detection
HAS_XGB = HAS_LGB = HAS_CAT = HAS_OPTUNA = False
try:
    import xgboost as xgb; HAS_XGB = True
except Exception:
    pass
try:
    import lightgbm as lgb; HAS_LGB = True
except Exception:
    pass
try:
    import catboost as cat; HAS_CAT = True
except Exception:
    pass
try:
    import optuna; HAS_OPTUNA = True
except Exception:
    pass

np.random.seed(SEED)

# -------------------- CONFIG --------------------
# Adjust paths if your files are placed differently
TRAIN_PATH = "/kaggle/input/heart-disease-cleveland-uci/heart_cleveland_upload.csv"
# You may not have a separate test file. If present, set TEST_PATH; else sample_submission will be used.
TEST_PATH = "/kaggle/input/heart-disease-cleveland-uci/heart_cleveland_upload_test.csv"
INPUT_ROOT = "/kaggle/input/heart-disease-cleveland-uci"
OUT_DIR = "artifacts_kaggle"
os.makedirs(OUT_DIR, exist_ok=True)

# Ensemble weights (will normalize automatically if some libs missing)
WEIGHT_XGB = 0.5
WEIGHT_LGB = 0.3
WEIGHT_CAT = 0.2

NFOLD = 10
EARLY_STOP = 50
MAX_ROUNDS = 2000

# Submission config
TARGET_COL = "target"
ID_COL_CANDIDATES = ['id','ID','Id','patient_id','PatientID','index']
AS_INTEGER_SUBMISSION = False  # set True if competition expects 0/1 labels
# ------------------------------------------------

def safe_read_csv(path):
    if path is None:
        return None
    if os.path.exists(path):
        return pd.read_csv(path)
    return None

def save_submission_df(df_sub, filename="submission_final.csv"):
    out_path = os.path.join(OUT_DIR, filename)
    df_sub.to_csv(out_path, index=False)
    print("Saved submission to:", out_path)
    return out_path

# -------------------- Load Data --------------------
df = safe_read_csv(TRAIN_PATH)
if df is None:
    raise FileNotFoundError(f"Train file not found. Update TRAIN_PATH: {TRAIN_PATH}")

df_test = safe_read_csv(TEST_PATH)  # may be None
# also try to find sample_submission if test missing later

# -------------------- Auto-detect target --------------------
possible_targets = ["target", "num", "condition", "HeartDisease", "diagnosis"]
target_col = None
for c in possible_targets:
    if c in df.columns:
        target_col = c; break
if target_col is None:
    for c in df.columns:
        if df[c].dtype.kind in "iu" and df[c].nunique() <= 6 and c.lower() not in ['age','sex','chol','trestbps','thalach','ca','thal']:
            target_col = c
            print(f"Auto-heuristic selected '{c}' as target; please verify.")
            break
if target_col is None:
    raise ValueError(f"No target column found. Columns: {df.columns.tolist()}")

if target_col != TARGET_COL:
    df = df.rename(columns={target_col: TARGET_COL})
    print(f"Renamed target column: {target_col} -> {TARGET_COL}")

# Convert multiclass -> binary if needed
if df[TARGET_COL].nunique() > 2:
    df[TARGET_COL] = (df[TARGET_COL] > 0).astype(int)

# -------------------- Basic cleaning & FE --------------------
def safe_numeric_convert(d):
    d = d.copy()
    for c in d.columns:
        if d[c].dtype == object:
            try: d[c] = pd.to_numeric(d[c])
            except: pass
    return d

def feature_engineer(d):
    d = d.copy()
    if all(c in d.columns for c in ["age","chol"]):
        d["age_chol"] = d["age"] * (d["chol"] + 1e-6)
    if all(c in d.columns for c in ["age","thalach"]):
        d["age_thalach"] = d["age"] * (d["thalach"] + 1e-6)
    if all(c in d.columns for c in ["chol","trestbps"]):
        d["chol_trestbps_ratio"] = d["chol"] / (d["trestbps"] + 1e-6)
    if "age" in d.columns:
        d["age_bin"] = pd.cut(d["age"], bins=[0,40,50,60,70,120], labels=False)
    for c in ["ca","thal"]:
        if c in d.columns:
            d[f"{c}_missing"] = d[c].isna().astype(int)
    return d

df = safe_numeric_convert(df)
df = feature_engineer(df)
if df_test is not None:
    df_test = safe_numeric_convert(df_test)
    df_test = feature_engineer(df_test)

# -------------------- Features list & detect types --------------------
features = [c for c in df.columns if c != TARGET_COL]
if len(features) == 0:
    raise RuntimeError("No features found after excluding target.")

def detect_cats(d, thresh=8):
    cats = []
    for c in d.columns:
        if d[c].dtype.kind in "iu" and d[c].nunique() <= thresh:
            cats.append(c)
        elif d[c].dtype == object:
            cats.append(c)
    return cats

categorical_feats = detect_cats(df[features])
numeric_feats = [c for c in features if c not in categorical_feats]

# ensure minimal numeric feature
if len(numeric_feats) == 0:
    df["_dummy_num"] = 0.0
    if df_test is not None:
        df_test["_dummy_num"] = 0.0
    numeric_feats = ["_dummy_num"]
    features.append("_dummy_num")

print("Numeric features:", numeric_feats)
print("Categorical features:", categorical_feats)

# -------------------- Per-fold transform (no leakage) --------------------
def transform_local(X_tr, X_val, X_test=None):
    num_imp = SimpleImputer(strategy="median")
    scaler = StandardScaler()
    ord_enc = None
    if len(categorical_feats) > 0:
        ord_enc = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)

    # numeric
    Xn_tr = pd.DataFrame(num_imp.fit_transform(X_tr[numeric_feats]), columns=numeric_feats, index=X_tr.index)
    Xn_val = pd.DataFrame(num_imp.transform(X_val[numeric_feats]), columns=numeric_feats, index=X_val.index)
    Xn_test = None
    if X_test is not None:
        Xn_test = pd.DataFrame(num_imp.transform(X_test[numeric_feats]), columns=numeric_feats, index=X_test.index)

    Xn_tr[numeric_feats] = scaler.fit_transform(Xn_tr[numeric_feats])
    Xn_val[numeric_feats] = scaler.transform(Xn_val[numeric_feats])
    if Xn_test is not None:
        Xn_test[numeric_feats] = scaler.transform(Xn_test[numeric_feats])

    # categorical
    if ord_enc is not None:
        Xc_tr = pd.DataFrame(ord_enc.fit_transform(X_tr[categorical_feats]), columns=categorical_feats, index=X_tr.index)
        Xc_val = pd.DataFrame(ord_enc.transform(X_val[categorical_feats]), columns=categorical_feats, index=X_val.index)
        Xc_test = pd.DataFrame(ord_enc.transform(X_test[categorical_feats]), columns=categorical_feats, index=X_test.index) if X_test is not None else None
    else:
        Xc_tr = pd.DataFrame(index=X_tr.index)
        Xc_val = pd.DataFrame(index=X_val.index)
        Xc_test = None

    X_tr_t = pd.concat([Xn_tr.reset_index(drop=True), Xc_tr.reset_index(drop=True)], axis=1)
    X_val_t = pd.concat([Xn_val.reset_index(drop=True), Xc_val.reset_index(drop=True)], axis=1)
    X_test_t = pd.concat([Xn_test.reset_index(drop=True), Xc_test.reset_index(drop=True)], axis=1) if Xn_test is not None else None

    return X_tr_t, X_val_t, X_test_t, {"num_imp": num_imp, "scaler": scaler, "ord_enc": ord_enc}

# -------------------- Model wrappers --------------------
def train_xgb(X_tr, y_tr, X_val, y_val, params=None):
    if not HAS_XGB:
        raise RuntimeError("XGBoost not installed.")
    dtrain = xgb.DMatrix(X_tr, label=y_tr)
    dval = xgb.DMatrix(X_val, label=y_val)
    default = {"objective":"binary:logistic","eval_metric":"auc","seed":SEED,"verbosity":0,"nthread":4,"tree_method":"hist"}
    if params: default.update(params)
    bst = xgb.train(default, dtrain, num_boost_round=MAX_ROUNDS, evals=[(dtrain,"train"),(dval,"valid")],
                    early_stopping_rounds=EARLY_STOP, verbose_eval=False)
    return bst

def train_lgb(X_tr, y_tr, X_val, y_val, params=None):
    if not HAS_LGB:
        raise RuntimeError("LightGBM not installed.")
    ltrain = lgb.Dataset(X_tr, label=y_tr)
    lval = lgb.Dataset(X_val, label=y_val, reference=ltrain)
    default = {"objective":"binary","metric":"auc","verbosity":-1,"seed":SEED,"n_jobs":4}
    if params: default.update(params)
    try:
        from lightgbm import early_stopping, log_evaluation
        bst = lgb.train(default, ltrain, num_boost_round=MAX_ROUNDS, valid_sets=[ltrain,lval],
                        callbacks=[early_stopping(stopping_rounds=EARLY_STOP), log_evaluation(0)])
    except Exception:
        bst = lgb.train(default, ltrain, num_boost_round=MAX_ROUNDS, valid_sets=[ltrain,lval],
                        early_stopping_rounds=EARLY_STOP, verbose_eval=False)
    return bst

def train_cat(X_tr, y_tr, X_val, y_val, params=None):
    if not HAS_CAT:
        raise RuntimeError("CatBoost not installed.")
    default = {"iterations":MAX_ROUNDS,"learning_rate":0.03,"depth":6,"loss_function":"Logloss","eval_metric":"AUC","random_seed":SEED,"verbose":False}
    if params: default.update(params)
    model = cat.CatBoostClassifier(**default)
    model.fit(X_tr, y_tr, eval_set=(X_val, y_val), early_stopping_rounds=EARLY_STOP, verbose=False)
    return model

# default params for models
xgb_default = {"eta":0.02,"max_depth":4,"subsample":0.8,"colsample_bytree":0.8}
lgb_default = {"learning_rate":0.02,"num_leaves":31,"feature_fraction":0.8,"bagging_fraction":0.8,"bagging_freq":5}
cat_default = {"learning_rate":0.02,"depth":5}

# optional light tuning with optuna for fold-1
def optuna_light_xgb(X_tr, y_tr, X_val, y_val, n_trials=12):
    if not (HAS_OPTUNA and HAS_XGB):
        return xgb_default
    def objective(trial):
        params = {
            "eta": trial.suggest_float("eta", 0.005, 0.1, log=True),
            "max_depth": trial.suggest_int("max_depth", 3, 7),
            "subsample": trial.suggest_float("subsample", 0.6, 1.0),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        }
        m = train_xgb(X_tr, y_tr, X_val, y_val, params=params)
        preds = m.predict(xgb.DMatrix(X_val))
        return roc_auc_score(y_val, preds)
    study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler(seed=SEED))
    study.optimize(objective, n_trials=n_trials, n_jobs=1)
    print("Optuna best params:", study.best_params)
    return study.best_params

# -------------------- CV OOF training --------------------
X = df[features].reset_index(drop=True)
y = df[TARGET_COL].reset_index(drop=True)

skf = StratifiedKFold(n_splits=NFOLD, shuffle=True, random_state=SEED)
oof = np.zeros(len(X), dtype=float)
test_fold_preds = []
cv_scores = []

for fold, (tr_idx, val_idx) in enumerate(skf.split(X, y), start=1):
    print(f"\n=== Fold {fold}/{NFOLD} ===")
    X_tr, X_val = X.iloc[tr_idx].reset_index(drop=True), X.iloc[val_idx].reset_index(drop=True)
    y_tr, y_val = y.iloc[tr_idx].reset_index(drop=True), y.iloc[val_idx].reset_index(drop=True)

    X_tr_t, X_val_t, X_test_t, fitted_objs = transform_local(X_tr, X_val, X_test=df_test[features] if df_test is not None else None)

    preds_components = []
    weights = []

    # XGBoost
    test_pred_x = None
    if HAS_XGB:
        try:
            if fold == 1 and HAS_OPTUNA:
                params_x = optuna_light_xgb(X_tr_t.values, y_tr.values, X_val_t.values, y_val.values, n_trials=12)
            else:
                params_x = xgb_default
            m_x = train_xgb(X_tr_t.values, y_tr.values, X_val_t.values, y_val.values, params=params_x)
            pv = m_x.predict(xgb.DMatrix(X_val_t.values))
            preds_components.append(pv); weights.append(WEIGHT_XGB)
            if X_test_t is not None:
                test_pred_x = m_x.predict(xgb.DMatrix(X_test_t.values))
        except Exception as e:
            print("XGB fold error:", e)

    # LightGBM
    test_pred_l = None
    if HAS_LGB:
        try:
            m_l = train_lgb(X_tr_t.values, y_tr.values, X_val_t.values, y_val.values, params=lgb_default)
            best_it = getattr(m_l, "best_iteration", None)
            if best_it and best_it > 0:
                pv = m_l.predict(X_val_t.values, num_iteration=best_it)
            else:
                pv = m_l.predict(X_val_t.values)
            preds_components.append(pv); weights.append(WEIGHT_LGB)
            if X_test_t is not None:
                test_pred_l = m_l.predict(X_test_t.values, num_iteration=best_it) if best_it else m_l.predict(X_test_t.values)
        except Exception as e:
            print("LGB fold error:", e)

    # CatBoost
    test_pred_c = None
    if HAS_CAT:
        try:
            m_c = train_cat(X_tr_t, y_tr.values, X_val_t, y_val.values, params=cat_default)
            pv = m_c.predict_proba(X_val_t)[:,1]
            preds_components.append(pv); weights.append(WEIGHT_CAT)
            if X_test_t is not None:
                test_pred_c = m_c.predict_proba(X_test_t)[:,1]
        except Exception as e:
            print("CAT fold error:", e)

    if len(preds_components) == 0:
        raise RuntimeError("No models trained this fold. Install xgboost, lightgbm or catboost.")

    weights = np.array(weights, dtype=float)
    weights = weights / weights.sum()

    fold_preds = np.zeros_like(preds_components[0], dtype=float)
    for p,w in zip(preds_components, weights):
        fold_preds += p * w

    oof[val_idx] = fold_preds
    auc_fold = roc_auc_score(y_val, fold_preds)
    cv_scores.append(auc_fold)
    print(f"Fold {fold} AUC: {auc_fold:.6f}")

    # test fold preds blended
    test_components = []
    test_weights = []
    if test_pred_x is not None:
        test_components.append(test_pred_x); test_weights.append(WEIGHT_XGB)
    if test_pred_l is not None:
        test_components.append(test_pred_l); test_weights.append(WEIGHT_LGB)
    if test_pred_c is not None:
        test_components.append(test_pred_c); test_weights.append(WEIGHT_CAT)
    if len(test_components) > 0:
        tw = np.array(test_weights, dtype=float)
        tw = tw / tw.sum()
        tpred = np.zeros_like(test_components[0], dtype=float)
        for p,w in zip(test_components, tw):
            tpred += p * w
        test_fold_preds.append(tpred)

    # cleanup
    del X_tr_t, X_val_t
    gc.collect()

# CV summary
print("\nCV AUCs:", [round(x,6) for x in cv_scores])
print("Mean CV AUC:", np.mean(cv_scores))
oof_auc = roc_auc_score(y, oof)
print("OOF AUC:", oof_auc)
pd.DataFrame({"oof":oof, "target":y}).to_csv(os.path.join(OUT_DIR,"oof_preds.csv"), index=False)

# -------------------- Final full-data retrain + test predict --------------------
print("\nRetraining on full data and producing final test predictions...")

# fit full transforms
full_num_imp = SimpleImputer(strategy="median")
full_scaler = StandardScaler()
full_ord = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1) if len(categorical_feats)>0 else None

X_full = df[features].reset_index(drop=True)
y_full = df[TARGET_COL].reset_index(drop=True)

Xn_full = pd.DataFrame(full_num_imp.fit_transform(X_full[numeric_feats]), columns=numeric_feats)
Xn_full[numeric_feats] = full_scaler.fit_transform(Xn_full[numeric_feats])
if full_ord is not None:
    Xc_full = pd.DataFrame(full_ord.fit_transform(X_full[categorical_feats]), columns=categorical_feats)
else:
    Xc_full = pd.DataFrame(index=Xn_full.index)
X_full_trans = pd.concat([Xn_full.reset_index(drop=True), Xc_full.reset_index(drop=True)], axis=1)

# transform test using full transforms - if df_test missing, we'll try sample_submission or create placeholder later
if df_test is not None:
    Xn_test = pd.DataFrame(full_num_imp.transform(df_test[numeric_feats]), columns=numeric_feats)
    Xn_test[numeric_feats] = full_scaler.transform(Xn_test[numeric_feats])
    if full_ord is not None:
        Xc_test = pd.DataFrame(full_ord.transform(df_test[categorical_feats]), columns=categorical_feats)
    else:
        Xc_test = pd.DataFrame(index=Xn_test.index)
    X_test_trans = pd.concat([Xn_test.reset_index(drop=True), Xc_test.reset_index(drop=True)], axis=1)
else:
    X_test_trans = None

final_components = []
final_weights = []

# XGB full
if HAS_XGB:
    try:
        m_x_full = train_xgb(X_full_trans.values, y_full.values, X_full_trans.values, y_full.values, params=xgb_default)
        if X_test_trans is not None:
            pred_x_test = m_x_full.predict(xgb.DMatrix(X_test_trans.values))
            final_components.append(pred_x_test); final_weights.append(WEIGHT_XGB)
    except Exception as e:
        print("XGB full retrain error:", e)

# LGB full
if HAS_LGB:
    try:
        m_l_full = train_lgb(X_full_trans.values, y_full.values, X_full_trans.values, y_full.values, params=lgb_default)
        best_it = getattr(m_l_full, "best_iteration", None)
        if X_test_trans is not None:
            pred_l_test = m_l_full.predict(X_test_trans.values, num_iteration=best_it) if best_it and best_it>0 else m_l_full.predict(X_test_trans.values)
            final_components.append(pred_l_test); final_weights.append(WEIGHT_LGB)
    except Exception as e:
        print("LGB full retrain error:", e)

# CAT full
if HAS_CAT:
    try:
        m_c_full = train_cat(X_full_trans, y_full.values, X_full_trans, y_full.values, params=cat_default)
        if X_test_trans is not None:
            pred_c_test = m_c_full.predict_proba(X_test_trans)[:,1]
            final_components.append(pred_c_test); final_weights.append(WEIGHT_CAT)
    except Exception as e:
        print("CAT full retrain error:", e)

# If we have no final_components but had test_fold_preds (from CV), use averaged fold preds
if len(final_components) == 0 and len(test_fold_preds) > 0:
    final_preds = np.mean(np.vstack(test_fold_preds), axis=0)
    print("No full retrain preds available; using averaged CV-fold test predictions.")
elif len(final_components) == 0:
    final_preds = None
    print("No test predictions available from models or CV. Will attempt fallback submission later.")
else:
    fw = np.array(final_weights, dtype=float)
    fw = fw / fw.sum()
    final_preds = np.zeros_like(final_components[0], dtype=float)
    for p,w in zip(final_components, fw):
        final_preds += p * w
    final_preds = np.clip(final_preds, 0.0, 1.0)

# -------------------- Robust Submission Builder --------------------
def find_sample_submission(input_root=INPUT_ROOT):
    if not os.path.exists(input_root):
        return None
    # typical names
    for root, dirs, files in os.walk(input_root):
        for f in files:
            if "sample" in f.lower() and "sub" in f.lower() and f.lower().endswith(".csv"):
                return os.path.join(root, f)
    # fallback: any csv in input root that looks like submission
    return None

def build_submission_from_predictions(preds, test_df=None, out_name=None, id_col_candidates=ID_COL_CANDIDATES, target_col=TARGET_COL, as_int=False):
    if preds is None:
        raise RuntimeError("No prediction array provided for submission.")
    preds = np.asarray(preds).ravel()
    preds = np.clip(preds, 0.0, 1.0)
    if np.all(preds == preds[0]):
        preds = preds + np.linspace(-1e-9, 1e-9, len(preds))
        preds = np.clip(preds, 0.0, 1.0)

    if test_df is None:
        # placeholder using indices
        sub = pd.DataFrame({"id": np.arange(len(preds)), target_col: preds})
        print("WARNING: No test file / sample_submission found. Saving placeholder submission using indices as 'id'. This will likely not match the official test set.")
    else:
        id_col = None
        for c in id_col_candidates:
            if c in test_df.columns:
                id_col = c; break
        if id_col is not None:
            sub = pd.DataFrame({id_col: test_df[id_col].values, target_col: preds})
        else:
            # use index as id
            sub = pd.DataFrame({"id": test_df.index.values, target_col: preds})

    if as_int:
        sub[target_col] = (sub[target_col] >= 0.5).astype(int)

    if test_df is not None and len(preds) != len(test_df):
        # trim/pad if mismatch small, else raise
        if len(preds) > len(test_df):
            preds = preds[:len(test_df)]
            sub = sub.iloc[:len(test_df)]
            sub[target_col] = preds
            print("Trimmed predictions to match test length.")
        else:
            pad_len = len(test_df) - len(preds)
            pad_vals = np.full(pad_len, preds.mean())
            preds = np.concatenate([preds, pad_vals])
            sub[target_col] = preds
            print(f"Padded predictions by {pad_len} to match test length.")

    if sub.isnull().any().any():
        raise ValueError("Submission contains NaN values. Aborting save.")

    if out_name is None:
        out_name = f"submission_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
    out_path = os.path.join(OUT_DIR, out_name)
    sub.to_csv(out_path, index=False)
    print("Saved submission to:", out_path)
    print("Submission preview:\n", sub.head())
    return out_path, sub

# Decide which preds to use (prefer final_preds, else averaged CV test folds, else OOF fallback)
preds_to_use = None
if final_preds is not None:
    preds_to_use = final_preds
elif len(test_fold_preds) > 0:
    preds_to_use = np.mean(np.vstack(test_fold_preds), axis=0)
elif 'oof' in globals():
    print("No test predictions available; using OOF as placeholder predictions.")
    preds_to_use = oof
else:
    raise RuntimeError("No predictions available for submission. Run training first.")

# Attempt to load explicit test_df (prefer TEST_PATH), else sample_submission skeleton
test_df_for_submission = None
if df_test is not None:
    test_df_for_submission = df_test.copy()
else:
    sample_path = find_sample_submission()
    if sample_path:
        print("Found sample_submission at:", sample_path)
        try:
            s = pd.read_csv(sample_path)
            test_df_for_submission = s.copy()
            # If sample submission typically contains only id & target, we'll use its id column for submission
        except Exception as e:
            print("Failed to read sample_submission:", e)
            test_df_for_submission = None

# If we have a sample_submission but preds length mismatch, we'll trim/pad inside builder
out_path, submission_df = build_submission_from_predictions(preds_to_use, test_df=test_df_for_submission, out_name=f"submission_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv", target_col=TARGET_COL, as_int=AS_INTEGER_SUBMISSION)

print("\n--- Done ---")
print("Artifacts directory:", OUT_DIR)
print("Mean CV AUC: {:.6f} | OOF AUC: {:.6f}".format(np.mean(cv_scores) if len(cv_scores)>0 else float('nan'), oof_auc if 'oof_auc' in globals() else float('nan')))
print("Submission file:", out_path)
print("If Kaggle rejects the submission, paste the exact error message and I will adapt the CSV format accordingly.")


[I 2025-10-22 04:02:35,936] A new study created in memory with name: no-name-0821fbae-f550-4822-a1e7-a3737cba0803
[I 2025-10-22 04:02:35,996] Trial 0 finished with value: 0.7321428571428571 and parameters: {'eta': 0.015355286838886862, 'max_depth': 7, 'subsample': 0.892797576724562, 'colsample_bytree': 0.7993292420985183}. Best is trial 0 with value: 0.7321428571428571.
[I 2025-10-22 04:02:36,034] Trial 1 finished with value: 0.7388392857142857 and parameters: {'eta': 0.007979118876474874, 'max_depth': 3, 'subsample': 0.6232334448672797, 'colsample_bytree': 0.9330880728874675}. Best is trial 1 with value: 0.7388392857142857.
[I 2025-10-22 04:02:36,088] Trial 2 finished with value: 0.7321428571428572 and parameters: {'eta': 0.03027182927734624, 'max_depth': 6, 'subsample': 0.608233797718321, 'colsample_bytree': 0.9849549260809971}. Best is trial 1 with value: 0.7388392857142857.


Renamed target column: condition -> target
Numeric features: ['age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'age_chol', 'age_thalach', 'chol_trestbps_ratio']
Categorical features: ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal', 'age_bin', 'ca_missing', 'thal_missing']

=== Fold 1/10 ===


[I 2025-10-22 04:02:36,128] Trial 3 finished with value: 0.71875 and parameters: {'eta': 0.060534484680010825, 'max_depth': 4, 'subsample': 0.6727299868828402, 'colsample_bytree': 0.5917022549267169}. Best is trial 1 with value: 0.7388392857142857.
[I 2025-10-22 04:02:36,175] Trial 4 finished with value: 0.7276785714285714 and parameters: {'eta': 0.012439367209907218, 'max_depth': 5, 'subsample': 0.7727780074568463, 'colsample_bytree': 0.645614570099021}. Best is trial 1 with value: 0.7388392857142857.
[I 2025-10-22 04:02:36,212] Trial 5 finished with value: 0.7366071428571428 and parameters: {'eta': 0.03126143958203108, 'max_depth': 3, 'subsample': 0.7168578594140873, 'colsample_bytree': 0.6831809216468459}. Best is trial 1 with value: 0.7388392857142857.
[I 2025-10-22 04:02:36,266] Trial 6 finished with value: 0.7053571428571429 and parameters: {'eta': 0.019603369861210685, 'max_depth': 6, 'subsample': 0.6798695128633439, 'colsample_bytree': 0.7571172192068059}. Best is trial 1 with 

Optuna best params: {'eta': 0.029493012052163467, 'max_depth': 3, 'subsample': 0.8430179407605753, 'colsample_bytree': 0.5852620618436457}
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	training's auc: 0.904217	valid_1's auc: 0.850446
Fold 1 AUC: 0.758929

=== Fold 2/10 ===
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[31]	training's auc: 0.942553	valid_1's auc: 0.866071
Fold 2 AUC: 0.875000

=== Fold 3/10 ===
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[20]	training's auc: 0.943626	valid_1's auc: 0.9375
Fold 3 AUC: 0.915179

=== Fold 4/10 ===
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[52]	training's auc: 0.96014	valid_1's auc: 0.754464
Fold 4 AUC: 0.732143

=== Fold 5/10 ===
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[30]	trai