In [1]:
# ==============================================================================
# 1. LIBRARIES
# ==============================================================================
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

# ==============================================================================
# 2. LOAD DATA
# ==============================================================================
INPUT_PATH = "/kaggle/input/playground-series-s5e12/"

train_main = pd.read_csv(f"{INPUT_PATH}train.csv")
test_df = pd.read_csv(f"{INPUT_PATH}test.csv")

new_df = pd.read_csv("/kaggle/input/diabetes-health-indicators-dataset/diabetes_dataset.csv")

new_df = new_df.drop([
    "glucose_fasting",
    "glucose_postprandial",
    "insulin_level",
    "hba1c",
    "diabetes_risk_score",
    "diabetes_stage"
], axis=1)

new_df["id"] = -1

train_df = pd.concat([train_main, new_df], axis=0).reset_index(drop=True)

target = "diagnosed_diabetes"

# ==============================================================================
# 3. FEATURE ENGINEERING
# ==============================================================================
def engineer_features(df):
    df = df.copy()

    df["pulse_pressure"] = df["systolic_bp"] - df["diastolic_bp"]
    df["bp_ratio"] = df["systolic_bp"] / (df["diastolic_bp"] + 1)
    df["sedentary_ratio"] = df["screen_time_hours_per_day"] / (df["physical_activity_minutes_per_week"] + 1)
    df["chol_hdl_ratio"] = df["cholesterol_total"] / df["hdl_cholesterol"]
    df["lipid_risk"] = (
        df["cholesterol_total"] +
        df["ldl_cholesterol"] +
        df["triglycerides"]
    ) / df["hdl_cholesterol"]
    df["bmi_age"] = df["bmi"] * df["age"]
    df["waist_bmi"] = df["waist_to_hip_ratio"] * df["bmi"]

    history_cols = [
        "family_history_diabetes",
        "hypertension_history",
        "cardiovascular_history"
    ]
    df["history_sum"] = df[history_cols].sum(axis=1)

    return df

train_df = engineer_features(train_df)
test_df = engineer_features(test_df)

# ==============================================================================
# 4. FEATURE PREPARATION
# ==============================================================================
drop_cols = ["id", target]
cat_cols = train_df.select_dtypes(include="object").columns.tolist()

X_cat = train_df.drop(columns=drop_cols)
y = train_df[target]
X_test_cat = test_df.drop(columns=["id"])

# Ensure strings for CatBoost
for c in cat_cols:
    X_cat[c] = X_cat[c].astype(str)
    X_test_cat[c] = X_test_cat[c].astype(str)

cat_features_idx = [X_cat.columns.get_loc(c) for c in cat_cols]

# One-hot encoding for LGB/XGB
X_ohe = pd.get_dummies(X_cat, drop_first=True)
X_test_ohe = pd.get_dummies(X_test_cat, drop_first=True)

X_ohe, X_test_ohe = X_ohe.align(X_test_ohe, axis=1, fill_value=0)

# ==============================================================================
# 5. MODELS
# ==============================================================================
cat_model = CatBoostClassifier(
    iterations=1500,
    depth=7,
    learning_rate=0.03,
    l2_leaf_reg=6,
    eval_metric="AUC",
    loss_function="Logloss",
    cat_features=cat_features_idx,
    random_seed=42,
    verbose=0
)

lgb_model = LGBMClassifier(
    n_estimators=1200,
    learning_rate=0.03,
    num_leaves=31,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

xgb_model = XGBClassifier(
    n_estimators=1000,
    learning_rate=0.03,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="logloss",
    random_state=42
)

# ==============================================================================
# 6. STRATIFIED K-FOLD CV
# ==============================================================================
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

oof_cat = np.zeros(len(X_cat))
oof_lgb = np.zeros(len(X_cat))
oof_xgb = np.zeros(len(X_cat))

test_cat = np.zeros(len(X_test_cat))
test_lgb = np.zeros(len(X_test_cat))
test_xgb = np.zeros(len(X_test_cat))

for fold, (tr, val) in enumerate(skf.split(X_cat, y)):
    print(f"ðŸ”¹ Fold {fold+1}")

    # CatBoost
    cat_model.fit(X_cat.iloc[tr], y.iloc[tr])
    oof_cat[val] = cat_model.predict_proba(X_cat.iloc[val])[:, 1]
    test_cat += cat_model.predict_proba(X_test_cat)[:, 1] / skf.n_splits

    # LightGBM
    lgb_model.fit(X_ohe.iloc[tr], y.iloc[tr])
    oof_lgb[val] = lgb_model.predict_proba(X_ohe.iloc[val])[:, 1]
    test_lgb += lgb_model.predict_proba(X_test_ohe)[:, 1] / skf.n_splits

    # XGBoost
    xgb_model.fit(X_ohe.iloc[tr], y.iloc[tr])
    oof_xgb[val] = xgb_model.predict_proba(X_ohe.iloc[val])[:, 1]
    test_xgb += xgb_model.predict_proba(X_test_ohe)[:, 1] / skf.n_splits

# ==============================================================================
# 7. ENSEMBLE
# ==============================================================================
oof_ensemble = 0.5 * oof_cat + 0.3 * oof_lgb + 0.2 * oof_xgb
cv_auc = roc_auc_score(y, oof_ensemble)
print(f"\nðŸ”¥ FINAL CV ROC-AUC: {cv_auc:.4f}")

final_test_pred = 0.5 * test_cat + 0.3 * test_lgb + 0.2 * test_xgb

submission = pd.DataFrame({
    "id": test_df["id"],
    "diagnosed_diabetes": final_test_pred
})

submission.to_csv("submission.csv", index=False)
print("\nâœ… submission.csv created")


ðŸ”¹ Fold 1
[LightGBM] [Info] Number of positive: 446674, number of negative: 273326
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.088387 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3458
[LightGBM] [Info] Number of data points in the train set: 720000, number of used features: 44
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.620381 -> initscore=0.491164
[LightGBM] [Info] Start training from score 0.491164
ðŸ”¹ Fold 2
[LightGBM] [Info] Number of positive: 446674, number of negative: 273326
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.161943 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3456
[LightGBM] [Info] Number of data points in the train set: 720000, number of used features: 44
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.6203