<a href="https://www.kaggle.com/code/samithsachidanandan/predict-heart-disease-xgb-lgb-cat-ensemble?scriptVersionId=295987961" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Importing Libraries and Loading the Data

In [1]:


import pandas as pd
import numpy as np
from scipy import stats

import warnings

warnings.filterwarnings("ignore")
np.random.seed(42)

# Load data
train_file = "/kaggle/input/playground-series-s6e2/train.csv"
test_file = "/kaggle/input/playground-series-s6e2/test.csv"

train_df = pd.read_csv(train_file)
test_df = pd.read_csv(test_file)

submission_df = pd.read_csv("/kaggle/input/playground-series-s6e2/sample_submission.csv")

TARGET = "Heart Disease"
ID_COL = "id"

train_df.shape, test_df.shape

((630000, 15), (270000, 14))

# Base features

In [2]:
base_features = [col for col in train_df.columns if col not in [TARGET, ID_COL]]


CATS = train_df.select_dtypes("object").columns.to_list()
print("CATS:", CATS)

CATS: ['Heart Disease']


In [3]:
train_df.head()

Unnamed: 0,id,Age,Sex,Chest pain type,BP,Cholesterol,FBS over 120,EKG results,Max HR,Exercise angina,ST depression,Slope of ST,Number of vessels fluro,Thallium,Heart Disease
0,0,58,1,4,152,239,0,0,158,1,3.6,2,2,7,Presence
1,1,52,1,1,125,325,0,2,171,0,0.0,1,0,3,Absence
2,2,56,0,2,160,188,0,2,151,0,0.0,1,0,3,Absence
3,3,44,0,3,134,229,0,2,150,0,1.0,2,0,3,Absence
4,4,58,1,4,140,234,0,2,125,1,3.8,2,3,3,Presence


# Feature Engineering 

In [4]:
def preprocess(df):
    
    df = df.copy()

    df['BP_Category'] = pd.cut(df['BP'], 
                               bins=[0, 120, 130, 140, np.inf],
                               labels=['Normal', 'Elevated', 'Stage1_HTN', 'Stage2_HTN'])
    
    df['Cholesterol_Risk'] = pd.cut(df['Cholesterol'],
                                     bins=[0, 200, 240, np.inf],
                                     labels=['Desirable', 'Borderline', 'High'])
    

    df['Pulse_Pressure'] = df['Max HR'] - df['BP']
    df['MAP'] = (df['BP'] + (2 * (df['Max HR'] * 0.4))) / 3
    
    df['BP_Cholesterol_Interaction'] = df['BP'] * df['Cholesterol'] / 1000
    df['Age_BP_Risk'] = df['Age'] * df['BP'] / 100
    df['Metabolic_Burden'] = (df['BP'] + df['Cholesterol'] / 100 + df['Max HR'] / 100) / 3
    

    df['High_BP_High_Chol'] = ((df['BP'] > 140) & (df['Cholesterol'] > 240)).astype(int)
    df['Age_Risk_Factor'] = (df['Age'] > 50).astype(int) * (df['BP'] > 130).astype(int)

    df['Cholesterol_per_100_BP'] = df['Cholesterol'] / (df['BP'] + 1)
    df['Age_normalized_HR'] = df['Max HR'] / (df['Age'] + 1)
    

    df['HR_BP_Ratio'] = df['Max HR'] / (df['BP'] + 1)
    df['Cardiac_Load_Index'] = (df['Max HR'] * df['BP']) / 10000
    

    df['FBS_Risk'] = (df['FBS over 120'] == 1).astype(int)
    
   
    df['Age_Squared'] = df['Age'] ** 2
    df['Age_Cubed'] = df['Age'] ** 3
    df['BP_Squared'] = df['BP'] ** 2

    numeric_cols = ['Age', 'BP', 'Cholesterol', 'Max HR']
    for col in numeric_cols:
        df[f'{col}_ZScore'] = np.abs(stats.zscore(df[col]))
    

    df['Abnormal_Count'] = (
        (df['BP'] > 140).astype(int) +
        (df['Cholesterol'] > 240).astype(int) +
        (df['FBS over 120'] == 1).astype(int) +
        (df['EKG results'] > 0).astype(int) +
        (df['Exercise angina'] == 1).astype(int) +
        (df['ST depression'] > 1.0).astype(int)
    )
    

    df['Is_Male'] = df['Sex']
    df['Is_Female'] = 1 - df['Sex']
    

    chest_pain_severity = {1: 'Typical_Angina', 2: 'Atypical_Angina', 
                          3: 'Non_Anginal_Pain', 4: 'Asymptomatic'}
    df['Chest pain type_Encoded'] = df['Chest pain type'].map(chest_pain_severity)
    chest_pain_dummies = pd.get_dummies(df['Chest pain type_Encoded'], prefix='CPT')
    df = pd.concat([df, chest_pain_dummies], axis=1)
    

    df['Risk_Profile'] = 'Low'
    mask_moderate = (df['BP'] > 130) | (df['Cholesterol'] > 200)
    mask_high = (df['BP'] > 140) & (df['Cholesterol'] > 240)
    df.loc[mask_moderate, 'Risk_Profile'] = 'Moderate'
    df.loc[mask_high, 'Risk_Profile'] = 'High'
    
    risk_dummies = pd.get_dummies(df['Risk_Profile'], prefix='Risk')
    df = pd.concat([df, risk_dummies], axis=1)

    df['HR_Reserve'] = 220 - df['Age'] - df['Max HR']
    df['HR_Reserve_Ratio'] = df['Max HR'] / (220 - df['Age'] + 1)
    

    df['Cardiac_Efficiency'] = (220 - df['Age']) / (df['Cardiac_Load_Index'] + 1)
    
    return df

In [5]:
train_df = preprocess(train_df)
test_df = preprocess(test_df)

# Spliting the Target

In [6]:
y_train = train_df['Heart Disease']
X_train = train_df.drop('Heart Disease', axis=1)

X_test = test_df.copy()

In [7]:
if 'id' in X_train.columns:
    X_train = X_train.drop('id', axis=1)
if 'id' in X_test.columns:
    X_test = X_test.drop('id', axis=1)

print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}\n")

X_train shape: (630000, 49)
y_train shape: (630000,)
X_test shape: (270000, 49)



# One - Hot Endcoding 

In [8]:
def prepare_data(X_train, X_test):
    
    X_train = X_train.copy()
    X_test = X_test.copy()
    

    bool_cols = X_train.select_dtypes(include=['bool']).columns.tolist()
    if bool_cols:

        for col in bool_cols:
            X_train[col] = X_train[col].astype(int)
            X_test[col] = X_test[col].astype(int)


    category_cols = X_train.select_dtypes(include=['category']).columns.tolist()
    if category_cols:
     
        X_train = pd.get_dummies(X_train, columns=category_cols, prefix=None, drop_first=False)
        X_test = pd.get_dummies(X_test, columns=category_cols, prefix=None, drop_first=False)

    

    object_cols = X_train.select_dtypes(include=['object']).columns.tolist()
    if object_cols:

        X_train = pd.get_dummies(X_train, columns=object_cols, prefix=None, drop_first=False)
        X_test = pd.get_dummies(X_test, columns=object_cols, prefix=None, drop_first=False)

    

    
    return X_train, X_test


In [9]:
X_train, X_test = prepare_data(X_train, X_test)

In [10]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 630000 entries, 0 to 629999
Data columns (total 59 columns):
 #   Column                                    Non-Null Count   Dtype  
---  ------                                    --------------   -----  
 0   Age                                       630000 non-null  int64  
 1   Sex                                       630000 non-null  int64  
 2   Chest pain type                           630000 non-null  int64  
 3   BP                                        630000 non-null  int64  
 4   Cholesterol                               630000 non-null  int64  
 5   FBS over 120                              630000 non-null  int64  
 6   EKG results                               630000 non-null  int64  
 7   Max HR                                    630000 non-null  int64  
 8   Exercise angina                           630000 non-null  int64  
 9   ST depression                             630000 non-null  float64
 10  Slope of ST         

# Model Training

In [11]:
xgb_params = {
    "objective": "binary:logistic",  
    "eval_metric": "auc",            
    "n_estimators": 5000,
    "learning_rate": 0.02,
    "max_depth": 6,
    "subsample": 0.85,
    "colsample_bytree": 0.85,
    "colsample_bynode": 0.7,
    "min_child_weight": 3,
    "reg_lambda": 2.0,
    "gamma": 0.1,
    "tree_method": "hist",
    "device": "cuda",
    "random_state": 42,
    "verbosity": 0,
    "n_jobs": -1,
}

lgb_params = {
    "objective": "binary",
    "metric": "auc",
    "n_estimators": 5000,
    "learning_rate": 0.02,
    "max_depth": 6,
    "subsample": 0.85,
    "colsample_bytree": 0.85,
    "min_child_weight": 3,
    "reg_lambda": 2.0,
    "verbosity": -1,
    "random_state": 42,
    "n_jobs": -1,
}


cat_params = {
    "objective": "Logloss",
    "eval_metric": "AUC",
    "iterations": 5000,
    "learning_rate": 0.02,
    "depth": 6,
    "subsample": 0.85,
    "colsample_bylevel": 0.85,
    "l2_leaf_reg": 2.0,
    "random_state": 42,
    "verbose": 0,
    "thread_count": -1,
}


In [12]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_train = le.fit_transform(y_train)

In [13]:

from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score, roc_curve, auc
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier


n_splits = 5

kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)


oof_xgb = np.zeros(len(X_train))
oof_lgb = np.zeros(len(X_train))
oof_cat = np.zeros(len(X_train))

test_xgb = np.zeros((len(X_test), n_splits))
test_lgb = np.zeros((len(X_test), n_splits))
test_cat = np.zeros((len(X_test), n_splits))

print("="*70)
print(f"Running {n_splits}-Fold Ensemble (XGBoost, LightGBM, CatBoost)")
print("="*70)

for fold, (tr_idx, val_idx) in enumerate(kf.split(X_train), 1):
    X_tr, X_val = X_train.iloc[tr_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train[tr_idx], y_train[val_idx]
    
    print(f"\nFold {fold}/{n_splits}")
    print("-" * 70)
    

    xgb_model = XGBClassifier(**xgb_params)
    xgb_model.fit(
        X_tr, y_tr,
        eval_set=[(X_val, y_val)],
        verbose=False
    )
    
    val_xgb = xgb_model.predict_proba(X_val)[:, 1]
    test_xgb[:, fold-1] = xgb_model.predict_proba(X_test)[:, 1]
    oof_xgb[val_idx] = val_xgb
    roc_auc_xgb = roc_auc_score(y_val, val_xgb)
    print(f"  XGBoost   | ROC-AUC: {roc_auc_xgb:.4f}")
    

    lgb_model = LGBMClassifier(**lgb_params)
    lgb_model.fit(
        X_tr, y_tr,
        eval_set=[(X_val, y_val)],
        callbacks=[],
    )
    
    val_lgb = lgb_model.predict_proba(X_val)[:, 1]
    test_lgb[:, fold-1] = lgb_model.predict_proba(X_test)[:, 1]
    oof_lgb[val_idx] = val_lgb
    roc_auc_lgb = roc_auc_score(y_val, val_lgb)
    print(f"  LightGBM  | ROC-AUC: {roc_auc_lgb:.4f}")
    

    cat_model = CatBoostClassifier(**cat_params)
    cat_model.fit(
        X_tr, y_tr,
        eval_set=[(X_val, y_val)],
        verbose=False,
    )
    
    val_cat = cat_model.predict_proba(X_val)[:, 1]
    test_cat[:, fold-1] = cat_model.predict_proba(X_test)[:, 1]
    oof_cat[val_idx] = val_cat
    roc_auc_cat = roc_auc_score(y_val, val_cat)
    print(f"  CatBoost  | ROC-AUC: {roc_auc_cat:.4f}")






Running 5-Fold Ensemble (XGBoost, LightGBM, CatBoost)

Fold 1/5
----------------------------------------------------------------------
  XGBoost   | ROC-AUC: 0.9542
  LightGBM  | ROC-AUC: 0.9547
  CatBoost  | ROC-AUC: 0.9553

Fold 2/5
----------------------------------------------------------------------
  XGBoost   | ROC-AUC: 0.9544
  LightGBM  | ROC-AUC: 0.9548
  CatBoost  | ROC-AUC: 0.9555

Fold 3/5
----------------------------------------------------------------------
  XGBoost   | ROC-AUC: 0.9551
  LightGBM  | ROC-AUC: 0.9554
  CatBoost  | ROC-AUC: 0.9561

Fold 4/5
----------------------------------------------------------------------
  XGBoost   | ROC-AUC: 0.9540
  LightGBM  | ROC-AUC: 0.9542
  CatBoost  | ROC-AUC: 0.9549

Fold 5/5
----------------------------------------------------------------------
  XGBoost   | ROC-AUC: 0.9541
  LightGBM  | ROC-AUC: 0.9545
  CatBoost  | ROC-AUC: 0.9552


In [14]:
print("\n" + "="*70)
print("Training final models on full training set...")
print("="*70)

final_xgb = XGBClassifier(**xgb_params)
final_xgb.fit(X_train, y_train, verbose=False)

final_lgb = LGBMClassifier(**lgb_params)
final_lgb.fit(X_train, y_train)

final_cat = CatBoostClassifier(**cat_params)
final_cat.fit(X_train, y_train, verbose=False)


y_test_xgb = final_xgb.predict_proba(X_test)[:, 1]
y_test_lgb = final_lgb.predict_proba(X_test)[:, 1]
y_test_cat = final_cat.predict_proba(X_test)[:, 1]


cv_auc_xgb = roc_auc_score(y_train, oof_xgb)
cv_auc_lgb = roc_auc_score(y_train, oof_lgb)
cv_auc_cat = roc_auc_score(y_train, oof_cat)

print("\nCV Results (OOF):")
print(f"  XGBoost   | ROC-AUC: {cv_auc_xgb:.4f}")
print(f"  LightGBM  | ROC-AUC: {cv_auc_lgb:.4f}")
print(f"  CatBoost  | ROC-AUC: {cv_auc_cat:.4f}")


Training final models on full training set...

CV Results (OOF):
  XGBoost   | ROC-AUC: 0.9544
  LightGBM  | ROC-AUC: 0.9547
  CatBoost  | ROC-AUC: 0.9554


In [15]:
print("\n" + "="*70)
print("Creating Ensemble Predictions")
print("="*70)


y_test_ensemble_mean = (y_test_xgb + y_test_lgb + y_test_cat) / 3
oof_ensemble_mean = (oof_xgb + oof_lgb + oof_cat) / 3
cv_auc_ensemble_mean = roc_auc_score(y_train, oof_ensemble_mean)
print(f"Ensemble (Mean) | ROC-AUC: {cv_auc_ensemble_mean:.4f}")


y_test_ensemble_median = np.median([y_test_xgb, y_test_lgb, y_test_cat], axis=0)
oof_ensemble_median = np.median([oof_xgb, oof_lgb, oof_cat], axis=0)
cv_auc_ensemble_median = roc_auc_score(y_train, oof_ensemble_median)
print(f"Ensemble (Median) | ROC-AUC: {cv_auc_ensemble_median:.4f}")


weights = np.array([0.10, 0.20, 0.70])  
y_test_ensemble_weighted = (
    weights[0] * y_test_xgb +
    weights[1] * y_test_lgb +
    weights[2] * y_test_cat
)
oof_ensemble_weighted = (
    weights[0] * oof_xgb +
    weights[1] * oof_lgb +
    weights[2] * oof_cat
)
cv_auc_ensemble_weighted = roc_auc_score(y_train, oof_ensemble_weighted)
print(f"Ensemble (Weighted) | ROC-AUC: {cv_auc_ensemble_weighted:.4f}")


print("\n" + "="*70)
print("Ensemble Comparison:")
print("="*70)
ensemble_results = {
    "Mean": cv_auc_ensemble_mean,
    "Median": cv_auc_ensemble_median,
    "Weighted": cv_auc_ensemble_weighted,
}

best_method = max(ensemble_results, key=ensemble_results.get)
best_score = ensemble_results[best_method]
print(f"Best Ensemble: {best_method} (ROC-AUC: {best_score:.4f})")


if best_method == "Mean":
    y_test_final = y_test_ensemble_mean
elif best_method == "Median":
    y_test_final = y_test_ensemble_median
else:
    y_test_final = y_test_ensemble_weighted

print(f"\nFinal Test predictions shape: {y_test_final.shape}")
print(f"Prediction range: [{y_test_final.min():.4f}, {y_test_final.max():.4f}]")



Creating Ensemble Predictions
Ensemble (Mean) | ROC-AUC: 0.9551
Ensemble (Median) | ROC-AUC: 0.9550
Ensemble (Weighted) | ROC-AUC: 0.9554

Ensemble Comparison:
Best Ensemble: Weighted (ROC-AUC: 0.9554)

Final Test predictions shape: (270000,)
Prediction range: [0.0001, 1.0000]


# Submission 

In [16]:
submission = submission_df.copy()
submission["Heart Disease"] = y_test_final

submission.to_csv('submission.csv', index=False)
print("\n Submission saved to 'submission.csv'")


 Submission saved to 'submission.csv'


In [17]:
submission.head()

Unnamed: 0,id,Heart Disease
0,630000,0.956141
1,630001,0.009233
2,630002,0.98889
3,630003,0.005891
4,630004,0.200445
