<a href="https://www.kaggle.com/code/samithsachidanandan/predict-heart-disease-catboost-single-model?scriptVersionId=296795395" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Importing Libraries and Loading the Data

In [13]:


import pandas as pd
import numpy as np
from scipy import stats
from sklearn.preprocessing import LabelEncoder
from scipy import stats as scipy_stats
import warnings


warnings.filterwarnings("ignore")
np.random.seed(42)


train_file = "/kaggle/input/playground-series-s6e2/train.csv"
test_file = "/kaggle/input/playground-series-s6e2/test.csv"
original_file = "/kaggle/input/heartdisease/Heart_Disease_Prediction.csv"

train_df = pd.read_csv(train_file)
test_df = pd.read_csv(test_file)
original_df = pd.read_csv(original_file)
submission_df = pd.read_csv("/kaggle/input/playground-series-s6e2/sample_submission.csv")

TARGET = "Heart Disease"
ID_COL = "id"

train_df.shape, test_df.shape, original_df.shape

((630000, 15), (270000, 14), (270, 14))

# Base features

In [14]:
le = LabelEncoder()
train_df['Heart Disease'] = le.fit_transform(train_df['Heart Disease'])
original_df['Heart Disease'] = le.fit_transform(original_df['Heart Disease'])

base_features = [col for col in train_df.columns if col not in ['Heart Disease', 'id']] 


In [15]:
train_df.shape

(630000, 15)

# Feature Engineering 

In [16]:
def preprocess(df):
    
    df = df.copy()

    # Acknowledgement https://www.kaggle.com/code/omidbaghchehsaraei/the-best-solo-model-so-far-realmlp-lb-0-95392
    for col in base_features: 
        if col in original_df.columns:
           
            stats_agg = original_df.groupby(col)['Heart Disease'].agg(['mean', 'median', 'std', 'skew', 'count']).reset_index()
         
            stats_agg.columns = [col] + [f"orig_{col}_{s}" for s in ['mean', 'median', 'std', 'skew', 'count']]
     
            df = df.merge(stats_agg, on=col, how='left') 
 
            fill_values = {
                f"orig_{col}_mean": original_df['Heart Disease'].mean(),
                f"orig_{col}_median": original_df['Heart Disease'].median(),
                f"orig_{col}_std": 0,
                f"orig_{col}_skew": 0,
                f"orig_{col}_count": 0
            }
            df = df.fillna(value=fill_values)
 

    df['BP_Category'] = pd.cut(df['BP'], 
                               bins=[0, 120, 130, 140, np.inf],
                               labels=['Normal', 'Elevated', 'Stage1_HTN', 'Stage2_HTN'])
    
    df['Cholesterol_Risk'] = pd.cut(df['Cholesterol'],
                                     bins=[0, 200, 240, np.inf],
                                     labels=['Desirable', 'Borderline', 'High'])
    

    df['Pulse_Pressure'] = df['Max HR'] - df['BP']
    df['MAP'] = (df['BP'] + (2 * (df['Max HR'] * 0.4))) / 3
    
    df['BP_Cholesterol_Interaction'] = df['BP'] * df['Cholesterol'] / 1000
    df['Age_BP_Risk'] = df['Age'] * df['BP'] / 100
    df['Metabolic_Burden'] = (df['BP'] + df['Cholesterol'] / 100 + df['Max HR'] / 100) / 3
    

    df['High_BP_High_Chol'] = ((df['BP'] > 140) & (df['Cholesterol'] > 240)).astype(int)
    df['Age_Risk_Factor'] = (df['Age'] > 50).astype(int) * (df['BP'] > 130).astype(int)

    df['Cholesterol_per_100_BP'] = df['Cholesterol'] / (df['BP'] + 1)
    df['Age_normalized_HR'] = df['Max HR'] / (df['Age'] + 1)
    

    df['HR_BP_Ratio'] = df['Max HR'] / (df['BP'] + 1)
    df['Cardiac_Load_Index'] = (df['Max HR'] * df['BP']) / 10000
    

    df['FBS_Risk'] = (df['FBS over 120'] == 1).astype(int)
    
   
    df['Age_Squared'] = df['Age'] ** 2
    df['Age_Cubed'] = df['Age'] ** 3
    df['BP_Squared'] = df['BP'] ** 2

    numeric_cols = ['Age', 'BP', 'Cholesterol', 'Max HR']
    for col in numeric_cols:
        df[f'{col}_ZScore'] = np.abs(scipy_stats.zscore(df[col]))
    

    df['Abnormal_Count'] = (
        (df['BP'] > 140).astype(int) +
        (df['Cholesterol'] > 240).astype(int) +
        (df['FBS over 120'] == 1).astype(int) +
        (df['EKG results'] > 0).astype(int) +
        (df['Exercise angina'] == 1).astype(int) +
        (df['ST depression'] > 1.0).astype(int)
    )
    

    df['Is_Male'] = df['Sex']
    df['Is_Female'] = 1 - df['Sex']
    

    chest_pain_severity = {1: 'Typical_Angina', 2: 'Atypical_Angina', 
                          3: 'Non_Anginal_Pain', 4: 'Asymptomatic'}
    df['Chest pain type_Encoded'] = df['Chest pain type'].map(chest_pain_severity)
    chest_pain_dummies = pd.get_dummies(df['Chest pain type_Encoded'], prefix='CPT')
    df = pd.concat([df, chest_pain_dummies], axis=1)
    

    df['Risk_Profile'] = 'Low'
    mask_moderate = (df['BP'] > 130) | (df['Cholesterol'] > 200)
    mask_high = (df['BP'] > 140) & (df['Cholesterol'] > 240)
    df.loc[mask_moderate, 'Risk_Profile'] = 'Moderate'
    df.loc[mask_high, 'Risk_Profile'] = 'High'
    
    risk_dummies = pd.get_dummies(df['Risk_Profile'], prefix='Risk')
    df = pd.concat([df, risk_dummies], axis=1)

    df['HR_Reserve'] = 220 - df['Age'] - df['Max HR']
    df['HR_Reserve_Ratio'] = df['Max HR'] / (220 - df['Age'] + 1)
    

    df['Cardiac_Efficiency'] = (220 - df['Age']) / (df['Cardiac_Load_Index'] + 1)
    
    return df

In [17]:
train_df = preprocess(train_df)
test_df = preprocess(test_df)

# Spliting the Target

In [18]:
y_train = train_df['Heart Disease']
X_train = train_df.drop(['id', 'Heart Disease'], axis=1)

X_test = test_df.copy()
X_test = X_test.drop('id', axis=1)

In [19]:
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}\n")

X_train shape: (630000, 114)
y_train shape: (630000,)
X_test shape: (270000, 114)



# One - Hot Endcoding 

In [20]:
def prepare_data(X_train, X_test):
    
    X_train = X_train.copy()
    X_test = X_test.copy()
    

    bool_cols = X_train.select_dtypes(include=['bool']).columns.tolist()
    if bool_cols:

        for col in bool_cols:
            X_train[col] = X_train[col].astype(int)
            X_test[col] = X_test[col].astype(int)


    category_cols = X_train.select_dtypes(include=['category']).columns.tolist()
    if category_cols:
     
        X_train = pd.get_dummies(X_train, columns=category_cols, prefix=None, drop_first=False)
        X_test = pd.get_dummies(X_test, columns=category_cols, prefix=None, drop_first=False)

    

    object_cols = X_train.select_dtypes(include=['object']).columns.tolist()
    if object_cols:

        X_train = pd.get_dummies(X_train, columns=object_cols, prefix=None, drop_first=False)
        X_test = pd.get_dummies(X_test, columns=object_cols, prefix=None, drop_first=False)

    

    
    return X_train, X_test


In [21]:
X_train, X_test = prepare_data(X_train, X_test)

In [22]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 630000 entries, 0 to 629999
Columns: 124 entries, Age to Risk_Profile_Moderate
dtypes: bool(14), float64(72), int64(38)
memory usage: 537.1 MB


# Model Training

In [None]:
!pip install pytorch-tabnet torch

[0m

In [None]:
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score, roc_curve, auc
from catboost import CatBoostClassifier
from pytorch_tabnet.tab_model import TabNetClassifier


In [None]:
cat_params = {
    "objective": "Logloss",
    "eval_metric": "AUC",
    "iterations": 8000,  
    "learning_rate": 0.01,  
    "depth": 7, 
    "subsample": 0.8,  
    "colsample_bylevel": 0.8, 
    "l2_leaf_reg": 3.0,  
    "random_state": 42,
    "verbose": 0,
    "thread_count": -1,
}

tabnet_params = {
    "n_d": 64,
    "n_a": 64,
    "n_steps": 3,
    "gamma": 1.3,
    "lambda_sparse": 1e-3,
    "optimizer_fn": torch.optim.Adam,
    "optimizer_params": {"lr": 2e-2},
    "mask_type": "softmax",
    "scheduler_params": {
        "step_size": 10,
        "gamma": 0.9,
    },
    "seed": 42,
    "verbose": 0,
}

In [None]:





n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)


oof_cat = np.zeros(len(X_train))
oof_tabnet = np.zeros(len(X_train))
oof_ensemble = np.zeros(len(X_train))


test_cat = np.zeros((len(X_test), n_splits))
test_tabnet = np.zeros((len(X_test), n_splits))

print("=" * 80)
print(f"Running {n_splits}-Fold Ensemble (CatBoost + TabNet with Pseudo-Labeling)")
print("=" * 80)

for fold, (tr_idx, val_idx) in enumerate(kf.split(X_train), 1):
    X_tr, X_val = X_train.iloc[tr_idx].copy(), X_train.iloc[val_idx].copy()
    y_tr, y_val = y_train.iloc[tr_idx].copy(), y_train.iloc[val_idx].copy()
    
    print(f"\n{'=' * 80}")
    print(f"Fold {fold}/{n_splits}")
    print("=" * 80)
    

    print(f"\n[{fold}] Training CatBoost...")
    cat_model = CatBoostClassifier(**cat_params)
    cat_model.fit(
        X_tr, y_tr,
        eval_set=[(X_val, y_val)],
        verbose=False,
    )
    
    val_cat = cat_model.predict_proba(X_val)[:, 1]
    test_cat[:, fold-1] = cat_model.predict_proba(X_test)[:, 1]
    oof_cat[val_idx] = val_cat
    roc_auc_cat = roc_auc_score(y_val, val_cat)
    print(f"  CatBoost ROC-AUC: {roc_auc_cat:.4f}")
    

    print(f"[{fold}] Training TabNet...")
    tabnet_model = TabNetClassifier(**tabnet_params)
    tabnet_model.fit(
        X_tr.values, y_tr.values,
        eval_set=[(X_val.values, y_val.values)],
        eval_metric=["auc"],
        max_epochs=200,
        patience=20,
        batch_size=256,
    )
    
    val_tabnet = tabnet_model.predict_proba(X_val.values)[:, 1]
    test_tabnet[:, fold-1] = tabnet_model.predict_proba(X_test.values)[:, 1]
    oof_tabnet[val_idx] = val_tabnet
    roc_auc_tabnet = roc_auc_score(y_val, val_tabnet)
    print(f"  TabNet ROC-AUC: {roc_auc_tabnet:.4f}")
    

    val_ensemble = (val_cat + val_tabnet) / 2
    oof_ensemble[val_idx] = val_ensemble
    roc_auc_ensemble = roc_auc_score(y_val, val_ensemble)
    print(f"Ensemble ROC-AUC: {roc_auc_ensemble:.4f}")
    print(f"CatBoost: {roc_auc_ensemble - roc_auc_cat:+.4f}")
    print(f"TabNet:   {roc_auc_ensemble - roc_auc_tabnet:+.4f}")

In [None]:
print("\n" + "=" * 80)
print("Cross-Validation Results (OOF)")
print("=" * 80)

cv_auc_cat = roc_auc_score(y_train, oof_cat)
cv_auc_tabnet = roc_auc_score(y_train, oof_tabnet)
cv_auc_ensemble = roc_auc_score(y_train, oof_ensemble)

print(f"\nCatBoost    | ROC-AUC: {cv_auc_cat:.4f}")
print(f"TabNet      | ROC-AUC: {cv_auc_tabnet:.4f}")
print(f"Ensemble    | ROC-AUC: {cv_auc_ensemble:.4f}")

In [None]:
print("\n" + "=" * 80)
print("Training final models on full training set...")
print("=" * 80)

final_cat = CatBoostClassifier(**cat_params)
final_cat.fit(X_train, y_train, verbose=False)
y_test_cat_final = final_cat.predict_proba(X_test)[:, 1]
print("CatBoost final model trained")

final_tabnet = TabNetClassifier(**tabnet_params)
final_tabnet.fit(
    X_train.values, y_train.values,
    max_epochs=200,
    patience=20,
    batch_size=256,
)
y_test_tabnet_final = final_tabnet.predict_proba(X_test.values)[:, 1]
print("TabNet final model trained")

In [None]:
y_test_cat_avg = test_cat.mean(axis=1)
y_test_tabnet_avg = test_tabnet.mean(axis=1)
y_test_ensemble_fold = (y_test_cat_avg + y_test_tabnet_avg) / 2


y_test_ensemble_final = (y_test_cat_final + y_test_tabnet_final) / 2

y_test_final = (y_test_ensemble_fold + y_test_ensemble_final) / 2
print(f"y_test_final shape: {y_test_final.shape}")

# Submission 

In [None]:
submission = submission_df.copy()
submission["Heart Disease"] = y_test_final

submission.to_csv('submission.csv', index=False)
print("\n Submission saved to 'submission.csv'")

In [None]:
submission.head()

Acknowledgement https://www.kaggle.com/code/omidbaghchehsaraei/the-best-solo-model-so-far-realmlp-lb-0-95392