In [13]:
import warnings
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, classification_report

warnings.filterwarnings('ignore')

print("Core Libraries: pandas, numpy, warnings")
print("Train/Test Split & CV: train_test_split, GridSearchCV, StratifiedKFold")
print("Model: RandomForestClassifier")
print("Preprocessing: OneHotEncoder, ColumnTransformer, Pipeline")
print("Metrics: accuracy_score, f1_score, classification_report")


Train/Test Split & CV: train_test_split, GridSearchCV, StratifiedKFold
Model: RandomForestClassifier
Preprocessing: OneHotEncoder, ColumnTransformer, Pipeline
Metrics: accuracy_score, f1_score, classification_report


In [14]:
data = pd.read_csv("/kaggle/input/cleaned-data/heart_disease_cleaned.csv")
data['thal'].replace({'fixed defect':'fixed_defect', 'reversable defect':'reversable_defect'}, inplace=True)
data['cp'].replace({'typical angina':'typical_angina', 'atypical angina':'atypical_angina'}, inplace=True)
data['restecg'].replace({'st-t abnormality':'ST-T_wave_abnormality', 'lv hypertrophy':'left_ventricular_hypertrophy'}, inplace=True)

data_1 = data[['age','sex','cp','dataset','trestbps','chol','fbs','restecg','thalch','exang','oldpeak','slope','ca','thal']].copy()
data_1['target'] = (data['num'] > 0).astype(int)
data_1['sex'] = (data_1['sex'] == 'Male').astype(int)
data_1['fbs'] = data_1['fbs'].astype(int)
data_1['exang'] = data_1['exang'].astype(int)

data_1.columns = [
    'age', 'sex', 'chest_pain_type','country','resting_blood_pressure',
    'cholesterol','fasting_blood_sugar','Restecg','max_heart_rate_achieved',
    'exercise_induced_angina','st_depression','st_slope_type','num_major_vessels',
    'thalassemia_type','target'
]

data_1.head()

Unnamed: 0,age,sex,chest_pain_type,country,resting_blood_pressure,cholesterol,fasting_blood_sugar,Restecg,max_heart_rate_achieved,exercise_induced_angina,st_depression,st_slope_type,num_major_vessels,thalassemia_type,target
0,63,1,typical_angina,Cleveland,145.0,233.0,1,left_ventricular_hypertrophy,150.0,0,2.3,downsloping,0.0,fixed_defect,0
1,67,1,asymptomatic,Cleveland,160.0,286.0,0,left_ventricular_hypertrophy,108.0,1,1.5,flat,3.0,normal,1
2,67,1,asymptomatic,Cleveland,120.0,229.0,0,left_ventricular_hypertrophy,129.0,1,2.6,flat,2.0,reversable_defect,1
3,37,1,non-anginal,Cleveland,130.0,250.0,0,normal,187.0,0,3.5,downsloping,0.0,normal,0
4,41,0,atypical_angina,Cleveland,130.0,204.0,0,left_ventricular_hypertrophy,172.0,0,1.4,upsloping,0.0,normal,0


In [15]:
def train_random_forest(data, target):
    # Split data into features and target
    X = data.drop(columns=[target])
    y = data[target]

    # Identify categorical and numerical features
    categorical_features = X.select_dtypes(include=["object", "category"]).columns.tolist()
    numeric_features = X.select_dtypes(exclude=["object", "category"]).columns.tolist()

    # Preprocessor: OneHotEncode categorical, passthrough numeric
    preprocessor = ColumnTransformer(
        transformers=[
            ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
            ("num", "passthrough", numeric_features)
        ]
    )

    # Define Random Forest with balanced class weights
    rf = RandomForestClassifier(random_state=42, class_weight="balanced", n_jobs=-1)

    # Create pipeline: preprocessing + model
    pipeline = Pipeline(steps=[("preprocessor", preprocessor),
                               ("model", rf)])

    # Expanded hyperparameter grid
    param_grid = {
        "model__n_estimators": [100, 200, 500],
        "model__max_depth": [None, 10, 20, 30],
        "model__min_samples_split": [2, 5, 10],
        "model__min_samples_leaf": [1, 2, 4],
        "model__max_features": ["sqrt", "log2"]
    }

    # Stratified CV for class balance
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # GridSearch with accuracy
    grid_search = GridSearchCV(
        pipeline,
        param_grid,
        cv=cv,
        scoring="accuracy",
        n_jobs=-1,
        verbose=2
    )

    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, stratify=y, random_state=42
    )

    # Fit the grid search
    grid_search.fit(X_train, y_train)

    # Best model
    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_

    # Evaluate on test set
    y_pred = best_model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="weighted")

    print("Best Hyperparameters:", best_params)
    print(f"Test Accuracy: {acc:.4f}")
    print(f"Test F1 Score: {f1:.4f}")
    print("\nClassification Report:\n", classification_report(y_test, y_pred))

    return best_model, best_params, {"accuracy": acc, "f1": f1}


In [16]:
target_column = 'target'
best_model, best_params, test_accuracy = train_random_forest(data_1, target=target_column)

# Print summary
print("\n--- Training Summary ---")
print(f"Best Hyperparameters: {best_params}")
print(f"Test Accuracy: {test_accuracy:.4f}")


Fitting 5 folds for each of 216 candidates, totalling 1080 fits
[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=100; total time=   0.4s
[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=200; total time=   0.7s
[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=200; total time=   0.7s
[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=500; total time=   1.7s
[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=5, model__n_estimators=100; total time=   0.4s
[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=5, model__n_estimators=200; total time=  

TypeError: unsupported format string passed to dict.__format__