In [1]:
import os
import warnings
import pickle

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV

from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import (
    RandomForestClassifier, RandomForestRegressor,
    AdaBoostClassifier, AdaBoostRegressor,
    GradientBoostingClassifier, GradientBoostingRegressor
)
from xgboost import XGBClassifier, XGBRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.svm import SVC, SVR

from sklearn.preprocessing import (
    StandardScaler, MinMaxScaler,
    LabelEncoder, OneHotEncoder,
    QuantileTransformer, PowerTransformer
)
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.metrics import (
    mean_squared_error, mean_absolute_error, r2_score,
    classification_report, accuracy_score, f1_score, precision_score
)

warnings.filterwarnings('ignore')

print("Core Libraries: pandas, numpy, matplotlib.pyplot, seaborn, warnings")
print("Train/Test Split: train_test_split, GridSearchCV, RandomizedSearchCV")
print("Models: GaussianNB, BernoulliNB, MultinomialNB, DecisionTreeClassifier, DecisionTreeRegressor, "
      "RandomForestClassifier, RandomForestRegressor, AdaBoostClassifier, AdaBoostRegressor, "
      "GradientBoostingClassifier, GradientBoostingRegressor, XGBClassifier, XGBRegressor, "
      "LogisticRegression, LinearRegression, KNeighborsClassifier, KNeighborsRegressor, SVC, SVR")
print("Preprocessing: StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder, "
      "QuantileTransformer, PowerTransformer, ColumnTransformer, Pipeline")
print("Metrics: mean_squared_error, mean_absolute_error, r2_score, "
      "classification_report, accuracy_score, f1_score, precision_score")
print("Other: pickle")


Train/Test Split: train_test_split, GridSearchCV, RandomizedSearchCV
Models: GaussianNB, BernoulliNB, MultinomialNB, DecisionTreeClassifier, DecisionTreeRegressor, RandomForestClassifier, RandomForestRegressor, AdaBoostClassifier, AdaBoostRegressor, GradientBoostingClassifier, GradientBoostingRegressor, XGBClassifier, XGBRegressor, LogisticRegression, LinearRegression, KNeighborsClassifier, KNeighborsRegressor, SVC, SVR
Preprocessing: StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder, QuantileTransformer, PowerTransformer, ColumnTransformer, Pipeline
Metrics: mean_squared_error, mean_absolute_error, r2_score, classification_report, accuracy_score, f1_score, precision_score
Other: pickle


In [2]:
data = pd.read_csv("/kaggle/input/cleaned-data/heart_disease_cleaned.csv")

In [3]:
data['thal'].replace({'fixed defect':'fixed_defect', 'reversable defect':'reversable_defect'}, inplace=True)
data['cp'].replace({'typical angina':'typical_angina', 'atypical angina':'atypical_angina'}, inplace=True)
data['restecg'].replace({'st-t abnormality':'ST-T_wave_abnormality', 'lv hypertrophy':'left_ventricular_hypertrophy'}, inplace=True)

data_1 = data[['age','sex','cp','dataset','trestbps','chol','fbs','restecg','thalch','exang','oldpeak','slope','ca','thal']].copy()
data_1['target'] = (data['num'] > 0).astype(int)
data_1['sex'] = (data_1['sex'] == 'Male').astype(int)
data_1['fbs'] = data_1['fbs'].astype(int)
data_1['exang'] = data_1['exang'].astype(int)

data_1.columns = [
    'age', 'sex', 'chest_pain_type','country','resting_blood_pressure',
    'cholesterol','fasting_blood_sugar','Restecg','max_heart_rate_achieved',
    'exercise_induced_angina','st_depression','st_slope_type','num_major_vessels',
    'thalassemia_type','target'
]

data_1.head()

Unnamed: 0,age,sex,chest_pain_type,country,resting_blood_pressure,cholesterol,fasting_blood_sugar,Restecg,max_heart_rate_achieved,exercise_induced_angina,st_depression,st_slope_type,num_major_vessels,thalassemia_type,target
0,63,1,typical_angina,Cleveland,145.0,233.0,1,left_ventricular_hypertrophy,150.0,0,2.3,downsloping,0.0,fixed_defect,0
1,67,1,asymptomatic,Cleveland,160.0,286.0,0,left_ventricular_hypertrophy,108.0,1,1.5,flat,3.0,normal,1
2,67,1,asymptomatic,Cleveland,120.0,229.0,0,left_ventricular_hypertrophy,129.0,1,2.6,flat,2.0,reversable_defect,1
3,37,1,non-anginal,Cleveland,130.0,250.0,0,normal,187.0,0,3.5,downsloping,0.0,normal,0
4,41,0,atypical_angina,Cleveland,130.0,204.0,0,left_ventricular_hypertrophy,172.0,0,1.4,upsloping,0.0,normal,0


In [4]:
def train_random_forest(data, target):
    # Dictionary to store LabelEncoders for each categorical column
    label_encoders = {}

    # split the data into X and y
    X = data.drop(target, axis=1)
    y = data[target]

    # Create a new LabelEncoder for each categorical column
    for col in X.select_dtypes(include=['object', 'category']).columns:
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col])
        label_encoders[col] = le

    # split the data into train and test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

    # Scaling Data
    scaler = MinMaxScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Define the Random Forest model
    rf_model = RandomForestClassifier(random_state=0,class_weight='balanced')

    # Define hyperparameters for tuning
    param_grid = {
        'n_estimators': [50, 100, 150],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }

    # Perform GridSearchCV for hyperparameter tuning
    grid_search = GridSearchCV(rf_model, param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)

    # Get the best model and parameters
    best_rf_model = grid_search.best_estimator_
    best_params = grid_search.best_params_

    # Print the best hyperparameters
    print('Best Hyperparameters:')
    print(best_params)

    # Train the model on the full training set
    best_rf_model.fit(X_train, y_train)

    # Evaluate the model on the test set
    y_pred = best_rf_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    print(f'Accuracy on Test Set: {accuracy:.2f}')

    # Inverse transform at the End
    # Loop through each column to decode the data
    for col, le in label_encoders.items():
        # Use the inverse_transform method to decode the column in both training and test sets
        X[col] = le.inverse_transform(X[col])

    return best_rf_model, best_params, accuracy


In [5]:
target_column = 'target'  # this is your target
best_model, best_params, test_accuracy = train_random_forest(data_1, target=target_column)

# Print summary
print("\n--- Training Summary ---")
print(f"Best Hyperparameters: {best_params}")
print(f"Test Accuracy: {test_accuracy:.4f}")

Best Hyperparameters:
{'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 100}
Accuracy on Test Set: 0.86

--- Training Summary ---
Best Hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 100}
Test Accuracy: 0.8551
