In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [None]:
# Load the dataset
heart_disease = pd.read_csv('data/heart.csv')
heart_disease.head()

### Data preprocessing

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# Numerical transfomer pipeline
numerical_features = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'ca']

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
])

In [None]:
# Categorical transfomer pipeline
categorical_features = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'thal']

categorical_transofrmer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=-1)),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [None]:
# Data preprocessing

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transofrmer, categorical_features),
        ('num', numerical_transformer, numerical_features)
    ]
)

### Model Building

In [None]:
X = heart_disease.drop('target', axis=1)
y = heart_disease['target']

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

classification_models = {
        'Logistic Regression' : Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('classifier', LogisticRegression(max_iter=1000))
        ]),
        'RandomForestClassifier' : Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('classifier', RandomForestClassifier())
        ]),
        'XGBoost' : Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('classifier', XGBClassifier())
        ])
    }

results = {}

In [158]:
np.random.seed(42)

for model_name, model in classification_models.items():
    model.fit(X_train, y_train)
    results[model_name] = model.score(X_test, y_test)

results

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'Logistic Regression': 0.8015564202334631,
 'RandomForestClassifier': 0.9766536964980544,
 'XGBoost': 0.9883268482490273}

In [None]:
results_df = pd.DataFrame(results.values(),
                          results.keys(),
                          columns=['Accuracy'])
results_df.plot.bar()

# Notice that XGBoost scored the highest, with ~98.83% accuracy

### Logistic Refression Tuning

In [159]:
# Logistic Regression hyperparamters
log_reg_grid = {"C": np.logspace(-4, 4, 20),
                "solver": ["liblinear"]}

In [160]:
from sklearn.model_selection import RandomizedSearchCV

np.random.seed(42)
# Instance of RandomizedSearchCV with a LogisticRegression() estimator

rs_log_reg = RandomizedSearchCV(estimator=LogisticRegression(),
                                param_distributions=log_reg_grid,
                                cv=5,
                                n_iter=1000,
                                verbose=True)
# Fit the instance of RandomizedSearchCV
rs_log_reg.fit(X_train, y_train)



Fitting 5 folds for each of 20 candidates, totalling 100 fits


0,1,2
,estimator,LogisticRegression()
,param_distributions,"{'C': array([1.0000...00000000e+04]), 'solver': ['liblinear']}"
,n_iter,1000
,scoring,
,n_jobs,
,refit,True
,cv,5
,verbose,True
,pre_dispatch,'2*n_jobs'
,random_state,

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,np.float64(11.288378916846883)
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'liblinear'
,max_iter,100


In [161]:
# Best parameters of the RandomizedSearchCV instance
rs_log_reg.best_params_

{'solver': 'liblinear', 'C': np.float64(11.288378916846883)}

In [162]:
rs_log_reg.score(X_test, y_test)

0.7937743190661478

In [163]:
# LogisticRegression classifier using the best hyperparameters from RandomizedSearchCV

clf = LogisticRegression(solver='liblinear', C=11.288378916846883)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.7937743190661478

### Random Forest Tuning

In [164]:
# Random Forest hyperparameters
from scipy.stats import randint
rf_param_grid = {
    'n_estimators' : [100, 200],
    'max_depth' : [None , 10, 20],
    'min_samples_split' : [2, 5],
    'min_samples_leaf' : [1, 2],
    'max_features' : ['sqrt', 'log2', None],
    'bootstrap' : [True, False]
}

In [191]:
np.random.seed(42)

rs_rf = RandomizedSearchCV(estimator=RandomForestClassifier(),
                                param_distributions=rf_param_grid,
                                cv=5,
                                n_iter=5,
                                verbose=True)
rs_rf.fit(X_train, y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


0,1,2
,estimator,RandomForestClassifier()
,param_distributions,"{'bootstrap': [True, False], 'max_depth': [None, 10, ...], 'max_features': ['sqrt', 'log2', ...], 'min_samples_leaf': [1, 2], ...}"
,n_iter,5
,scoring,
,n_jobs,
,refit,True
,cv,5
,verbose,True
,pre_dispatch,'2*n_jobs'
,random_state,

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,5
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'log2'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,False


In [192]:
rs_rf.best_params_

{'n_estimators': 100,
 'min_samples_split': 5,
 'min_samples_leaf': 1,
 'max_features': 'log2',
 'max_depth': None,
 'bootstrap': False}

In [193]:
rs_rf.score(X_test, y_test)

0.9766536964980544

In [181]:
# RandomForest Classifier using the best hyperparameters from RandomizedSearchCV

clf = RandomForestClassifier(n_estimators=100,
                             min_samples_split=5,
                             min_samples_leaf=1,
                             max_features='log2',
                             max_depth=None,
                             bootstrap=False)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.9766536964980544

### XGBoost tuning

In [None]:
# XGBoost hyperparameters

from scipy.stats import uniform
xgb_param_grid = {
    'n_estimators': randint(100, 1000),         
    'learning_rate': uniform(0.01, 0.3),       
    'max_depth': randint(3, 15),                
    'min_child_weight': randint(1, 10),        
    'subsample': uniform(0.5, 0.5),             
    'colsample_bytree': uniform(0.5, 0.5),      
    'gamma': uniform(0, 0.5),                   
    'reg_alpha': uniform(0, 1),
    'reg_lambda' : uniform(0,1)              
}

In [196]:
np.random.seed(42)

xg_rs = RandomizedSearchCV(estimator=XGBClassifier(),
                           param_distributions=xgb_param_grid,
                           cv=5,
                           n_iter=5,
                           verbose=True)
xg_rs.fit(X_train, y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


0,1,2
,estimator,"XGBClassifier...ree=None, ...)"
,param_distributions,"{'colsample_bytree': <scipy.stats....x7f037c4034d0>, 'gamma': <scipy.stats....x7f037c474c30>, 'learning_rate': <scipy.stats....x7f0395d1d160>, 'max_depth': <scipy.stats....x7f037c46f950>, ...}"
,n_iter,5
,scoring,
,n_jobs,
,refit,True
,cv,5
,verbose,True
,pre_dispatch,'2*n_jobs'
,random_state,

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,np.float64(0.5852620618436457)
,device,
,early_stopping_rounds,
,enable_categorical,False


In [201]:
xg_rs.best_params_

{'colsample_bytree': np.float64(0.5852620618436457),
 'gamma': np.float64(0.03252579649263976),
 'learning_rate': np.float64(0.29466566117599996),
 'max_depth': 14,
 'min_child_weight': 2,
 'n_estimators': 876,
 'reg_alpha': np.float64(0.015966252220214194),
 'reg_lambda': np.float64(0.230893825622149),
 'subsample': np.float64(0.6205127330130058)}

In [202]:
xg_rs.score(X_test, y_test)

0.9883268482490273

In [203]:
best_params = {}

clf = XGBClassifier(colsample_bytree=0.5852620618436457,
                    gamma=0.03252579649263976,
                    learning_rate=0.29466566117599996,
                    max_depth=14,
                    min_child_weight=2,
                    n_estimators=876,
                    reg_alpha=0.015966252220214194,
                    reg_lambda=0.230893825622149,
                    subsample=0.6205127330130058)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.9883268482490273