In [1]:
import pandas as pd
import numpy as np

df_heart = pd.read_csv('../input/heartdisease/heart_disease.csv')
df_heart.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [2]:
df_heart.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trestbps  303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalach   303 non-null    int64  
 8   exang     303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    int64  
 11  ca        303 non-null    int64  
 12  thal      303 non-null    int64  
 13  target    303 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 33.3 KB


In [3]:
X = df_heart.iloc[:,:-1]
y = df_heart.iloc[:,-1]

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2)

In [6]:
# Decision Tree Classifier - Initial Model

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

dec_clf = DecisionTreeClassifier(random_state=2)

scores = cross_val_score(dec_clf, X, y, cv=5)

print('Accuracy:', np.round(scores,2))

print('Accuracy mean: %0.2f' % scores.mean())



Accuracy: [0.74 0.85 0.77 0.73 0.7 ]
Accuracy mean: 0.76


In [14]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score

def randomized_search_clf(params,runs=20, clf=DecisionTreeClassifier(random_state=2)):
    rand_clf = RandomizedSearchCV(clf, params, n_iter=runs, cv=5, n_jobs=-1, random_state=2)
    rand_clf.fit(X_train,y_train)
    best_model = rand_clf.best_estimator_
    best_score = rand_clf.best_score_
    print('Training Score: %0.3f' % best_score)
    y_preds = best_model.predict(X_test)
    accuracy = accuracy_score(y_preds, y_test)
    print('Test Score: %0.3f' % accuracy)
    return best_model

In [15]:
randomized_search_clf(params={'criterion':['entropy','gini'],
                             'splitter':['best','random'],
                             'min_weight_fraction_leaf':[0.0, 0.0025, 0.005, 0.0075],
                             'min_samples_split':[2,3,4,5,6,8,10],
                             'min_samples_leaf':[1,0.01,0.02,0.03,0.04],
                              'min_impurity_decrease':[0.0, 0.0005, 0.005, 0.05, 0.10, 0.15, 0.2],
                              'max_leaf_nodes':[10, 15, 20, 25, 30, 35, 40, 45, 50, None],
                              'max_features':['auto', 0.95, 0.90, 0.85, 0.80, 0.75, 0.70],
                              'max_depth':[None,2,4,6,8],
                              'min_weight_fraction_leaf':[0.0, 0.0025, 0.005, 0.0075, 0.01, 0.05]})

Training Score: 0.771
Test Score: 0.803


DecisionTreeClassifier(criterion='entropy', max_features=0.85,
                       max_leaf_nodes=30, min_impurity_decrease=0.005,
                       min_samples_split=10, min_weight_fraction_leaf=0.0075,
                       random_state=2, splitter='random')

In [16]:
randomized_search_clf(params={'max_depth':[None, 6, 7],'max_features':['auto', 0.78],
'max_leaf_nodes':[45, None], 'min_samples_leaf':[1, 0.035, 0.04, 0.045,
0.05],'min_samples_split':[2, 9, 10],'min_weight_fraction_leaf': [0.0, 0.05, 0.06,
0.07],}, runs=100)

Training Score: 0.802
Test Score: 0.868


DecisionTreeClassifier(max_depth=7, max_features=0.78, max_leaf_nodes=45,
                       min_samples_leaf=0.045, min_samples_split=9,
                       min_weight_fraction_leaf=0.06, random_state=2)

In [19]:
model = DecisionTreeClassifier(max_depth=7, max_features=0.78, max_leaf_nodes=45,
                       min_samples_leaf=0.045, min_samples_split=9,
                       min_weight_fraction_leaf=0.06, random_state=2)
scores = cross_val_score(model,X,y, cv=5,)

print('Accuracy: ', np.round(scores,2))
print('Accuracy mean: %0.3f' % scores.mean())

Accuracy:  [0.82 0.9  0.8  0.8  0.78]
Accuracy mean: 0.822


In [21]:
# communicate the most important features of the machine learning model

# feature_importances_


best_clf = DecisionTreeClassifier(class_weight=None, criterion='gini',
                                  max_depth=9,max_features=0.8, max_leaf_nodes=47,min_impurity_decrease=0.0,
                                  min_impurity_split=None,min_samples_leaf=1,min_samples_split=8,
                                  min_weight_fraction_leaf=0.05, random_state=2, splitter='best')

best_clf.fit(X,y)

DecisionTreeClassifier(max_depth=9, max_features=0.8, max_leaf_nodes=47,
                       min_samples_split=8, min_weight_fraction_leaf=0.05,
                       random_state=2)

In [22]:
best_clf.feature_importances_

array([0.04830121, 0.04008887, 0.47546568, 0.        , 0.        ,
       0.        , 0.        , 0.00976578, 0.        , 0.02445397,
       0.02316427, 0.1774694 , 0.20129082])

In [24]:
feature_dict = dict(zip(X.columns, best_clf.feature_importances_))

In [27]:
import operator
sorted(feature_dict.items(),key=operator.itemgetter(1), reverse=True)[0:3]

[('cp', 0.47546567857183675),
 ('thal', 0.20129082387838435),
 ('ca', 0.1774694042213901)]

In [None]:
You can tell the doctors and nurses that your model predicts if the patient has a heart disease with 82% accuracy using chest pain,
maximum heart rate, and fluoroscopy as the three most important characteristics.

You can tell the doctors and nurses that your model predicts if the patient has a heart disease with 82% accuracy using chest pain,
maximum heart rate, and fluoroscopy as the three most important characteristics.