In [12]:
import numpy as np
import pandas as pd
import matplotlib as pyplot
import seaborn as sns
import sklearn
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [2]:
# data loading
data = pd.read_csv('heart_v2.csv')
data.shape

(270, 5)

In [3]:
data.head()

Unnamed: 0,age,sex,BP,cholestrol,heart disease
0,70,1,130,322,1
1,67,0,115,564,0
2,57,1,124,261,1
3,64,1,128,263,0
4,74,0,120,269,0


In [4]:
# checking null values
data.isnull().mean()*100

age              0.0
sex              0.0
BP               0.0
cholestrol       0.0
heart disease    0.0
dtype: float64

In [5]:
# split data into train and test
train_data, test_data = train_test_split(data, train_size=0.7, test_size=0.3, random_state=42)

In [6]:
X_train = train_data.drop(columns='heart disease', axis=1)
y_train = train_data['heart disease']

In [7]:
X_test = test_data.drop(columns='heart disease', axis=1)
y_test = test_data['heart disease']

In [8]:
# utility to evaluate the model
def evaluateModel(dt_classifier):
    y_train_pred = dt_classifier.predict(X_train)
    y_test_pred = dt_classifier.predict(X_test)
    print('Train accuracy = ', accuracy_score(y_train, y_train_pred))
    print('Test accuracy = ', accuracy_score(y_test, y_test_pred))

In [9]:
# decision tree with max_depth parameter
dt_max_depth = DecisionTreeClassifier(max_depth=3)
dt_max_depth.fit(X_train, y_train)
evaluateModel(dt_max_depth)

Train accuracy =  0.7407407407407407
Test accuracy =  0.6049382716049383


In [10]:
# decision tree with min_samples_leaf parameter
dt_min_samples_leaf = DecisionTreeClassifier(min_samples_leaf=20)
dt_min_samples_leaf.fit(X_train, y_train)
evaluateModel(dt_min_samples_leaf)

Train accuracy =  0.7037037037037037
Test accuracy =  0.6419753086419753


In [11]:
# decision tree with min_samples_split parameter
dt_min_samples_split = DecisionTreeClassifier(min_samples_split=20)
dt_min_samples_split.fit(X_train, y_train)
evaluateModel(dt_min_samples_split)

Train accuracy =  0.8359788359788359
Test accuracy =  0.6419753086419753


In [15]:
# define K-folds and parameters
folds = KFold(n_splits=5, shuffle=True, random_state=100)
params = {
            'max_depth': [2,3,5,7,11], 
            'min_samples_leaf': [5,10,15,20,25],
            'min_samples_split': [5,10,15,20,25]
         }

In [18]:
# define GridSearchCV
cv = GridSearchCV(estimator=DecisionTreeClassifier(), 
                  param_grid=params, 
                  scoring='accuracy', 
                  n_jobs=-1, 
                  cv=folds, 
                  return_train_score=True)

In [19]:
# cross validation on train data
cv.fit(X_train, y_train)

In [23]:
# get best estimator from cv results
dt_best = cv.best_estimator_

In [25]:
cv.best_params_

{'max_depth': 3, 'min_samples_leaf': 25, 'min_samples_split': 5}

In [24]:
# evaluate best estimator
evaluateModel(dt_best)

Train accuracy =  0.7037037037037037
Test accuracy =  0.6419753086419753
