In [92]:
#Import the required libraries
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

In [93]:
#attribute information
#https://archive.ics.uci.edu/ml/datasets/Statlog+%28Heart%29

In [94]:
# -- 1. age 
# -- 2. sex 
# -- 3. chest pain type (4 values) 
# -- 4. resting blood pressure 
# -- 5. serum cholestoral in mg/dl 
# -- 6. fasting blood sugar > 120 mg/dl 
# -- 7. resting electrocardiographic results (values 0,1,2) 
# -- 8. maximum heart rate achieved 
# -- 9. exercise induced angina 
# -- 10. oldpeak = ST depression induced by exercise relative to rest 
# -- 11. the slope of the peak exercise ST segment 
# -- 12. number of major vessels (0-3) colored by flourosopy 
# -- 13. thal: 3 = normal; 6 = fixed defect; 7 = reversable defect 

In [95]:
df = pd.read_csv("heart.dat", sep=' ', names=['age', 'sex', 'chest_pain',\
                                    'resting_bp', 'serum_cholestoral', 'fasting_bs',\
                                    'rer','mhra', 'eia', 'oldpeak', 'slope_of_pes','no_of_mv','thal' ,'heart_disease'])

In [96]:
df.head()

Unnamed: 0,age,sex,chest_pain,resting_bp,serum_cholestoral,fasting_bs,rer,mhra,eia,oldpeak,slope_of_pes,no_of_mv,thal,heart_disease
0,70.0,1.0,4.0,130.0,322.0,0.0,2.0,109.0,0.0,2.4,2.0,3.0,3.0,2
1,67.0,0.0,3.0,115.0,564.0,0.0,2.0,160.0,0.0,1.6,2.0,0.0,7.0,1
2,57.0,1.0,2.0,124.0,261.0,0.0,0.0,141.0,0.0,0.3,1.0,0.0,7.0,2
3,64.0,1.0,4.0,128.0,263.0,0.0,0.0,105.0,1.0,0.2,2.0,1.0,7.0,1
4,74.0,0.0,2.0,120.0,269.0,0.0,2.0,121.0,1.0,0.2,1.0,1.0,3.0,1


In [97]:
df.isna().any().sum()

0

In [98]:
df.shape

(270, 14)

In [117]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 270 entries, 0 to 269
Data columns (total 14 columns):
age                  270 non-null float64
sex                  270 non-null float64
chest_pain           270 non-null float64
resting_bp           270 non-null float64
serum_cholestoral    270 non-null float64
fasting_bs           270 non-null float64
rer                  270 non-null float64
mhra                 270 non-null float64
eia                  270 non-null float64
oldpeak              270 non-null float64
slope_of_pes         270 non-null float64
no_of_mv             270 non-null float64
thal                 270 non-null float64
heart_disease        270 non-null int64
dtypes: float64(13), int64(1)
memory usage: 29.6 KB


In [99]:
# Attributes types 
# ----------------- 

# Real: 1,4,5,8,10,12 
# Ordered:11, 
# Binary: 2,6,9 
# Nominal:7,3,13 

# Variable to be predicted 
# ------------------------ 
# Absence (1) or presence (2) of heart disease 

In [104]:
X = df.drop(columns='heart_disease')
y = df['heart_disease']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, train_size=0.7, random_state = 100)

In [105]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(189, 13)
(81, 13)
(189,)
(81,)


In [106]:
#Create a object and fit using the train data
dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [107]:
#predict the dependent variable using the test data
y_pred = dtc.predict(X_test)
accuracy_score(y_pred=y_pred, y_true=y_test)

In [109]:
#To find out the reliability of our model, lets split the data into different folds and test it
dct = DecisionTreeClassifier()
cv = cross_val_score(dct, X_train, y_train, cv=5)
cv

In [112]:
cv

array([0.76315789, 0.84210526, 0.65789474, 0.76315789, 0.75675676])