In [1042]:
import numpy as np
import pandas as pd
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import graphviz

**Convert the continous featurs to Discrete**

In [1043]:
data = '../input/heartdt/heart(1).csv'

df = pd.read_csv(data)
df.head()

df=pd.DataFrame(df)

print('\nBefore convert it : \n')
print(df)


df.age=pd.cut(df.age, bins=[0,20,40,60,80,100],labels=np.arange(5), right=False)
#Age have min=29 and max=77 ,the intervals with new values: [0,20]=0,[21,40]=1,[41,60]=2,[61,80]=3,[81,100]=4

df.trestbps=pd.cut(df.trestbps, bins=[0,119,129,139,200] ,labels=np.arange(4), right=False)
#Trestbps have min=94 and max=200 ,the intervals with new values: [0,119]=0 'Normal',[120,129]=1 'elevated',[130,139)=2'high stage1',[139,200]=3'high'

df.chol=pd.cut(df.chol, bins=[100,199,239,700] ,labels=np.arange(3), right=False)
#chol have min=126 and max=564 , so the intervals with new values: [100,199)=0'ideal ,[200,239)=1'borderline' ,[240,700)=2'too high or low'

df.thalach=pd.cut(df.thalach, bins=[50,117,137,156,176,210] ,labels=np.arange(5), right=False)
#thalach have min=71 and max=202 , so the intervals with new values: [50,117]=0'very light' ,[118,137]=1'light' ,[138,156)=2'moderate' , [157,176]=3'hard' ,[177,210]=4'maximum

df.oldpeak=pd.cut(df.oldpeak, bins=3 ,labels=np.arange(3), right=False)
#oldpeak have min=0 and max=6.2 , so the intervals with new values: [0,2]=0 'low' ,[2.1,4.1)=1'risk' ,[4.2,6.2)=2'terrible'

print('\n After convert it : \n')
print(df)

In [1044]:
X = df.drop(columns=['target'])
y = df['target']
print(X.shape)
print(y.shape)

**Splitting dataset to train and test**

In [1045]:
x_train,x_test,y_train,y_test = train_test_split(X,y,stratify=y)
print(x_train.shape)
print(x_test.shape)

**we will fit a normal decision tree without any fine tuning and check the results**

In [1046]:
# perform training with giniIndex
clf = tree.DecisionTreeClassifier(random_state=1)
clf.fit(x_train,y_train)
y_train_pred = clf.predict(x_train)
y_test_pred = clf.predict(x_test)

**Visualizing decision tree**

In [1047]:
#Print tree
plt.figure(figsize=(20,20))
features = df.columns
classes = ['Not heart disease','heart disease']
tree.plot_tree(clf,feature_names=features,class_names=classes,filled=True)
plt.show()

**Print the depth to see the diffrence after pruning**

In [1048]:
print(clf.get_depth())

In [1049]:
# helper function
def plot_confusionmatrix(y_train_pred,y_train,dom):
    print(f'{dom} Confusion matrix')
    cf = confusion_matrix(y_train_pred,y_train)
    sns.heatmap(cf,annot=True,yticklabels=classes
               ,xticklabels=classes,cmap='Blues', fmt='g')
    plt.tight_layout()
    plt.show()
    

In [1050]:
print(f'Train score {accuracy_score(y_train_pred,y_train)*100}')
print(f'Test score {accuracy_score(y_test_pred,y_test)*100}')
plot_confusionmatrix(y_train_pred,y_train,dom='Train')
plot_confusionmatrix(y_test_pred,y_test,dom='Test')

**We can see that in our train data we have 100% accuracy (100 % precison and recall). But in test data model is is diffrent every run.
We comput the Avrege = 76.4 accuracy. Over model is clearly overfitting. We will avoid overfitting by applying pre-pruning.**

In [1051]:
params = {'max_depth': [2,4,6,8,10,12],
         'min_samples_split': [2,3,4],
         'min_samples_leaf': [1,2]}

clf = tree.DecisionTreeClassifier()
gcv = GridSearchCV(estimator=clf,param_grid=params)
gcv.fit(x_train,y_train)

**As of now we will control these parameters**

**max_depth: maximum depth of decision tree
min_sample_split: The minimum number of samples required to split an internal node:
min_samples_leaf: The minimum number of samples required to be at a leaf node.**

In [1052]:
model = gcv.best_estimator_
model.fit(x_train,y_train)
y_train_pred = model.predict(x_train)
y_test_pred = model.predict(x_test)

print(f'Train score {accuracy_score(y_train_pred,y_train)}')
print(f'Test score {accuracy_score(y_test_pred,y_test)}')
plot_confusionmatrix(y_train_pred,y_train,dom='Train')
plot_confusionmatrix(y_test_pred,y_test,dom='Test')

In [1053]:
#Print tree
plt.figure(figsize=(20,20))
features = df.columns
classes = ['Not heart disease','heart disease']
tree.plot_tree(model,feature_names=features,class_names=classes,filled=True)
plt.show()

In [1054]:
print(model.get_depth())

**We can see that tree is pruned and there is improvement in test accuracy.But still there is still scope of improvement.**

In [1055]:
print(model.predict([[49,0,0,130,269,0,1,163,0,0,2,0,2]]))