In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import tree
import pandas_profiling
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [None]:
data = pd.read_csv("../input/heart-disease-uci/heart.csv")

In [None]:
data.head()

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
data.isnull().sum()

In [None]:
print("Features : {}".format(data.columns[:-1].values))

print("Total number of Features : {}".format(len(data.columns)))

print("Target Variable : {}".format(data.columns[-1]))

print("Total No of samples present in dataset : {}".format(len(data)))

print("Our Decision Tree will classify if the sample belongs to class {} or {}".format(data.target.unique()[0],data.target.unique()[1]))

 Lets visualize our data now

# Age 

In [None]:

plt.figure(figsize=(15,8))

sns.distplot(data.age,bins=10)
plt.xlabel("Age")
plt.ylabel("Density")
plt.title("Age Distribution")


In [None]:
data[data.age<30]

In [None]:
data[(data.age>29) & (data.age<=50)][data.target==0].count()


In [None]:
data[(data.age>29) & (data.age<=50)][data.target==1].count()

In [None]:
data[(data.age>50)][data.target==0].count()

In [None]:
data[(data.age>50)][data.target==1].count()

 Observations

*  Age seems to be normally distributed
*  there is only 1 case having age 29
*  65 cases if age is >29 and <=50
*  99 cases if age>50

Lets analyze Gender

0 -> female
1 -> male 

In [None]:
plt.figure(figsize=(15,8))

sns.countplot(data.target,hue=data.sex)

In [None]:
data.groupby(["sex"])["target"].count()

In [None]:
female = data[data.sex==0]["age"].describe()
female

In [None]:
sns.boxplot(female)

In [None]:
male = data[data.sex==1]["age"].describe()
male

In [None]:
sns.boxplot(male)

In [None]:
len(data[(data.target==1) &(data.sex==0)])

In [None]:
len(data[(data.target==1) &(data.sex==1)])

Observations

* 1. no. of male = 207 female =96
* 1. out of 207 male and 96 female, 93 male and 72 female are diagnosed positive 
* 1.  risk of having disease for female ranges from age 34 to 76 and for male from age 29 to 77

# Chest Pain 

In [None]:
data.groupby("cp")["target"].count()

In [None]:
#plotting data for female having chest pain and diagnosed as +ve [cp wise].

data[(data.sex==0)&(data.target==1)].groupby("cp")["target"].count().plot(kind="bar")

In [None]:
#plotting data for male having chest pain and diagnosed as +ve [cp wise].

data[(data.sex==1)&(data.target==1)].groupby("cp")["target"].count().plot(kind="bar")

In [None]:
data[(data.sex==0)&(data.target==1)].groupby("cp")["target"].count()

In [None]:
data[(data.sex==1)&(data.target==1)].groupby("cp")["target"].count()

Observations

* for  72 +ve diagnosed female, 18,16,34,4 have cp of type 0,1,2,3 respectively
* for  93 +ve diagnosed female, 21,25,35,12 have cp of type 0,1,2,3 respectively

# Blood Pressure

In [None]:
data.trestbps.describe()

In [None]:
# avg blood pressure for female diagnose +ve 

data[(data.sex==0)&(data.target==1)]["trestbps"].mean()

In [None]:
# avg blood pressure for male diagnose +ve 

data[(data.sex==1)&(data.target==1)]["trestbps"].mean()

In [None]:
data[data.sex==0].groupby("cp")["trestbps"].mean().plot(kind="bar")

In [None]:
data[data.sex==1].groupby("cp")["trestbps"].mean().plot(kind="bar")

 Observations :

*  avg BP for female and male diagnosed +ve is 128.73 and 129.74 resp
*  for female, avg bp for cp type 0,1,2,3 is 138.58, 128.05, 127.88, 147.5 resp
*  for male, avg bp for cp type 0,1,2,3 is 129.55,128.59,132.05,139.47 resp

# Cholestrola

In [None]:
data.chol.describe(percentiles=(0.3,0.4,0.5,0.6,0.7,0.75,0.85,0.9,1))

In [None]:
# For example, we will consider 130 md/dl as borderline 

print(data[data.chol<130])
print("\n\n Gender :",data[data.chol<130].sex.count())
# this shows only 1 patient is having cholestrol level under 130  (male)

# Blood Sugar > 120

In [None]:
data[(data.target==1)&(data.fbs==1)].groupby("sex")["target"].count()

In [None]:
data[(data.target==1)&(data.fbs==1)].groupby("sex")["trestbps"].mean().plot(kind="bar")

In [None]:
data[(data.target==1)&(data.fbs==1)].groupby(["sex","cp"])["trestbps"].mean().plot(kind="bar")

Observations 

* out of +ve male and female, 6 female and 17 male have high sugar level
* avg BP for +ve male and female is 131.16 and 136.5 resp 

# Maximum Hear Rate

In [None]:
data.thalach.describe()

In [None]:
data[data.target==1].groupby("sex")["thalach"].mean().plot(kind="bar") 

In [None]:
data[(data.target==1) & (data.fbs==1)].groupby("sex")["thalach"].mean().plot(kind="bar") 

In [None]:
data[(data.target==1) & (data.fbs==1)].groupby(["sex","cp"])["thalach"].mean()

In [None]:
data[(data.target==1) & (data.fbs==1)].groupby(["sex","cp"])["thalach"].mean().plot(kind="bar")

Observations

* avg high heart rate for male, female tested +ve is 154.02 161.90
* avg high heart rate for male, female tested +ve and having high blood sugar level is 145.66 161.17

# Now lets split our data

In [None]:
X = data.drop("target",axis=1)
Y = data["target"]

x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.3,random_state=42)

# Model building

In [None]:
DTREE = DecisionTreeClassifier(random_state=0)
DTREE.fit(x_train,y_train)

In [None]:
plt.figure(figsize=(35,28))

features = x_test.columns
classes = ["No Disease","Diasease"]
tree.plot_tree(DTREE,feature_names=features,class_names=classes,filled=True,rounded=True)

In [None]:
pred_on_train = DTREE.predict(x_train)
pred_on_test = DTREE.predict(x_test)

In [None]:
print("Accuracy on training set : {}".format(accuracy_score(y_train,pred_on_train)))
print("Accuracy on testing set : {}".format(accuracy_score(y_test,pred_on_test)))

In [None]:
# confusion Matrix

cm_1 = confusion_matrix(y_test,pred_on_test)
cm_2 =confusion_matrix(y_train,pred_on_train)
print("Train Confusion Matrix")
sns.heatmap(cm_2,annot=True,yticklabels=classes,xticklabels=classes,cmap="Blues")
plt.show()

print("Test Confusion Matrix")
sns.heatmap(cm_1,annot=True,yticklabels=classes,xticklabels=classes,cmap="Blues")
plt.show()

As we can see, this model has low bias and high variance.
This is an example of Overfitting

In [None]:
# Lets prune the tree tp reduce the overfitting

In [None]:
# # Lets first pre-prune it and check the accuracy

# Pre pruning is nothing but stoping the growth of decision tree on an early stage. For that we can limit the growth of trees by setting constrains. We can limit parameters like max_depth , min_samples etc.

# An effective way to do is that we can grid search those parameters and choose the optimum values that gives better performace on test data.

# As of now we will control these parameters

#     max_depth: maximum depth of decision tree
#     min_sample_split: The minimum number of samples required to split an internal node
#     min_samples_leaf: The minimum number of samples required to be at a leaf node

In [None]:
from sklearn.model_selection import GridSearchCV


params = {"max_depth":[2,4,6,8,10,12],
         "min_samples_split":[1,2,3,4],
         "min_samples_leaf":[1,2]}

pre_pruning = DecisionTreeClassifier()
gcv = GridSearchCV(estimator=pre_pruning, param_grid=params)

gcv.fit(x_train,y_train)

In [None]:
model = gcv.best_estimator_
model.fit(x_train,y_train)

In [None]:
pred_on_train = model.predict(x_train)
pred_on_test = model.predict(x_test)

In [None]:
print("Accuracy on  Train data : {}".format(accuracy_score(y_train,pred_on_train)) )
print("Accuracy on  TEST data : {}".format(accuracy_score(y_test,pred_on_test)) )

In [None]:
cm_train = confusion_matrix(y_train,pred_on_train)
cm_test = confusion_matrix(y_test,pred_on_test)

print("Train Confusion Matrix")
sns.heatmap(cm_train,annot=True,xticklabels=classes,yticklabels=classes,cmap="Blues")
plt.show()

print("Test Confusion Matrix")
sns.heatmap(cm_test,annot=True,xticklabels=classes,yticklabels=classes)
plt.show()

In [None]:
plt.figure(figsize=(25,25))
tree.plot_tree(model,filled=True,rounded=True, class_names=classes)

In [None]:
# Now Lets use post pruning
# Cost Complexity Pruning

In [None]:
# # Decision trees can easily overfit. 
# One way to avoid it is to limit the growth of trees by setting constrains.
# We can limit parameters like max_depth , min_samples etc. 
# But a most effective way is to use post pruning methods like cost complexity pruning. 
# This helps to improve test accuracy and get a better model.s

# # Cost complexity pruning is all about finding the right parameter foar alpha.We will get the alpha values for this tree and will check the accuracy with the pruned trees.

In [None]:
post_pruning = DecisionTreeClassifier()

path = post_pruning.cost_complexity_pruning_path(x_train,y_train)

ccp_alphas, impurities = path.ccp_alphas, path.impurities

In [None]:
clfs = []

for ccp_alpha in ccp_alphas:
    clf = DecisionTreeClassifier(ccp_alpha=ccp_alpha)
    clf.fit(x_train,y_train)
    clfs.append(clf)

In [None]:
#We will remove the last element in clfs and ccp_alphas, because it is the trivial tree with only one node.

In [None]:
clfs = clfs[:-1]
ccp_alphas = ccp_alphas[:-1]

In [None]:
nodes = [clf.tree_.node_count for clf in clfs]
depth = [clf.tree_.max_depth for clf in clfs]

In [None]:
plt.figure(figsize=(15,8))

plt.plot(ccp_alphas, nodes,drawstyle="steps-post")
plt.plot(ccp_alphas, depth,drawstyle="steps-post")
plt.scatter(ccp_alphas, nodes)
plt.scatter(ccp_alphas, depth)
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(15,8))

train_acc = []
test_acc = []
for c in clfs:
    y_train_pred = c.predict(x_train)
    y_test_pred = c.predict(x_test)
    train_acc.append(accuracy_score(y_train_pred,y_train))
    test_acc.append(accuracy_score(y_test_pred,y_test))

plt.scatter(ccp_alphas,train_acc)
plt.scatter(ccp_alphas,test_acc)
plt.plot(ccp_alphas,train_acc,label='train_accuracy',drawstyle="steps-post")
plt.plot(ccp_alphas,test_acc,label='test_accuracy',drawstyle="steps-post")
plt.legend()
plt.title('Accuracy vs alpha')
plt.show()

In [None]:
# We will select 0.023

In [None]:
DTREE = DecisionTreeClassifier(ccp_alpha=0.02)
DTREE.fit(x_train,y_train)

pred_on_train = DTREE.predict(x_train)
pred_on_test = DTREE.predict(x_test)

In [None]:
print(f'Train score {accuracy_score(y_train,pred_on_train)}')
sns.heatmap(confusion_matrix(y_train,pred_on_train),annot=True,xticklabels=classes,yticklabels=classes,cmap="Blues")
plt.show()
print(f'Test score {accuracy_score(y_test,pred_on_test)}')
sns.heatmap(confusion_matrix(y_test,pred_on_test),annot=True,xticklabels=classes,yticklabels=classes,cmap="Blues")
plt.show()

In [None]:
# Cost Comlexity Pruning is giving the better output

In [None]:
plt.figure(figsize=(25,25))
tree.plot_tree(DTREE,class_names=classes,filled=True,rounded=True)