In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

plt.style.use('ggplot')

In [None]:
data = pd.read_csv('../input/heart-disease/heart.csv')

In [None]:
data.head()

In [None]:
data.isnull().sum()

In [None]:
data.dtypes

## EDA

In [None]:
data.hist(figsize=(10,8))
plt.tight_layout()
plt.show()

In [None]:
# distribution plot for numeric variables

In [None]:
fig, ax = plt.subplots(5,1, figsize=(11,10))
sns.histplot(data['age'], ax=ax[0], kde=True)
sns.histplot(data['trestbps'], ax=ax[1], kde=True)
sns.histplot(data['chol'], ax=ax[2], kde=True)
sns.histplot(data['thalach'], ax=ax[3], kde=True)
sns.histplot(data['oldpeak'], ax=ax[4], kde=True)

In [None]:
# correlation and heatmap

In [None]:
corr_matrix = data.corr()
mask = np.array(corr_matrix)
mask[np.tril_indices_from(mask, 0)] = False

In [None]:
plt.figure(figsize=(11,9))
sns.heatmap(corr_matrix, mask=mask, annot=True, 
           square=True, cmap='viridis')
plt.tight_layout()
plt.xticks(rotation=45)
plt.show()

In [None]:
#Data preprocessing

In [None]:
cat_variables = ['cp', 'restecg', 'slope', 'thal']
X = data.iloc[:,:-1].copy()
y = data['target']

In [None]:
X_enc = pd.get_dummies(X, columns=cat_variables)

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import plot_confusion_matrix

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_enc, y, test_size=0.33, random_state=101)

In [None]:
clf = DecisionTreeClassifier(random_state=101)
clf = clf.fit(X_train, y_train)

In [None]:
plot_confusion_matrix(clf, X_test, y_test)
plt.grid(())

In [None]:
plt.figure(figsize=(20,15))
plot_tree(clf, 
          rounded=True,
         filled=True,
         feature_names=X_enc.columns)
plt.show()

`pruning trees

In [None]:
path = clf.cost_complexity_pruning_path(X_train, y_train)

In [None]:
ccp_alphas = path['ccp_alphas'][:-1]

In [None]:
clfs = []

for alpha in ccp_alphas:
    dt = DecisionTreeClassifier(random_state=101, ccp_alpha=alpha)
    dt = dt.fit(X_train, y_train)
    clfs.append(dt)

In [None]:
# calculate scores for both train and set set
train_scores = [dt.score(X_train, y_train) for dt in clfs]
test_scores = [dt.score(X_test, y_test) for dt in clfs] 

In [None]:
plt.figure(figsize=(11,7))
plt.plot(ccp_alphas, train_scores, label='train set', color='r', marker='o', ls= '--', drawstyle='steps-post')
plt.plot(ccp_alphas, test_scores, label='test set', color='g',marker='o', ls= '--', drawstyle='steps-post')
plt.legend()

In [None]:
# Choosing alpha 0.017 as it leads to higher accuracy

In [None]:
# Using cross validation to test 0.017 (alpha) 

In [None]:
clf = DecisionTreeClassifier(random_state=101, ccp_alpha=0.017)

In [None]:
scores = cross_val_score(clf, X_train, y_train, cv=5)
df_scores = pd.DataFrame(data={'tree': range(1,6),
                               'accuracy': scores})

In [None]:
df_scores.head()

In [None]:
plt.figure(figsize=(10,8))
df_scores.plot(x='tree', y='accuracy', marker='o', ls='--')

In [None]:
# As the accuracy is quite sensitive to different datasets, so their mean and std are calculated

alpha_cross = []

for alpha in ccp_alphas:
    dt = DecisionTreeClassifier(random_state=101, ccp_alpha=alpha)
    scores = cross_val_score(dt, X_train, y_train, cv=5)
    alpha_cross.append([alpha, np.mean(scores), np.std(scores)])

df_cross = pd.DataFrame(alpha_cross, 
                        columns=['ccp_alpha', 'mean_accuracy', 'std_accuracy'])

In [None]:
df_cross.head()

In [None]:
plt.figure(figsize=(11,9))
df_cross.plot(x='ccp_alpha',
              y='mean_accuracy',
              yerr='std_accuracy', 
              marker='o',
              ls='--')

In [None]:
# Take the best value
df_cross.sort_values(by='mean_accuracy', ascending=False).head(3)

In [None]:
final_alpha = df_cross.iloc[15,0]
print("The ideal alpha for this dt model is %3.5f" % final_alpha)

In [None]:
# Buiding the final model

final_clf = DecisionTreeClassifier(random_state=101, ccp_alpha=final_alpha)
fianl_clf = final_clf.fit(X_train, y_train)

In [None]:
# plot confusion matrix
plot_confusion_matrix(final_clf, X_test, y_test)
plt.grid(())

In [None]:
# It is better than previous one!! 
# Finished !! 
