1. ## Decision Tree Classifier - Heart Disease

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree # drawing classification trees
from sklearn.model_selection import train_test_split # split the data into training and testing sets
from sklearn.model_selection import cross_val_score # cross validations
from sklearn.metrics import confusion_matrix # generate the confusion matrix
from sklearn.metrics import plot_confusion_matrix # dras the confusion matrix


## Exploratory Data Analysis[](http://)

In [None]:
data = pd.read_csv('../input/heart-disease-uci/heart.csv')
data.head()

In [None]:
df_uci = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data', header = None)
df_uci.head()

In [None]:
df = data.copy()
df.shape

In [None]:
df_uci.shape

In [None]:
df.info()

In [None]:
df_uci.columns = [
    'age',
    'gender',
    'cp',
    'restbp',
    'chol',
    'fbs',
    'restecg',
    'thalach',
    'exang',
    'oldpeak',
    'slope',
    'ca',
    'thal',
    'hd'
]
df_uci.head()

In [None]:
df_uci.dtypes

In [None]:
for column in df_uci.columns:
    print(column, df_uci[column].unique())

In [None]:
len(df_uci.loc[(df_uci['ca'] == '?') | (df_uci['thal'] == '?')])

In [None]:
df_uci.loc[(df_uci['ca'] == '?') | (df_uci['thal'] == '?')]

In [None]:
df2 = df_uci.loc[(df_uci['ca'] != '?') & (df_uci['thal'] != '?')]
df2.head()

In [None]:
df2.nunique()

In [None]:
# The feature vector contains the features used to make the prediction - the independant variables
X = df2.drop('hd', axis = 1).copy()
X.head()

In [None]:
# the dependent variable is the target feature vector
y = df2['hd'].copy()
y.head()

In [None]:
X['cp'].unique()

In [None]:
pd.get_dummies(X, columns = ['cp']).head()

In [None]:
X1 = pd.get_dummies(X, columns = ['cp', 'restecg', 'slope', 'thal'])
X1.head()

In [None]:
y.unique()

In [None]:
# We want to perform a binary classification, so we will consider any value greater than zero as a patient having heart disease
y1 = y > 0
y[y1] = 1
y.unique()

## Build a Classification Tree

In [None]:
# Split the data into training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(X1, y, random_state = 1)

# Create a decision tree and fit it to the training data
clf = DecisionTreeClassifier(random_state = 1)
clf = clf.fit(X_train, y_train)

In [None]:
plt.figure(figsize = (15, 7.5))
plot_tree(clf, filled = True, rounded = True, class_names = ["No HD", "Yes HD"], feature_names = X1.columns);

In [None]:
## Plot the confusion matrix
plot_confusion_matrix(clf, X_test, y_test, display_labels = ["No HD", "Yes HD"])

### Cost Complexity Pruning

In [None]:
path = clf.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas = path.ccp_alphas
ccp_alphas = ccp_alphas[:-1]

clf_dts = []
for ccp_alpha in ccp_alphas:
    clf_dt = DecisionTreeClassifier(random_state = 1, ccp_alpha = ccp_alpha)
    clf_dt.fit(X_train, y_train)
    clf_dts.append(clf_dt)

In [None]:
train_scores = [clf_dt.score(X_train, y_train) for clf_dt in clf_dts]
test_scores = [clf_dt.score(X_test, y_test) for clf_dt in clf_dts]