In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
from sklearn.ensemble import RandomForestClassifier

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Declaring a random state so the execution doesn't change
random_state = 0

In [None]:
# Loading the data
raw_data = pd.read_csv('/kaggle/input/health-care-data-set-on-heart-attack-possibility/heart.csv')
raw_data.head()

# The columns are:

* age
* sex: 1 = male, 0 = female
* cp: chest pain type
    * 1: typical angina
    * 2: atypical angina
    * 3: non-anginal pain
    * 4: asymptomatic
* trestbps: resting blood pressure
* chol: serum cholestoral in mg/dl
* fbs: fasting blood sugar 0 = >=120 mg/dl, 1 = <120 mg/dl
* restecg: resting electrocardiographic results 
    * 0: normal
    * 1: having ST-T wave abnormality
    * 2: showing probable or definite left ventricular hypertrophy
* thalach: maximum heart rate achieved
* exang: exercise induced angina 0 = no, 1 = yes
* oldpeak: oldpeak = ST depression induced by exercise relative to rest
* slope: the slope of the peak exercise ST segment
    * 1: upsloping
    * 2: flat
    * 3: downsloping
* ca: number of major vessels (0-3) colored by flourosopy
* thal: 0 = normal; 1 = fixed defect; 2 = reversable defect
* target: 0 = less chance of heart attack 1 = more chance of heart attack

In [None]:
# Statistics about the dataset
raw_data.describe()

In [None]:
# Let's check how much data is null
raw_data.isnull().sum()

In [None]:
# Checking the unique values of the columns
for column in raw_data.columns:
    print(column)
    print(raw_data[column].unique())
    print('\n')

# Data preprocessing

As some of the columns are categorical data, we need to create dummies for them. We'll apply the get dummies method to the cp, restecg, slope and thal columns.

In [None]:
# Get dummies
df = pd.get_dummies(raw_data, columns=['cp', 'restecg', 'slope', 'thal'])
df.head()

# Model building

Let's try to fit a Decision Tree and a Random Forest to compare each method. We can apply some techniques to improve the models as well.

In [None]:
# Splitting inputs and targets
X_encoded = df.drop(['target'], axis=1)
y = df['target']

In [None]:
# Splitting the training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, random_state=random_state)

In [None]:
# Decision Tree
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

# Cross validation
dt_accuracy = np.mean(cross_val_score(dt, X_train, y_train, cv=5, scoring='accuracy'))
print("Mean accuracy: ", dt_accuracy)

# Plotting the tree
plt.figure(figsize=(28, 18))
plot_tree(dt, filled=True, rounded=True, class_names=['No HD', 'HD'], feature_names=X_encoded.columns);

# Let's check the accuracy of it's predictions

In [None]:
# Confusion matrix
plot_confusion_matrix(dt, X_test, y_test, display_labels=['Does not have HD', 'Has HD'])
plt.grid(False)

In [None]:
# Overall accuracy of the pruned tree
dt.score(X_test, y_test)

We can try to improve the model by pruning. Let's visualize the optimal alpha for our tree.

# Cost complexity pruning: visualizing alpha

## **This part was taken from this [webinar](https://https://www.youtube.com/watch?v=q90UDEgYqeI)**

In [None]:
# Determine the values for alpha
path = dt.cost_complexity_pruning_path(X_train, y_train)
# Extract the different values for alpha
ccp_alphas = path.ccp_alphas
# Exclude the maximum value for alpha, as this value would produce a tree with only one leaf
ccp_alphas = ccp_alphas[:-1]

# Let's create an array to hold our decision trees
dts = []

for ccp_alpha in ccp_alphas:
    dt = DecisionTreeClassifier(ccp_alpha=ccp_alpha)
    dt.fit(X_train, y_train)
    dts.append(dt)

Now let's plot the accuracy of the trees using the Training Dataset and the Testing Dataset as a function of alpha

In [None]:
train_scores = [dt.score(X_train, y_train) for dt in dts]
test_scores = [dt.score(X_test, y_test) for dt in dts]

fig, ax = plt.subplots(figsize=(12,8))
ax.set_xlabel("alpha")
ax.set_ylabel("accuracy")
ax.set_title("Accuracy vs alpha for training and testing sets")
ax.plot(ccp_alphas, train_scores, marker='o', label="train", drawstyle='steps-post')
ax.plot(ccp_alphas, test_scores, marker='o', label="test", drawstyle='steps-post')
ax.legend()
plt.show()

# Cost complexity pruning: Cross validation for finding the best alpha

A second method that we can apply is cross validation for finding the best alpha. We will run a 5-fold cross validation for each candidate alpha and plot the results.

In [None]:
# Creating an array to store the results of cross validation
alpha_loop_values = []

# Cross validation
for ccp_alpha in ccp_alphas:
    dt = DecisionTreeClassifier(ccp_alpha=ccp_alpha)
    scores = cross_val_score(dt, X_train, y_train, cv=5, scoring='accuracy')
    alpha_loop_values.append([ccp_alpha, np.mean(scores), np.std(scores)])
    
# Let's visualize the candidate alphas
alpha_results = pd.DataFrame(alpha_loop_values, columns=['alpha', 'mean_accuracy', 'std'])

alpha_results.plot(x='alpha', y='mean_accuracy', yerr='std', marker='o', linestyle='--')

In [None]:
# Let's get the best value
ideal_ccp_alpha = alpha_results['alpha'][alpha_results['mean_accuracy'].idxmax]
ideal_ccp_alpha

In [None]:
# Pruned Decision Tree
dt_pruned = DecisionTreeClassifier(ccp_alpha=ideal_ccp_alpha)
dt_pruned.fit(X_train, y_train)

# Plotting the tree
plt.figure(figsize=(24, 18))
plot_tree(dt_pruned, filled=True, rounded=True, class_names=['No HD', 'HD'], feature_names=X_encoded.columns);

In [None]:
# Confusion matrix
plot_confusion_matrix(dt_pruned, X_test, y_test, display_labels=['Does not have HD', 'Has HD'])
plt.grid(False)

In [None]:
# Overall accuracy of the pruned tree
dt_pruned.score(X_test, y_test)

### **There's a tiny improvement over the original tree, and in this case (for this random state) it seems like we lost time, but after all, we got a higher accuracy with a much smaller tree, and that's good.**

# Random Forest

Let's try now a much better approach for classifying: Random Forest.

In [None]:
# Random Forest
rf = RandomForestClassifier(n_estimators=50)
rf.fit(X_train, y_train)

# Cross validation
rf_accuracy = np.mean(cross_val_score(rf, X_train, y_train, cv=5, scoring='accuracy'))
print('Mean accuracy: ', rf_accuracy)

In [None]:
# Accuracy
rf.score(X_test, y_test)

## Let's tune the model

* n_estimators: Cross-validation to find the best value for n_estimators

In [None]:
error_rate = []
n_est = []

for n in range(20, 200):
    rf.set_params(n_estimators=n, oob_score=True, random_state=random_state)
    rf.fit(X_train, y_train)
    
    # Record the OOB error for each `n_estimators=i` setting.
    oob_error = 1 - rf.oob_score_
    n_est.append(n)
    error_rate.append(oob_error)
    
plt.plot(n_est, error_rate)
plt.legend()
plt.show()

The value for n_estimators seems to be stable after n=50 (using a random_state)