## UCI Heart Disease Dataset: 

#### Using Decison Trees and Random Forest

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier

In [None]:
df = pd.read_csv('../input/heart-disease-uci/heart.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
msno.matrix(df)
#no missing values

In [None]:
#Renaming the columns for better understanding
df.rename (columns = {
    'cp':'chest_pain',
    'trestbps':'blood_pressure',
    'chol':'cholesterol',
    'fbs':'blood_sugar',
    'restecg':'rest_ecg',
    'thalach':'heart_rate',
    'exang':'exercise_angina',
    'oldpeak':'st_peak',
    'slope':'st_slope',
    'ca':'n_vessels',
    'thal':'thallium',
    'target':'heart_disease'}, inplace = True)

In [None]:
df.sex.replace({
    0: 'woman',
    1: 'man'}, inplace=True)
df.chest_pain.replace({
    1: 'typical',
    2: 'atypical',
    3: 'non-anginal',
    0: 'none'}, inplace=True)
df.blood_sugar.replace({
    0: 'normal',
    1: 'high'}, inplace=True)
df.rest_ecg.replace({
    1: 'normal',
    2: 'wave-abnormal',
    0: 'lv-hyperthrophy'}, inplace=True)
df.exercise_angina.replace({
    0: 'no',
    1: 'yes'}, inplace=True)
df.st_slope.replace({
    2: 'increasing',
    1: 'flat',
    0: 'decreasing'}, inplace=True)
df.thallium.replace({
    0: float('nan'), # 0 is not a valid value
    2: 'normal',
    1: 'fixed',
    3: 'reversable'}, inplace=True)
df.heart_disease.replace({
    0: 'yes',
    1: 'no'}, inplace=True)

In [None]:
df.mode()

In [None]:
df.mode().iloc[0]

In [None]:
df.fillna(df.mode().iloc[0], inplace=True)

In [None]:
categoricals = ['sex', 'chest_pain', 'blood_sugar', 'rest_ecg', 'exercise_angina',
                'st_slope', 'thallium', 'heart_disease', 'n_vessels']
numericals = ['age', 'blood_pressure', 'cholesterol', 'heart_rate', 'st_peak']

In [None]:
for col in categoricals:
    df[col] = df[col].astype('category')


In [None]:
categoricals = [c for c in categoricals if c != 'heart_disease']

In [None]:
col_order = numericals + categoricals + ['heart_disease']

In [None]:
df = df.loc[:, col_order]

In [None]:
fig, axes = plt.subplots(3, 3, sharey=True, figsize=(16,16))

for i, col in enumerate(categoricals):
    ax = axes.flat[i]
    
    cat_values = df[col].unique()
    with_disease = [len(df[(df[col] == val) & (df.heart_disease == 'yes')]) / len(df[df[col] == val]) for val in cat_values]
    with_disease = [100 * x for x in with_disease]
    
    ax.bar([str(x) for x in cat_values], with_disease)
    ax.set_ylim(0, 100)
    ax.set_xlabel(f"Values for {col}", fontsize=14)
    ax.set_ylabel('Patients with heart disease', fontsize=14)
    ax.set_title(f"Distribution for {col}", fontsize=14)
    
fig.delaxes(axes.flat[-1])
fig.tight_layout();

In [None]:
label = 'heart_disease'
features = [col for col in df.columns if col != label]

In [None]:
X = df[features]
y = df[label] 

In [None]:
X.columns

In [None]:
y = y.replace({'no': 0, 'yes': 1})

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,shuffle = True, test_size = .2, random_state = 0)

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline

In [None]:
kfold = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 0)

In [None]:
transformer = ColumnTransformer(transformers=[
    ('numerical', StandardScaler(), numericals),
    ('categorical', OneHotEncoder(drop='first'), categoricals)
])

In [None]:
tree = make_pipeline(
    transformer,
    GridSearchCV(
        estimator=DecisionTreeClassifier(random_state=0),
        cv=kfold,
        param_grid={
            'criterion': ['gini', 'entropy'],
            'max_depth': [3, 5, 10, 20],
            'min_samples_split': [.001, .01, .1],
            'min_samples_leaf': [.001, .01, .1]
        },
        n_jobs=4,
        verbose=1,
        scoring='recall'
    )
)

In [None]:
tree.fit(X_train,y_train)

In [None]:
tree[-1].best_estimator_

In [None]:
prediction = tree.predict(X_test)

In [None]:
print("Accuracy score on the test set:", accuracy_score(y_test,prediction))

In [None]:
data = confusion_matrix(y_test, prediction)
df_cm = pd.DataFrame(data, columns=np.unique(y_test), index = np.unique(prediction))
df_cm.index.name = 'Actual'
df_cm.columns.name = 'Predicted'
plt.figure(figsize = (10,10))
sns.set(font_scale=1.5)#for label size
sns.heatmap(df_cm, cmap="Blues", annot=True, cbar = False)

In [None]:
pip install scikit-plot

In [None]:
import scikitplot as skplt

##Trying out the scikitplot library for the first time, the confusion matrix is same as the one above albeit with fewer lines of code.

In [None]:
skplt.metrics.plot_confusion_matrix(
    y_test,
    prediction,
    figsize = (12,12)
)

In [None]:
from sklearn.tree import plot_tree

In [None]:
ohe = tree[0].transformers_[1][1]

In [None]:
feature_names = numericals + list(ohe.get_feature_names(input_features = categoricals))

In [None]:
fig, ax = plt.subplots(figsize=(22,16))
plot_tree(tree[-1].best_estimator_, max_depth=5, feature_names=feature_names, class_names=['no', 'yes'], label='all', filled=True, rounded=True, proportion=True, precision=1, fontsize=12, ax=ax);

In [None]:
feature_importance = tree[-1].best_estimator_.feature_importances_

In [None]:
for fi, fn in sorted(zip(feature_importance, feature_names), reverse=True):
    if fi > 0:
        print(f"Feature {fn}: importance = {fi:.3f}")

In [None]:
forest = make_pipeline(
    transformer,
    GridSearchCV(
        estimator=RandomForestClassifier(),
        cv=kfold,
        scoring='recall',
        n_jobs=4,
        verbose=True,
        param_grid={
            'n_estimators': [50, 100],
            'max_features': ['sqrt'],
            'criterion': ['entropy'],
            'max_depth': [3, 5],
            'min_samples_split': [.01, .1],
            'min_samples_leaf': [.01, .1]
        }
    )
)

In [None]:
forest.fit(X_train, y_train)

In [None]:
forest[-1].best_estimator_

In [None]:
pred = forest.predict(X_test)


In [None]:
print("Accuracy score on test set:", accuracy_score(y_test, pred))

In [None]:
fig, ax = plt.subplots(figsize=(8,8))
sns.heatmap(confusion_matrix(y_test, pred), cmap=plt.cm.jet, square=True, annot=True, cbar=False, ax=ax)
ax.set_xlabel('Predicted', fontsize=18)
ax.set_ylabel('True', fontsize=18)
ax.set_title('Confusion Matrix', fontsize=18)

In [None]:
skplt.metrics.plot_confusion_matrix(
    y_test,
    pred,
    figsize = (8,8)
)