In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt

In [None]:
# Reading the CSV file 
df = pd.read_csv("../input/heart-disease-uci/heart.csv")

In [None]:
df.head(5)

In [None]:
df.describe()

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.dtypes

In [None]:
df.info()

In [None]:
df_target = df.groupby("target").size()
df_target

In [None]:
# "target 0" - absence,"target 1"- presence
plt.pie(df_target.values,labels=["target 0","target 1"], autopct="%1.1f%%",radius=1,
textprops={"fontsize":16},explode=[0.01,0])
plt.show()

In [None]:
df_sex = df.groupby(['sex', 'target']).size()
print(df_sex)

plt.pie(df_sex.values,labels=["sex_0,target_0", "sex_0,target_1", 
"sex_1,target_0", "sex_1,target_1"], 
autopct="%1.1f%%",radius=1,
textprops={"fontsize":16})
plt.show()

The ratio gets higher over the age of forty. That is, people who are over forty, are under high risk of heart disease.

In [None]:
plt.hist([df[df.target==0].age, df[df.target==1].age],bins=20,alpha=0.5
,label = ["no_heart_disease", "with_heart_disease"])
plt.xlabel("age")
plt.ylabel("percentage")
plt.legend(loc="best")
plt.show()

This shows that people within the cholesterol range of 200 - 300 have a high chance of high disease

In [None]:
plt.hist([df[df.target==0].chol, df[df.target==1].chol],bins=20,alpha=0.5
,label = ["no_heart_disease", "with_heart_disease"])
plt.xlabel("chol")
plt.ylabel("percentage")
plt.legend(loc="best")
plt.show()

In [None]:
df_cp = df.groupby(['cp', 'target']).size()
print(df_cp)

plt.pie(df_cp.values,labels=["cp_0,target_0", "cp_0,target_1", 
"cp_1,target_0", "cp_1,target_1", "cp_2,target_0", "cp_2,target_1","cp_3,target_0", "cp_3,target_1"], 
autopct="%1.1f%%",radius=1,
textprops={"fontsize":9})
plt.title("Chest Pain pie chart")
plt.show()

People with cp_2 chest pain, has a high chance of having heart disease from the pie chart above

In [None]:
plt.hist([df[df.cp==0].age, df[df.cp==1].age, df[df.cp==2].age, df[df.cp==3].age],bins=10,alpha=0.8
,label = ["chest_pain_0", "chest_pain_1","chest_pain_2","chest_pain_3"])
plt.xlabel("age")
plt.ylabel("percentage")
plt.legend(loc="best")
plt.show()

In [None]:
df_cp_sex = df.groupby(['sex', 'cp']).size()
print(df_cp_sex)

plt.pie(df_cp_sex.values,labels=["sex_0,cp_0", "sex_0,cp_1","sex_0,cp_2","sex_0,cp_3",
"sex_1,cp_0", "sex_1,cp_1","sex_1,cp_2","sex_1,cp_3", ], 
autopct="%1.1f%%",radius=1,
textprops={"fontsize":9})
plt.title("Chest Pain pie chart by sex")
plt.show()

Males have a high percentage of cp_2 chest pain from the pie chart above.

resting blood pressure (in mm Hg on admission to the hospital)

It seems people within the range of (120-140)mmHg have a high percentage of heart disease

In [None]:
plt.hist([df[df.target==0].trestbps, df[df.target==1].trestbps],bins=20,alpha=0.5
,label = ["no_heart_disease", "with_heart_disease"])
plt.xlabel("trestbps")
plt.ylabel("percentage")
plt.legend(loc="best")
plt.show()

maximum heart rate achieved
With a mean of 150, it shows that people within the range of 160-180, people have an estimated amount of 25% to have heart disease

In [None]:
plt.hist([df[df.target==0].thalach, df[df.target==1].thalach],bins=20,alpha=0.5
,label = ["no_heart_disease", "with_heart_disease"])
plt.xlabel("thalach")
plt.ylabel("percentage")
plt.legend(loc="best")
plt.show()

## Building the model

To determine the best model for this we shall be using a pipeline with a grid search.

In [None]:
# Import relevant libraries

from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler,MinMaxScaler
import xgboost as xgb
from sklearn.tree import export_graphviz as eg
from sklearn.model_selection import train_test_split,GridSearchCV,StratifiedShuffleSplit
import warnings
warnings.filterwarnings("ignore")


## DATA SETUP

In [None]:
X = df.drop(["target"], axis=1)
y = df.target

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.2,train_size=.8, random_state=10)

print("This is X variable: \n{}\n".format(X[:5]))
print("This is y variable: \n{}".format(y[:5]))
print("This is X_train variable: \n{}\n".format(X_train[:5]))
print("This is X_test variable: \n{}".format(X_test[:5]))
print("This is y_train variable: \n{}\n".format(y_train[:5]))
print("This is y_test variable: \n{}".format(y_test[:5]))

In [None]:
# Creating the pipleline

pipe  = Pipeline([("preprocessing", StandardScaler()),("classifier",KNeighborsClassifier())])

# Creating the Parameter grid

params_grid = [
    {   "preprocessing": [StandardScaler(),None], 
        "classifier":[KNeighborsClassifier()],
        "classifier__n_neighbors":[1,2,3,4,5]     
    },
    {
        "classifier":[DecisionTreeClassifier(),RandomForestClassifier()],
        # "classifier__n_estimators":[100,None],
        "classifier__max_depth":[3,None],
        "preprocessing": [None]
    },

    {
        "classifier":[GradientBoostingClassifier(), xgb.XGBClassifier()],
        # "classifier__n_estimators":[100,None],
        "classifier__max_depth":[3,None],
        "classifier__learning_rate":[0.001,0.01, 0.1],
        "preprocessing": [None]
    }

    ]


Stratified_Shuffle_Split = StratifiedShuffleSplit(random_state=10)

In [None]:
params_grid

In [None]:
## Making use of the Grid search and fitting to the training set
grid = GridSearchCV(pipe,param_grid=params_grid, cv=Stratified_Shuffle_Split)
grid.fit(X_train,y_train)

In [None]:
## Printing the values

print("Best params:\n {}".format(grid.best_params_))
print("Best estimator:\n {}".format(grid.best_estimator_))
print("Best score:\n {}".format(grid.best_score_))

# Displaying using the pandas format
results = pd.DataFrame(grid.cv_results_)
display(results.T)

## THE MODEL
So the best model is RandomForestClassifier with a max_depth of 3, so we build the model itself and apply it to the test data.

In [None]:
rf = RandomForestClassifier(max_depth=3)
rf.fit(X_train,y_train)
print("Test data score: \n{}\n".format(rf.score(X_test,y_test)))

In [None]:
y_rf_pred = rf.predict(X_test)
print("Predictions are \n{}\n".format(y_rf_pred))

### Feature importance
we look into the feature importance of the data and sort it to see which feature ranks higher than the other.

In [None]:
def feature_importances(X,model):
    importance = model.feature_importances_
    importance_sort = np.argsort(importance)[::-1]
    print("Feature importances: \n{}".format(importance))
    print("Feature importance sorted indices: \n{}".format(importance_sort))
    print("Feature importance sorted: \n{}".format(importance[importance_sort]))
    feature_dict = {}
    for index in importance_sort:
        feature_dict[list(X)[index]] = float(importance[index])
    print("The features with their names:")
    return feature_dict

feature_importances(X,rf)

chest pain type (cp) is the feature that ranks highest while fasting blood sugar &gt; 120 mg/dl (fbs) ranks last

Plotting the feature importance 

In [None]:
def plot_feature_importances(model):
    n_features = X.shape[1]
    plt.barh(range(n_features), model.feature_importances_, align='center')
    plt.yticks(np.arange(n_features), X.columns)
    plt.xlabel("Feature importance ratio")
    plt.ylabel("Feature")
    plt.title("Feature importance")
plot_feature_importances(rf)

## Further Analysis
For further analysis we shall look into the 
1.) confusion matrix, 
2.) class report, 
3.) Precision-recall curve
4.) AUC curve

In [None]:
# rf classification report
from sklearn.metrics import  classification_report,plot_confusion_matrix,confusion_matrix
y_test = y_test
target_names = ["target_0","target_1"]

def class_report(model,y_test,pred,target_names):
    print(classification_report(y_test,pred, target_names=target_names))
    disp = plot_confusion_matrix(model, X_test, y_test,labels=[1,0])
    disp.figure_.suptitle("Confusion Matrix")
    print(f"Confusion matrix:\n{disp.confusion_matrix}")
    plt.show()


class_report(rf,y_test,y_rf_pred,target_names)

In [None]:
tp,fn,fp,tn = confusion_matrix(y_test, y_rf_pred, labels=[1,0]).ravel()
print("The tp,tn,fp,fn respectively: \n{} {} {} {}\n".format(tp,tn,fp,fn))
print("TNR (Specificity) is: \n{}\n".format({tn/(tn+fp)}))
print("TPR (Sensitivity) is: \n{}\n".format({tp/(tp+fn)}))

### Specificity or True Negative Rate (TNR)
TNR (ranges from 0 to 1, higher is better) measures the proportion of negatives that are correctly identified as such (e.g. the percentage of healthy people who are correctly identified as not having the condition).
TNR = TN/(TN+FP)

### Precision, Positive Predictive Value (PPV)
PPV (ranges from 0 to 1, higher is better) is the ratio of true positives over all true and false positives:
Precision = TP/(TP+FP)
High precision means that an algorithm returned substantially more relevant results than irrelevant ones, or in other words the more likely everything it returns is right, but it does not mean it may get all the right results that are out there.


### Recall, Sensitivity, Hit Rate or True Positive Rate (TPR)
TPR (ranges from 0 to 1, higher is better) is the ratio of true positives over the sum of true positives and false negatives:
Recall = TP / (TP+FN)
High recall means that an algorithm returned most of the relevant results, 
Recall is used as performance metric when we need to identify all positive samples; that is, when it is important to avoid false negatives.

target_1 has a high recall of 0.88, while target_0 has a high precision of 0.90.
Moreover, the Sensitivity of the algorithm is 0.88 and the Specificity is 0.74.
Because the prediction of true positive heart disease is of high importance. The alogrithm can be considered, based on it's sensitivity.

## Precision-recall curve

In [None]:
from sklearn.metrics import average_precision_score,roc_auc_score, plot_precision_recall_curve,plot_roc_curve

y_predict_proba = rf.predict_proba(X_test)
print("Checking Uncertainties: \n{}\n".format(y_predict_proba[:5]))

average_precision = average_precision_score(y_test,rf.predict_proba(X_test)[:,1])
print("Average Precision: {:.2f}".format(average_precision))
disp = plot_precision_recall_curve(rf,X_test,y_test)
plt.title("2-class Precision-Recall curve:")

## ROC curve

In [None]:
roc_scoring = roc_auc_score(y_test,rf.predict_proba(X_test)[:,1])
print("AUC score: {:.2f}".format(roc_scoring))
disp = plot_roc_curve(rf,X_test,y_test)
plt.title("2-class ROC curve:")
plt.show()

## Conclusion
Chest pain is the main symptom of heart disease based on the feature importance shown by the Randomforest Classifier. Moreover, males have a higher chance of having chest pain which can lead to heart disease, also people with chest_pain_2 have a higher chance of having heart disease. 
The algorithm has a sensitivity of 0.88, which enables the prediction of people with heart disease. Precautions should be taken to reduce chest_pain_2.