In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import scale
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Marketing Analytics
This database contains 76 attributes, but all published experiments refer to using a subset of 14 of them. In particular, the Cleveland database is the only one that has been used by ML researchers to
this date. The "goal" field refers to the presence of heart disease in the patient.

## Variables
As critical information was not provided on the data set, external sources were used for explanation for variables:
Source: https://towardsdatascience.com/heart-disease-uci-diagnosis-prediction-b1943ee835a7

age = age in years

sex = (1 = male; 0 = female)

cp = chest pain type
— Value 0: asymptomatic
— Value 1: atypical angina
— Value 2: non-anginal pain
— Value 3: typical angina

trestbps = resting blood pressure (in mm Hg on admission to the hospital)

chol = serum cholestoral in mg/dl

fbs = fasting blood sugar > 120 mg/dl

restecg = resting electrocardiographic results (values 0,1,2)

thalach = maximum heart rate achieved

exang = exercise induced angina

oldpeak = ST depression induced by exercise relative to rest

slope = the slope of the peak exercise ST segment

ca = number of major vessels (0-3) colored by flourosopy

thal = thal: 3 = normal; 6 = fixed defect; 7 = reversable defect

target = (0 = no heart disease), (1 = heart disease)

# Notes
more description

# Important data characteristics

# 1. Exploratory Data Analysis

In [None]:
# Import CSV File
df = pd.read_csv("/kaggle/input/heart-disease-uci/heart.csv", sep = ",")
# Show dataframe, first 5 rows
print(df.head(5))
# All columns are numerical
print(df.info())
# Shape of data, 303 rows and 14 columns
print(df.shape)

## 1.1 Check for Missing Values
Every dataset needs to be checked for empty variables. If empty variables exists, they should be replaced by averages or in the worst cased dropped.
Empty variables can be identified by the following two function:

In [None]:
# 1. function for identifying missing values
print(df.isnull().sum())

# 2. function for identifying missing values
df.info()

## 1.2 Assign Variables

In [None]:
X = df.drop("target", axis=1).values
y = df["target"].values
feature_names = df.drop("target", axis=1).columns

In [None]:
df[(df["cp"] != 0) & (df["target"] == 1)].count()

## 1.3 Descriptive Statistics
Descriptive statistics can reveil instristing instights of the dataset

In [None]:
print(df.describe())

## 1.4 Heatmap

In [None]:
plt.figure(figsize=(12,10))
sns.heatmap(df.corr(),annot=True,cmap="magma",fmt='.2f') # look for other color theme

## 1.5 Gender characteristics

In [None]:

sns_dataframe = df.copy()[["sex","target","cp","age"]]

def diagnosis(infected):
    if infected == 1:
        return "Heart Disease"
    else:
        return "No Heart Disease"
sns_dataframe['target'] = sns_dataframe['target'].apply(diagnosis)

def gender(sex):
    if sex == 0:
        return "female"
    else:
        return "male"
sns_dataframe["sex"] = sns_dataframe["sex"].apply(gender)

sns.countplot(data= sns_dataframe, x="sex",hue="target", palette=["#c1121f", "#60d394"])
plt.title("Deseased per Sex v/s target\n")


## 1.6 Heart diseased with respect to different chest pain types

In [None]:
sns.countplot(data= sns_dataframe, x='cp',hue='target', palette=["#c1121f", "#60d394"])
plt.xticks([0,1,2,3],["asymptomatic", "atypical angina", "non-anginal pain", "typical angina"])
plt.title('Chest Pain Type v/s target\n')

In [None]:
plt.hist(df[df["target"] == 1]["age"], density = True, bins = 15)

In [None]:
df[df["target"] == 1]["age"].value_counts().sort_values().plot.barh()

In [None]:
plt.figure(figsize=(20,7))
sns.set()
sns.countplot(x = df.age);

In [None]:
plt.figure(figsize=(20,7))
plt.hist(x = df[df.sex==1]["age"], bins=30, color = "#60d394")
plt.hist(x = df[df.sex==0]["age"], bins=30, color = "#c1121f")
plt.title("Heart disease per age and sex", fontsize=20)
plt.xlabel("Age")
plt.ylabel("Count of Diseased")
plt.xticks([20,25,30,35,40,45,50,55,60,65,70,75])
plt.legend(["Male", "Female"])
#help(plt.hist)

# 2.0 Data Preparation
Chest Pain (cp) is the most important predictor of heart disease

In [None]:
from yellowbrick.model_selection import FeatureImportances
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=19, random_state=42)
viz = FeatureImportances(model, labels = column_names)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)
viz.fit(X_test, y_test)
viz.show()

## 2.2 Assessment of our target variable by chest type

In [None]:
# How many results do we get for chest pain type 0 and target 0?
print("Values for chest pain type 0 and target 0:", len(df[(df["cp"] == 0) & (df["target"] == 0)]))

# How many results do we get for chest pain type 0 and target 1?
print("Values for chest pain type 0 and target 1:", len(df[(df["cp"] == 0) & (df["target"] == 1)]))

# How many results do we get for chest pain type 1 and target 0?
print("Values for chest pain type 1 and target 0:", len(df[(df["cp"] == 1) & (df["target"] == 0)]))

# How many results do we get for chest pain type 1 and target 1?
print("Values for chest pain type 1 and target 1:", len(df[(df["cp"] == 1) & (df["target"] == 1)]))

# How many results do we get for chest pain type 2 and target 0?
print("Values for chest pain type 2 and target 0:", len(df[(df["cp"] == 2) & (df["target"] == 0)]))

# How many results do we get for chest pain type 2 and target 1?
print("Values for chest pain type 2 and target 1:", len(df[(df["cp"] == 2) & (df["target"] == 1)]))

# How many results do we get for chest pain type 3 and target 0?
print("Values for chest pain type 3 and target 0:", len(df[(df["cp"] == 3) & (df["target"] == 0)]))

# How many results do we get for chest pain type 3 and target 1?
print("Values for chest pain type 3 and target 1:", len(df[(df["cp"] == 3) & (df["target"] == 1)]))

## 2.3 Results
Patients with chest pain show a higher tendendency for the target variable 1. As chest pain is also an indicator for heart disease, one can be assume that 1 equals heart diseased.

# 3.0 Model building
The results will be stored in the variable results.

In [None]:
# Results
results = {}

## 3.1 KNeighborsClassifier

In [None]:
pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("knn", KNeighborsClassifier())
])

parameters = {"knn__n_neighbors": np.arange(1,50)}
cv = GridSearchCV(pipeline, parameters, cv = 5, scoring = "f1_weighted")
cv.fit(X, y)
y_pred = cv.predict(X)

best_n_neighbors = cv.best_params_["knn__n_neighbors"]
# Save model results in dictionary
results["KNeighborsClassifier"] = {
    "Accuracy" : cv.score(X, y),
    "F1-Score" : cv.cv_results_["mean_test_score"][best_n_neighbors - 1]
}
print("Best value for n_neighbors", best_n_neighbors)
print("Classification Report: \n", classification_report(y, y_pred))
print(pd.DataFrame(results))


ConfusionMatrixDisplay(confusion_matrix(y,y_pred), display_labels = ["Not diseased", "Diseased"]).plot()


## 3.2 LogisticRegression

In [None]:
pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("logreg", LogisticRegression(random_state=0))
])

parameters = {}
cv = GridSearchCV(pipeline, parameters, cv = 5, scoring = "f1_weighted")
cv.fit(X, y)
print("Classification Report: \n", classification_report(y, cv.predict(X)))

# Save model results in dictionary
results["LogisticRegression"] = {
    "Accuracy" : cv.score(X, y),
    "F1-Score" : cv.cv_results_["mean_test_score"][0]
}

print(pd.DataFrame(results))
ConfusionMatrixDisplay(confusion_matrix(y,y_pred), display_labels = ["Not diseased", "Diseased"]).plot()

## 3.3 DecisionTreeClassifier

In [None]:
# Tune hyperparameter max_depth
cv = GridSearchCV(
    Pipeline(
        steps = [
        ("clf", DecisionTreeClassifier(random_state=42))]
    ), 
    {"clf__max_depth": np.arange(1,50)}, 
    cv = 5, 
    scoring = "f1_weighted"
)
cv.fit(X, y)
y_pred = cv.predict(X)
best_max_depth = cv.best_params_["clf__max_depth"]
print("Best max_depth value:", best_max_depth)
print("Classification Report: \n", classification_report(y, y_pred))

# Save model results in dictionary
results["DecisionTreeClassifier"] = {
    "Accuracy" : cv.score(X, y),
    "F1-Score" : cv.cv_results_["mean_test_score"][best_max_depth - 1]
}
print(pd.DataFrame(results))
ConfusionMatrixDisplay(confusion_matrix(y,y_pred), display_labels = ["Not diseased", "Diseased"]).plot()

### Use the best parameter in the model

In [None]:
clf = DecisionTreeClassifier(max_depth = best_max_depth)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=40)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
score = clf.score(X_test, y_test)
cv_scores = cross_val_score(clf, X, y, cv = 5, scoring = "roc_auc")
print("Average:", np.mean(cv_scores), "Scores: ", cv_scores)

# Plot Tree
plt.figure(figsize=(20,7))
plot_tree(clf, filled = True, feature_names = list(feature_names), class_names = ["Diseased", "Healthy"])
plt.show()

In [None]:
# Tune hyperparameter max_depth
cv = GridSearchCV(
    Pipeline(
        steps = [
        ("clf", RandomForestClassifier(random_state=42))]
    ), 
    {"clf__n_estimators": np.arange(1,50)}, 
    cv = 5, 
    scoring = "f1_weighted"
)
cv.fit(X, y)
y_pred = cv.predict(X)
best_n_estimators = cv.best_params_["clf__n_estimators"]
print("Best n_estimators value:", best_n_estimators)
print("Classification Report: \n", classification_report(y, y_pred))

# Save model results in dictionary
results["RandomForestClassifier"] = {
    "Accuracy" : cv.score(X, y),
    "F1-Score" : cv.cv_results_["mean_test_score"][best_n_estimators - 1]
}
print(pd.DataFrame(results))


# 4.0 Model Evaluation

In [None]:
print(pd.DataFrame(results))