In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import tree
%matplotlib inline
from sklearn.ensemble import RandomForestClassifier

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<font size=10>Lets read and look at data set first</font>

In [None]:
df = pd.read_csv("/kaggle/input/heart-disease-uci/heart.csv")

In [None]:
df

<font size = 10>Here will be visualization of most interesting features (as for me)</font>

In [None]:
fig, ax =plt.subplots(1, 2, figsize=(18, 7))
sns.countplot(data=df, x=df.sex.replace([0, 1], ['Female', 'Male']), ax=ax[0])
age_df = df.groupby('sex', as_index=False).agg({'age': 'count'}).rename(columns={'age': 'number'})
plt.pie(age_df.number, labels=['Female', 'Male'], colors=['C1', 'C0'], wedgeprops={'edgecolor': 'black'})

<font size=5>In some discussions of this data set people say that 0 is disease, but someone says the opposite, in my code and visualization I'll assume 1 as desiase and 0 as healthy</font>

In [None]:
ax = sns.catplot(data=df, x=df.target.replace([1, 0], ['Has Disease', 'Healthy']), kind='count')
ax.set(xlabel='Has Desiase', ylabel='Number of people')

In [None]:
fig, ax =plt.subplots(2, 2, figsize=(18, 10))
sns.histplot(data=df, x='age', kde=True, ax=ax[0, 0])
sns.countplot(data=df, x='cp', ax=ax[0, 1])
ax[0, 1].set_xlabel('Chest pain')
sns.histplot(data=df, x='trestbps', kde=True, ax=ax[1, 0])
ax[1, 0].set_xlabel('Resting Blood Pressure')
sns.histplot(data=df, x='thalach', kde=True,ax=ax[1, 1])
ax[1, 1].set(xlabel='Max Heart Rate')

In [None]:
fig, ax =plt.subplots(1, 2, figsize=(16, 7))
sns.countplot(data=df, x='slope', ax=ax[0])
sns.countplot(data=df, x='exang', ax=ax[1])
ax[1].set_xlabel('Exercise Induced Angina')

<font size=5>Lets see distribution of desiase by age</font>

In [None]:
pd.crosstab(df.age,df.target).plot(kind="bar",figsize=(18, 7))
plt.title('Heart Disease Frequency grouped by age')
plt.xlabel('Age')
plt.ylabel('Desiase Frequency')
plt.legend(["Have Disease", "Healthy|"])

<font size=5>And by sex too</font>

In [None]:
pd.crosstab(df.sex.replace([0, 1], ['Female', 'Male']),df.target).plot(kind="bar",figsize=(10, 5))
plt.xticks(rotation=0)
plt.title('Heart Disease Frequency grouped by sex')
plt.xlabel('Sex')
plt.ylabel('Desiase Frequency')
plt.legend(["Have Disease", "Healthy"])

<font size=10>Making Model</font>

<font size=5>Let's extract features to vars X and y</font>

In [None]:
X = df.drop(['target'], axis=1)
y = df['target']

<font size=10>Let's look for best params at Grid Search</font>

In [None]:
from sklearn.model_selection import GridSearchCV

<font size=8>Note, GridSearchCV is quite long, so I will create model using params that GridSearch Gave me in my local notebook</font>

In [None]:
rf = RandomForestClassifier(criterion='entropy', max_depth=2, min_samples_leaf=5, min_samples_split=17, n_estimators=19, n_jobs=-1)

In [None]:
params = {'n_estimators': range(2, 20), 'max_depth': range(1, 15), 'min_samples_split': range(2, 20, 3), 'min_samples_leaf': range(2, 20, 3)}

<font size= 7>Commented code is the code that I've run in my local notebook to get the best classifier</font>

In [None]:
#gs = GridSearchCV(rf, params, cv=3)
#gs.fit(X, y)
#best_clf = gs.best_estimator_
best_clf = rf
best_clf.fit(X, y)

<font size=6>Metrics Time</font>

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [None]:
accuracy_score(y, best_clf.predict(X))

In [None]:
precision_score(y, best_clf.predict(X))

<font size=6>Having High recall is very important for diagnosing desiases, because we don't care about False Positives, but we care about False Negatives, cuz of them patient might die :(</font>

In [None]:
recall_score(y, best_clf.predict(X))

<font size=6>Also lets look at ROC curve</font>

<font size=6>0.93 AUC means model is very good, but if we increased values at parameters of GridSearch we could achieve better results, but also it will cost more time to wait for grid search to complete</font>

In [None]:
from sklearn.metrics import roc_curve, auc
y_predict_prob = best_clf.predict_proba(X)
fpr, tpr, thresholds = roc_curve(y, y_predict_prob[:,1])
roc_auc= auc(fpr, tpr)
plt.figure()
plt.plot(fpr, tpr, color='darkorange', label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.show()

<font size=10>Creating Final Data Set of our predictions</font>

<font size=6>As predict_proba is 2D array, we will put values from secind column, which stands for predictions of target = 1(Which in my case id disease)</font>

In [None]:
X['disease_chance'] = best_clf.predict_proba(X)[:, 1]

<font size=6>Some process of beautyfying our Data Set</font>

In [None]:
X['disease_chance'] = round(X['disease_chance'] * 100, 2)

In [None]:
X['disease_chance'] = X.disease_chance.astype(str) + '%'

In [None]:
X['sex'] = X.sex.replace([0, 1], ['Female', 'Male'])

In [None]:
X = X.rename(columns={'cp': 'Chest Pain Type', \
                  'trestbps': 'Resting blood pressure', \
                  'chol': 'Serum cholestoral', \
                  'fbs': 'Fasting blood sugar', \
                  'restecg': 'Resting electrocardiographic results', \
                  'thalach': 'Max heart rate', \
                  'exang': 'Exercise induced angina', \
                  'oldpeak': 'Oldpeak', \
                  'slope': 'Slope', \
                  'ca': 'Number of major vessels', \
                  'thal': 'Thal', 'disease_chance': 'Disease chance'})

In [None]:
X['Serum cholestoral'] = X['Serum cholestoral'].astype(str) + '(mg/dl)'

In [None]:
X['Exercise induced angina'] = X['Exercise induced angina'].replace([0, 1], ['No', 'Yes'])

<font size=6>Final look at data set</font>

In [None]:
X

<font size=6>Finally save data set, and now we can send it somewhere</font>

In [None]:
#X.to_csv('Predictions.csv')

<font size=10>Thanks for the attention, if you have suggestions provide them in comments :)</font>