In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import metrics
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Loading data and initial overview

In [None]:
df = pd.read_csv('/kaggle/input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv')
print(df.isnull().sum()) # checking for nulls
df.info()

**All columns are numeric and there are no nulls so not much data cleaning or feature engineering is required here.**

# Data Visualization

In [None]:
bins = [30,40,50,60,70,80,90,100,110]
sns.distplot(df['age'],bins=bins,kde=False)
plt.ylabel('Count')
plt.title('Distribution of heart disease across the age groups')
plt.show()

In [None]:
fig,ax = plt.subplots(2,1)
filt = df['DEATH_EVENT'] == 1
death_count = df.loc[filt,'sex'].value_counts()
slices = [death_count[0],death_count[1]]
label = ['Female','Male']
ax[0].pie(slices,labels = label,shadow=True,autopct='%1.1f%%')
ax[0].set_title('Death percentage by gender')
death_count = df['sex'].value_counts()
slices = [death_count[0],death_count[1]]
ax[1].pie(slices,labels = label,shadow=True,autopct='%1.1f%%')
ax[1].set_title('Heart patient percentage by gender')
plt.show()
#df.head()

**Male population is more prone to heart problems and death by that.**

In [None]:
fig,ax = plt.subplots(3,1,figsize=(10,10))
sns.countplot(df['high_blood_pressure'],hue=df['DEATH_EVENT'],order=[1,0],hue_order=[0,1],ax=ax[0])
ax[0].set_xticklabels(['high','low'])
ax[0].legend(['Alive','Dead'])
ax[0].set_title('Blood pressure wise death count')
sns.countplot(df['smoking'],hue=df['DEATH_EVENT'],order=[0,1],hue_order=[0,1],ax=ax[1])
ax[1].set_xticklabels(['non-smoker','smoker'])
ax[1].legend(['Alive','Dead'])
ax[1].set_title('Death counts based on smoking')
sns.countplot(df['diabetes'],hue=df['DEATH_EVENT'],order=[0,1],hue_order=[0,1],ax=ax[2])
ax[2].set_xticklabels(['non-diabetic','diabetic'])
ax[2].legend(['Alive','Dead'])
ax[2].set_title('Death counts based on diabetes')
plt.show()


# Machine Learning

In [None]:
performance_metrics = pd.DataFrame(columns=['Model','accuracy','precision','recall','f1'])
def model_stats(name,model,df):
    global performance_metrics
    scoring = {'accuracy': 'accuracy',
           'precision': 'precision',
           'recall': 'recall',
           'f1': 'f1'
           }
    X = df.drop(columns='DEATH_EVENT')
    y = df['DEATH_EVENT']
    score = cross_validate(model, X, y, scoring=scoring, n_jobs=-1)
    performance_metrics = performance_metrics.append({'Model':name,
                                'accuracy':score['test_accuracy'].mean(),
                                 'precision':score['test_precision'].mean(),
                                 'recall':score['test_recall'].mean(),
                                 'f1':score['test_f1'].mean()}, ignore_index = True)
    

model_stats('KNN',KNeighborsClassifier(n_neighbors = 5),df)
model_stats('RF',RandomForestClassifier(n_estimators = 50),df)
model_stats('NB',GaussianNB(),df)
model_stats('LR',LogisticRegression(),df)
model_stats('SV',SVC(),df)
performance_metrics