In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#loading the dataset
h=pd.read_csv('/kaggle/input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv')
h.head()

In [None]:
h.shape

In [None]:
h.info()

In [None]:
h.describe()

**INSIGHTS**-

* anaemia,diabetes,high_blood_pressure,sex,smoking and DEATH_EVENT are discrete values(0 and 1) rest of the features are continuous in     nature.

* No encoding needed as all features are numeric.

* Target variable is DEATH_EVENT which represents mortality by heart disease.

* In creatinine_phosphokinase and platelets there is a huge difference between 75th quartile and maximum values which depicts presence of outliers.

* Age has a mean value of approx 61 which means most of the people are around 60 years of age.

* No missing values in the dataset.

* According to the datset given,1 denotes men and 0 denotes women in sex column.

In [None]:
h.isnull().sum()

In [None]:
#renaming the target column
h.rename(columns={'DEATH_EVENT':'death_event'},inplace=True)

In [None]:
h.columns

In [None]:
#converting age into int type
h['age']=h['age'].astype('int64')

In [None]:
h.info()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly
#plotly.offline.init_notebook_mode(connected = True)

## UNIVARIATE ANALYSIS

### Continuous variables

In [None]:
plt.figure(figsize=(10,10))
sns.distplot(h['age'],bins=30)

Most of the people are or below 60 and the curve represents right skewed distribution.

In [None]:
plt.figure(figsize=(10,10))
sns.distplot(h['creatinine_phosphokinase'])
plt.xticks(range(0,10000,500))

* Most of the CPK levels exist below 1000 mcg/L. 
* The curve represents right skewed distribution.

In [None]:
plt.figure(figsize=(10,10))
sns.distplot(h['ejection_fraction'],bins=25)

The ejection fraction levels are mostly 40% or less than 40%.

In [None]:
plt.figure(figsize=(10,10))
sns.distplot(h['platelets'])



* Most of the platelets count exist between 200000 and 400000 kiloplatelets/ML.

* The curve shows right skewed distribution.

In [None]:
plt.figure(figsize=(10,10))
sns.distplot(h['serum_sodium'])

* Most of the serum_sodium levels are between 130 to 140 mEq/L.

* The curve shows left skewed distribution.

In [None]:
plt.figure(figsize=(10,10))
sns.distplot(h['serum_creatinine'])

* Most of the serum creatinine levels are between 0 to 2 mg/DL.

* The curve shows right skewed distribution.


In [None]:
plt.figure(figsize=(10,10))
sns.distplot(h['time'],bins=30)

Most of the heart attacks have a follow up-period below 100 days.

### Discrete Variables

In [None]:
h['anaemia'].value_counts(normalize=True)*100

In [None]:
(h['anaemia'].value_counts(normalize=True)*100).plot(kind='bar',color=['b','r'],rot=0)

Approx 57% people don't have anaemia i.e low level of haemoglobin.

In [None]:
h['diabetes'].value_counts(normalize=True)*100

In [None]:
(h['diabetes'].value_counts(normalize=True)*100).plot(kind='bar',color=['b','r'],rot=0)

Approx 58% people don't have diabetes.

In [None]:
h['high_blood_pressure'].value_counts(normalize=True)*100

In [None]:
(h['high_blood_pressure'].value_counts(normalize=True)*100).plot(kind='bar',color=['b','r'],rot=0)

Approx 65% of people don't have high blood pressure.

In [None]:
h['sex'].value_counts(normalize=True)*100

Most of the people are men(approx 65%).

In [None]:
h['smoking'].value_counts(normalize=True)*100

Approx 68% the people don't smoke. 

In [None]:
h['death_event'].value_counts(normalize=True)*100

The mortalities caused during follow up period in case of an heart attack are less in proportion (approx 68%).

## FEATURE SELECTION

In [None]:
x=h.drop(columns='death_event')
y=h['death_event']

from sklearn.ensemble import ExtraTreesClassifier
model = ExtraTreesClassifier()
model.fit(x,y)
print(model.feature_importances_) 
feat_importances = pd.Series(model.feature_importances_, index=x.columns)
feat_importances.nlargest(12).plot(kind='barh')
plt.show()

The most important features are 'time','ejection_fraction' and 'serum_creatinine'.We will use these in model building.

In [None]:
#determining correlation
plt.figure(figsize=(15,15))
sns.heatmap(h.corr(),annot=True)

* Method of correlation is pearson.
* 'age','serum_creatinine','anaemia','creatinine_phosphokinase',high_blood_pressure' shows positive correlation in which serum_creatinine has maximum positive correlation.
* 'diabetes','ejection_fraction','platelets','serum_sodium','time','smoking' shows negative correlation in which time has maximum negative correlation.

In [None]:
px.box(h,'time')

No outliers in time.

In [None]:
px.box(h,'ejection_fraction')

Two outliers when fraction is more or greater than 70%.

In [None]:
#detecting the outliers
h[h['ejection_fraction']>=70]

In [None]:
#removing the outliers
h=h[h['ejection_fraction']<70]
h.head()

In [None]:
px.box(h,'serum_creatinine')

Outliers present here can be ignored as in reality cases exist where people have their creatinine levels at a severe level.

## BIVARIATE ANALYSIS

In [None]:
#converting discrete values into categorical for analysis
h['anaemia'].replace({1:'Yes',0:'No'},inplace=True)
h['diabetes'].replace({1:'Yes',0:'No'},inplace=True)
h['high_blood_pressure'].replace({1:'Yes',0:'No'},inplace=True)
h['smoking'].replace({1:'Yes',0:'No'},inplace=True)
h['death_event'].replace({1:'Yes',0:'No'},inplace=True)
h['sex'].replace({1:"Men",0:"women"},inplace=True)

### Continuous Features

In [None]:
e=px.scatter(h,'age',color='death_event',
             title="Distribution of death_event on basis of age",size='age')
e.show()

* Individuals having age more than 59 years have the higher probability of deaths during follow-up period. 
* age and death_event has a linear relationship which means as age increases the probability of death from heart attacks will also increase.

In [None]:
e=px.scatter(h,'creatinine_phosphokinase',color='death_event',
             title="Distribution of death_event on basis of Creatinine Phosphokinase (in mcg/L)",size='creatinine_phosphokinase')
e.show()

* Higher level of creatinine_phosphokinase ( > 540 mcg/L) will cause more heart problems resulting in fatality.
* Linear Relationship between these two features.

In [None]:
e=px.scatter(h,'serum_creatinine',color='death_event',
             title="Distribution of death_event on basis of Serum Creatinine(in mg/dL)",size='serum_creatinine')
e.show()

* People having serum_creatinine levels more than 1.2 mg/dL have higher chance of death.
* Linear Relationship exists between these two features.

In [None]:
e=px.scatter(h,'serum_sodium',color='death_event',
             title="Distribution of death_event on basis of serum sodium(mEq/L)",size='serum_sodium')
e.show()

* When the serum_sodium levels inside a body drops below 135 mEq/L, it increases the chance of deaths.
* Inverse Relationship exists between this two features (negative correlation).

In [None]:
e=px.scatter(h,'ejection_fraction',color='death_event',
             title="Distribution of death_event on basis of Ejection fraction (in %)",size='ejection_fraction')
e.show()

* When the ejection_fraction levels inside a body drops below 40 %, it increases the chance of heart failure.
* Inverse Relationship exists between this two features (negative correlation).

In [None]:
e=px.scatter(h,'platelets',color='death_event',
             title="Distribution of death_event on basis of Platelets (in kiloplates/ML)",size='platelets')
e.show()

* When the platelet levels inside a body drops below 256000 kiloplatelets/ML, it increases the chance of deaths.
* Inverse Relationship exists between this two features (negative correlation).

In [None]:
e=px.scatter(h,'time',color='death_event',
             title="Distribution of death_event on basis of Follow-up period (in days)",size='time')
e.show()

* When follow-up period decreases(< 80 days),the probability of death increases.
* Inverse Relationship exists between this two features (negative correlation).

### Discrete features

In [None]:
sns.countplot("death_event",data=h,hue='smoking')

Smoking doesn't largely affect fatality rate.The people who don't smoke can also die of heart attack because of severe levels in other factors.


In [None]:
sns.countplot("death_event",data=h,hue='anaemia')

* Whether anaemic or not the fatality rate is almost same in both cases.
* Anaemia also doesn't largely affect fatality rate.The people who don't have anaemia can also die of heart attack because of severe levels in other factors.

In [None]:
sns.countplot("death_event",data=h,hue='sex')

* Fatality rate of men is more than women.
* Also,survival rate of men is more than women.
* Due to proportion of more men in the dataset(65%) then women this can be possible.
* We can say that Gender doesn't relate to heart failure.

In [None]:
sns.countplot("death_event",data=h,hue='diabetes')

Diabetes doesn't largely affect fatality rate.The people who aren't diabetic can also die of heart attack because of severe levels in other factors.

In [None]:
sns.countplot("death_event",data=h,hue='high_blood_pressure')

High BP doesn't largely affect fatality rate.The people who don't have high BP can also die of heart attack because of severe levels in other factors.

In [None]:
sns.barplot(y='age',x='diabetes',data=h,ci=None)

Individuals who are below 60 mostly are diabetic.

In [None]:
sns.barplot(y='age',x='high_blood_pressure',data=h,ci=None)

As the age increases there is a risk of high_blood_pressure.

## MULTIVARIATE ANALYSIS

In [None]:
#continuous variable analysis with respect to target variable
d=px.scatter(h,'serum_sodium',color='death_event',hover_data=['creatinine_phosphokinase',
                                                                             'serum_creatinine','ejection_fraction',
                                                                             'time','platelets'])
d.show()

* If an individual has more than one components that affect heart health at a severe level than chances for his/her death will be more.
* If an patient has **more platelets,more sodium concentration,less creatinine_phosphokinase,more ejection fraction,less serum_creatinine,more follow-up period in his/her blood** then they will survive from an heart failure.
* If a user has more components in control than those who are not in control,then they will likely survive.
* Therefore this proves that all the **continuous** factors are **independent from each other and are only related with the target variable(death_event).**
* Most percent of the people survived the heart disease.

In [None]:
#discrete variable analysis with respect to target variable
w=px.bar(h,'death_event',color='death_event',hover_data=['sex','diabetes','anaemia','high_blood_pressure','smoking'])
w.show()

* If an individual has more than one components that affect heart health at a severe level than chances for his/her death will be more.
* If a user has more components in control than those who are not in control,then they will likely survive.
* Therefore this proves that all the **discrete** factors are **independent from each other and aren't much related with the target variable(death_event) as it was seen in above analysis.**
* Most percent of the people survived the heart disease.

## MODEL BUILDING

In [None]:
#encoding the strings into numbers of needed features
h['death_event'].replace({'Yes':1,'No':0},inplace=True)
h['anaemia'].replace({'Yes':1,'No':0},inplace=True)
h['diabetes'].replace({'Yes':1,'No':0},inplace=True)
h['high_blood_pressure'].replace({'Yes':1,'No':0},inplace=True)
h['smoking'].replace({'Yes':1,'No':0},inplace=True)
h['sex'].replace({"Men":1,"women":0},inplace=True)

In [None]:
#model fitting without feature selection
x1=h.drop(columns=['death_event'])
y1=h['death_event']

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,roc_auc_score,classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.ensemble import VotingClassifier 

In [None]:
train_x,test_x,train_y,test_y=train_test_split(x1,y1,test_size=0.2,random_state=0,shuffle=True)
sc=StandardScaler()
train_x=sc.fit_transform(train_x)
test_x=sc.transform(test_x)

In [None]:
lr=LogisticRegression(random_state=0)
dt=DecisionTreeClassifier(random_state=0)
knn=KNN()
classifiers = [('Logistic Regression', lr),
('K Nearest Neighbours', knn),
('Decision Tree', dt)]

In [None]:
vc=VotingClassifier(estimators=classifiers)
vc.fit(train_x,train_y)

In [None]:
y_pred=vc.predict(test_x)
roc_auc_score(test_y,y_pred)

The model has an baseline ROC SCORE of 0.83 without feature selection.

In [None]:
#model fitting with feature selection
x2=h.iloc[:,[4,7,11]]
x2.head()

In [None]:
train_x,test_x,train_y,test_y=train_test_split(x2,y1,test_size=0.2,random_state=0,shuffle=True)
sc=StandardScaler()
train_x=sc.fit_transform(train_x)
test_x=sc.transform(test_x)
vc=VotingClassifier(estimators=classifiers)
vc.fit(train_x,train_y)
y_pred=vc.predict(test_x)
roc_auc_score(test_y,y_pred)

With feature selection the model has an ROC score of 0.885.

In [None]:
cm=confusion_matrix(test_y,y_pred)
plt.figure(figsize=(10,10))
sns.heatmap(cm, cmap=plt.cm.Blues,annot=True)
plt.title("Ensemble Model - Confusion Matrix")
plt.yticks(range(2), ["Actual Heart Not Failed","Actual Heart Fail"], fontsize=16)
plt.xticks(range(2), ["Predicted Heart Not Failed"," Predicted Heart Fail"], fontsize=16)
plt.show()

* 41 observations are **True positives**.(Actual positive observation == Predicted positive observation).It means people who don't have heart failure are correctly classified as it is by the algorithm.
* 2 observations are **False positives**. (Actual positive observation == Predicted negative observation).It means people who don't have heart failure are incorrectly classified as people with heart failure.
* 3 observation is **False negative**. (Actual negative observation == Predicted positive observation).It means people who have heart failure are incorrectly classified as people with no heart failure.
* 14 observations are **True negatives**. (Actual negative observation == Predicted negative observation).It means people who have heart failure are correctly classified as it is by the algorithm.

In [None]:
print(classification_report(test_y,y_pred))

**If you like this notebook do upvote it.**

Do provide your valuable feedback.

Do checkout my other notebooks at https://www.kaggle.com/tmchls