  Feature              &           	Explanation    &                                     	Measurement     
 
1.  Age                             : 	Age of the patient                                  |  	Years                
2.  Anaemia                         :	Decrease of red blood cells or hemoglobin           |	Boolean        
3.  High blood pressure             :	If a patient has hypertension      	Boolean         |	0, 1              
4.  Creatinine phosphokinase (CPK)  :	Level of the CPK enzyme in the blood                |	mcg/L
5.  Diabetes                        :	If the patient has diabetes                         |	Boolean                
6.  Ejection fraction               :	Percentage of blood leaving                         |	Percentage   
7.  Sex                             :	Woman or man                                        |	Binary       
8.  Platelets                       : 	Platelets in the blood                              |	kiloplatelets/mL
9.  Serum creatinine                : 	Level of creatinine in the blood                    |	mg/dL         
10.  Serum sodium                    :	Level of sodium in the blood                        | 	mEq/L       
11.  Smoking                         :	If the patient smokes                               |	Boolean                      
12.  Time                            : 	Follow-up period                                    |	Days      
13.  (target) death event            : 	If the patient died during the follow-up period     |	Boolean   
 
**Boolean features**
        * Sex - Gender of patient Male = 1, Female =0
        * Diabetes - 0 = No, 1 = Yes
        * Anaemia - 0 = No, 1 = Yes
        * High_blood_pressure - 0 = No, 1 = Yes
        * Smoking - 0 = No, 1 = Yes
        * DEATH_EVENT - 0 = No, 1 = Yes

**Other informations**
     * mcg/L: micrograms per liter. mL: microliter. mEq/L: milliequivalents per litre
    * The time feature seams to be highly correlated to the death event but there is no concret information of how this metric was measured patient by patient.
      Which makes it hard to use it in the analysis. But feel free to use it

**Information source :** https://www.kaggle.com/andrewmvd/heart-failure-clinical-data/discussion/193109

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt #Data Visulization
import seaborn as sns #Data Visulization


from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split

from sklearn.metrics import confusion_matrix, accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Loading Data

In [None]:
df=pd.read_csv("/kaggle/input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv")

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.describe()

**EDA**

In [None]:
fig,ax=plt.subplots(2,4,figsize=(18,15),)
sns.histplot(x=df.age,ax=ax[0,0],bins=12)
sns.histplot(x=df.creatinine_phosphokinase,ax=ax[0,1],bins=10)
sns.histplot(x=df.ejection_fraction,ax=ax[0,2])
sns.histplot(x=df.platelets,bins=10,ax=ax[0,3])
sns.histplot(x=df.serum_creatinine,ax=ax[1,0],bins=10)
sns.histplot(x=df.serum_sodium,ax=ax[1,1])
sns.histplot(x=df.time,ax=ax[1,2])
ax[1,3].axis('off') 
plt.suptitle("Distribution of Numerical Variable")

In [None]:
fig,ax=plt.subplots(2,3,figsize=(15,10),)
sns.countplot(x=df.anaemia,ax=ax[0,0])
sns.countplot(x=df.diabetes,ax=ax[0,1])
sns.countplot(x=df.high_blood_pressure,ax=ax[0,2])
sns.countplot(x=df.sex,ax=ax[1,0])
sns.countplot(x=df.smoking,ax=ax[1,1])
sns.countplot(x=df.DEATH_EVENT,ax=ax[1,2])
plt.suptitle("Distribution Of Categorical Variable")

In [None]:
plt.pie(x=df.DEATH_EVENT.value_counts(),autopct='%1.2f%%',labels=["ALIVE","NOT ALIVE"],shadow=True,explode=[0,0.1])
plt.suptitle("DEATH EVENT RATIO")

In [None]:
female_survive=df.age[(df.sex==0) & (df.DEATH_EVENT==0)].count()
male_survive=df.age[(df.sex==1) & (df.DEATH_EVENT==0)].count()
female_dead=df.age[(df.sex==0) & (df.DEATH_EVENT==1)].count()
male_dead=df.age[(df.sex==1) & (df.DEATH_EVENT==1)].count()


data=[female_survive,female_dead,male_survive,male_dead]
label=["Female survive","Female dead","Male survive","Male dead"]
explde=[0.1,0.1,0.1,0.1]
colors = ['#ff9999','#66b3ff','#99ff99','#ffcc99']
plt.figure(figsize=(8,8))
plt.pie(x=data,labels=label,autopct="%1.2f%%",shadow=True,explode=explde,colors=colors)
plt.suptitle("Distribution of DEATH EVENT according to GENDER")

In [None]:
fig,ax=plt.subplots(2,2,figsize=(10,10))
sns.countplot(x=df.diabetes,hue=df.DEATH_EVENT,ax=ax[0,0],palette="gist_earth")
ax[0,0].legend(title='DEATH_EVENT', labels=['No death event', 'Death event'])

sns.countplot(x=df.smoking,hue=df.DEATH_EVENT,palette="gist_earth",ax=ax[0,1])
ax[0,1].legend(title='DEATH_EVENT', labels=['No death event', 'Death event'])

sns.countplot(x=df.high_blood_pressure,hue=df.DEATH_EVENT,palette="gist_earth",ax=ax[1,0])
ax[1,0].legend(title='DEATH_EVENT', labels=['No death event', 'Death event'])

sns.countplot(x=df.anaemia,hue=df.DEATH_EVENT,palette="gist_earth",ax=ax[1,1])
ax[1,1].legend(title='DEATH_EVENT', labels=['No death event', 'Death event'])

plt.suptitle("COMPARING DEATH EVENTS WITH CATEGORICAL VARIABLE")


**Feature Selection**

In [None]:
#Coping the data
m_df=df.copy()


In [None]:
#Droping Unwanted Column
m_df.drop("time",axis=1,inplace=True)

In [None]:
m_df.head(3)

In [None]:
X=m_df.iloc[:,0:11].values
y=m_df.iloc[:,11].values

In [None]:
sc=StandardScaler()
X[:,[0,2,4,6,7,8]]=sc.fit_transform(X[:,[0,2,4,6,7,8]])

Spliting data (Train and Test)

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=0)

**Modeling**

Logistic Regression

In [None]:
clr = LogisticRegression(random_state = 0)
clr.fit(X_train, y_train)

# Predicting the Test set results
y_pred_clr = clr.predict(X_test)

In [None]:
clr_cm = confusion_matrix(y_test, y_pred_clr)
acc_clr =accuracy_score(y_test, y_pred_clr)

print("Accuracy =","%.2f" % (acc_clr*100),"%")

plt.subplots(figsize=(4,4))
sns.heatmap(clr_cm, annot=True,cmap="icefire_r")
plt.title('Confusion Matrix')

Naive Bayes

In [None]:
nb=GaussianNB()
nb.fit(X_train,y_train)

y_pred_nb=nb.predict(X_test)

In [None]:
cm_nb=confusion_matrix(y_test,y_pred_nb)
acc_nb =accuracy_score(y_test, y_pred_nb)

print("Accuracy =","%.1f"%(acc_nb * 100),"%")

plt.subplots(figsize=(4,4))
sns.heatmap(cm_nb,annot=True,cmap="icefire_r")
plt.title('Confusion Matrix')


KNN

In [None]:
knn=KNeighborsClassifier(n_neighbors = 15, metric = 'minkowski')
knn.fit(X_train,y_train)

y_pred_knn=knn.predict(X_test)

In [None]:
cm_knn=confusion_matrix(y_test,y_pred_knn)
acc_knn=accuracy_score(y_test,y_pred_knn)


print("Accuracy =","%.2f" %(acc_knn*100),"%")
plt.subplots(figsize=(4,4))
sns.heatmap(cm_knn,annot=True,cmap="icefire_r")
plt.title('Confusion Matrix')

Decision Tree

In [None]:
dt = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)


In [None]:
cm_dt = confusion_matrix(y_test, y_pred_dt)
acc_dt=accuracy_score(y_test, y_pred_dt)

print("Accuracy =","%.2f" % (acc_dt*100),"%")

plt.subplots(figsize=(4,4))
sns.heatmap(cm_dt, annot=True,cmap="icefire_r")
plt.title('Confusion Matrix')

SVM

In [None]:
svcl = SVC(kernel = 'linear', random_state = 0)
svcl.fit(X_train, y_train)
y_pred_svcl = svcl.predict(X_test)



In [None]:
cm_svcl = confusion_matrix(y_test, y_pred_svcl)

acc_svcl=accuracy_score(y_test, y_pred_svcl)

print("Accuracy =","%.2f" % (acc_svcl*100),"%")

plt.subplots(figsize=(4,4))
sns.heatmap(cm_svcl, annot=True,cmap="icefire_r")
plt.title('Confusion Matrix')

In [None]:
model_score=[acc_clr,acc_nb,acc_knn,acc_dt,acc_svcl]
model_name=["Logistic Regression","Naive Bayes","K Nearest Neighbors","Decision Tree","SVM"]

plt.subplots(figsize=(9,8))
splot=sns.barplot(x=model_name,y=model_score,palette = "icefire_r")
plt.xlabel('Classifier Models', fontsize = 10 )
plt.ylabel('% of Accuracy', fontsize = 10)
plt.title('Accuracy of different Classifier Models on test set', fontsize = 15)
plt.xticks(rotation=-55,fontsize = 12)
plt.yticks(fontsize = 12)

for p in splot.patches:
  splot.annotate(format(p.get_height(), '.2f')+"%", (p.get_x() + p.get_width() / 2 , p.get_height()),ha='center', xytext = (0, 6), textcoords = 'offset points',fontsize=10)

If You Like My work then UPVOTE this Notebook :)
ThankYou!!