In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import warnings
import seaborn as sns
from colorama import Fore, Back, Style 
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from mlxtend.plotting import plot_confusion_matrix
from plotly.offline import plot, iplot, init_notebook_mode
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import plotly.express as px
from statsmodels.formula.api import ols
import plotly.graph_objs as gobj

init_notebook_mode(connected=True)
warnings.filterwarnings("ignore")
import plotly.figure_factory as ff
%matplotlib inline

import xgboost
import lightgbm
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier

**About the Data**

Cardiovascular diseases (CVDs) are the number 1 cause of death globally, taking an estimated 17.9 million lives each year, which accounts for 31% of all deaths worlwide.

Heart failure is a common event caused by CVDs and this dataset contains 12 features that can be used to predict mortality by heart failure.

In [None]:
heart_data = pd.read_csv('/kaggle/input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv')
heart_data.head()

In [None]:
heart_data.columns 

As we can see, a death due to heart failure is influenced by a number of factors.

**Objectives**: 
* To look for numerous insights in the data in order to improve the accuracy of our prediction.
* Use different models to predict death events and compare the most reliable model for this dataset.

To begin with, lets first take the simplest look at the age factor in the death event due to heart failure. 

In [None]:
fig = go.Figure()
fig.add_trace(go.Histogram(x=heart_data['age'],xbins=dict(start = 20,end=95,size=2),))

fig.update_layout(
title_text= 'Age distribution',
xaxis_title_text = 'Age',
yaxis_title_text= 'Count',
bargap = 0.05,
template='plotly_dark')


The above graph shows us the distribution of age groups in our data. (hover over the graph for exact count)
Now lets analyse the death rate in various age groups. 

In [None]:
fig = px.histogram(heart_data, x='age', color = "DEATH_EVENT",hover_data=heart_data.columns, title="Distribution of death evevnt amongst the age groups",labels={'age':"AGE"})
fig.show()

There is a higher death rate in the age group 58-62 as compared to other age groups. 

Next, lets analyse and find out if the gender factor affects the death rate. 
To do this:
* Lets find the ratio of Men and Women in the dataset
* Lets find the percentage ratio of survival and death event. 

In [None]:
d1 = heart_data[(heart_data['DEATH_EVENT']==0) & (heart_data['sex']==1)]
d2 = heart_data[(heart_data['DEATH_EVENT']==1) & (heart_data['sex']==1)]
d3 = heart_data[(heart_data['DEATH_EVENT']==0) & (heart_data['sex']==0)]
d4 = heart_data[(heart_data['DEATH_EVENT']==1) & (heart_data['sex']==0)]

gender_labels = ['Male','Females']
death_labels = ['Male-Survived','Male-decesed','Female-survived','Female-deceased']
gender_value = [(len(d1)+len(d2)),(len(d3)+len(d4))]
death_ratio_value = [len(d1),len(d2),len(d3),len(d4)]

fig = make_subplots(rows=1,cols=2,specs = [[{'type':'domain'},{'type':'domain'}]])
fig.add_trace(go.Pie(labels=gender_labels,values=gender_value,name='Gender ratio'),1,1)
fig.add_trace(go.Pie(labels=death_labels,values=death_ratio_value,name='Gender and death ratio'),1,2)

fig.update_traces(hole=.4, hoverinfo="label+percent")
fig.update_layout(title_text="  GENDER DISTRIBUTION IN THE DATASET                           GENDER VS DEATH_EVENT")



From the above subplot we can conclude that in our dataset 65.3% are MALE (out of which 44.4% survived and 20.9% died) and 34.7% are FEMALE (out of which 23.6% survived and 11.1% died)

Diabetes has been one of the most sought after factor in relation to heart_failure, so lets analyse that. 

In [None]:
d1 = heart_data[(heart_data['DEATH_EVENT']==0) & (heart_data['diabetes']==0)]
d2 = heart_data[(heart_data['DEATH_EVENT']==1) & (heart_data['diabetes']==0)]
d3 = heart_data[(heart_data['DEATH_EVENT']==0) & (heart_data['diabetes']==1)]
d4 = heart_data[(heart_data['DEATH_EVENT']==1) & (heart_data['diabetes']==1)]

labels1 = ['No diabetes ','diabetes']
labels2 = ['Non-diabetic survivor','non-diabetic deceased','diabetic-survivor','diabetic decesaed']
value1 = [(len(d1)+len(d2)),(len(d3)+len(d4))]
value2 = [len(d1),len(d2),len(d3),len(d4)]

fig = make_subplots(rows=1,cols=2,specs = [[{'type':'domain'},{'type':'domain'}]])
fig.add_trace(go.Pie(labels=labels1,values=value1,name='diabetic ratio'),1,1)
fig.add_trace(go.Pie(labels=labels2,values=value2,name='diabetic and death ratio'),1,2)

fig.update_traces(hole=.4, hoverinfo="label+percent")
fig.update_layout(title_text="  Diabetic ratio and Diabetic death ratio")

The dataset of diabetic distribution gives us the following insights:
* 41.8% of patients had diabetes out of which 13.4% succumbed to heart_failure. 
* 58.2% of patients from the dataset were non-diabetic out of which 18.7% succumbed to diabetes.

Lets analyse the effect of smoking in various age groups and link that to heart attack 

In [None]:
fig = px.violin(heart_data,x = 'smoking',y= 'age',color="DEATH_EVENT",box=False, points = 'all',hover_data= heart_data.columns)
fig.show()

As we can see smoking affects the rate of survival, people who dont smoke had better chance of survival at age group 55 to 65.
Death rate of smokers is higher than non-smokers. 

In [None]:
fig = px.histogram(heart_data, x='ejection_fraction', color = "DEATH_EVENT",hover_data=heart_data.columns, title="Ejection fraction vs Death Event",labels={'ejection_fraction':"Ejection Fraction"})
fig.show()

Ejection fraction is the Percentage of blood leaving the heart at each contraction.
The data in the above graph shows that a lower ejection fraction could be linked to Death event. 


To improve the accuracy of the prediction model, lets try to find the most important features to fit in the training data. 

In [None]:
plt.rcParams['figure.figsize']=20,12
sns.set_style("darkgrid")

x = heart_data.iloc[:, :-1]
y = heart_data.iloc[:,-1]

from sklearn.ensemble import ExtraTreesClassifier
import matplotlib.pyplot as plt
model = ExtraTreesClassifier()
model.fit(x,y)
print(model.feature_importances_) 
feat_importances = pd.Series(model.feature_importances_, index=x.columns)
feat_importances.nlargest(12).plot(kind='barh')
plt.show()

We calculated the feature importance for this data to predict the outcome, so based on this we only select the three most important, i.e Time, ejection_fraction and serum creatinine. 

In [None]:
Features = ['time','ejection_fraction','serum_creatinine','age']
x = heart_data[Features]
y = heart_data["DEATH_EVENT"]
x_train,x_test,y_train,y_test = train_test_split(x,y, test_size=0.2, random_state=2)

In [None]:
accuracy_list = []

We have fit the data for our predictions now lets try different methods to predict our data. 

In [None]:
#logistic regression

log_reg = LogisticRegression()
log_reg.fit(x_train,y_train)
log_reg_prediction = log_reg.predict(x_test)
log_reg_accuracy = accuracy_score(y_test,log_reg_prediction)
accuracy_list.append(log_reg_accuracy)
print(Fore.BLACK + "Accuracy of Logistic Regression is : ", "{:.2f}%".format(100* log_reg_accuracy))

Lets create a confusion matrix to better understand the accuracy

In [None]:
cm = confusion_matrix(y_test, log_reg_prediction)
plt.figure()
plot_confusion_matrix(cm, figsize=(12,8), hide_ticks=True, cmap=plt.cm.Blues)
plt.title("Logistic Regression Model - Confusion Matrix")
plt.xticks(range(2), ["Heart Not Failed","Heart Fail"], fontsize=16)
plt.yticks(range(2), ["Heart Not Failed","Heart Fail"], fontsize=16)
plt.show()

Lets try some more models..

In [None]:
sv_clf = SVC()
sv_clf.fit(x_train,y_train)
sv_preds = sv_clf.predict(x_test)
sv_accuracy = accuracy_score(y_test,sv_preds)

In [None]:
accuracy_list.append(sv_accuracy)
print(Fore.GREEN + "Accuracy of SVC is : ", "{:.2f}%".format(100* sv_accuracy))

In [None]:
cm = confusion_matrix(y_test,sv_preds)
plt.figure()
plot_confusion_matrix(cm,figsize=(12,8),cmap=plt.cm.Reds)
plt.title('SVC Confusion Matrix')
plt.xticks(range(2),['heart not failed','heart failed'],fontsize=16)
plt.yticks(range(2),['heart not failed','heart failed'],fontsize=16)
plt.show()


In [None]:
xgb_clf = xgboost.XGBRFClassifier(max_depth=3, random_state=1)
xgb_clf.fit(x_train,y_train)
xgb_pred = xgb_clf.predict(x_test)
xgb_acc = accuracy_score(y_test, xgb_pred)
accuracy_list.append(xgb_acc)

In [None]:
print(Fore.GREEN + "Accuracy of XGBRFClassifier is : ", "{:.2f}%".format(100* xgb_acc))

In [None]:
cm = confusion_matrix(y_test,xgb_pred)
plt.figure()
plot_confusion_matrix(cm,figsize=(12,8),cmap=plt.cm.Reds)
plt.title('SVC Confusion Matrix')
plt.xticks(range(2),['heart not failed','heart failed'],fontsize=16)
plt.yticks(range(2),['heart not failed','heart failed'],fontsize=16)
plt.show()

In [None]:
lgb_clf = lightgbm.LGBMClassifier(max_depth=2, random_state=4)
lgb_clf.fit(x_train,y_train)
lgb_pred = lgb_clf.predict(x_test)
lgb_acc = accuracy_score(y_test, lgb_pred)
accuracy_list.append(lgb_acc)

In [None]:
print(Fore.GREEN + "Accuracy of LGBMClassifier is : ","{:.2f}%".format(100* lgb_acc))

In [None]:
cm = confusion_matrix(y_test, lgb_pred)
plt.figure()
plot_confusion_matrix(cm, figsize=(12,8), hide_ticks=True, cmap=plt.cm.Blues)
plt.title("LGBMClassifier Model - Confusion Matrix")
plt.xticks(range(2), ["Heart Not Failed","Heart Fail"], fontsize=16)
plt.yticks(range(2), ["Heart Not Failed","Heart Fail"], fontsize=16)
plt.show()

In [None]:
# K Neighbors Classifier

kn_clf = KNeighborsClassifier(n_neighbors=6)
kn_clf.fit(x_train, y_train)
kn_pred = kn_clf.predict(x_test)
kn_acc = accuracy_score(y_test, kn_pred)
accuracy_list.append(kn_acc)

In [None]:
print(Fore.GREEN + "Accuracy of K Neighbors Classifier is : ", "{:.2f}%".format(100* kn_acc))

In [None]:
cm = confusion_matrix(y_test, kn_pred)
plt.figure()
plot_confusion_matrix(cm, figsize=(12,8), hide_ticks=True, cmap=plt.cm.Blues)
plt.title("K Neighbors Model - Confusion Matrix")
plt.xticks(range(2), ["Heart Not Failed","Heart Fail"], fontsize=16)
plt.yticks(range(2), ["Heart Not Failed","Heart Fail"], fontsize=16)
plt.show()

In [None]:
# GradientBoostingClassifier

gradientboost_clf = GradientBoostingClassifier(max_depth=2, random_state=1)
gradientboost_clf.fit(x_train,y_train)
gradientboost_pred = gradientboost_clf.predict(x_test)
gradientboost_acc = accuracy_score(y_test, gradientboost_pred)
accuracy_list.append(gradientboost_acc)

In [None]:
print(Fore.GREEN + "Accuracy of Gradient Boosting is : ", "{:.2f}%".format(100* gradientboost_acc))

In [None]:
cm = confusion_matrix(y_test, gradientboost_pred)
plt.figure()
plot_confusion_matrix(cm, figsize=(12,8), hide_ticks=True, cmap=plt.cm.Blues)
plt.title("Gredient Boosting Model - Confusion Matrix")
plt.xticks(range(2), ["Heart Not Failed","Heart Fail"], fontsize=16)
plt.yticks(range(2), ["Heart Not Failed","Heart Fail"], fontsize=16)
plt.show()

In [None]:
print(accuracy_list)
# accuracy_list = accuracy_list[:len(accuracy_list)-1]
#Had an error when I re-run one of the models and a duplicate value was added in the array.

In [None]:
models_list = ['Logistic Regressor', 'svc','xgb','lightgum','K Neighbors','Gradient Boosting']

In [None]:
plt.rcParams['figure.figsize']=10,6 
sns.set_style("darkgrid")
ax = sns.barplot(x=models_list, y=accuracy_list, palette = "rocket", saturation =1.5)
plt.xlabel("Classifier Models", fontsize = 20 )
plt.ylabel("% of Accuracy", fontsize = 20)
plt.title("Accuracy of different Classifier Models", fontsize = 20)
plt.xticks(fontsize = 12, horizontalalignment = 'center', rotation = 8)
plt.yticks(fontsize = 13)
for p in ax.patches:
    width, height = p.get_width(), p.get_height()
    x, y = p.get_xy() 
    ax.annotate(f'{height:.2%}', (x + width/2, y + height*1.02), ha='center', fontsize = 'medium')
plt.show()

Heart Failure related death event data analysis and prediction by Farhan khan. 