In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

#Neural network libraries
import tensorflow as tf
from tensorflow import keras
# Layers for our neural networks
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Dropout, Flatten, Dense
# A pretrained model for transfer learning
from keras.models import Model
from keras.applications import vgg19

# Helper functions
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, f1_score
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
#Models 
from sklearn import tree
from sklearn import datasets

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import GridSearchCV

import statsmodels.api as sm
plt.style.use('fivethirtyeight')

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('/kaggle/input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv')

# Any missing values?

In [None]:
df.isna().sum()

## No missing values, but whats the size of the data?

In [None]:
df.shape

### 299 entries, 13 features

# Basic statistics about the data before any removal of outliers is done

In [None]:
df.describe().T

In [None]:
df.rename(columns={'DEATH_EVENT':'Died'},inplace=True)
df.Died.value_counts().plot(kind='bar',legend=True)
plt.title('Amount of heart failures')


## Time to split the columns into categorical and continuous columns

In [None]:
df.columns

# Continuous Columns

In [None]:
continous_cols = ['age', 'creatinine_phosphokinase',
       'ejection_fraction', 'platelets',
       'serum_creatinine', 'serum_sodium' ]
continous_cols

# Categorical Columns

In [None]:
categorical_cols = list(df.columns.drop(continous_cols))
categorical_cols.remove('Died')
categorical_cols.remove('time')
categorical_cols

# Visualizations of the continuous variables

In [None]:
cnt = 0
max_in_row = 2
for x in continous_cols:
    data = df[x]
    plt.figure(cnt//max_in_row, figsize=(20,10))
    plt.subplot(1, max_in_row, (cnt)%max_in_row + 1)
    plt.title(x)
    sns.histplot(data, bins = 50, kde=50);
    cnt += 1


In [None]:
max_in_row = 2
for x in continous_cols:
    plt.figure(cnt//max_in_row, figsize=(25,4))
    plt.subplot(1, max_in_row, (cnt)%max_in_row + 1)
    plt.title(x)
    sns.kdeplot(data=df, x=x, hue="Died", fill=True, common_norm=1, alpha=.5, linewidth=0);
    cnt += 1

# Percentages of men and women who lived and died from heart failure

In [None]:
male = df[df.sex ==1]
female = df[df.sex == 0 ]

male_lived = male[male.Died ==0]
male_died = male [male.Died ==1]
female_lived = female[female.Died ==0]
female_died =  female[female.Died ==1]
data = [len(male_died), len(male_lived),len(female_died),len(female_lived)]
plt.pie(x=data,labels=['Male Died','Male Lived', 'Female Died','Female Lived'],wedgeprops={'edgecolor':'black'},autopct='%1.1f%%')
plt.tight_layout()
plt.title('Men and women survival rates')

## Percentages of men and women who lived and died from heart failure with having diabetes


In [None]:
male_died_diabetes = male_died[male_died.diabetes==1]
male_lived_diabetes = male_lived[male_lived.diabetes==1]
female_died_diabetes = female_died[female_died.diabetes==1]
female_lived_diabetes = female_lived[female_lived.diabetes==1]
data = [len(male_died_diabetes), len(male_lived_diabetes),len(female_died_diabetes),len(female_lived_diabetes)]
plt.pie(x=data,labels=['Male Died','Male Lived', 'Female Died','Female Lived'],wedgeprops={'edgecolor':'black'},autopct='%1.1f%%')
plt.tight_layout()
plt.title('Diabetes survival rates')

In [None]:
male_died_diabetes = male_died[male_died.diabetes==0]
male_lived_diabetes = male_lived[male_lived.diabetes==0]
female_died_diabetes = female_died[female_died.diabetes==0]
female_lived_diabetes = female_lived[female_lived.diabetes==0]
data = [len(male_died_diabetes), len(male_lived_diabetes),len(female_died_diabetes),len(female_lived_diabetes)]
plt.pie(x=data,labels=['Male Died','Male Lived', 'Female Died','Female Lived'],wedgeprops={'edgecolor':'black'},autopct='%1.1f%%')
plt.tight_layout()
plt.title('Non-Diabetes survival rates')

## Percentages of survival rate amongst men and women who are anaemic

In [None]:
male_died_anemia = male_died[male_died.anaemia==1]
male_lived_anemia = male_lived[male_lived.anaemia==1]
female_died_anemia = female_died[female_died.anaemia==1]
female_lived_anemia = female_lived[female_lived.anaemia==1]
data = [len(male_died_anemia), len(male_lived_anemia),len(female_died_anemia),len(female_lived_anemia)]
plt.pie(x=data,labels=['Male Died','Male Lived', 'Female Died','Female Lived'],wedgeprops={'edgecolor':'black'},autopct='%1.1f%%')
plt.tight_layout()
plt.title('Anaemia survival rates')

In [None]:
male_died_anemia = male_died[male_died.anaemia==0]
male_lived_anemia = male_lived[male_lived.anaemia==0]
female_died_anemia = female_died[female_died.anaemia==0]
female_lived_anemia = female_lived[female_lived.anaemia==0]
data = [len(male_died_anemia), len(male_lived_anemia),len(female_died_anemia),len(female_lived_anemia)]
plt.pie(x=data,labels=['Male Died','Male Lived', 'Female Died','Female Lived'],wedgeprops={'edgecolor':'black'},autopct='%1.1f%%')
plt.tight_layout()
plt.title('Non-Anaemia survival rates')

## Percentages of survival rate amongst men and women who have high blood pressure

In [None]:
male_died_hbp = male_died[male_died.high_blood_pressure==1]
male_lived_hbp= male_lived[male_lived.high_blood_pressure==1]
female_died_hbp = female_died[female_died.high_blood_pressure==1]
female_lived_hbp= female_lived[female_lived.high_blood_pressure==1]
data = [len(male_died_hbp), len(male_lived_hbp),len(female_died_hbp),len(female_lived_hbp)]
plt.pie(x=data,labels=['Male Died','Male Lived', 'Female Died','Female Lived'],wedgeprops={'edgecolor':'black'},autopct='%1.1f%%')
plt.tight_layout()
plt.title('High Blood Pressure survival rates')

In [None]:
male_died_hbp = male_died[male_died.high_blood_pressure==0]
male_lived_hbp= male_lived[male_lived.high_blood_pressure==0]
female_died_hbp = female_died[female_died.high_blood_pressure==0]
female_lived_hbp= female_lived[female_lived.high_blood_pressure==0]
data = [len(male_died_hbp), len(male_lived_hbp),len(female_died_hbp),len(female_lived_hbp)]
plt.pie(x=data,labels=['Male Died','Male Lived', 'Female Died','Female Lived'],wedgeprops={'edgecolor':'black'},autopct='%1.1f%%')
plt.tight_layout()
plt.title('Non-High Blood Pressure survival rates')

In [None]:
male_died_smoke = male_died[male_died.smoking==1]
male_lived_smoke= male_lived[male_lived.smoking==1]
female_died_smoke = female_died[female_died.smoking==1]
female_lived_smoke= female_lived[female_lived.smoking==1]
data = [len(male_died_smoke), len(male_lived_smoke),len(female_died_smoke),len(female_lived_smoke)]
plt.pie(x=data,labels=['Male Died','Male Lived', 'Female Died','Female Lived'],wedgeprops={'edgecolor':'black'},autopct='%1.1f%%')
plt.tight_layout()
plt.title('Smoking survival rates')

In [None]:
male_died_smoke = male_died[male_died.smoking==0]
male_lived_smoke= male_lived[male_lived.smoking==0]
female_died_smoke = female_died[female_died.smoking==0]
female_lived_smoke= female_lived[female_lived.smoking==0]
data = [len(male_died_smoke), len(male_lived_smoke),len(female_died_smoke),len(female_lived_smoke)]
plt.pie(x=data,labels=['Male Died','Male Lived', 'Female Died','Female Lived'],wedgeprops={'edgecolor':'black'},autopct='%1.1f%%')
plt.tight_layout()

plt.title('Non-Smoker survival rates')

# Visualizations of categorical variables with their probability densities 

In [None]:
max_in_row = 3
for x in categorical_cols:
    plt.figure(cnt//max_in_row, figsize=(25,4))
    plt.subplot(1, max_in_row, (cnt)%max_in_row + 1)
    plt.title(x)
    sns.kdeplot(data=df, x=x, hue="Died", fill=True, common_norm=False, alpha=.5, linewidth=0,);
    cnt += 1

# Statistical Analysis 

In [None]:
import statsmodels.api as sm
from scipy.stats import shapiro
import scipy.stats as stats
from scipy.stats import anderson
from scipy.stats import norm, skew

import warnings
warnings.filterwarnings("ignore")


In [None]:
max_in_row = 2
cnt=0
for x in continous_cols:
    plt.figure(cnt//max_in_row, figsize=(25,4))
    plt.subplot(1, max_in_row, (cnt)%max_in_row + 1)
    plt.title(x)
    sns.boxplot(df[x],orient='v')
    cnt += 1

In [None]:
df_no_outliers = df.copy()

In [None]:
Q1 = df_no_outliers.creatinine_phosphokinase.quantile(.25)
Q3 = df_no_outliers.creatinine_phosphokinase.quantile(.75)
IQR = Q3 - Q1
filter = (df_no_outliers.creatinine_phosphokinase >= Q1 - 1.5 * IQR) & (df_no_outliers.creatinine_phosphokinase <= Q3 + 1.5 *IQR)

sns.boxplot(df_no_outliers.loc[filter].creatinine_phosphokinase,orient='v')
plt.title('Creatinine Phosphokinase boxplot after removing outliers ')
shapiro(df_no_outliers.loc[filter].creatinine_phosphokinase)
fig = plt.figure()
res = stats.probplot(df_no_outliers.loc[filter].creatinine_phosphokinase, plot=plt)
plt.show()
df_no_outliers.creatinine_phosphokinase = df_no_outliers[filter]
shapiro(df_no_outliers.loc[filter].creatinine_phosphokinase)

## Creatinine Phosphokinase does not look normal and does not have a large enough pvalue to go with the null hypothesis that the data is normal

In [None]:
Q1 = df_no_outliers.ejection_fraction.quantile(.25)
Q3 = df_no_outliers.ejection_fraction.quantile(.75)
IQR = Q3 - Q1
filter = (df_no_outliers.ejection_fraction >= Q1 - 1.5 * IQR) & (df_no_outliers.ejection_fraction <= Q3 + 1.5 *IQR)

sns.boxplot(df_no_outliers.loc[filter].ejection_fraction,orient='v')
plt.title('Ejection fraction boxplot after removing outliers ')


fig = plt.figure()
res = stats.probplot(df_no_outliers.loc[filter].ejection_fraction, plot=plt)
plt.show()
df_no_outliers.ejection_fraction = df_no_outliers[filter]
shapiro(df_no_outliers.loc[filter].ejection_fraction)

## Ejection Fraction does not look normal and does not have a large enough pvalue to go with the null hypothesis that the data is normal

In [None]:
col = 'platelets'
Q1 = df_no_outliers[col].quantile(.25)
Q3 = df_no_outliers[col].quantile(.75)
IQR = Q3 - Q1
filter = (df_no_outliers[col] >= Q1 - 1.5 * IQR) & (df_no_outliers[col] <= Q3 + 1.5 *IQR)

sns.boxplot(df_no_outliers.loc[filter][col],orient='v')
plt.title('Platelets boxplot after removing outliers ')
plt.tight_layout()

fig = plt.figure()
res = stats.probplot(df_no_outliers.loc[filter][col], plot=plt)
plt.show()
df_no_outliers[col] = df_no_outliers[filter]
shapiro(df_no_outliers.loc[filter][col])

## Platelets does not look normal and does not have a large enough pvalue to go with the null hypothesis that the data is normal

In [None]:
col = 'serum_creatinine'
Q1 = df_no_outliers[col].quantile(.25)
Q3 = df_no_outliers[col].quantile(.75)
IQR = Q3 - Q1
filter = (df_no_outliers[col] >= Q1 - 1.5 * IQR) & (df_no_outliers[col] <= Q3 + 1.5 *IQR)

sns.boxplot(df_no_outliers.loc[filter][col],orient='v')
plt.title( col + ' boxplot after removing outliers ')


fig = plt.figure()
res = stats.probplot(df_no_outliers.loc[filter][col], plot=plt)
plt.show()
df_no_outliers[col] = df_no_outliers[filter]
shapiro(df_no_outliers.loc[filter][col])

## Serum Creatinine does not look normal and does not have a large enough pvalue to go with the null hypothesis that the data is normal

In [None]:
col = 'serum_sodium'
Q1 = df_no_outliers[col].quantile(.25)
Q3 = df_no_outliers[col].quantile(.75)
IQR = Q3 - Q1
filter = (df_no_outliers[col] >= Q1 - 1.5 * IQR) & (df_no_outliers[col] <= Q3 + 1.5 *IQR)

sns.boxplot(df_no_outliers.loc[filter][col],orient='v')
plt.title('Platelets boxplot after removing outliers ')


fig = plt.figure()
res = stats.probplot(df_no_outliers.loc[filter][col], plot=plt)
plt.show()
df_no_outliers[col] = df_no_outliers[filter]
shapiro(df_no_outliers.loc[filter][col])

## Serum Sodium does not look normal and does not have a large enough pvalue to go with the null hypothesis that the data is normal

In [None]:
df_no_outliers = df_no_outliers.dropna()
df_no_outliers.isna().sum()

# Time to create and train some machine learning models to see how well we can predict heart failure

## Let's say what our target variable is

In [None]:
df_no_outliers = pd.get_dummies(df_no_outliers, columns = categorical_cols, drop_first = False)
#df_no_outliers.drop(columns=['time'],inplace=True)
y = df_no_outliers['Died']
#X = df_no_outliers[df_no_outliers.columns.drop('Died')]
#X = df_no_outliers[df_no_outliers.columns.drop('time')]
X = df_no_outliers[df_no_outliers.columns.drop('Died')]
#X = df_no_outliers[df_no_outliers.columns.drop()]
scalerX = MinMaxScaler(feature_range=(0, 1))
X[X.columns] = scalerX.fit_transform(X[X.columns])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [None]:
X.columns

# Logistic Regression 80% accuracy!

In [None]:
model = LogisticRegression(max_iter=50)
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
accuracy_score(y_test,y_pred)

In [None]:
mod =RFE(estimator=LogisticRegression(),n_features_to_select=len(X.columns),verbose=True)
mod.fit(X=X_train,y=y_train)
mod.score(X_test,y_test)

# Neural Network accuracy 82%

In [None]:
model = tf.keras.models.Sequential([ 
    tf.keras.layers.Dense(256, activation='relu'),
  tf.keras.layers.Dense(64, activation='relu'),
   tf.keras.layers.Dense(16, activation='relu'),
tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(optimizer=tf.keras.optimizers.Adam(), loss='binary_crossentropy', metrics=['acc'])
epochs = 100
model.fit(X_train, y_train, epochs=epochs, validation_split=0.1,verbose=0)
model.evaluate(X_test, y_test)

# Decision Tree's accuracy 73%

In [None]:
model = RFE(estimator=DecisionTreeClassifier(),n_features_to_select=len(X.columns),verbose=True)
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
accuracy_score(y_test,y_pred)

# Random Forrest accuracy 78%

In [None]:
model = RFE(estimator=RandomForestClassifier(),n_features_to_select=len(X.columns),verbose=True)
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
accuracy_score(y_test,y_pred)

# K Nearest Neighbors classifier accuracy 76%

In [None]:
model =   RFE(estimator=KNeighborsClassifier(n_neighbors=12,),n_features_to_select=len(X.columns),verbose=True) 
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
print(accuracy_score(y_pred,y_test))

# Support Vector Machine accuracy 87%

In [None]:
params = {
            'max_iter' : [5,7,9,10,12,-1],
            'degree' : [2,3,4,5,6],
            'kernel' : [ 'poly','sigmoid','rbf','linear'],
            'gamma' : ['scale','auto'],
        
         }
grid_search_cv =  GridSearchCV( 
    estimator = SVC(), 
    param_grid = params, 
    scoring = 'accuracy')
grid_search_cv.fit(X_train, y_train)
y_pred = grid_search_cv.predict(X_test)
print(grid_search_cv.best_estimator_)
accuracy_score(y_pred=y_pred,y_true=y_test)