In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt

## statistical tests
from scipy.stats import ttest_ind

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current sessionda

In [None]:
data = pd.read_csv("/kaggle/input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv")
my_data = data.copy()
my_data.head()
## our target variable is "DEATH_EVENT". we store it in a variable
target_variable = "DEATH_EVENT"

# Data exploratory analysis

first we check the target variable.
the name of our target variable is **DEATH_EVENT** and it has 2 classes, 1 for **DEAD** and 0 for **ALIVE**

In [None]:
my_data.shape

we have 299 rows and 12 features ( we exclude the **DEATH_EVENT** which is our target variable )
now, we need to check the types of our features. To do so, we use the dtype of pandas.

In [None]:
my_data.dtypes

after getting an idea about types in our dataset, we need to check missing values.
one way is to use a *heatmap* along with *isna()*

In [None]:
sns.heatmap(my_data.isna(), cbar=False)

In [None]:
((my_data.shape[0]-my_data.isna().sum())/my_data.shape[0])*100

the heatmap and the previous table shows that there are no missing values.

based on the description of the dataset, time captures the time of the event. That is, the time at which the patient died or were censored which means that it won't be included as a feature in our analysis. no end user will be able to providethe value of time, since they do not know at what time in the future the patient will die/get censored!

In [None]:
my_data = my_data.drop(["time"],axis=1)

## explore the target variable

now what we need to do is to exlore the target variable, know how many deaths caused by heart failure.

In [None]:
not_dead = my_data[my_data[target_variable] == 0][target_variable].value_counts().tolist()[0]
dead = my_data[my_data[target_variable] == 1][target_variable].value_counts().tolist()[0]

dead_percentage = dead / my_data.shape[0] * 100
not_dead_percentage = not_dead / my_data.shape[0] * 100

print("the percentage of dead people from heart failure is {:.2f}%".format(dead_percentage))
print("the percentage of people that didn't die from heart failure is {:.2f}%".format(not_dead_percentage))

now, lets explore our features.

## float features 
 
we first began with our float variables ( continous ones ) and draw histograms to know more about their distribution.

In [None]:
float_features = my_data.select_dtypes("float64")
print("float features are : {}".format(float_features.transpose().index.tolist()))
for float_feature in float_features:
   plt.figure()
   sns.distplot(my_data[float_feature])

In [None]:
for float_feature in float_features:
   print("skewness of the feature \"{}\" is {:.2f}".format(float_feature,my_data[float_feature].skew()))

as we can see, the age variable distribution is close to normal distribution, while others are skewed!

## integer features

lets explore our integer features.

In [None]:
int_features = my_data.select_dtypes("int")
for int_feature in int_features:
    print("{} #{}".format(int_feature,my_data[int_feature].nunique()))

as we see here, we have 2 groups of features.
lets first see in detail every feature that has only 2 unique values.

In [None]:
int_features_with_two_values = []
for int_feature in int_features:
    if my_data[int_feature].nunique() <= 2 and int_feature != target_variable:
        int_features_with_two_values.append(int_feature)
int_features_with_two_values

let see each value of these features.

In [None]:
for int_feature_with_two_values in int_features_with_two_values:
    print("{:<20} {} ".format(int_feature_with_two_values,my_data[int_feature_with_two_values].unique()))

as we can see here, there are 0 and 1 as values. it's obvious that, for *anaemia*, *diabetes*, *high blood pressure* and *smoking* are boolean variables where 0 represents **FALSE/NO** and 1 represents **TRUE/YES**.
for the *sex* feature, 0 represents a **woman**, and 1 represents a **man**.

let's check the categories distribution using pies.

In [None]:
def show_percentage(label,list,indexes):
    for i in range(0,len(list)):
        percentage = list[i] / sum(list) * 100
        print("the number of {} in \"{}\" variable is {}  [ {:.2f}% ]".format(indexes[i],label,list[i],percentage))

In [None]:
for int_feature_with_two_values in int_features_with_two_values:
   if int_feature_with_two_values == "sex":
    show_percentage(int_feature_with_two_values,my_data[int_feature_with_two_values].value_counts(),["woman","man"])
   else:
    show_percentage(int_feature_with_two_values,my_data[int_feature_with_two_values].value_counts(),["no","yes"])
   print("-----------")

now let's check other integer values that are not categorical.

In [None]:
int_features_with_multiple_values = []
for int_feature in int_features:
    if my_data[int_feature].nunique() > 2:
        int_features_with_multiple_values.append(int_feature)
int_features_with_multiple_values

what we're gonna do now is show the summary and the histogram for these variables.

In [None]:
for int_feature_with_multiple_values in int_features_with_multiple_values:
    plt.figure()
    sns.distplot(my_data[int_feature_with_multiple_values])
    
for int_feature_with_multiple_values in int_features_with_multiple_values:
   print("skewness of the feature \"{}\" is {:.2f}".format(int_feature_with_multiple_values,my_data[int_feature_with_multiple_values].skew()))

In [None]:
for int_feature_with_multiple_values in int_features_with_multiple_values:
    print(my_data[int_feature_with_multiple_values].describe())
    print("---------------------------------")

we got an idea about every variable in our dataset. Now let's check the relationship between the target variable and other variables.

## relation between target variable & other variables:

first, we split our data into 2 groups ( based on the classes we have on the *target* variable ). so we will have 2 groups.

In [None]:
# dead people
dead_people_data = my_data[my_data[target_variable] == 1]
# alive people
alive_people_data = my_data[my_data[target_variable] == 0]

let's print again every variable we have.

In [None]:
print(float_features.columns.tolist())
print(int_features_with_multiple_values)
print(int_features_with_two_values)

let's start with diabetes, smoking, high blood pressure and anaemia.we check if they contribute to the heart failure.

In [None]:
#fig, ax =plt.subplots(1,len(int_features_with_two_values),constrained_layout=True,figsize=(15,4))
for int_feature_with_two_values in int_features_with_two_values:
    plt.figure()
    sns.heatmap(pd.crosstab(my_data[target_variable],my_data[int_feature_with_two_values]),annot=True,cbar=False,fmt='d')

since our dataset is small, and from the cross tables, the previous variables don't give us any idea if they contribute in the heart failure or not since counts are close to each other.

now, let's check the relation of our target variable with *creatinine_phosphokinase*  , *ejection_fraction* and *serum_sodium*

In [None]:
## ejection_fraction
plt.figure()
sns.distplot(dead_people_data["ejection_fraction"],hist=False,label='dead people')
sns.distplot(alive_people_data["ejection_fraction"],hist=False,label='alive people')
plt.legend()

## creatinine_phosphokinase
plt.figure()
sns.distplot(dead_people_data["creatinine_phosphokinase"],hist=False,label='dead people')
sns.distplot(alive_people_data["creatinine_phosphokinase"],hist=False,label='alive people')
plt.legend()

## serum_sodium
plt.figure()
sns.distplot(dead_people_data["serum_sodium"],hist=False,label='dead people')
sns.distplot(alive_people_data["serum_sodium"],hist=False,label='alive people')
plt.legend()

from our plots, we see that **serum sodium** and **ejection fraction** distribution in both classes are quite different. maybe they are linked to the heart failure. we need to **test this hypothesis**

let's plot other float variables and check if there's a difference between dead and alive people.

In [None]:
## age
plt.figure()
sns.distplot(dead_people_data["age"],hist=False,label='dead people')
sns.distplot(alive_people_data["age"],hist=False,label='alive people')
plt.legend()

## platelets
plt.figure()
sns.distplot(dead_people_data["platelets"],hist=False,label='dead people')
sns.distplot(alive_people_data["platelets"],hist=False,label='alive people')
plt.legend()

## age
plt.figure()
sns.distplot(dead_people_data["serum_creatinine"],hist=False,label='dead people')
sns.distplot(alive_people_data["serum_creatinine"],hist=False,label='alive people')
plt.legend()

there's a slight difference between dead and alive people in the **serum creatinine**. we will test our hypothesis.

## deep analysis

what we will do now is to find relations between variables themselves.

first we check if **serum creatinine** , **serum sodium** and **ejection fraction** are correlated.

secondly, we see if these variables are related to age, sex.

finally, we check if smoking, diabetes, high blood pressure or aneamia affect variables that we think they contribute in heart failure ( serum creatinine, serum sodium, ejection fraction).

In [None]:
sus_variables = ["serum_creatinine","ejection_fraction","serum_sodium"] 
sns.heatmap(my_data[sus_variables].corr())

as we can see from the heatmap, variables are not collerated.

In [None]:
for sus_variable in sus_variables:
    sns.lmplot(x="age",y=sus_variable,hue=target_variable,data=my_data)
    print("correlation is {}".format(my_data.corr()["age"][sus_variable]))

there's no correlation between age and the different variables we suspected their contribution.
let's see if sex is related.

In [None]:
## we split our data with sex.
man_data = my_data[(my_data["sex"] == 1)]
woman_data = my_data[(my_data["sex"] == 0)]
for sus_variable in sus_variables:
    plt.figure()
    sns.distplot(man_data[sus_variable],hist=False,label='man')
    sns.distplot(woman_data[sus_variable],hist=False,label='woman')
    plt.legend()

as we can see, there's a slight difference between dead man & woman when it comes to serum sodium. we need to **test** if sex has a relation with the level of that variable.

In [None]:
# smokers and non smokers
people_true_data = my_data[my_data["smoking"] == 1]
people_false_data = my_data[my_data["smoking"] == 0]
for sus_variable in sus_variables:
    plt.figure()
    sns.distplot(people_true_data[sus_variable],hist=False,label='smokers')
    sns.distplot(people_false_data[sus_variable],hist=False,label='non_smokers')
    plt.legend()

In [None]:
# diabetes and non diabetes
people_true_data = my_data[(my_data["diabetes"] == 1)]
people_false_data = my_data[my_data["diabetes"] == 0]
for sus_variable in sus_variables:
    plt.figure()
    sns.distplot(people_true_data[sus_variable],hist=False,label='diabetes')
    sns.distplot(people_false_data[sus_variable],hist=False,label='non diabetes')
    plt.legend()

In [None]:
# high blood pressure and non high blood pressure 
people_true_data = my_data[my_data["high_blood_pressure"] == 1]
people_false_data = my_data[my_data["high_blood_pressure"] == 0]
for sus_variable in sus_variables:
    plt.figure()
    sns.distplot(people_true_data[sus_variable],hist=False,label='high blood pressure ')
    sns.distplot(people_false_data[sus_variable],hist=False,label='non high blood pressure ')
    plt.legend()

In [None]:
# aneamia and non aneamia 
people_true_data = my_data[my_data["anaemia"] == 1]
people_false_data = my_data[my_data["anaemia"] == 0]
for sus_variable in sus_variables:
    plt.figure()
    sns.distplot(people_true_data[sus_variable],hist=False,label='anaemia')
    sns.distplot(people_false_data[sus_variable],hist=False,label='non anaemia')
    plt.legend()

based on the distributions,we conclude that those variables don't have any effect on our suspected variables.

# Summary

what we concluded from our analysis is:

- serum sodium, ejection fraction and serum creatinine may contribute to heart failure.
- serum sodium levels can differ from man to woman

time to test these hypothesis:
- dead people had significantly different levels of serum sodium, ejection fraction and serum creatinine.
    H0 = means are equal for dead and alive people.
- women & men have significantly different means of serum sodium.

In [None]:
## we need to balance our datas for tests.
print(dead_people_data.shape)
print(alive_people_data.shape)

In [None]:
balanced_alive_people_data = alive_people_data.sample(dead_people_data.shape[0])

In [None]:
def t_test(data1,data2,alpha):
    stat, p = ttest_ind(data1,data2)
    return p < alpha

In [None]:
print(t_test(balanced_alive_people_data["serum_sodium"],dead_people_data["serum_sodium"],0.05))
print(t_test(balanced_alive_people_data["ejection_fraction"],dead_people_data["ejection_fraction"],0.05))
print(t_test(balanced_alive_people_data["serum_creatinine"],dead_people_data["serum_creatinine"],0.05))

thanks to ttest, we confirmed that the 3 variables contribute in heart failure.
let's test our last hypothesis.

In [None]:
balanced_man_data = man_data.sample(woman_data.shape[0])
print(t_test(balanced_man_data["serum_sodium"],woman_data["serum_sodium"],0.05))

from the ttest, sex doesn't really contribute on the levels of serum sodium.

# Data preprocessing & training

In [None]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV

important_features = ["serum_sodium","ejection_fraction","serum_creatinine"]

data_for_processing = data.copy()

for important_feature in important_features:
    data_gross = data_for_processing[important_feature].values.reshape(-1, 1).astype(float)
    data_normalized = preprocessing.normalize(data_gross,axis=0)
    data_for_processing[important_feature] = data_normalized
    

data_for_training = data_for_processing[important_features+[target_variable]]

train_set,test_set = train_test_split(data_for_training,test_size=0.25)
    
train_set_y = train_set.pop(target_variable)
test_set_y = test_set.pop(target_variable)

In [None]:
## logistic regression
lr_parameters = {'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],'C': [0.001, 0.01, 0.1, 1,10],'penalty': ['l2']}
lr = GridSearchCV(LogisticRegression(),param_grid=lr_parameters,cv=5,refit=True)
lr.fit(train_set,train_set_y)
print(lr.best_params_)
lr_score = lr.score(test_set, test_set_y)

In [None]:
## gaussian naive bayesian
gnb = GaussianNB()
gnb.fit(train_set,train_set_y)
gnb_score = gnb.score(test_set, test_set_y)

In [None]:
## MLP
parameters = {'solver': ['lbfgs','adam'], 'max_iter': [1500,2000], 'alpha': 10.0 ** -np.arange(2, 5), 'hidden_layer_sizes':np.arange(3,9)}
MLP_gs = GridSearchCV(MLPClassifier(),parameters,cv=5)
MLP_gs.fit(train_set,train_set_y)
print(MLP_gs.best_params_)

In [None]:
MLP_score = MLP_gs.score(test_set, test_set_y)

In [None]:
SVM_parameters = {'kernel': ['rbf','sigmoid','linear'], 'gamma': 10.0 ** -np.arange(1, 7),'C': [0.001, 0.10, 0.1,1, 10, 25, 50, 100, 1000]}
SVM_gs = GridSearchCV(svm.SVC(),SVM_parameters,cv=5)
SVM_gs.fit(train_set,train_set_y)
print(SVM_gs.best_params_)

In [None]:
SVM_score = SVM_gs.score(test_set, test_set_y)

let's plot the values now.

In [None]:
global_metrics = pd.DataFrame([{"model" : 'SVM' , "score" : SVM_score },{"model" : 'Logistic Regression' , "score" : lr_score },{"model" : 'Naive Bayesian' , "score" : gnb_score },\
                               {"model" : 'Neural Network' , "score" : MLP_score }])

plt.figure(figsize=(7,7))
splot = sns.barplot(x="model", y="score",data=global_metrics)
for p in splot.patches:
    splot.annotate(format(p.get_height(), '.3f'), 
                   (p.get_x() + p.get_width() / 2., p.get_height()), 
                   ha = 'center', va = 'center', 
                   xytext = (0, 9), 
                   textcoords = 'offset points')