In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")


In [None]:
df=pd.read_csv("../input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv").copy()

In this work we will predict heart failure. Here is the dataframe that we will work on. If the patient deceased during the follow-up period, it is 1. Otherwise, it is 0.

In [None]:
df.head()

If we look the information of the columns, we see that there is no null variables.(There is 299 sample and all of the columns have 299 non-null).

In [None]:
df.info()

## DATA ANALYSIS

### Target Variable

The target variable "DEATH_EVENT" has 0 observations more than 1.

In [None]:
print(df["DEATH_EVENT"][df["DEATH_EVENT"]==0].count())
print(df["DEATH_EVENT"][df["DEATH_EVENT"]==1].count())

In [None]:
sns.countplot("DEATH_EVENT", data=df, palette="Set3")

### Categorical Variables

We can see in the dataframe,the columns "anaemia", "diabetes", "high_blood_pressure", "sex", "smoking" are categorical variables. Now, we will apply chi-square test for figuring out the association between these variables and target variable. In the result, if p value>0.05 (for alpha = 5%) it means we don't have evidence to believe that there is association between target and that categorical variable. Namely, they are independent. We can ignore that variable because if they dont have any effect on target, keeping them is not useful.

In [None]:
cat_columns=["anaemia","diabetes","high_blood_pressure","sex","smoking"]

In [None]:
import scipy.stats
for i in cat_columns:  
    print("Chi-Square Test Between", "\033[4m", "DEATH_EVENT","\033[0m","and","\033[4m",i,'\033[0m',"\n")
    cross=pd.crosstab(index=df["DEATH_EVENT"], columns=df[i])
    chi2,p,dof,expected= scipy.stats.chi2_contingency(cross)
    print("Chi=", chi2, "\033[1m", "p_value=", p,"\033[0m", "\n")
 

All categoric variables has p-value that is greater than 0.05, so we will not use any of them.
We can explain it like this; for example, we take the "diabetes" variable, we find the cross table of it below. 
Look at the numbers, if a sample is not diabete(0), the probability of deadth is 56/174=0.3218.
Otherhand, if the sample is diabete(1), the probability of death is 40/125= 0.32.
So, whether the sample is diabete or not, the probability of dead is equal. So diabete is not effect on the death.

In [None]:
cross=pd.crosstab(index=df["DEATH_EVENT"], columns=df["diabetes"])
print(cross)

In [None]:
sns.countplot("diabetes", hue="DEATH_EVENT",data=df,palette="Set3")
plt.show()

### Numeric Varibales

Because of the all categorical data has no effect on the target, we will continue with the numeric data.

In [None]:
num_data=df.drop(cat_columns,axis=1)
target=num_data["DEATH_EVENT"]
num_corr=num_data.drop("DEATH_EVENT", axis=1)

#### Correlation Matrix

 First, we will find the correlation in between numeric variables. If there is a any high correlaton (negative or positive) in between two variable, we will use one of them. 
 As you can see in the correlation table there is no high correlation in  between variables. We can use all of them.

In [None]:
corr=num_corr.corr()
fig,ax = plt.subplots(figsize=(7, 7))
sns.heatmap(corr, ax=ax, annot=True, linewidths=0.05, fmt= '.2f',cmap="GnBu")
plt.show()

#### t-test

Now, we will do t-test between numeric variables and target. Result of this will show us the dependence between of them.
In the two sample t-test, null hypotesis, H0, says that "the means of two samples are equal". So, if p-value<0.05, H0 is rejected. It means they have different mean, so there is a meaningful difference between of them. They are dependent. We will eliminate the variables that have p-value>0.05, because they are not dependent on target.

In [None]:
def two_sample_t_test(cl1,cl2, data):
        import scipy.stats as stats
        from pandas import Series
        A=Series(data[cl1].iloc[:][data[cl2]==1])
        B=Series(data[cl1].iloc[:][data[cl2]==0])
        print("\033[1m","'{}' - '{}'". format(cl1,cl2),"\033[0m")
                       
        stat, p=stats.levene(A,B)
       
        if p<0.05:
            var=False
        else:
            var=True
        print("RESULT:")
        stat, p=stats.ttest_ind(A,B, equal_var=var)
        print("\033[1m","p-val=",p,"\033[0m")
        if p<0.05:
            print("\033[1m","H0--> Reject. They have different mean\n","\033[0m")
        else:
            print("\033[1m","H0--> Fail to Reject.\n","\033[0m")

In [None]:
for col in num_data.columns:
    two_sample_t_test(col,"DEATH_EVENT",num_data)

In the result of t-test, the variables "creatinine_phosphokinase", "platelets" have p-values greater than 0.05. As you can see in the box plots, for both of the variables, the mean are the same wheter it is dead or not. These variables independent from target. So, we will not use them for model.

In [None]:
plt.figure(figsize=(5,5))
sns.boxplot(x="DEATH_EVENT",y="platelets", data=df, palette=["lightblue", "pink"])

For example, if we look at the age-dead_event association, we can see that, "DEATH_EVENT" values that are 1, has greater age mean. 

In [None]:
sns.boxplot(x="DEATH_EVENT",y="age", data=df, palette=["lightblue", "pink"])

In [None]:
num_selected=num_data[["age","ejection_fraction","serum_creatinine","serum_sodium","time"]].copy() 
# new numeric data without "creatinine_phosphokinase" and "platelets".

#### Outliers

Now we will look at the outliers. "serum_creatinine" and "serum_sodium" seems like they have a lot of outliers but if we search about these variables, outliers of them are not abnormal values. For example, serum creatinine value is normal value up to 10 mg/dL . This also applies to the serum_sodium variable . So, we will not delete them.

In [None]:
for i in num_selected.columns:
    sns.boxplot(num_selected[i],palette=["lightblue"])   
    plt.show()

#### Train- Test Split

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test= train_test_split(num_selected,target,test_size=0.30, random_state=42)

#### Scaling 

Look at the data, the unit of values different. So, we will scale them.

In [None]:
X_train.head()

In [None]:
X_train_scaled=X_train.copy()
X_test_scaled=X_test.copy()

In [None]:
from sklearn.preprocessing import StandardScaler
for i in X_train_scaled.columns:
    scaler=StandardScaler().fit(X_train[[i]])
    X_train_scaled[i]=scaler.transform(X_train[[i]])
    X_test_scaled[i]=scaler.transform(X_test[[i]])

In [None]:
X_train_scaled.head()

After scaling, their units become more regular.

## MODELS

First of all, I will write a method that allows us to access all result information of the models. It will give us train-test scores, cross-validation results and ROC Curves.

In [None]:
def result(model,X_train,X_test,y_train,y_test):
    print( "\033[1m","         ****** RESULT ****** ", "\033[0;0m")
    y_pred = model.predict(X_test)
    print("\033[1m","TEST Accuracy=","\033[0m",accuracy_score(y_test, y_pred))
    print("\033[1m","TEST Report=\n","\033[0m",classification_report(y_test,model.predict(X_test)),"\n")
    y_pred = model.predict(X_train)
    print("\033[1m","TRAIN Accuracy=","\033[0m",accuracy_score(y_train, y_pred))
    print("\033[1m","TRAIN Report=\n","\033[0m",classification_report(y_train,model.predict(X_train)),"\n")
    
    print("\033[1m","Cross Validation TEST:\n","\033[0m",cross_val_score(model, X_test, y_test, cv = 5).mean())    
    print("\033[1m","Cross Validation TRAIN:\n","\033[0m",cross_val_score(model, X_train, y_train, cv = 5).mean(),"\n")
    print("\033[1m","ROC CURVES","\033[0m")
    nb_roc_auc=roc_auc_score(y_test,model.predict(X_test))
    fpr, tpr, thresholds = roc_curve(y_test, model.predict_proba(X_test)[:,1])
    plt.figure()
    plt.plot(fpr, tpr, label='AUC (area = %0.2f)' % nb_roc_auc)
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive ')
    plt.ylabel('True Positive ')
    plt.title('ROC-TEST')
    plt.show()
    nb_roc_auc=roc_auc_score(y_train,model.predict(X_train))
    fpr, tpr, thresholds = roc_curve(y_train, model.predict_proba(X_train)[:,1])
    plt.figure()
    plt.plot(fpr, tpr, label='AUC (area = %0.2f)' % nb_roc_auc)
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive ')
    plt.ylabel('True Positive ')
    plt.title('ROC-TRAIN')
    plt.show() 
   

#### LOGISTIC REGRESSION

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, roc_auc_score,roc_curve
from sklearn.model_selection import GridSearchCV, cross_val_score

In [None]:
logreg=LogisticRegression()
model=logreg.fit(X_train_scaled,y_train)

In [None]:
result(logreg,X_train_scaled,X_test_scaled,y_train,y_test)

#### Linear SVC

LinearSVC has no function of predict_proba, so we will use CalibratedClassifierCV for getting probabilities.

In [None]:
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
svm = LinearSVC()
clf = CalibratedClassifierCV(svm) 
clf.fit(X_train_scaled, y_train)

In [None]:
result(clf,X_train_scaled,X_test_scaled,y_train,y_test)

#### NAIVE BAYES

In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
nb=GaussianNB().fit(X_train_scaled,y_train)

In [None]:
result(nb,X_train_scaled,X_test_scaled,y_train,y_test)

#### SUPPORT VECTOR MACHINES

In [None]:
from sklearn.svm import SVC
svm_model = SVC(kernel = "linear",probability=True).fit(X_train_scaled, y_train)
svm_model

In [None]:
result(svm_model,X_train_scaled,X_test_scaled,y_train,y_test)

#### TUNING

For logistic regression, linear SVC and SVC, the grid search was applied but nothing change. So, this process was not shown. When we look at the train-test accuracy score results, we can see the difference like 0.08, 0.10.. etc. These difference show the the models not enough for the generalization. It can be happened beacuse of our data set size. Logistic regression, linear SVC, SVC perform well when the data set is large; and Naive Bayes performs well when high-dimensional data set. (After feature elimination we had 5 feature and sample size 299 is not large data set.)