In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session


In [None]:
import numpy as np
import pandas as pd
pd.set_option('max_columns', 105)
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set()

# sklearn
from sklearn.preprocessing import StandardScaler 
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
#import xgboost as xgb
from xgboost import XGBClassifier

from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

import joblib

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore")

from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))


This notebook explores and analyzes heart failure. Please check out the feature explanations by RachidYZ:

https://www.kaggle.com/andrewmvd/heart-failure-clinical-data/discussion/193109

In [None]:
df = pd.read_csv('../input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv')
df.head()

In [None]:
df.shape

The target variable is DEATH_EVENT. There is a class imbalance when the value counts are reviewed.

In [None]:
df['DEATH_EVENT'].value_counts()

Let's explore it further with Pandas Profiling.

In [None]:
import pandas_profiling as pp
from pandas_profiling import ProfileReport

ProfileReport(df, title='Pandas Profiling Report for training dataset', html={'style':{'full_width':True}})

We can see that there are no missing values that need handling. We also see a couple high-correlation relationships between a few features. Let's explore this in more detail.

First we will plot the time feature against the target.


In [None]:
ax = sns.boxplot(x="DEATH_EVENT", y="time", data=df)

In [None]:
mean_time_death_0 = df[df['DEATH_EVENT']==0]['time'].mean()
mean_time_death_1 = df[df['DEATH_EVENT']==1]['time'].mean()

print(f'Average followup time of a healthy person is {mean_time_death_0}.')
print(f'Average followup time of an un-healthy person is {mean_time_death_1}.')

This raises some flags. Note that the 25-percentile value of the left plot is almost higher than the 75-percentile value of the right! Searching through the forums, I found the following post:

https://www.kaggle.com/andrewmvd/heart-failure-clinical-data/discussion/178372

That makes sense- healthy patients will not need to follow up shortly, while those with complications might check back into the hospital soon. Since the time feature will not be available for new examples, let's leave it out for now.

In [None]:
df.drop(['time'], axis=1, inplace=True)

Another feature flagged by the Pandas Profile report is ejection_fraction. Let's look at it against the target.

In [None]:
ax = sns.boxplot(x="DEATH_EVENT", y="ejection_fraction", data=df)

Ejection fraction, which is the percentage of blood leaving the heart from a contraction, is higher in healthy patients than in unhealthy patients. Reviewing the PP report, the max value is much higher than the others. This one super-efficient heart is pumping out 80% of the blood, which is much higher than even the 95-percentile value of 60%. Let's remove this outlier.

In [None]:
df[df['ejection_fraction']==80]

In [None]:
df.drop(df[df['ejection_fraction']==80].index, axis=0, inplace=True)

In [None]:
df.head()

Now we can proceed with modeling. Using gridsearchcv to do some light hyperparameter tuning, we will compare the results of three models.

In [None]:
from sklearn.model_selection import train_test_split
y = df['DEATH_EVENT']
X = df.drop('DEATH_EVENT', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [None]:
numerical_transformer = Pipeline(steps=[
    #('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

In [None]:
# LinearRegression

pipe_Logistic = Pipeline(steps = 
                       [('numerical_transformer', numerical_transformer),
                        ('Logistic', LogisticRegression()) ])

pipe_SVC = Pipeline(steps = 
                    [ ('numerical_transformer', numerical_transformer),
                      ('SVC', SVC()) ])

pipe_XGB  = Pipeline(steps = 
                     [('numerical_transformer', numerical_transformer),
                        ('XGB', XGBClassifier()) ])



In [None]:
list_pipelines = [pipe_Logistic, pipe_SVC, pipe_XGB]

Cross validation for preliminary results:

In [None]:
print("model", "\t", "mean", "\t", "std", "\t", "\t", "min")
print("-+"*30)
for pipe in list_pipelines :
    
    scores = cross_val_score(pipe, X, y, cv=5)
    print(pipe.steps[1][0], "\t", 
          '{:08.6f}'.format(np.mean(scores)), "\t",  
          '{:08.6f}'.format(np.std(scores)),  "\t", 
          '{:08.6f}'.format(np.min(scores)))

In [None]:
parameters_Logistic = {"Logistic__C": np.logspace(-3,3,10), 
                       "Logistic__penalty": ["l1","l2"]}

gscv_Logistic = GridSearchCV(pipe_Logistic, parameters_Logistic, n_jobs=-1, verbose=0, cv=5)
gscv_Logistic.fit(X_train, y_train)


In [None]:
print(f'Best Logistic score is {gscv_Logistic.best_score_}')
print(f'Best Logistic params are {gscv_Logistic.best_params_}')

In [None]:
print(accuracy_score(y_test, gscv_Logistic.predict(X_test)))

In [None]:
joblib.dump(gscv_Logistic.best_estimator_, 'Logistic_HF.pkl')

In [None]:
parameters_SVC = {'SVC__C': [1,10,100],
                  'SVC__gamma': [0.1,0.001,0.0001], 
                  'SVC__kernel': ['linear','rbf']}

gscv_SVC = GridSearchCV(pipe_SVC, parameters_SVC, n_jobs=-1, verbose=0, cv=5)
gscv_SVC.fit(X_train, y_train)

In [None]:
print(f'Best Logistic score is {gscv_SVC.best_score_}')
print(f'Best Logistic params are {gscv_SVC.best_params_}')

In [None]:
print(accuracy_score(y_test, gscv_SVC.predict(X_test)))

In [None]:
joblib.dump(gscv_SVC.best_estimator_, 'SVC_HF.pkl')

In [None]:
load_clf = joblib.load('SVC_HF.pkl')
prediction = load_clf.predict(X)

In [None]:
import pickle
Logi = LogisticRegression()
Logi.fit(X,y)
pickle.dump(Logi,open('Logistic_HF.pkl', 'wb'))

In [None]:
parameters_XGB = {'XGB__max_depth': [2,3,5],
                  'XGB__min_child_weight': [1,6],
                  'XGB__n_estimators': [300]}

gscv_XGB = GridSearchCV(pipe_XGB, parameters_XGB, n_jobs=-1, verbose=0, cv=5)
gscv_XGB.fit(X_train, y_train)

In [None]:
print(f'Best Logistic score is {gscv_XGB.best_score_}')
print(f'Best Logistic params are {gscv_XGB.best_params_}')

In [None]:
print(accuracy_score(y_test, gscv_XGB.predict(X_test)))

In [None]:
joblib.dump(gscv_XGB.best_estimator_, 'XGB_HF.pkl')