In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
pip install pycaret

In [None]:
import pycaret
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import scipy

#Suppressing all warnings
warnings.filterwarnings("ignore")

%matplotlib inline

In [None]:
df = pd.read_csv("../input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv")
df.head()

 #  Data Review

In [None]:
df.shape


In [None]:
df.describe()

# Dataset's Null Values

In [None]:
df.isna().sum()

# Death Event Distribution

In [None]:
import plotly.express as px
fig = px.pie(df, names='DEATH_EVENT', title='Distribution of Death Events in Patients',width=700, height=500)
fig.show()

# Correlation Heatmap #isolates important variables

In [None]:
corr = df.corr()
ax, fig = plt.subplots(figsize=(15,15))
sns.heatmap(corr, vmin=-1, cmap='coolwarm', annot=True)
plt.show()

In [None]:
corr[abs(corr['DEATH_EVENT']) > 0.1]['DEATH_EVENT'] # This found a strong correlation between the variables but keep in mind that correlation does not imply causation

 This found variables that have a strong correlation-- in other words, significant variable importance--in the    calculation of Death_Event but keep in mind that correlation does not imply causation. 'Age' excluded after trial and error.

# Pycaret Data Preprocessing

In [None]:
from pycaret.classification import *
exp_reg101 = setup(data = df, target = 'DEATH_EVENT', session_id=123,
                   ignore_features = ['age','anaemia','diabetes','creatinine_phosphokinase','high_blood_pressure','platelets','sex','smoking'],
                   train_size = .8,numeric_features = ['ejection_fraction']
                   
                  )

# Model Creation with Pycaret

In [None]:
best = compare_models() #compares 15 algorithms at once for the single best one 

In [None]:
top5 = compare_models(n_select = 5) #compares top 5 models

In [None]:
tuned_top5 = [tune_model(i) for i in top5]

In [None]:
bagged_top5 = [ensemble_model(i) for i in tuned_top5] # Highest Accuracy, AUC, Precision in combined version of top 5 models

In [None]:
blender = blend_models(estimator_list = top5) # blends top 5 models

In [None]:
tuned_best = tune_model(best) # tunes best individual model

In [None]:
stack_soft = stack_models(top5)

In [None]:
stack_soft2 = stack_models(top5, meta_model=best)

In [None]:
rf = create_model('rf', verbose = False)
et = create_model('et', verbose = False)
catboost = create_model('catboost', verbose = False)

In [None]:
blend_soft = blend_models(estimator_list = [rf, et, catboost], method = 'soft')

In [None]:
blend_hard = blend_models(estimator_list = [rf, et, catboost], method = 'hard')

In [None]:
print(bagged_top5)

# Rupesh Deshmukh's code Analysis below

In [None]:
x = df[['ejection_fraction', 'serum_creatinine', 'serum_sodium', 'time']]
y = df['DEATH_EVENT']
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=1,test_size=0.2)


In [None]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, plot_confusion_matrix #Accuracy is the only metric !?
svm=SVC()
svm.fit(x_train,y_train)
p_1=svm.predict(x_test)
s_1=accuracy_score(y_test,p_1)
print("Support Vector Machine Success Rate :", "{:.2f}%".format(100*s_1))
plot_confusion_matrix(svm, x_test, y_test)
plt.show()

In [None]:
from xgboost import XGBClassifier
from bayes_opt import BayesianOptimization
from sklearn.metrics import roc_auc_score # AUC score is the only metric !?
from sklearn.model_selection import StratifiedKFold, GridSearchCV

params = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
        }

xgb = XGBClassifier(learning_rate=0.01, n_estimators=1000, objective='binary:logistic')

skf = StratifiedKFold(n_splits=5, shuffle = True, random_state = 0)

grid = GridSearchCV(estimator=xgb, param_grid=params, n_jobs=4, 
                    cv=skf.split(x_train,y_train), verbose=0 )

grid.fit(x_train,y_train,early_stopping_rounds=30,eval_set=[(x_test, y_test)])
p2x = grid.best_estimator_.predict(x_test)
s2x=accuracy_score(y_test,p2x)
print("Extra Gradient Booster Classifier Success Rate :", "{:.2f}%".format(100*s2x))
plot_confusion_matrix(grid.best_estimator_, x_test, y_test)
plt.show()