In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import LabelEncoder, RobustScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, KFold, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, roc_auc_score, roc_curve, confusion_matrix, classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

from sklearn.impute import KNNImputer

from xgboost import XGBClassifier
import lightgbm as lgbm

In [None]:
path = '/kaggle/input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv'

df = pd.read_csv(path)
df.head()

In [None]:
df.shape

In [None]:
plt.figure(figsize=(9,7))
plt.title('Missing values', fontweight='bold')
ax = sns.heatmap(df.isna().sum().to_frame(), annot=True, fmt='d', cmap='turbo')
ax.set_xlabel('Ammount missing')
plt.show()

In [None]:
sns.countplot(x='DEATH_EVENT',data=df)
plt.show()

In [None]:
df.info()

In [None]:
float_cols = df.select_dtypes('float')
plt.figure(figsize=(19,12))

n = 1
for i in float_cols:
    plt.subplot(2,2,n)
    sns.histplot(x=i, hue='DEATH_EVENT', edgecolor='black', alpha=0.6,
                multiple='stack', data=df)
    sns.despine()
    plt.title(f'Histplot of DEATH EVENT by {i}')
    n += 1
plt.tight_layout()
plt.show()

In [None]:
float_cols = df.drop('DEATH_EVENT',axis=1).select_dtypes('int')
plt.figure(figsize=(15,17))

n = 1
for i in float_cols:
    plt.subplot(7,2,n)
    sns.kdeplot(x=i, hue='DEATH_EVENT', data=df)
    sns.despine()
    plt.title(f'Histplot of DEATH EVENT by {i}')
    n += 1
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(19,7))
sns.heatmap(df.corr(), annot=True, cmap='Accent')
plt.show()

In [None]:
mms = MinMaxScaler()
x = df.drop('DEATH_EVENT', axis=1)
y = df['DEATH_EVENT']
X_train, X_test, y_train, y_test = train_test_split(x,y,train_size=0.75,random_state=42)
X_train, X_test = mms.fit_transform(X_train), mms.fit_transform(X_test)

In [None]:
# prepare configuration for cross validation test harness
seed = 42

models = []
models.append(('LR', LogisticRegression(max_iter=300)))
models.append(('DTC', DecisionTreeClassifier(criterion='entropy', random_state=42)))
models.append(('KNC', KNeighborsClassifier()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('GNB', GaussianNB()))
models.append(('RFC', RandomForestClassifier()))
models.append(('SVM', SVC()))
models.append(('XGB', XGBClassifier(use_label_encoder=False, objective="binary:logistic",
                       learning_rate=1.0e-3,
                       n_estimators=800, 
                        n_jobs=4,
                       eval_metric='error')))
models.append(('LightGbm', lgbm.LGBMClassifier(
    is_unbalance=True,
    #categorical_feature=obj_cols_idx,
    seed=42,
    boosting_type='goss',
    device_type= 'cpu',#'gpu',
    learning_rate=1.0e-3,
    max_depth=4,
    n_estimators=800,
    n_jobs=4,
    num_leaves=31,
    reg_alpha=0.0,
    reg_lambda=0.0
)))

In [None]:
results = []
names = []
scoring = 'accuracy'

for name, model in models:
    kfold = KFold(n_splits=10)
    cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    print(f'name: {name}, mean_cv_result: {cv_results.mean()}, std_cv_result: {cv_results.std()}')
    
# boxplot algorithm comparison
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()

In [None]:
RFC_clf = RandomForestClassifier(n_estimators=1000)
RFC_clf.fit(X_train, y_train)
RFC_preds = RFC_clf.predict(X_test)
print(f'ROC_AUC_score = {roc_auc_score(RFC_preds, y_test)}')
print(f'Accuracy score = {accuracy_score(RFC_preds, y_test)}')

In [None]:
print(classification_report(y_test, RFC_preds))

In [None]:
print(confusion_matrix(y_test, RFC_preds))

In [None]:
def plot_roc_auc(actual, predict):
    
    fpr, tpr, threshold = roc_curve(actual, predict)
    plt.plot(fpr, tpr, color='b')
    #plt.plot()
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.plot([0.0, 1.0], [0.0, 1.0], color='r')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC AUC = {:.3f}'.format(roc_auc_score(actual, predict)))

plot_roc_auc(y_test, RFC_preds)

In [None]:
df['DEATH_EVENT'].value_counts()

In [None]:
# Inverse of Null Accuracy
print('Inverse of Null Accuracy: ',96/(96+203))
print('Null Accuracy: ',203/(96+203))

In [None]:
# Models

# Scale our data in pipeline, then split

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

rf_pipeline = Pipeline(steps = [
    ('scale', MinMaxScaler()),
    ('RF', RandomForestClassifier(random_state=42))])

svm_pipeline = Pipeline(steps=[
    ('scale', MinMaxScaler()),
    ('SVM', SVC(random_state=42))])

logreg_pipeline = Pipeline(steps=[
    ('scale', MinMaxScaler()),
    ('LR', LogisticRegression(random_state=42))])

rf_cv = cross_val_score(rf_pipeline, X_train, y_train,cv=10, scoring='f1')
svm_cv = cross_val_score(svm_pipeline, X_train, y_train, cv=10, scoring='f1')
logreg_cv = cross_val_score(logreg_pipeline, X_train, y_train, cv=10, scoring='f1')

print('Mean f1 scores: ')
print('Random Forest mean: ', rf_cv.mean())
print('SVM mean: ', svm_cv.mean())
print('Logistic Regression mean: ', logreg_cv.mean())

In [None]:
%timeit

rf_pipeline.fit(X_train, y_train)
svm_pipeline.fit(X_train, y_train)
logreg_pipeline.fit(X_train, y_train)

rf_pred = rf_pipeline.predict(X_test)
svm_pred = svm_pipeline.predict(X_test)
logreg_pred = logreg_pipeline.predict(X_test)

rf_cm  = confusion_matrix(y_test,rf_pred )
svm_cm = confusion_matrix(y_test,svm_pred)
logreg_cm  = confusion_matrix(y_test,logreg_pred )

rf_f1  = f1_score(y_test,rf_pred)
svm_f1 = f1_score(y_test,svm_pred)
logreg_f1  = f1_score(y_test,logreg_pred)

print('Mean f1 scores:')

print('RF mean :',rf_f1)
print('SVM mean :',svm_f1)
print('LR mean :',logreg_f1)

In [None]:
# Pretty good accuracy, but poor recall!
# Unscaled and not upsampled test

from sklearn.model_selection import GridSearchCV

n_estimators =[64,100,128,200]
max_features = [2,3,5,7]
bootstrap = [True,False]

param_grid = {'n_estimators':n_estimators,
             'max_features':max_features,
             'bootstrap':bootstrap}

rfc = RandomForestClassifier()
grid = GridSearchCV(rfc,param_grid)
grid.fit(X_train,y_train)
grid.best_params_

In [None]:
# Let's use those params now

rfc = RandomForestClassifier(**grid.best_params_)

rfc.fit(X_train,y_train)

rfc_tuned_pred = rfc.predict(X_test)

print(classification_report(y_test,rfc_tuned_pred))

print('Accuracy Score: ',accuracy_score(y_test,rfc_tuned_pred))
print('F1 Score: ',f1_score(y_test,rfc_tuned_pred))

In [None]:
print(confusion_matrix(y_test, rfc_tuned_pred))

In [None]:
plot_roc_auc(y_test, rfc_tuned_pred)

In [None]:
predictions = rfc.predict(x)
submission = pd.read_csv('/kaggle/input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv')
submission['Prediction'] = predictions
submission
submission.to_csv("submission_RFC_21032021.csv", index=False)