In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
dataset = pd.read_csv('/kaggle/input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv', sep = ',')
dataset.head(1)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier #library for random forest classifier
from sklearn.model_selection import RandomizedSearchCV # library for random search hyper param tuning
from sklearn import metrics #library for metrics to measure model performance
#x = dataset[['age','anaemia','creatinine_phosphokinase','diabetes','ejection_fraction','high_blood_pressure','platelets','serum_creatinine','serum_sodium','sex','smoking','time']]
x = dataset.loc[:, dataset.columns != 'DEATH_EVENT'] #create x var
y = dataset['DEATH_EVENT'] #create y var
x.head(1)


In [None]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.2) # split dataset to train and test

In [None]:
random_grid = {'bootstrap': [True, False],
               'max_depth': [10,20,30,40,50,60,70],
               'n_estimators': [100,200,300,400,500,600,700,800,900,1000],
               'max_features': ['auto','sqrt']}   # create random search grid

rf = RandomForestClassifier() #initiate base model - random forest classifier
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 5, verbose = 2, 
                               random_state = 42, n_jobs = -1) #specify the random search 
rf_random.fit(x_train, y_train) #fit the random search model
rf_random.best_params_ #throws the model params with the least cost

In [None]:
best_rf = RandomForestClassifier(bootstrap = True, n_estimators = 200, max_depth = 20,max_features = 'sqrt') #createing the best rf classifier
best_rf.fit(x_train, y_train) #fits the best model
importance = best_rf.feature_importances_ #outputs feature importance


In [None]:
flist = x_train.columns
for i,v in enumerate(importance):
    col = flist[i]
    print('Feature: ', col, ' Score: %.5f' % (v))

In [None]:
from matplotlib import pyplot
pyplot.bar([x for x in range(len(importance))], importance)
pyplot.show()

In [None]:
y_pred = best_rf.predict(x_test) #get the predictions
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

matrix = confusion_matrix(y_test, y_pred, labels = [1,0]) #crates a confusion matrix
print('Confusion matrix: \n', matrix)
tp, fn, fp, tn = confusion_matrix(y_test, y_pred, labels = [1,0]).reshape(-1) #captures the matrix entities in linear fashion
print('Outcome values: \n', tp, fn, fp, tn)
clfreport = classification_report(y_test, y_pred, labels = [1,0]) #creates classificaiton report
print('Classification report: \n', clfreport)

In [None]:
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
y_pred_prob = best_rf.predict_proba(x_test)
# get roc details across thresholds for best classifier
fpr, tpr, thresh = roc_curve(y_test, y_pred_prob[:,1], pos_label=1)
# roc curve for tpr = fpr 
random_probs = [0 for i in range(len(y_test))]
p_fpr, p_tpr, _ = roc_curve(y_test, random_probs, pos_label=1)
# auc scores
auc_score = roc_auc_score(y_test, y_pred_prob[:,1])
print ('auc for the model is : {:0.4f}' .format(auc_score))


In [None]:
import matplotlib.pyplot as plt #plot the roc curve
plt.style.use('seaborn')
plt.plot(fpr, tpr, linestyle='--',color='orange', label='Best_RF_classifier')
plt.plot(p_fpr, p_tpr, linestyle='--', color='blue')
# title
plt.title('ROC curve')
# x label
plt.xlabel('False Positive Rate')
# y label
plt.ylabel('True Positive rate')

plt.legend(loc='best')
plt.savefig('ROC',dpi=300)
plt.show()