### Import Data and Libraries

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_style('darkgrid')

In [None]:
data = pd.read_csv('../input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv')
data.head()

### Data Analysis

In [None]:
plt.figure(figsize=(8,6))
sns.heatmap(data.isna(),yticklabels=False, cmap='plasma')
plt.show()

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(data.corr(),annot=True,cmap='coolwarm')
plt.show()

In [None]:
data.describe()

In [None]:
data.info()

In [None]:
len(data.columns)

In [None]:
data.nunique()

In [None]:
data['DEATH_EVENT'].value_counts()

In [None]:
data['DEATH_EVENT'].value_counts()/len(data)

The data has in total 299 instances and 13 features.
Out of 13, 6 are categorical features and 7 are numeric features.
There are no NULL values in the dataset.
This cn be considered a balanced dataset as the ration is 1:3 and there are pretty much instances to train the model for both the categorical values. 
0 : False (203)
1 : True  (096)
The independent variable is 'DEATH_EVENT' indicating whether the patient expired or not.

In [None]:
cat_features = ['anaemia','diabetes','high_blood_pressure','sex','smoking']
for cat in cat_features:
    sns.countplot(data[cat], hue=data.DEATH_EVENT)
    plt.title(cat.upper()+' (w.r.t. DEATH_EVENT)')
    plt.show()
    print(data.groupby(cat)['DEATH_EVENT'].value_counts())

Anaemia : The risk of heart failure is more than if the person has anaemia. There are approx. 33% chances of death if the person has anaemia.

Diabetes : Like anaemia, diabetes also responsible for the heart failure. About 33% of total people having diabetes can die of heart failure.

High BP : More than 33% people with High BP problem can die of heart failure. 

Sex : There is no relation between sex of a person and heart failure as both the gender has almost same risk of havig heart failure.

Smoking : There are approx 33% chances of a smoker to die of heart failure.

So, the statistics of all of the categorical features leading to death is almost same.

In [None]:
sns.distplot(data.age,bins=15)

In [None]:
for cat in cat_features:
    plt.figure(figsize=(15,6))
    plt.subplot(1,2,1)
    sns.distplot(data[data[cat]==0]['age'],label=0,color='blue',bins=15)
    plt.legend()
    plt.title(cat.upper())
    plt.subplot(1,2,2)
    sns.distplot(data[data[cat]==1]['age'],label=1,color='red',bins=15)
    plt.legend()
    plt.title(cat.upper())
    plt.show()

Mostly people having anaemia are of age range 47 to 66 years.

Mostly diabetic patients are of age range 43 to 72 years.

Mostly people having High Blood Pressure are of age range 43 to 78 years.

Mostly males are of age range 54 to 64 years.

Mostly smokers are of age range 50 to 72 years.

In [None]:
cont_features = ['age','creatinine_phosphokinase','ejection_fraction','platelets','serum_creatinine','serum_sodium','time']
for col in cont_features:
    plt.figure(figsize=(15,6))
    plt.subplot(1,2,1)
    sns.distplot(data[data['DEATH_EVENT']==0][col],label=0,color='blue',bins=10)
    plt.legend()
    plt.title(col.upper())
    plt.subplot(1,2,2)
    sns.distplot(data[data['DEATH_EVENT']==1][col],label=1,color='red',bins=10)
    plt.legend()
    plt.title(col.upper())
    plt.show()

Age wise, most of the deaths is between the range of 40 and 80.

People with low level of Creatinine Phosphokinase (about 0-800 mcg/L) have mostly died due to heart failure.

The ejection fraction of blood from heart of about 14-60% has led to heart failure.

There is a normal distribution for platelet counts from 0 to 600000 and most people with counts of 150000-350000 have resulted in heart failure.

The level of serum creatinine of about 0.8 to 2.3 mg/dL had heart failure.


In [None]:
for col in cont_features:
    plt.boxplot(data[col])
    plt.title(col.upper())
    plt.show()

All the continuous value features have outliers.

All the features except age and time have the most outliers.

### Model Prediction

In [None]:
X = data.iloc[:,:-1].values
y = data.iloc[:,-1].values

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=100, random_state=0)
rfc.fit(X_train, y_train)
pred = rfc.predict(X_test)
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
print(confusion_matrix(y_test, pred))
print(accuracy_score(y_test, pred))
print(classification_report(y_test, pred))

In [None]:
from sklearn.model_selection import cross_val_score
val_score = cross_val_score(estimator=rfc,X=X_train,y=y_train,cv=10,n_jobs=-1)
val_score

### Hyper Parameter Tuning

In [None]:
rfc = RandomForestClassifier()
rf_params = {'n_estimators':[i for i in range(100,1000,10)],
          'criterion':['gini','entropy'],
          'max_features':['auto','sqrt','log2'],
          'max_depth':[i for i in range(10,1000,10)],
          'min_samples_split':[2,4,6,8,10],
          'min_samples_leaf':[1,2,3,4,5,6,7,8,9,10]
}

In [None]:
from sklearn.model_selection import RandomizedSearchCV
rs = RandomizedSearchCV(estimator=rfc,n_jobs=-1,cv=10,n_iter=100,param_distributions=rf_params,verbose=5,random_state=0)
rs.fit(X_train, y_train)

In [None]:
rs

In [None]:
rs.best_params_

In [None]:
rs.best_estimator_

### Tuned Model Prdiction

In [None]:
best = rs.best_estimator_
y_pred = best.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

### So the default model (85.33%) gave more accuracy than the hyper parameter tuned model (84%).