In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Importing Required Libraries
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msn

In [None]:
#Reading the data using pandas
heart = pd.read_csv('/kaggle/input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv')
heart.head(10) # Displaying first few lines of the data

In [None]:
#Displaying a concise summary of a DataFrame
heart.info()

In [None]:
#Generating descriptive statistics.
heart.describe()

# Missing Values

In [None]:
msn.matrix(heart)

* No Missing values present in the data.

In [None]:
heart.isnull().any()

# Outliers Or Variance Detection
* In creatinine_phosphokinase, ejection_fraction, platelets, serum_creatinine, and serum_sodium.

In [None]:
fig , ((ax1, ax2), (ax3, ax4), (ax5, ax6)) = plt.subplots(nrows = 3, ncols = 2, figsize = (8, 10))

ax = [ax1, ax2 ,ax3, ax4 ,ax5]

feature = ['creatinine_phosphokinase', 'ejection_fraction', 'platelets', 'serum_creatinine', 'serum_sodium']

for i, axes in zip(feature, ax):
    sns.boxplot(heart[i], ax = axes)
    
    axes.set_title(i, fontsize = 15)
    axes.set_xticks([])
    axes.set_xlabel('')
    
ax6.set_visible(False)
plt.tight_layout()
plt.show()

* As there are lots of outliers/variance in creatinine_phosphokinase, platelets, serum_creatinine.
* Correcting these outliers/variance using IQR Score.

In [None]:
# Correcting Outilers using IQR Score
Q1 = heart.quantile(0.25)
Q3 = heart.quantile(0.75)
IQR = Q3 - Q1
print('IQR Score : \n\n{}'.format(IQR))

heart = heart[~((heart < (Q1 - 1.5 * IQR)) |(heart > (Q3 + 1.5 * IQR)))]

#Filling the outliers value with mean value of corresponding features
heart = heart.fillna(heart.mean())

fig , ((ax1, ax2), (ax3, ax4), (ax5, ax6)) = plt.subplots(nrows = 3, ncols = 2, figsize = (8, 10))
fig.suptitle('After Correcting outliers', fontsize = 20)

ax = [ax1, ax2 ,ax3, ax4 ,ax5]

feature = ['creatinine_phosphokinase', 'ejection_fraction', 'platelets', 'serum_creatinine', 'serum_sodium']

for i, axes in zip(feature, ax):
    sns.boxplot(heart[i], ax = axes)
    
    axes.set_title(i, fontsize = 15)
    axes.set_xticks([])
    axes.set_xlabel('')
    

ax6.set_visible(False)

plt.show()
plt.tight_layout()
fig.subplots_adjust(top=0.888)

# Correlation between features
   * Computing pairwise correlation of columns, excluding NA/null values.

In [None]:
plt.figure(figsize = (10, 10)) 
sns.heatmap(heart.corr(), annot=True)

# Visualization

In [None]:
fig , ((ax1, ax2), (ax3, ax4), (ax5, ax6)) = plt.subplots(nrows = 3, ncols = 2, figsize = (8, 10))

ax = [ax1, ax2 ,ax3, ax4 ,ax5, ax6]

feature = ['age', 'creatinine_phosphokinase', 'ejection_fraction', 'platelets', 'serum_creatinine', 'serum_sodium']

for i, axes in zip(feature, ax):
    sns.distplot(heart[i][heart.DEATH_EVENT == 1], ax = axes, label = 'Deceased')
    sns.distplot(heart[i][heart.DEATH_EVENT == 0], ax = axes, label = 'Not Deceased')
    
    axes.set_title(i, fontsize = 15)
    #axes.set_xticks([])
    axes.set_xlabel('')
    axes.legend()

plt.tight_layout()
plt.show()

# Model Training

In [None]:
X = heart[['age', 'anaemia', 'creatinine_phosphokinase', 'diabetes',
       'ejection_fraction', 'high_blood_pressure', 'platelets',
       'serum_creatinine', 'serum_sodium', 'sex', 'smoking', 'time']]

y = heart[['DEATH_EVENT']]

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, np.array(y).ravel(), random_state=1,test_size=0.2)

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import f1_score

Logistic = LogisticRegression(solver = 'liblinear', max_iter=10000).fit(X_train, y_train)

Log_pred = Logistic.predict(X_test)

print('Accuracy Score : {:.2f}%'.format(Logistic.score(X_test, y_test)*100))
print('\nf1 Score : ', f1_score(y_test, Log_pred))
print('\nPrecision Score : ', precision_score(y_test, Log_pred))
print('\nRecall Score : ', recall_score(y_test, Log_pred))

### RandomForestClassifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import f1_score

RndClf = RandomForestClassifier(n_estimators = 100, max_depth = 15, max_leaf_nodes = 40).fit(X_train, y_train)

Rnd_pred = RndClf.predict(X_test)

print('Accuracy Score : {:.2f}%'.format(RndClf.score(X_test, y_test)*100))
print('\nf1 Score : ', f1_score(y_test, Rnd_pred))
print('\nPrecision Score : ', precision_score(y_test, Rnd_pred))
print('\nRecall Score : ', recall_score(y_test, Rnd_pred))

### Support Vector Machine

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import f1_score

svc = SVC().fit(X_train, y_train)

svc_pred = svc.predict(X_test)

print('Accuracy Score : {:.2f}%'.format(svc.score(X_test, y_test)*100))

### Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import f1_score

tree = DecisionTreeClassifier(max_depth = 15, max_features = 10, max_leaf_nodes = 45).fit(X_train, y_train)

tree_pred = tree.predict(X_test)

print('Accuracy Score : {:.2f}%'.format(tree.score(X_test, y_test)*100))
print('\nf1 Score : ', f1_score(y_test, tree_pred))
print('\nPrecision Score : ', precision_score(y_test, tree_pred))
print('\nRecall Score : ', recall_score(y_test, tree_pred))