In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

## Reading and Understanding the Data

In [None]:
h_df = pd.read_csv('../input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv') 
h_df.head(10)

In [None]:
h_df.shape

In [None]:
h_df.columns

In [None]:
h_df.info()

In [None]:
h_df.describe([0.25,0.50,0.75])

In [None]:
h_df.isnull().sum()

In [None]:
h_df['age'].sort_values(ascending = False)

In [None]:
bins = [32,48,64,80,96]
labels = ['32-48', '48-64', '64-80', '80-96']
h_df['agegroup']=pd.cut(h_df['age'], bins, labels = labels)

In [None]:
h_df['agegroup'].value_counts()

### Outlier Detection and Removal

In [None]:
def plot_Outlier(var_list):
    plt.figure(figsize=(20, 15))
    for var in var_list:
        plt.subplot(4,4,var_list.index(var)+1)
        ax=sns.boxplot(x = h_df[var])   
    plt.show()

In [None]:
plot_Outlier(['age', 'anaemia', 'creatinine_phosphokinase', 'diabetes',
       'ejection_fraction', 'high_blood_pressure', 'platelets',
       'serum_creatinine', 'serum_sodium', 'sex', 'smoking', 'time',
       'DEATH_EVENT'])

**So, we find outliers in following columns: creatinine_phosphokinase,ejection fraction, platelets, serum_creatinine, serum_sodium**

In [None]:
Q1 = h_df['creatinine_phosphokinase'].quantile(0.25)
Q3 = h_df['creatinine_phosphokinase'].quantile(0.75)
IQR = Q3 - Q1
h_df=h_df.loc[(h_df['creatinine_phosphokinase'] >= Q1 - 1.5*IQR) & (h_df['creatinine_phosphokinase'] <= Q3 + 1.5*IQR)]
h_df.shape

In [None]:
Q1 = h_df['platelets'].quantile(0.25)
Q3 = h_df['platelets'].quantile(0.75)
IQR = Q3 - Q1
h_df=h_df.loc[(h_df['platelets'] >= Q1 - 1.5*IQR) & (h_df['platelets'] <= Q3 + 1.5*IQR)]
h_df.shape

In [None]:
Q1 = h_df['serum_creatinine'].quantile(0.25)
Q3 = h_df['serum_creatinine'].quantile(0.75)
IQR = Q3 - Q1
h_df=h_df.loc[(h_df['serum_creatinine'] >= Q1 - 1.5*IQR) & (h_df['serum_creatinine'] <= Q3 + 1.5*IQR)]
h_df.shape

In [None]:
Q1 = h_df['serum_sodium'].quantile(0.25)
Q3 = h_df['serum_sodium'].quantile(0.75)
IQR = Q3 - Q1
h_df=h_df.loc[(h_df['serum_sodium'] >= Q1 - 1.5*IQR) & (h_df['serum_sodium'] <= Q3 + 1.5*IQR)]
h_df.shape

In [None]:
Q1 = h_df['ejection_fraction'].quantile(0.25)
Q3 = h_df['ejection_fraction'].quantile(0.75)
IQR = Q3 - Q1
h_df=h_df.loc[(h_df['ejection_fraction'] >= Q1 - 1.5*IQR) & (h_df['ejection_fraction'] <= Q3 + 1.5*IQR)]
h_df.shape

In [None]:
plot_Outlier(['age', 'anaemia', 'creatinine_phosphokinase', 'diabetes',
       'ejection_fraction', 'high_blood_pressure', 'platelets',
       'serum_creatinine', 'serum_sodium', 'sex', 'smoking', 'time',
       'DEATH_EVENT'])

## Data Visualization and Data Analysis

In [None]:
h_df.columns

In [None]:
plt.figure(figsize=(10, 5))
sns.countplot('agegroup', data=h_df)

In [None]:
plt.figure(figsize=(10, 5))
sns.countplot('anaemia', data=h_df, hue = 'agegroup')

In [None]:
plt.figure(figsize=(10, 5))
sns.countplot('agegroup', data=h_df, hue = 'anaemia')

**The age group of 48-64 have high positive cases of Anaemia**. **In the age group of 80-96, the cases are low, but they also have a higher cases of anaemia**

In [None]:
plt.figure(figsize=(10, 5))
sns.countplot('agegroup', data=h_df, hue = 'DEATH_EVENT')

In [None]:
plt.figure(figsize=(10, 5))
sns.countplot('anaemia', data=h_df, hue = 'DEATH_EVENT')

In [None]:
plt.figure(figsize=(10, 5))
sns.countplot('high_blood_pressure', data=h_df, hue = 'DEATH_EVENT')

In [None]:
plt.figure(figsize=(15, 10))
sns.heatmap(h_df.corr(),annot = True)

In [None]:
corr = h_df.corr()
corr[abs(corr['DEATH_EVENT']) > 0.1]['DEATH_EVENT']

**It shows that Death event has a considerable correlation with following columns: ejection_fraction, serum creatinine, time, serum_sodium**

In [None]:
plt.figure(figsize=(10, 5))
sns.barplot(y ='ejection_fraction', data=h_df, x = 'DEATH_EVENT')

In [None]:
def bar_plot(var_list):
    plt.figure(figsize=(20, 20))
    for var in var_list:
        plt.subplot(2,3,var_list.index(var)+1)
        ax=sns.barplot(y = h_df[var], data = h_df, x = 'DEATH_EVENT')   
    plt.show()

In [None]:
bar_plot(['age','ejection_fraction', 'serum_creatinine', 'time', 'serum_sodium'])

## Model Training and Prediction

In [None]:
corr_df=h_df[['age', 'ejection_fraction', 'serum_creatinine', 'serum_sodium', 'time']]

In [None]:
x = corr_df
y = h_df['DEATH_EVENT']

## Train and test data split
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=1,test_size=0.2)

In [None]:
from sklearn.preprocessing import MinMaxScaler


scaler = MinMaxScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn import metrics
lr = LogisticRegression()
lr.fit(x_train,y_train)
y_pred = lr.predict(x_test)
print("Accuracy {}".format(metrics.accuracy_score(y_test, y_pred)))
print("Recall/Sensitivity {}".format(metrics.recall_score(y_test, y_pred)))

### Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc=RandomForestClassifier()
rfc.fit(x_train,y_train)
y_pred=rfc.predict(x_test)
#s3=accuracy_score(y_test,p3)
print("Accuracy {}".format(metrics.accuracy_score(y_test, y_pred)))
print("Recall/Sensitivity {}".format(metrics.recall_score(y_test, y_pred)))

### Support Vector Machine

In [None]:
from sklearn.svm import SVC
svm=SVC()
svm.fit(x_train,y_train)
y_pred=svm.predict(x_test)
print("Accuracy {}".format(metrics.accuracy_score(y_test, y_pred)))
print("Recall/Sensitivity {}".format(metrics.recall_score(y_test, y_pred)))

### Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt_clf = DecisionTreeClassifier(max_leaf_nodes=10, random_state=30, criterion='entropy')
dt_clf.fit(x_train, y_train)
y_pred = dt_clf.predict(x_test)
print("Accuracy {}".format(metrics.accuracy_score(y_test, y_pred)))
print("Recall/Sensitivity {}".format(metrics.recall_score(y_test, y_pred)))

**Accuracy for the following models:**
1. Logistic Regression: 82.22%
2. RandomForest Classifier: 82.22%
3. SVM: 88.88%
4. Decision Tree Classifier: 84.44%

In [None]:
pd.concat([pd.DataFrame(x.columns, columns = ['variable']),
           pd.DataFrame(rfc.feature_importances_, columns = ['importance'])],
          axis = 1).sort_values(by = 'importance', ascending = False)