# Mortality Prediction of Heart Failure Patient in Hospital 

## Loading appropriate libraries 
    * We will load other libraries as & when needed

In [7]:
import scipy.stats as stats

In [None]:
import scipy.stats as stats
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
from scipy.stats import chi2

In [8]:
## Suppress the warnings
import warnings
warnings.filterwarnings('ignore')

In [9]:
pd.set_option('display.max_rows', None)

## Import Data

In [10]:
masterData = pd.read_excel("Healthcare_cat_dataset.xlsx")

In [11]:
mdf = masterData.copy()

In [12]:
mdf.shape

(1177, 53)

In [13]:
mdf.head(5)

Unnamed: 0,group,ID,outcome,age,gendera,BMI_cat,hypertensive,atrialfibrillation,CHD with no MI,diabetes,...,cal_cat,chloride_cat,anion_cat,Mag_cat,ph_cat,Biccarbon_cat,metcat,lactic_cat,pco2_cat,ef_cat
0,1,125047,0.0,72,1,0,0,0,0,1,...,0,0,1,0,1,0,0,0,1,1
1,1,139812,0.0,75,2,0,0,0,0,0,...,0,1,1,1,1,0,0,0,0,1
2,1,109787,0.0,83,2,1,0,0,0,0,...,0,1,1,1,1,0,0,0,0,0
3,1,130587,0.0,43,2,0,0,0,0,0,...,1,0,1,1,1,0,0,0,0,1
4,1,138290,0.0,75,2,0,1,0,0,0,...,1,1,1,0,1,0,0,0,0,1


Data Description

Rows: 1176

Columns: 53

Dependent Variable: outcome

Except for Age all are in binary data

## EDA

### Dependent Variable:- output

In [14]:
mdf = mdf[~mdf['outcome'].isna()]

In [15]:
mdf['outcome'] = mdf['outcome'].astype(int)

### Droping non significant variables

In [15]:
mdf = pd.get_dummies(mdf, columns = ['gendera'])

In [17]:
mdf = mdf.drop(['ID','group'], axis=1)

### Checking for Null Data

In [18]:
dd = mdf.isnull().sum()
dd[dd > 0]

outcome            1
Pulse rate cat    16
dtype: int64

### Imputing data using mode as data is binary

In [19]:
mdf["Pulse rate cat"].fillna(mdf["Pulse rate cat"].mode()[0],inplace=True)

In [20]:
mdf['outcome'].value_counts()

0.0    1017
1.0     159
Name: outcome, dtype: int64

In [21]:
mdf.columns

Index(['outcome', 'age', 'BMI_cat', 'hypertensive', 'atrialfibrillation',
       'CHD with no MI', 'diabetes', 'deficiencyanemias', 'depression',
       'Hyperlipemia', 'Renal failure', 'COPD', 'heart rate at',
       'Pulse rate cat', 'Sys_cat', 'Diastolic', 'respiratory cat', 'temp_cat',
       'SP O2 ', 'urine_cat', 'hemocrit_cat', 'RBC_Cat', 'mch_cat', 'mchc_Cat',
       'mcv_cta', 'rdw_cat', 'leukocytes_cat', 'platelets_cat',
       'neutriphil_cat', 'Basophil_cat', 'Lympho_cat', 'PT_cat(sec)',
       'INR_cat', 'NT_cat', 'CK_cat', 'Creatinine_cat', 'UN_cat', 'Glu_cat',
       'potas_cat', 'sodium_cat', 'cal_cat', 'chloride_cat', 'anion_cat',
       'Mag_cat', 'ph_cat', 'Biccarbon_cat', 'metcat', 'lactic_cat',
       'pco2_cat', 'ef_cat', 'gendera_1', 'gendera_2'],
      dtype='object')

# User Defined Functions

In [22]:
def PerformHypothesisTest(dependentVariable, independentVariables):
    
    evalResult = pd.DataFrame(columns = ['Feature', 'p_Value_Chi'])

    for col in independentVariables:
        resultTable=pd.crosstab(mdf[col], mdf[dependentVariable])
        observedValues = resultTable.values
        
        val = stats.chi2_contingency(resultTable)
        
        expectedValues = val[3]
        
        no_of_rows = len(resultTable.iloc[0:2,0])
        no_of_columns = len(resultTable.iloc[0,0:2])
        ddof = (no_of_rows - 1) * (no_of_columns - 1)
        alpha = 0.05
        
        chi_square = sum([(o-e)**2./e for o,e in zip(observedValues, expectedValues)])
        chi_square_statistic = chi_square[0] + chi_square[1]
        
        p_value_Chi = 1 - chi2.cdf(x = chi_square_statistic, df = ddof)
        
        significant_Chi = ""
        
        significant_t = ""
        
        if p_value_Chi <= alpha:
            significant_Chi = "Yes"
        else:
            significant_Chi = "No"
        
        evalResult = evalResult.append({'Feature' : col, 
                                        'p_Value_Chi' : p_value_Chi,
                                        'significant_Chi':significant_Chi
                                       }, ignore_index = True)
    
    
    return evalResult

In [23]:
def with_hue(ax, feature, Number_of_categories, hue_categories):
    a = [p.get_height() for p in ax.patches]
    patch = [p for p in ax.patches]
    for i in range(Number_of_categories):
        total = feature.value_counts().values[i]
        for j in range(hue_categories):
            percentage = '{:.1f}%'.format(100 * a[(j*Number_of_categories + i)]/total)
            x = patch[(j*Number_of_categories + i)].get_x() + patch[(j*Number_of_categories + i)].get_width() / 2 - 0.15
            y = patch[(j*Number_of_categories + i)].get_y() + patch[(j*Number_of_categories + i)].get_height() 
            h = patch[(j*Number_of_categories + i)].get_height()
            w = patch[(j*Number_of_categories + i)].get_width()
            x1 = patch[(j*Number_of_categories + i)].get_x()
            ax.annotate(percentage, (x1 + w / 2., h),
                           ha = 'center', va = 'top', xytext = (0, 12), textcoords = 'offset points')

In [24]:
def CalculateValues(df, variable):
    results = df.groupby([variable]).outcome.value_counts()
    y = [f"{x/results[0].sum()*100:.0f}" for x in results[0]]  
    text = f"Alive = {y[0]} % \nDead = {y[1]} %\n"
    text1 = f"\nH0: There is no impact of {variable} on output \nH1: There is impact of {variable} on output\n"
    pValue = evaluationResult[evaluationResult['Feature'] == variable].squeeze()['p_Value_Chi'] 
    
    pValueText = f"\np_value for {variable} is {pValue}\n"
    
    conclusionText = ""
    if pValue > 0.05:
        conclusionText = f"\nHence we fail to reject the H0"
    else:
        conclusionText = f"\nHence we reject the H0"
        
    vv = r'{ \textcolor{colorB} {\bfseries A}}: absquatulate'
    text = text + text1 + pValueText + conclusionText
    return text

In [25]:
def SetPValues(ax, df, variable):
    text = CalculateValues(df, variable)
    ax.set_axis_off()
    ax.text(0, 0.5, text, fontsize=14,weight="bold");
    return True

In [26]:
def SetFeaturesPie(ax, featureCount, variable):
    ax.pie(featureCount, 
                  autopct='%1.1f%%',                  
                  explode=(0.025,0.025),  
                  colors=['#4F6272', 'navajowhite'],
                  labels = ['AN', 'N'],
                  rotatelabels=True,
                  startangle=80,)
    ax.set_title('Feature Ratio', pad=25)
    labels = f"AN = Having {variable} \nN = Not Having {variable}"
    ax.annotate(labels, xy=(0.9,0.9),xycoords='axes fraction', fontsize=12)

In [27]:
def SetOutcomeCountPie(ax, outcomeCount, variable):
    ax.pie(outcomeCount, 
                  autopct='%1.1f%%',                  
                  explode=(0.025,0.025),  
                  colors=['skyblue', '#B7C3F3'],
                  labels = ['AN', 'N'],
                  rotatelabels=True,
                  startangle=80,)
    ax.set_title('Feature Ratio', pad=25)
    labels = f"AN = Having {variable} \nN = Not Having {variable}"
    ax.annotate(labels, xy=(0.9,0.9),xycoords='axes fraction', fontsize=12)

In [28]:
def SetOutcomeCountSNSPlot(ax, df, variable, outcome):
    sb.countplot(df[variable], hue=df[outcome],palette="PuBu",ax=ax)
    with_hue(ax, df[variable], 2, 2)    
    ax.set_title('Outcome Bar', pad=25)

In [29]:
def SetresultsPie(ax, results):
    ax.pie(results, 
                  autopct='%1.1f%%', 
                  labels = ['AN+A', 'AN+D', 'N+A', 'N+D'],
                  explode=(0.025,0.05,0.025,0.05),
                  colors=['#3880A8', '#B7C3F3','skyblue','navajowhite'],
                  rotatelabels=True,
                  startangle=180,)
    ax.set_title('Outcome Pie', pad=25)
    labels = f"AN = Abnormal \nN = Normal \nA = Alive \nD = Dead"
    ax.annotate(labels, xy=(0.9,0.9),xycoords='axes fraction', fontsize=12)

In [30]:
def plot_viz(feature='outcome'):
    validCol = [feature,'outcome']
    corr = mdf[validCol].corr()
    
    featureCount = mdf[feature].value_counts()
    
    outcomeCount =mdf[mdf[feature] == 1].outcome.value_counts()
    
    results = mdf.groupby([feature, 'outcome']).outcome.value_counts()    
    
# fig, axs = plt.subplots(3, 2, figsize=(18, 18))    
    
    fig = plt.figure(constrained_layout=True, figsize=(16, 16))
    gs = GridSpec(3, 2, height_ratios=[1, 3, 3], figure=fig)
 
    # create sub plots as grid
    ax1 = fig.add_subplot(gs[0, :])
    ax2 = fig.add_subplot(gs[1, 0])
    ax3 = fig.add_subplot(gs[1, 1])
    ax4 = fig.add_subplot(gs[2, 0])
    ax5 = fig.add_subplot(gs[2, 1])
    
    fig.suptitle("Features Significance Analysis")    
    
    SetPValues(ax1, mdf, feature)    
    
    SetFeaturesPie(ax2, featureCount, feature)
    
    SetOutcomeCountPie(ax3, outcomeCount, feature)    

    SetOutcomeCountSNSPlot(ax4, mdf, feature, 'outcome')
    
    SetresultsPie(ax5, results)
    
    plt.show()

# Data Visualization w.r.t Dependent Vatiable

In [31]:
import ipywidgets as widgets
from matplotlib.gridspec import GridSpec

In [33]:
## Display the charts with Dropdown 

xDataset = mdf.drop(['outcome','age'], axis=1)

independentVariables = xDataset.columns

evaluationResult = PerformHypothesisTest('outcome', independentVariables)

widgets.interact(plot_viz, feature=independentVariables);

interactive(children=(Dropdown(description='feature', options=('BMI_cat', 'hypertensive', 'atrialfibrillation'…

## Feature Engineering

### Derive new Features if need be 

In [None]:
# Add column for anemia & blood pressure
# mdf['derivedAnemia'] = np.where((((mdf['deficiencyanemias'] == 1) & (mdf['RBC_Cat'] == 1)) 
#                                  | ((mdf['deficiencyanemias'] == 0) & (mdf['RBC_Cat'] == 1))), 1, 0)

mdf['derivedAnemia'] = np.where((mdf['deficiencyanemias'] == 1) & (mdf['RBC_Cat'] == 1), 1, 0)

mdf['derivedInflammation'] = np.where((mdf['neutriphil_cat'] == 1) & (mdf['Lympho_cat'] == 1), 1, 0)  

independentVariables = ['derivedAnemia', 'deficiencyanemias', 'RBC_Cat', 'derivedInflammation', 'neutriphil_cat', 'Lympho_cat']

In [None]:
evaluationResult = PerformHypothesisTest('outcome', independentVariables)
evaluationResult

In [None]:
## Plot p-Values 
xDataset = mdf.drop('outcome', axis=1)

evaluationResult = PerformHypothesisTest('outcome', xDataset.columns)

evaluationResult = evaluationResult.sort_values(['p_Value_Chi'], ascending=True)

plt.rcParams['figure.figsize'] = [8,10]

colors = ["red" if i > 0.05 else "#40A944" for i in evaluationResult.p_Value_Chi]

plt.barh(evaluationResult.Feature, evaluationResult.p_Value_Chi, color = colors)
 
# setting label of y-axis
plt.ylabel("Features")
 
# setting label of x-axis
plt.xlabel("p-Values")
plt.title("Horizontal bar graph")
plt.show()

In [None]:
evaluationResult

## Treating Imbalanced Data

In [None]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

In [None]:
mdf['outcome'].value_counts()

In [None]:
over = SMOTE(sampling_strategy=1, random_state=42)
under = RandomUnderSampler(sampling_strategy=1, random_state=42)
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)
# transform the dataset
X, y = pipeline.fit_resample(mdf.drop('outcome', axis=1), mdf['outcome'])
mdf_upsampled = pd.concat([pd.DataFrame(y), pd.DataFrame(X)], axis=1)

In [None]:
mdf_upsampled['outcome'].value_counts()

## Modeling with different Algos

## Logistic Regression

In [None]:
from sklearn.datasets import make_classification 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, roc_auc_score, precision_score, f1_score
from sklearn.feature_selection import SelectFromModel

In [None]:
# from preventing overfitting we will do seperate data into train and test

y = mdf_upsampled['outcome']
X = mdf_upsampled.drop(columns = ["outcome"], axis = 1)

X_train,X_test,y_train,y_test = train_test_split(X, y,test_size = 0.3,random_state = 9)


In [None]:
model_logReg = LogisticRegression()
res = model_logReg.fit(X_train, y_train)
pred= model_logReg.predict(X_test)
pred_logi = model_logReg.predict_proba(X_test)

In [None]:
THRESHOLD = 0.5
y_pred_logi = np.where(model_logReg.predict_proba(X_test)[:,1] > THRESHOLD, 1, 0)

In [None]:
logiDF = pd.DataFrame(data=[accuracy_score(y_test, y_pred_logi), recall_score(y_test, y_pred_logi),
                   precision_score(y_test, y_pred_logi), f1_score(y_test, y_pred_logi, average='binary'),
                   roc_auc_score(y_test, y_pred_logi)], 
             index=["accuracy", "recall", "precision", "f1_score", "roc_auc_score"])

logiDF

## KNN Classifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
knn= KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train,y_train)
y_pred_knn= knn.predict(X_test)
pred_knn = knn.predict_proba(X_test)

pd.DataFrame(data=[accuracy_score(y_test, y_pred_knn), recall_score(y_test, y_pred_knn),
                   precision_score(y_test, y_pred_knn),  f1_score(y_test, y_pred_knn, average='binary'),
                   roc_auc_score(y_test, y_pred_knn)], 
             index=["accuracy", "recall", "precision", "f1_score", "roc_auc_score"])


## Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
dtree = DecisionTreeClassifier()
dtree.fit(X_train,y_train)

y_pred_dtree = dtree.predict(X_test)
pred_dtree = dtree.predict_proba(X_test)

pd.DataFrame(data=[accuracy_score(y_test, y_pred_dtree), recall_score(y_test, y_pred_dtree),
                   precision_score(y_test, y_pred_dtree),  f1_score(y_test, y_pred_dtree, average='binary'),
                   roc_auc_score(y_test, y_pred_dtree)], 
             index=["accuracy", "recall", "precision", "f1_score", "roc_auc_score"])


## SVM ( Support Vector Machine)


In [None]:
from sklearn.svm import SVC

In [None]:
svclassifier = SVC(kernel='linear', probability=True)
svclassifier.fit(X_train, y_train)

In [None]:
y_pred_svm = svclassifier.predict(X_test)
pred_svm = svclassifier.predict_proba(X_test)

pd.DataFrame(data=[accuracy_score(y_test, y_pred_svm), recall_score(y_test, y_pred_svm),
                   precision_score(y_test, y_pred_svm),  f1_score(y_test, y_pred_svm, average='binary'),
                   roc_auc_score(y_test, y_pred_svm)], 
             index=["accuracy", "recall", "precision", "f1_score", "roc_auc_score"])


## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rforestClassifier = RandomForestClassifier(n_estimators = 100)
rforestClassifier.fit(X_train, y_train)

In [None]:
y_pred_rf = rforestClassifier.predict(X_test)
pred_rf = rforestClassifier.predict_proba(X_test)

pd.DataFrame(data=[accuracy_score(y_test, y_pred_rf), recall_score(y_test, y_pred_rf),
                   precision_score(y_test, y_pred_rf),  f1_score(y_test, y_pred_rf, average='binary'),
                   roc_auc_score(y_test, y_pred_rf)], 
             index=["accuracy", "recall", "precision", "f1_score", "roc_auc_score"])

## XG Boost Classifier

In [None]:
from xgboost import XGBClassifier

In [None]:
xgbClassifier = XGBClassifier()
xgbClassifier.fit(X_train, y_train)


In [None]:
y_pred_xgb = xgbClassifier.predict(X_test)
pred_xgb = xgbClassifier.predict_proba(X_test)

pd.DataFrame(data=[accuracy_score(y_test, y_pred_xgb), recall_score(y_test, y_pred_xgb),
                   precision_score(y_test, y_pred_xgb),  f1_score(y_test, y_pred_xgb, average='binary'),
                   roc_auc_score(y_test, y_pred_xgb)], 
             index=["accuracy", "recall", "precision", "f1_score", "roc_auc_score"])

## Models Evaluation using ROC Curve Matrix

In [None]:
from sklearn.metrics import roc_curve

# roc curve for models
fpr1, tpr1, thresh1 = roc_curve(y_test, pred_logi[:,1], pos_label=1)
fpr2, tpr2, thresh2 = roc_curve(y_test, pred_knn[:,1], pos_label=1)
fpr3, tpr3, thresh3 = roc_curve(y_test, pred_dtree[:,1], pos_label=1)
fpr4, tpr4, thresh4 = roc_curve(y_test, pred_svm[:,1], pos_label=1)
fpr5, tpr5, thresh5 = roc_curve(y_test, pred_rf[:,1], pos_label=1)
fpr6, tpr6, thresh6 = roc_curve(y_test, pred_xgb[:,1], pos_label=1)

# roc curve for tpr = fpr 
random_probs = [0 for i in range(len(y_test))]
p_fpr, p_tpr, _ = roc_curve(y_test, random_probs, pos_label=1)

In [None]:
# matplotlib
import matplotlib.pyplot as plt
plt.style.use('seaborn')

# plot roc curves
plt.plot(fpr1, tpr1, linestyle='--',color='orange', label='Logistic Regression')
plt.plot(fpr2, tpr2, linestyle='solid',color='green', label='KNN')
plt.plot(fpr3, tpr3, linestyle='dashed',color='red', label='DTree')
plt.plot(fpr4, tpr4, linestyle='solid',color='brown', label='SVM')
plt.plot(fpr5, tpr5, linestyle='dashdot',color='black', label='RF')
plt.plot(fpr6, tpr6, linestyle='-.',color='blue', label='XGB')
plt.plot(p_fpr, p_tpr, linestyle='-', color='pink')
# title
plt.title('ROC curve')
# x label
plt.xlabel('False Positive Rate')
# y label
plt.ylabel('True Positive rate')

plt.legend(loc='best')
plt.savefig('ROC',dpi=300)
plt.show();

# Feature Selection

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import mutual_info_classif
from matplotlib import pyplot 

### Chi2 Scores

In [None]:
from scipy.stats import chi2

In [None]:
evaluationResult = PerformHypothesisTest('outcome', X_train.columns)
ns_df_sorted = evaluationResult.sort_values(['p_Value_Chi'], ascending = True).head(15)
ns_df_sorted

### K Best Features

In [None]:
sel_significant_columns = SelectKBest(mutual_info_classif,k= 15)
sel_significant_columns.fit(X_train,y_train)

names = X_train.columns.values[sel_significant_columns.get_support()]
scores = sel_significant_columns.scores_[sel_significant_columns.get_support()]
names_scores = list(zip(names, scores))
ns_df = pd.DataFrame(data = names_scores, columns=['Feat_names', 'F_Scores'])
#Sort the dataframe for better visualization
ns_df_sorted = ns_df.sort_values(['F_Scores', 'Feat_names'], ascending = [False, True])
ns_df_sorted

In [None]:
plt.figure(figsize = (20,8))
ns_df_sorted.plot(kind='bar')


### ExtraTree Classifier

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

In [None]:
extra_tree_forest = ExtraTreesClassifier(n_estimators = 100, criterion ='gini', max_features = 15)
extra_tree_forest.fit(X, y)

feature_importance = extra_tree_forest.feature_importances_


feature_importance_normalized = np.std([tree.feature_importances_ 
                                        for tree in extra_tree_forest.estimators_],
                                        axis = 0)
features = pd.Series(feature_importance_normalized, index=X.columns).nlargest(15)

etf_features = pd.DataFrame(features)

etf_features

## Recursive Feature selection 

### Using Logistic Regression

In [None]:
from sklearn.feature_selection import RFE

In [None]:
logreg = LogisticRegression()
logreg_rfe_model = RFE(estimator=logreg,n_features_to_select=15)
logreg_model_fit = logreg_rfe_model.fit(X_train,y_train)
logreg_feat_index = pd.Series(data = logreg_model_fit.ranking_, index = X_train.columns)
logreg_feat_rfe = logreg_feat_index[logreg_feat_index==1].index

logreg_selected_features = pd.DataFrame(logreg_feat_rfe)
logreg_selected_features

### Using SVM

In [None]:
svm_lin=SVC(kernel='linear')
svm_rfe_model=RFE(estimator=svm_lin,n_features_to_select=15)
svm_rfe_model_fit=svm_rfe_model.fit(X_train,y_train)
feat_index = pd.Series(data = svm_rfe_model_fit.ranking_, index = X_train.columns)
signi_feat_rfe = feat_index[feat_index==1].index

svm_selected_features = pd.DataFrame(signi_feat_rfe)
svm_selected_features

### Using Random Forest

In [None]:
clf = RandomForestClassifier(n_estimators = 100)
clf_rfe_model=RFE(estimator=clf,n_features_to_select=15)
clf_model_fit=clf_rfe_model.fit(X_train,y_train)
feat_index = pd.Series(data = clf_model_fit.ranking_, index = X_train.columns)
signi_feat_rfe = feat_index[feat_index==1].index

rf_selected_features = pd.DataFrame(signi_feat_rfe)
rf_selected_features