### Importing commonly used packages

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib notebook
%matplotlib inline
from matplotlib.gridspec import GridSpec

### Importing Dataset

In [None]:
df = pd.read_csv("/kaggle/input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv")

### Column description

Age - Age of respondent

anaemia - Decrease of red blood cells or hemoglobin (boolean)

creatinine_phosphokinase - Level of the CPK enzyme in the blood (mcg/L)

diabetes - If the patient has diabetes (boolean)

ejection_fraction - Percentage of blood leaving the heart at each contraction (percentage)

high_blood_pressure - If the patient has hypertension (boolean)

platelets - Platelets in the blood (kiloplatelets/mL)

serum_creatinine - Level of serum creatinine in the blood (mg/dL)

serum_sodium - Level of serum sodium in the blood (mEq/L)

sex - Woman or man (binary)

smoking - If the patient smokes or not (boolean)

time - Follow-up period (days)

DEATH_EVENT - If the patient deceased during the follow-up period (boolean)

### Viewing the data

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df[df.DEATH_EVENT == 0].describe()

In [None]:
df[df.DEATH_EVENT == 1].describe()

In [None]:
df.info()

In [None]:
int_col = []
for col_name in df.columns:
    if np.dtype(df[col_name]) == "int64":
        int_col.append(col_name)
int_col

In [None]:
float_col = []
for col_name in df.columns:
    if np.dtype(df[col_name]) == "float64":
        float_col.append(col_name)
float_col

In [None]:
# Unique values

In [None]:
for i in df.columns:
    if i != "DEATH_EVENT":
        print(i,"has",(df[i]).nunique(),"unique values")

In [None]:
binary_cat_col = []
for i in df.columns:
    if len(df[i].unique()) == 2:
        binary_cat_col.append(i)
binary_cat_col

In [None]:
multi_val_col = []
for i in df.columns:
    if len(df[i].unique()) > 2:
        multi_val_col.append(i)

In [None]:
multi_val_col

In [None]:
# Number of nulls

In [None]:
for i in df.columns:
    if i != "DEATH_EVENT":
        print(i,"has",np.isnan(df[i]).sum(),"nan")

# Exploring the data through visuals

In [None]:
df["Life_Status"] = df["DEATH_EVENT"].apply(str)

In [None]:
df.Life_Status = df.Life_Status.replace({"0":"Alive","1":"Dead"})

In [None]:
multi_val_df = df[["age",'creatinine_phosphokinase','ejection_fraction','platelets','serum_creatinine','serum_sodium',"time","Life_Status","DEATH_EVENT"]]

In [None]:
plt.figure(figsize=(9,6))
for index,i in enumerate([321,322,323,324,325,326]):
    plt.subplot(i)
    plt.hist(x= multi_val_df.columns[index],data= multi_val_df,edgecolor = "black",bins=15)
    if multi_val_df.columns[index] == "platelets":
        plt.xticks(rotation = 20)
    plt.xlabel(multi_val_df.columns[index])
    plt.tight_layout(1)

From the above plots, we can gather an idea of the various features of the respondents - 
1. The respondents from whom data was collected happen to be middle-old aged people, with majority between the 50-70 age. 
2. The creatinine phosphokinase levels in close to 200 people were around 500 (mcg/L) and close to 1000 (mcg/L) for around 80 people. 
3. Frequent Ejection fraction levels observed were 25-40 % . However there were significant number of people recording levels between 45-60 %
4. the most frequent level of Platelets in people appears to be approx between the 250000-300000 (kiloplatelets/mL) mark. 
5. In terms of serum levels, frequently observed levels of serum creatinine appears to be around below 2 (mg/dl) while for serum sodium , it appears to be around 135-140 (mEq/L). 

In [None]:
plt.figure(figsize=(15,7))
plt.title("Bar plots")
plt.subplot(231)
plt.pie([i for i in df.groupby("smoking")["smoking"].count()],explode = [0.1,0],labels = ["Non-Smoker","Smoker"],autopct='%1.1f%%',shadow=True)
plt.xlabel("Smoking habit",size = 15)
plt.subplot(232)
plt.pie([i for i in df.groupby("anaemia")["anaemia"].count()],explode = [0.1,0],labels = ["Non Anaemic","Anaemic"],autopct='%1.1f%%',shadow=True)
plt.xlabel("Anaemia Status",size = 15)
plt.subplot(233)
plt.pie([i for i in df.groupby("diabetes")["diabetes"].count()],explode = [0.1,0],labels = ["Non Diabetic","Diabetic"],autopct='%1.1f%%',shadow=True)
plt.xlabel("Diabetes Status",size = 15)
plt.subplot(234)
plt.pie([i for i in df.groupby("high_blood_pressure")["high_blood_pressure"].count()],explode = [0.1,0],labels = ["No High BP ",""],autopct='%1.1f%%',shadow=True)
plt.xlabel("High BP condition",size = 15)
plt.subplot(235)
plt.pie([i for i in df.groupby("sex")["sex"].count()],explode = [0.1,0],labels = ["Female","Male"],autopct='%1.1f%%',shadow=True)
plt.xlabel("Gender",size = 15)
plt.subplot(236)
plt.pie([i for i in df.groupby("Life_Status")["Life_Status"].count()],explode = [0.1,0],labels = ["Alive","Deceased"],autopct='%1.1f%%',shadow=True)
plt.xlabel("Present Status of Person",size = 15)

From the above plots, we can arrive at the following conclusions about the people from whom the data was collected -
1. 1/3 of the people from whom data was collected passed away later on.
2. Almost 2/3 of the people from whom data was taken were males. 
1. From the above plots, we can conclude that majority of the people from whom data was collected had a healthy lifestyle.
2. approx 2/3 of the people were non smokers. 
3. Despite all the people being above 40, an age from which many diseases begin to arrive, almost 2/3 of the people didnt have problems of high bp. 
4. Approx 60% of the people didn't have condition of diabetes or anaemia. 

In [None]:
fg = sns.FacetGrid(data = df,col = "Life_Status")
fg = fg.map(sns.boxplot,'serum_sodium')

When comparing the levels of sodium in people who passed away and who survived, it can be noticed that ( with the exception of outliers ) majority of the people who are alive have sodium levels between approx 130-145 (mEq/L)., with a median sodium levels close to 137 (mEq/L). whereas those who passed away recorded sodium levels between 127-145 (mEq/L). with median sodium levels close to 135.   

In [None]:
fg = sns.FacetGrid(data = df,col = "Life_Status")
fg = fg.map(sns.boxplot,'serum_creatinine')

When comparing the levels of serum_creatinine in people who passed away and who survived, it can be noticed that ( with the exception of outliers ) majority of the people who are alive have creatinine levels between approx 0.5-3 (mg/dL), with a median sodium levels close to 1.3 (mg/dL) whereas those who passed away recorded serum_creatinine levels between 0.5-1.8 (mg/dL) with median serum_creatinine levels close to 1 (mg/dL). However, there were a significant number of extremely high serum creatinine levels recorded for those who are deceased and alive

In [None]:
fg = sns.FacetGrid(data = df,col = "Life_Status")
fg = fg.map(sns.boxplot,'age')

In terms of age, it can be seen that among those deceased, the median age was close to 65 whereas for those alive, their median age is around 60. From the distribution, it can also be noticed that among those alive, half of them were 50-65 whereas for those deceased, many were between the age of 55-75. This can infer that people on the older side are slightly more succeptible to risk of heart failure

In [None]:
fg = sns.FacetGrid(data = df,col = "Life_Status")
fg = fg.map(sns.boxplot,'time')

From the above plot, we can see that follow_up time period varies significantly for those who deceased and those who were alive. While the median number of following the collection of data for those who deceased was around 44, the follow up period for those who lived stretched up to around approx 175 days.  

In [None]:
fg = sns.FacetGrid(data = df,col = "Life_Status",size = 4)
fg = fg.map(sns.boxplot,"platelets")

The range of platelets in those who deceased and those alive appears more or less the same with median values appearing around 250000 (kiloplatelets/mL).

In [None]:
fg = sns.FacetGrid(data = df,col = "Life_Status")
fg = fg.map(sns.boxplot,'ejection_fraction')

In terms of ejection_fraction levels,those alive recorded higher levels of ejection fraction, around 50 % mainly around 35-45 % whereas amongst those deceased, many observed ejection levels between 25-40 %. The median value of ejection_fraction appears higher ( 38 ) for those alive as compared to those who are deceased ( 30 )

In [None]:
fg = sns.FacetGrid(data = df,col = "Life_Status")
fg = fg.map(sns.boxplot,'creatinine_phosphokinase')

In terms of levels of creatinine phosphokinase, there appears to not be much of a difference amongst those who are deceased and those alive.

In [None]:
sns.pairplot(multi_val_df.drop(columns = ["DEATH_EVENT"]),hue="Life_Status")

From the above scatter plots to understand the relationship between different numerical variables, there appears to be no apparent significant relation between pairs of variables. Age & time and ejection fraction & sodium levels do seem to have some kind of correlation but very weak. 

The histograms give us a faint idea of certain variables vary in terms of weather person is living or passed away. We can see that time ,age and ejection fraction do seem to vary for the people based on life status.

In [None]:

g = sns.FacetGrid(data = multi_val_df,col = "Life_Status")
g = g.map(sns.distplot,"creatinine_phosphokinase")

In [None]:
g = sns.FacetGrid(data = multi_val_df,col = "Life_Status")
g = g.map(sns.distplot,"age")

In [None]:
g = sns.FacetGrid(data = multi_val_df,col = "Life_Status")
g = g.map(sns.distplot,"time")

In [None]:
g = sns.FacetGrid(data = multi_val_df,col = "Life_Status")
g = g.map(sns.distplot,"ejection_fraction")

In [None]:
g = sns.FacetGrid(data = multi_val_df,col = "Life_Status")
g = g.map(sns.distplot,"platelets")

In [None]:
g = sns.FacetGrid(data = multi_val_df,col = "Life_Status")
g = g.map(sns.distplot,'serum_creatinine')

In [None]:
#'anaemia','diabetes','high_blood_pressure','sex','smoking',
plt.figure(figsize = (10,10))
plt.tight_layout(h_pad=2)
plt.subplot(321)
ax1 = sns.countplot(x = df.diabetes,hue = df.Life_Status)
h1,l1 = ax1.get_legend_handles_labels()
plt.legend(loc=1,prop = {"size": 0})
plt.xlabel("")
plt.title(label = "diabetes",pad = 2)
plt.xticks(ticks = [0,1],labels = ["Non-Diabetic","Diabetic"])
plt.ylabel("No. of people")

plt.subplot(322)
ax2 = sns.countplot(x = df.anaemia,hue = df.Life_Status)
h2,l2 = ax2.get_legend_handles_labels()
plt.legend(loc=1,prop = {"size": 0})
plt.xlabel("")
plt.title(label = "anaemia",pad = 2)
plt.xticks(ticks = [0,1],labels = ["No","Yes"])
plt.ylabel("No. of people")

plt.subplot(323)
ax3 = sns.countplot(x = df.high_blood_pressure,hue = df.Life_Status)
h3,l3 = ax3.get_legend_handles_labels()
plt.legend(loc=1,prop = {"size": 0})
plt.xlabel("")
plt.title(label = "High blood pressure",pad = 2)
plt.xticks(ticks = [0,1],labels = ["No","Yes"])
plt.ylabel("No. of people")

plt.subplot(324)
ax4 = sns.countplot(x = df.sex,hue = df.Life_Status)
h4,l4 = ax4.get_legend_handles_labels()
ax4.legend(bbox_to_anchor = (0.5,0.7,1,0.5),prop= {"size":15})
plt.xlabel("")
plt.title(label = "Gender",pad = 2)
plt.xticks(ticks = [0,1],labels = ["Female","Male"])
plt.ylabel("No. of people")

plt.subplot(325)
ax5 = sns.countplot(x = df.smoking,hue = df.Life_Status)
h5,l5 = ax5.get_legend_handles_labels()
plt.legend(loc=1,prop = {"size": 0})
plt.xlabel("")
plt.title(label = "smoking",pad = 4)
plt.xticks(ticks = [0,1],labels = ["Non smoker","smoker"])
plt.ylabel("No. of people")

There appears to be no significant trend in condition and habits in respect to people who have passed away from heart failure and those are still alive. 



# Feature identification

In [None]:
from sklearn.feature_selection import SelectKBest , chi2 , f_classif
from sklearn.ensemble import ExtraTreesClassifier

In [None]:
plt.figure(figsize = (15,7))
sns.heatmap(multi_val_df.corr(),annot = True)
plt.xticks(rotation = 20)
plt.title("CORRELATION MATRIX FOR NUMERICAL / DISCRETE VARIABLES USING HEATMAP ",pad = 6)

In [None]:
multi_val_df.corr()

From the above correlation heat map , we can see that there is no pair of numerical variables which are significantly correlated. Age and time, serum sodium and creatinine & serum sodium and ejection fraction are pairs that have very weak correlation coefficients. Hence we cant dispose any one of them. 

Also , when correlation coefficients of variables with death_event are assessed,time seems to have the most significant relation with the death_event followed variables like age, ejection_fraction and serum creatinine. 

In [None]:
plt.figure(figsize = (15,7))
sns.heatmap(df[binary_cat_col].corr(),annot = True)
plt.xticks(rotation = 20)
plt.title("CORRELATION MATRIX FOR BINARY CLASS VARIABLES USING HEATMAP ",pad = 6)

In [None]:
df[binary_cat_col].corr()

From the above heatmap / correlation matrix, we can see that diabetes & smoking, diabetes & sex and smoking & aneamia have extremely weak correlation coefficients. However, smoking and sex appear to be slightly-moderately correlated with each other. However it is not strong enough to be able to remove one of the two variables from the potential model. 

Apart from them, no other pair of numerical variables appear to be correlated. Also when correlation of variables with the Death_event is assessed, no variable seems to have a significant correlation. 

In [None]:

Bin_Best_Features = SelectKBest(score_func=chi2,k=3)

In [None]:
Multi_Best_Features = SelectKBest(score_func=f_classif,k=4)

In [None]:
Binary_fit = Bin_Best_Features.fit(df[["anaemia","diabetes","high_blood_pressure","sex","smoking"]],df["DEATH_EVENT"])

In [None]:
Multi_Val_fit = Multi_Best_Features.fit(df[['age','creatinine_phosphokinase','ejection_fraction','platelets','serum_creatinine','serum_sodium','time']],df["DEATH_EVENT"])

In [None]:
pd.DataFrame(list(zip(Binary_fit.get_support(),Binary_fit.scores_)),index=["anaemia","diabetes","high_blood_pressure","sex","smoking"],columns = ["important","scores"])

From the above scores based on chi_square, we can conclude that the top 3 variables appear to be anaemia , high blood pressure and smoking. These variables will be looked into for final feature selection

In [None]:
pd.DataFrame(list(zip(Multi_Val_fit.get_support(),Multi_Val_fit.scores_)),index = [['age','creatinine_phosphokinase','ejection_fraction','platelets','serum_creatinine','serum_sodium','time']],columns=["Important","F_statistic_score"])

From the above F-statistic scores, we can say that  age, time, serum creatinine and ejection_fraction appear to be the top 4 important features. The variables will be looked into for final feature selection

In [None]:
model = ExtraTreesClassifier()

In [None]:
model.fit(df[["anaemia","high_blood_pressure","smoking",'age','ejection_fraction','serum_creatinine','time']],df["DEATH_EVENT"])

In [None]:
Numercial_feat_imp = pd.Series(model.feature_importances_,index=["anaemia","high_blood_pressure","smoking",'age','ejection_fraction','serum_creatinine','time'])

In [None]:
Numercial_feat_imp.plot(kind = "barh",label = "")

From the above bar plot, we can see that time, serum_creatinine, ejection_fraction and age happen to have considerable influence on the classification. Hence we will consider them for the final model

# Model Building

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [None]:
from sklearn.metrics import precision_score, recall_score, accuracy_score , f1_score, classification_report ,confusion_matrix , \
    roc_curve , auc , roc_auc_score

In [None]:
Data = df[["age","time", "ejection_fraction", "serum_creatinine","DEATH_EVENT" ]]

In [None]:
X_data = df[["age","time", "ejection_fraction","serum_creatinine"]]
y_data = df["DEATH_EVENT"]

In [None]:
X_train , X_test , y_train , y_test = train_test_split(X_data,y_data,train_size = 0.75,test_size = 0.25,random_state = 16 ,stratify = y_data)

In [None]:
scale = MinMaxScaler().fit(np.float64(X_train))

In [None]:
X_train_scaled = scale.transform(np.float64(X_train))

In [None]:
X_test_scaled = scale.transform(np.float64(X_test))

In [None]:
knn = KNeighborsClassifier().fit(X_train_scaled,y_train)

# Evaluation

In [None]:
print("Accuracy of the K nearest neighbors model with default parameters -",accuracy_score(y_test,knn.predict(X_test_scaled)))
print(classification_report(y_test,knn.predict(X_test_scaled)))

In [None]:
confusion_matrix = confusion_matrix(y_test,knn.predict(X_test_scaled))

In [None]:
plt.figure()
ax = plt.subplot()
sns.heatmap(confusion_matrix,cbar = False,annot = True)
ax.set_xlabel("Predicted Values",labelpad = 10)
ax.set_ylabel("True Values")
ax.xaxis.set_ticklabels(["Alive","Deceased"])
ax.yaxis.set_ticklabels(["Alive","Deceased"])
ax.set_title("CONFUSION MATRIX")


In [None]:
neighbors = [2,3,4,5,6,7,8,9,10]
precision_scores = []
recall_scores = []
accuracy_scores = []
for i in neighbors:
    knn = KNeighborsClassifier(n_neighbors=i).fit(X_train_scaled,y_train)
    precision = precision_score(y_test,knn.predict(X_test_scaled))
    recall = recall_score(y_test,knn.predict(X_test_scaled))
    accuracy = accuracy_score(y_test,knn.predict(X_test_scaled))
    precision_scores.append(round(precision,2))
    recall_scores.append(round(recall,2))
    accuracy_scores.append(round(accuracy,2))
plt.figure(figsize=(10,5))
plt.plot(precision_scores,label = "precision")
plt.plot(recall_scores,label = "recall")
plt.plot(accuracy_scores,label = "accuracy")
plt.legend(bbox_to_anchor = (1,1,0,0))
plt.xticks(range(0,9),neighbors)
plt.title("Optimum number of neighbors")
plt.xlabel("no. of neighbours")
plt.ylabel("score")
    
    

From the above plot it appears that the model is able to achieve the best possible combination of precision , recall and accuracy when using 5 as the no. of neighbours. It appears to offer slighlty better recall and accuracy predictions as compared to using 9 as no. of neighbors. 5 was the default value of n_neighbors hence our initial model is the best obtainable knn model

In [None]:
knn_predict_prob = knn.predict_proba(X_test_scaled)

In [None]:
knn_predict_prob = knn_predict_prob[:,1]

In [None]:
fpr, tpr , threshold = roc_curve(y_test,knn_predict_prob)

In [None]:
plt.figure(figsize=(10,5))
plt.plot(fpr,tpr,linestyle = "--")
plt.fill_between(fpr,tpr,alpha = 0.1)
plt.title("ROC for K nearest neighbors")
plt.xlabel("False Positive rate")
plt.ylabel("True Positive rate")
print("ROC_AUC Score for KNN model :",roc_auc_score(y_test,knn_predict_prob))


#### Note - 
Hey guys, this was the first ever machine learning task that I have worked on. I am new to this field and have spent a few months learning about Machine Learning. Any feedback or suggestions for improving the model or related to any of the processes I did in this task like model evaluation, data exploration, feature selection,etc would really be appreciated as it would help me build better models in the future. 