email : febrianasulistyap@gmail.com

HEART FAILURE PREDICTION
-----
### Modeling with Classification (KNN, LogReg, SVM)

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import sklearn
import matplotlib.pyplot as plt

In [None]:
# defines dataframe
df = pd.read_csv('../input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv')

# displays the top 5 rows of dataframe
df.head()

In [None]:
# defines the numeric features in the dataset
df_num=df[['age','creatinine_phosphokinase','ejection_fraction','platelets','serum_creatinine','serum_sodium','time']]

In [None]:
# displays the top 5 rows of the numeric features
df_num.head()

In [None]:
# check the data type of each features in dataframe
df.info()

In [None]:
# displays statistical summary of numeric features
df_num.describe().transpose()

In [None]:
# displays the dimensions of the data
print('number of rows:', df.shape[0])
print('number of columns:', df.shape[1])

Data Preparation
------

In [None]:
# check missing value in each features
print('Checking missing data:')
df.isnull().any()

Exploration Data Analysis
-------

In [None]:
# defines mean of the numerical features in each categorical features
round(df.groupby(['DEATH_EVENT','sex','anaemia','diabetes','high_blood_pressure','smoking'],as_index=False).mean(),2)

### Numeric Features

In [None]:
# defines death event as label and the numeric features in data
df_DE_and_num=df[['DEATH_EVENT','age','creatinine_phosphokinase','ejection_fraction','platelets','serum_creatinine','serum_sodium','time']]

In [None]:
# defines mean of the numerical features in each death event labels
df_group_DE = round(df_DE_and_num.groupby(['DEATH_EVENT'],as_index=False).mean(),2)
df_group_DE

In [None]:
# Bar chart visualization of the mean numerical feature in each death event labels

f, axes = plt.subplots(ncols=3, figsize=(26, 8))

df_group_DE['age'].plot(kind="bar",color=['darkcyan', 'grey'], ax=axes[0])
axes[0].set_title('Mean of Age in each category Death Event', fontsize = 16)
axes[0].set_xlabel('Category Death Event', fontsize = 14)
axes[0].set_ylabel('Count', fontsize = 12)
# set individual bar lables 
for i in axes[0].patches:
    # get_x pulls left or right; get_height pushes up or down
    axes[0].text(i.get_x()+.16, i.get_height()+.5, \
            str(i.get_height()), fontsize=16,
                color='black')

df_group_DE['creatinine_phosphokinase'].plot(kind="bar", color=['darkcyan', 'grey'], ax=axes[1])
axes[1].set_title('Mean of Creatinine Phosphokinase in each category Death Event', fontsize = 16)
axes[1].set_xlabel('Category Death Event', fontsize = 14)
axes[1].set_ylabel('Count', fontsize = 12)
# set individual bar lables 
for i in axes[1].patches:
    # get_x pulls left or right; get_height pushes up or down
    axes[1].text(i.get_x()+.16, i.get_height()+2.5, \
            str(i.get_height()), fontsize=16,
                color='black')

df_group_DE['ejection_fraction'].plot(kind="bar",color=['darkcyan', 'grey'], ax=axes[2])
axes[2].set_title('Mean of Ejection Fraction in each category Death Event', fontsize = 16)
axes[2].set_xlabel('Category Death Event', fontsize = 14)
axes[2].set_ylabel('Count', fontsize = 12)
# set individual bar lables 
for i in axes[2].patches:
    # get_x pulls left or right; get_height pushes up or down
    axes[2].text(i.get_x()+.16, i.get_height()+.5, \
            str(i.get_height()), fontsize=16,
                color='black')

plt.show()

In [None]:
# Bar chart visualization of the mean numerical feature in each death event labels

f, axes = plt.subplots(ncols=2, figsize=(26, 8))

df_group_DE['platelets'].plot(kind="bar", color=['darkcyan', 'grey'], ax=axes[0])
axes[0].set_title('Mean of Platelets in each category Death Event', fontsize = 18)
axes[0].set_xlabel('Category Death Event', fontsize = 14)
axes[0].set_ylabel('Count', fontsize = 12)
# set individual bar lables 
for i in axes[0].patches:
    # get_x pulls left or right; get_height pushes up or down
    axes[0].text(i.get_x()+.12, i.get_height()+1.5, \
            str(i.get_height()), fontsize=16,
                color='black')

df_group_DE['serum_creatinine'].plot(kind="bar",color=['darkcyan', 'grey'], ax=axes[1])
axes[1].set_title('Mean of Serum Creatinine in each category Death Event', fontsize = 18)
axes[1].set_xlabel('Category Death Event', fontsize = 14)
axes[1].set_ylabel('Count', fontsize = 12)
# set individual bar lables 
for i in axes[1].patches:
    # get_x pulls left or right; get_height pushes up or down
    axes[1].text(i.get_x()+.16, i.get_height()+.02, \
            str(i.get_height()), fontsize=16,
                color='black')

plt.show()

In [None]:
# Bar chart visualization of the mean numerical feature in each death event labels

f, axes = plt.subplots(ncols=2, figsize=(26, 8))

df_group_DE['serum_sodium'].plot(kind="bar", color=['darkcyan', 'grey'], ax=axes[0])
axes[0].set_title('Mean of Serum Sodium in each category Death Event', fontsize = 18)
axes[0].set_xlabel('Category Death Event', fontsize = 14)
axes[0].set_ylabel('Count', fontsize = 12)
# set individual bar lables 
for i in axes[0].patches:
    # get_x pulls left or right; get_height pushes up or down
    axes[0].text(i.get_x()+.16, i.get_height()+1.5, \
            str(i.get_height()), fontsize=16,
                color='black')

df_group_DE['time'].plot(kind="bar",color=['darkcyan', 'grey'], ax=axes[1])
axes[1].set_title('Mean of Time in each category Death Event', fontsize = 18)
axes[1].set_xlabel('Category Death Event', fontsize = 14)
axes[1].set_ylabel('Count', fontsize = 12)
# set individual bar lables 
for i in axes[1].patches:
    # get_x pulls left or right; get_height pushes up or down
    axes[1].text(i.get_x()+.16, i.get_height()+1.5, \
            str(i.get_height()), fontsize=16,
                color='black')

plt.show()

In [None]:
# Histogram visualization of numerical feature

f, axes = plt.subplots(ncols=4, figsize=(24, 6))

sns.histplot(x='age', color='darkred', kde=True, data=df,ax=axes[0])
axes[0].set_title('Histogram of Age', fontsize = 14)

sns.histplot(x='creatinine_phosphokinase', color='darkred', kde=True, data=df,ax=axes[1])
axes[1].set_title('Histogram of Creatinine Phosphokinase', fontsize = 14)

sns.histplot(x='ejection_fraction', color='darkred', kde=True, data=df,ax=axes[2])
axes[2].set_title('Histogram of Ejection Fraction', fontsize = 14)

sns.histplot(x='platelets', color='darkred', kde=True, data=df,ax=axes[3])
axes[3].set_title('Histogram of Platelets', fontsize = 14)

plt.show()

In [None]:
# Histogram visualization of numerical feature

f, axes = plt.subplots(ncols=3, figsize=(24, 6))

sns.histplot(x='serum_creatinine', color='darkred', kde=True, data=df,ax=axes[0])
axes[0].set_title('Histogram of Serum Creatinine', fontsize = 14)

sns.histplot(x='serum_sodium', color='darkred', kde=True, data=df,ax=axes[1])
axes[1].set_title('Histogram of Serum Sodium', fontsize = 14)

sns.histplot(x='time', color='darkred', kde=True, data=df,ax=axes[2])
axes[2].set_title('Histogram of Time', fontsize = 14)

plt.show()

### Categorical Features

In [None]:
f, axes = plt.subplots(ncols=3, figsize=(24, 8))

# BARCHART ANAEMIA vs DEATH EVENT
sns.countplot(x='anaemia',hue='DEATH_EVENT', data=df, ax=axes[0])
axes[0].set_title('Bar Plot Anaemia berdasarkan Death Event', fontsize = 18)
axes[0].set_xlabel('Category Anaemia', fontsize = 15)
axes[0].set_ylabel('Count', fontsize = 12)
totals0 = []
# find the values and append to list
for i in axes[0].patches:
    totals0.append(i.get_height())
# set individual bar lables using above list    
total0 = sum(totals0)
for i in axes[0].patches:
    # get_x pulls left or right; get_height pushes up or down
    axes[0].text(i.get_x()+.1, i.get_height()+.5, \
            str(round((i.get_height()/total0)*100, 2))+'%', fontsize=14,
                color='black')

# BARCHART DIABETES vs DEATH EVENT
sns.countplot(x='diabetes',hue='DEATH_EVENT', data=df, ax=axes[1])
axes[1].set_title('Bar Plot Diabetes berdasarkan Death Event', fontsize = 18)
axes[1].set_xlabel('Category Diabetes', fontsize = 15)
axes[1].set_ylabel('Count', fontsize = 12)
totals1 = []
# find the values and append to list
for i in axes[1].patches:
    totals1.append(i.get_height())
# set individual bar lables using above list    
total1 = sum(totals1)
for i in axes[1].patches:
    # get_x pulls left or right; get_height pushes up or down
    axes[1].text(i.get_x()+.1, i.get_height()+.5, \
            str(round((i.get_height()/total1)*100, 2))+'%', fontsize=14,
                color='black')

# BARCHART HIGH BLOOD PRESSURE vs DEATH EVENT
sns.countplot(x='high_blood_pressure',hue='DEATH_EVENT', data=df, ax=axes[2])
axes[2].set_title('Bar Plot High Blood Pressure berdasarkan Death Event', fontsize = 18)
axes[2].set_xlabel('Category High Blood Pressure', fontsize = 15)
axes[2].set_ylabel('Count', fontsize = 12)
totals2 = []
# find the values and append to list
for i in axes[2].patches:
    totals2.append(i.get_height())
# set individual bar lables using above list    
total2 = sum(totals2)
for i in axes[2].patches:
    # get_x pulls left or right; get_height pushes up or down
    axes[2].text(i.get_x()+.1, i.get_height()+.5, \
            str(round((i.get_height()/total2)*100, 2))+'%', fontsize=14,
                color='black')

plt.show()

In [None]:
f, axes = plt.subplots(ncols=2, figsize=(24, 8))

# BARCHART SEX vs DEATH EVENT
sns.countplot(x='sex',hue='DEATH_EVENT', data=df, ax=axes[0])
axes[0].set_title('Bar Plot Sex berdasarkan Death Event', fontsize = 18)
axes[0].set_xlabel('Category Sex', fontsize = 15)
axes[0].set_ylabel('Count', fontsize = 12)
totals0 = []
# find the values and append to list
for i in axes[0].patches:
    totals0.append(i.get_height())
# set individual bar lables using above list    
total0 = sum(totals0)
for i in axes[0].patches:
    # get_x pulls left or right; get_height pushes up or down
    axes[0].text(i.get_x()+.1, i.get_height()+.5, \
            str(round((i.get_height()/total0)*100, 2))+'%', fontsize=16,
                color='black')

# BARCHART SMOKING vs DEATH EVENT
sns.countplot(x='smoking',hue='DEATH_EVENT', data=df, ax=axes[1])
axes[1].set_title('Bar Plot Smoking Habits berdasarkan Death Event', fontsize = 18)
axes[1].set_xlabel('Category Smoking', fontsize = 15)
axes[1].set_ylabel('Count', fontsize = 12)
totals1 = []
# find the values and append to list
for i in axes[1].patches:
    totals1.append(i.get_height())
# set individual bar lables using above list    
total1 = sum(totals1)
for i in axes[1].patches:
    # get_x pulls left or right; get_height pushes up or down
    axes[1].text(i.get_x()+.1, i.get_height()+.5, \
            str(round((i.get_height()/total1)*100, 2))+'%', fontsize=16,
                color='black')

plt.show()

Outlier Detection
-----

In [None]:
# Boxplot visualization for outlier detection

fig, axs = plt.subplots(ncols=4, nrows=2, figsize=(14, 8))
index = 0
axs = axs.flatten()

for k,v in df_num.items():
    sns.boxplot(y=k, color='grey', data=df_num, ax=axs[index])
    index += 1
    plt.tight_layout(pad=1, w_pad=2, h_pad=5.0)

In [None]:
# defines outlier
def detect_outliers(df, x):
    Q1 = df[x].describe()['25%']
    Q3 = df[x].describe()['75%']
    IQR = Q3-Q1
    return df_num[(df[x] < Q1-1.5*IQR) | (df[x] > Q3+1.5*IQR)]

In [None]:
# displays data based on outliers in the Creatinine Phosphokine
detect_outliers(df_num,'creatinine_phosphokinase')[['creatinine_phosphokinase']]

In [None]:
# displays data based on outliers in the Ejection Fraction
detect_outliers(df_num,'ejection_fraction')[['ejection_fraction']]

In [None]:
# displays data based on outliers in the Platelets
detect_outliers(df_num,'platelets')[['platelets']]

In [None]:
# displays data based on outliers in the Serum Creatinine
detect_outliers(df_num,'serum_creatinine')[['serum_creatinine']]

In [None]:
# displays data based on outliers in the Serum Sodium
detect_outliers(df_num,'serum_sodium')[['serum_sodium']]

There are various ways of dealing with outliers. Among them is to impute outliers, remove outliers, or keep them. To determine what steps to take, you need to research and seek knowledge about outliers, data sets, and maybe some domain knowledge before dealing with outliers. We need to know an understanding of the possible ranges contained in each feature. When I researched a little bit, I found that all of the outliers values were within the possible range of values. So the step taken in this case is to keep the value in the dataset.

Analyzing relationships between variables
-----

#### Relationship between Numerical Features

In [None]:
# kendall correlation of each numerical feature and death event label
corr = df_DE_and_num.corr(method='kendall')
plt.figure(figsize=(10,10))
sns.heatmap(corr, vmin=-1, cmap='coolwarm', annot=True)
plt.show()

In [None]:
# numerical feature with higher correlation to death event
corr[abs(corr['DEATH_EVENT']) > 0.24]['DEATH_EVENT']

we can conclude that some of the numerical features which have the highest correlation are ejection fraction, serum creatinin, and time.

#### Relationship between Categorical Features

In [None]:
from sklearn.feature_selection import chi2

In [None]:
df_cat=df[['sex','anaemia','diabetes','high_blood_pressure','smoking']]

In [None]:
df_cat.head()

In [None]:
f_score=chi2(df_cat,df[['DEATH_EVENT']])   #returns f score and p value 
f_score

In [None]:
# printing p values for each categorical features
p_value = pd.Series(f_score[1],index=df_cat.columns)
p_value.sort_values(ascending=True,inplace=True)
p_value

In [None]:
p_value.plot(kind="bar")
plt.xlabel("Features",fontsize=12)
plt.ylabel("p-values",fontsize=12)
plt.title("Chi-squared test base on p-value", fontsize=15)
plt.show()

If we see above plot we can conclude that all categorical features in df_cat has p-value > 0.05 hence all categorical features does not have significance on target variable (death event).

Normalization
---

In [None]:
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler

StandardScaler = StandardScaler()
columns_to_scale=['age','creatinine_phosphokinase','ejection_fraction','platelets','serum_creatinine','serum_sodium','time']
df[columns_to_scale] = StandardScaler.fit_transform(df[columns_to_scale])

In [None]:
# displays the top 5 rows of data after normalization
df.head()

Modeling
---

In [None]:
# defines X and y
X=df.drop(['DEATH_EVENT'], axis = 1)
y=df['DEATH_EVENT']

In [None]:
# defines Training set and Testing set
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=4)
print ('Train set:', X_train.shape,  y_train.shape)
print ('Test set:', X_test.shape,  y_test.shape)

KNN (K Nearest Neighbor)
----
(with all features)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

Ks = 10
mean_acc_train = np.zeros((Ks-1))
mean_acc_test = np.zeros((Ks-1))
std_acc = np.zeros((Ks-1))
ConfustionMx = [];
for n in range(1,Ks):
    
    #Train Model and Predict  
    neigh = KNeighborsClassifier(n_neighbors = n).fit(X_train,y_train)
    yhat_train=neigh.predict(X_train)
    yhat_test=neigh.predict(X_test)
    mean_acc_train[n-1] = metrics.accuracy_score(y_train, yhat_train)
    mean_acc_test[n-1] = metrics.accuracy_score(y_test, yhat_test) 
    std_acc[n-1]=np.std(yhat_test==y_test)/np.sqrt(yhat_test.shape[0])

print("Akurasi Training", np.round(mean_acc_train,3))
print("Akurasi Testing", np.round(mean_acc_test,3))

In [None]:
plt.plot(range(1,Ks),mean_acc_test,'g')
plt.fill_between(range(1,Ks),mean_acc_test - 1 * std_acc,mean_acc_test + 1 * std_acc, alpha=0.10)
plt.legend(('Accuracy ', '+/- 3xstd'))
plt.ylabel('Accuracy ')
plt.xlabel('Number of Nabors (K)')
plt.tight_layout()
plt.show()

In [None]:
print( "The best accuracy value is", np.round(mean_acc_test.max(),3), "with k=", mean_acc_test.argmax()+1) 

In [None]:
# use the best k
k = 6

# predict  
yhat = neigh.predict(X_test)

# matrix confusion
from sklearn.metrics import confusion_matrix
cm_knn = confusion_matrix(y_test,yhat)

f, ax = plt.subplots(figsize=(6,4))
sns.heatmap(cm_knn, annot = True, fmt='.0f', ax = ax)
plt.title('Confusion Matrix KNN (with all features)')
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, yhat))

With all the features, we get the KNN model with an accuracy of 0.75

Logistic Regression
---
(with all features)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

LR = LogisticRegression(C=0.01, solver='liblinear').fit(X_train,y_train)
# predict
yhat_test = LR.predict(X_test)

In [None]:
# probability of the predict
yhat_prob = LR.predict_proba(X_test)
yhat_prob

In [None]:
# matrix confusion
cm_logreg = confusion_matrix(y_test,yhat_test)

f, ax = plt.subplots(figsize=(6,4))
sns.heatmap(cm_logreg, annot = True, fmt='.0f', ax = ax)
plt.title('Confusion Matrix Logistic Regression (with all features)')
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()

In [None]:
from sklearn.metrics import accuracy_score

mylist = []
ac_train = accuracy_score(y_train, yhat_train)
ac_test = accuracy_score(y_test, yhat_test)
mylist.append(ac_train)
mylist.append(ac_test)
print('Accuracy training:', np.round(ac_train,3))
print('Accuracy testing:', np.round(ac_test,3))

In [None]:
print(classification_report(y_test, yhat_test))

With all the features, we get the Logistic Regression model with an accuracy of 0.87

SVM (Support Vector Machine)
---
(with all features)

In [None]:
from sklearn import svm

clf = svm.SVC(kernel='rbf')
clf.fit(X_train, y_train)
# predict
yhat_test = clf.predict(X_test)

In [None]:
# matrix confusion
cm_svm = confusion_matrix(y_test,yhat_test)

f, ax = plt.subplots(figsize=(6,4))
sns.heatmap(cm_svm, annot = True, fmt='.0f', ax = ax)
plt.title('Confusion Matrix SVM (with all features)')
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()

In [None]:
from sklearn.metrics import accuracy_score

mylist = []
ac_train = accuracy_score(y_train, yhat_train)
ac_test = accuracy_score(y_test, yhat_test)
mylist.append(ac_train)
mylist.append(ac_test)
print('Accuracy training:', np.round(ac_train,3))
print('Accuracy testing:', np.round(ac_test,3))

In [None]:
print(classification_report(y_test, yhat_test))

With all the features, we get the SVM model with an accuracy of 0.83

Modeling with Feature Selection
----

In [None]:
# defines X1

X1=df[['ejection_fraction', 'serum_creatinine', 'time']]

In [None]:
# defines Training set and Testing set

X_train1, X_test1, y_train1, y_test1 = train_test_split( X1, y, test_size=0.2, random_state=4)
print ('Train set:', X_train1.shape,  y_train1.shape)
print ('Test set:', X_test1.shape,  y_test1.shape)

KNN (K Nearest Neighbor)
---
(with feature selection)

In [None]:
Ks = 10
mean_acc_train1 = np.zeros((Ks-1))
mean_acc_test1 = np.zeros((Ks-1))
std_acc1 = np.zeros((Ks-1))
ConfustionMx1 = [];
for n in range(1,Ks):
    
    #Train Model and Predict  
    neigh1 = KNeighborsClassifier(n_neighbors = n).fit(X_train1,y_train1)
    yhat_train1=neigh1.predict(X_train1)
    yhat_test1=neigh1.predict(X_test1)
    mean_acc_train1[n-1] = metrics.accuracy_score(y_train1, yhat_train1)
    mean_acc_test1[n-1] = metrics.accuracy_score(y_test1, yhat_test1) 
    std_acc1[n-1]=np.std(yhat_test1==y_test1)/np.sqrt(yhat_test1.shape[0])

print("Akurasi Training", np.round(mean_acc_train1,3))
print("Akurasi Testing", np.round(mean_acc_test1,3))

In [None]:
plt.plot(range(1,Ks),mean_acc_test1,'g')
plt.fill_between(range(1,Ks),mean_acc_test1 - 1 * std_acc1,mean_acc_test1 + 1 * std_acc1, alpha=0.10)
plt.legend(('Accuracy ', '+/- 3xstd'))
plt.ylabel('Accuracy ')
plt.xlabel('Number of Nabors (K)')
plt.tight_layout()
plt.show()

In [None]:
print("The best accuracy value is", np.round(mean_acc_test1.max(),3), "with k=", mean_acc_test1.argmax()+1) 

In [None]:
# use the best k
k = 3

# Predict  
yhat1 = neigh1.predict(X_test1)

# matrix confusion
cm_knn1 = confusion_matrix(y_test1,yhat1)

f, ax = plt.subplots(figsize=(6,4))
sns.heatmap(cm_knn1, annot = True, fmt='.0f', ax = ax)
plt.title('Confusion Matrix KNN (with features selection)')
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()

In [None]:
print(classification_report(y_test1, yhat1))

With features ejection fraction, serum creatinine, and time, we get the KNN model with an accuracy of 0.82

Logistic Regression
---
(with features selection)

In [None]:
LR = LogisticRegression(C=0.01, solver='liblinear').fit(X_train1,y_train1)
# predict
yhat_test1 = LR.predict(X_test1)

In [None]:
# probability of the predict
yhat1_prob = LR.predict_proba(X_test1)
yhat1_prob

In [None]:
# matrix confusion
cm_logreg1 = confusion_matrix(y_test1,yhat_test1)

f, ax = plt.subplots(figsize=(6,4))
sns.heatmap(cm_logreg1, annot = True, fmt='.0f', ax = ax)
plt.title('Confusion Matrix Logistic Regression (with features selection)')
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()

In [None]:
mylist = []
ac_train = accuracy_score(y_train1, yhat_train1)
ac_test = accuracy_score(y_test1, yhat_test1)
mylist.append(ac_train)
mylist.append(ac_test)
print('Accuracy training:', np.round(ac_train,3))
print('Accuracy testing:', np.round(ac_test,3))

In [None]:
print(classification_report(y_test1, yhat_test1))

With features ejection fraction, serum creatinine, and time, we get the Logistic Regression model with an accuracy of 0.90

SVM (Support Vector Machine)
----
(with feature selection)

In [None]:
clf = svm.SVC(kernel='rbf')
clf.fit(X_train1, y_train1)
# predict
yhat_test1 = clf.predict(X_test1)

In [None]:
# matrix confusion
cm_svm1 = confusion_matrix(y_test1,yhat_test1)

f, ax = plt.subplots(figsize=(6,4))
sns.heatmap(cm_svm1, annot = True, fmt='.0f', ax = ax)
plt.title('Confusion Matrix SVM (with features selection)')
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()

In [None]:
mylist = []
ac_train = accuracy_score(y_train1, yhat_train1)
ac_test = accuracy_score(y_test1, yhat_test1)
mylist.append(ac_train)
mylist.append(ac_test)
print('Accuracy training:', np.round(ac_train,3))
print('Accuracy testing:', np.round(ac_test,3))

In [None]:
print(classification_report(y_test1, yhat_test1))

With features ejection fraction, serum creatinine, and time, we get the SVM model with an accuracy of 0.85

##### So, it can be concluded that using the classification model selection feature will have a higher accuracy. In this dataset, the best model that can predict heart failure is Logistic Regression which has an accuracy of 90%. With features selection applied based on the control correlation, the model was built using three features are ejection fraction, serum creatinine, and time.