In [27]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [28]:
df = pd.read_csv('heart_failure_clinical_records_dataset.csv')
print('Shape:', df.shape)
df

Shape: (299, 13)


Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.00,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.00,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.00,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.00,2.7,116,0,0,8,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
294,62.0,0,61,1,38,1,155000.00,1.1,143,1,1,270,0
295,55.0,0,1820,0,38,0,270000.00,1.2,139,0,0,271,0
296,45.0,0,2060,1,60,0,742000.00,0.8,138,0,0,278,0
297,45.0,0,2413,0,38,0,140000.00,1.4,140,1,1,280,0


In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299 entries, 0 to 298
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   age                       299 non-null    float64
 1   anaemia                   299 non-null    int64  
 2   creatinine_phosphokinase  299 non-null    int64  
 3   diabetes                  299 non-null    int64  
 4   ejection_fraction         299 non-null    int64  
 5   high_blood_pressure       299 non-null    int64  
 6   platelets                 299 non-null    float64
 7   serum_creatinine          299 non-null    float64
 8   serum_sodium              299 non-null    int64  
 9   sex                       299 non-null    int64  
 10  smoking                   299 non-null    int64  
 11  time                      299 non-null    int64  
 12  DEATH_EVENT               299 non-null    int64  
dtypes: float64(3), int64(10)
memory usage: 30.5 KB


In [30]:
df['age'] = df['age'].astype(int)

In [31]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [32]:
X = df.drop(columns= ['time','DEATH_EVENT'])
y = df['DEATH_EVENT']

In [33]:
seed = 42
test_size = 0.2
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= test_size, random_state= seed)

In [34]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [42]:
models = {
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'K-Nearest Neighbors': KNeighborsClassifier()
}

kfolds = 8
skf = StratifiedKFold(n_splits=kfolds, shuffle=True, random_state=seed)
for name, model in models.items():
    np.random.seed(seed)
    scores = cross_val_score(model, X_train_scaled, y_train, cv=skf)
    print(f'Model Name: {name}')
    print(f'Cross Validation Accuracy Mean: {scores.mean()}')
    print(f'Cross Validation Accuracy Standard Deviation: {scores.std()}\n')

Model Name: Decision Tree
Cross Validation Accuracy Mean: 0.6821839080459771
Cross Validation Accuracy Standard Deviation: 0.03170830856122695

Model Name: Random Forest
Cross Validation Accuracy Mean: 0.7616379310344827
Cross Validation Accuracy Standard Deviation: 0.0447941870125755

Model Name: K-Nearest Neighbors
Cross Validation Accuracy Mean: 0.6695402298850575
Cross Validation Accuracy Standard Deviation: 0.05325015809148736



In [43]:
for name, model in models.items():
    np.random.seed(seed)
    model.fit(X_train_scaled, y_train)

for name, model in models.items():
    y_pred = model.predict(X_test_scaled)
    print(f'Model Name: {name}')
    print(f'Accuracy Score: {accuracy_score(y_test, y_pred):.3f}\n')

Model Name: Decision Tree
Accuracy Score: 0.717

Model Name: Random Forest
Accuracy Score: 0.733

Model Name: K-Nearest Neighbors
Accuracy Score: 0.667



In [53]:
importances = RandomForestClassifier().fit(X_train_scaled, y_train).feature_importances_
feature_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': importances})
feature_df = feature_df.sort_values(by='Importance', ascending=False)
feature_df

Unnamed: 0,Feature,Importance
7,serum_creatinine,0.231418
4,ejection_fraction,0.182146
0,age,0.155047
2,creatinine_phosphokinase,0.122145
6,platelets,0.113525
8,serum_sodium,0.100351
5,high_blood_pressure,0.021128
1,anaemia,0.019027
10,smoking,0.018812
9,sex,0.018621
