### Import libraries

In [30]:
import pandas as pd
import numpy as np

### Read CSV file

In [31]:
data = pd.read_csv("heart-disease.csv")
data.head(10)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
5,57,1,0,140,192,0,1,148,0,0.4,1,0,1,1
6,56,0,1,140,294,0,0,153,0,1.3,1,0,2,1
7,44,1,1,120,263,0,1,173,0,0.0,2,0,3,1
8,52,1,2,172,199,1,1,162,0,0.5,2,0,3,1
9,57,1,2,150,168,0,1,174,0,1.6,2,0,2,1


### Data Info

1. Check info 

In [32]:
# Info of data
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trestbps  303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalach   303 non-null    int64  
 8   exang     303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    int64  
 11  ca        303 non-null    int64  
 12  thal      303 non-null    int64  
 13  target    303 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 33.3 KB


2. Check NaN

In [33]:
# Check NaN data
data.isna().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

### Split data

In [48]:
# Import libraries
from sklearn.model_selection import train_test_split

# Set random seed
np.random.seed(42)

# Create X and y
X = data.drop("target", axis=1)
y = data["target"]

# Split data to train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Shape of splitted data
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((242, 13), (61, 13), (242,), (61,))

### Choose a model and fit it

In [47]:
# import libraries
from sklearn.ensemble import RandomForestClassifier

# Set random seed
np.random.seed(42)

# Create an object of model
clf = RandomForestClassifier()

# Fit model
clf.fit(X_train, y_train)

### Evaluating model

1. Score function(return accuracy score)

In [49]:
clf.score(X_test, y_test)

0.9344262295081968

2. Accuracy score, precision score, recall score and f1 score

In [50]:
# Import libraries
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Create a function to calculate all metrics
def evaluating_model(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1_scr = f1_score(y_true, y_pred)

    print(f'accuracy score: {round(accuracy, 2)}')
    print(f'precision score: {round(precision, 2)}')
    print(f'recall score: {round(recall, 2)}')
    print(f'f1 score: {round(f1_scr, 2)}')

    return {
        "accuracy": accuracy, 
        "precision": precision,
        "recall": recall,
        "f1-score": f1_scr
    }

3. Prediction and evaluating by above function

In [51]:
# Predict test data
y_preds = clf.predict(X_test)

# evaluate model
evaluating_model(y_preds, y_test)

accuracy score: 0.93
precision score: 0.91
recall score: 0.97
f1 score: 0.94


{'accuracy': 0.9344262295081968,
 'precision': 0.90625,
 'recall': 0.9666666666666667,
 'f1-score': 0.9354838709677419}

### Save and load model

1. Import library and save model

In [52]:
# Import libraries
from joblib import dump, load

# Save object
dump(clf, filename="clf_heart_disease.joblib")

['clf_heart_disease.joblib']

2. Load model

In [54]:
# Load object
loaded_clf_object = load(filename="clf_heart_disease.joblib")

0.5081967213114754