In [1]:
import pandas as pd
import numpy as np

In [2]:
heart_data = pd.read_csv('Data/heart-disease.csv')

In [4]:
heart_data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


There are different performance metrics used in the ML
1. Accuracy
2. Area under ROC curve
3. Confusion Matrix
4. Classification report

### Accuracy

In [15]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

X = heart_data.drop("target", axis = 1)
y = heart_data.target

clf = RandomForestClassifier()

cross_values = cross_val_score(clf, X, y, cv = 5)

In [16]:
np.mean(cross_values)

0.8183060109289617

In [17]:
print(f"Accuracy of the classifier based on the cross value score is {np.mean(cross_values) * 100:.2f}%")

Accuracy of the classifier based on the cross value score is 81.83%


### Area under curve

**It is also called as area under the reciever operating characteristics curve (AUC/ROC)**

* Area under curve (AUC)
* ROC curve

ROC curve is comparison of model's true positive rate to false positive rate

1. True Positive : Predicted 1 when truth is 1
2. False Positive : Predicted 1 when truth is 0
3. True Negative : Predicted 0 when truth is 0
4. False Negative : Predicted 0 when truth is 1

In [19]:
# lets just get the probability of predicted values in the clf
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

clf = RandomForestClassifier()

clf.fit(X_train, y_train)

y_proba = clf.predict_proba(X_test)

In [21]:
y_proba[0:10]

array([[0.46, 0.54],
       [0.3 , 0.7 ],
       [0.07, 0.93],
       [0.83, 0.17],
       [0.21, 0.79],
       [0.94, 0.06],
       [0.26, 0.74],
       [0.52, 0.48],
       [0.18, 0.82],
       [0.28, 0.72]])

In [22]:
# Get the positive values in the y_proba which are in the second column of every row

positive_val = y_proba[:, 1]

In [23]:
positive_val[:10]

array([0.54, 0.7 , 0.93, 0.17, 0.79, 0.06, 0.74, 0.48, 0.82, 0.72])

In [25]:
from sklearn.metrics import roc_curve

fpr, tpr, threshhold = roc_curve(y_test, positive_val)