In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler


In [2]:
## import data
set_1_path = 'data/heart.csv'
heart_df = pd.read_csv(set_1_path)
heart_df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [3]:
# there are four different chest pain types
heart_df['ChestPainType'].value_counts()

ASY    496
NAP    203
ATA    173
TA      46
Name: ChestPainType, dtype: int64

In [4]:
# three different resting ecg responses
heart_df['RestingECG'].value_counts()

Normal    552
LVH       188
ST        178
Name: RestingECG, dtype: int64

In [5]:
# three st_slope values
heart_df['ST_Slope'].value_counts()

Flat    460
Up      395
Down     63
Name: ST_Slope, dtype: int64

In [6]:
# change "ExerciseAngina" to 1 or 0 for y/n
# same for "Sex" column
heart_df.loc[heart_df['ExerciseAngina'] == 'Y', 'ExerciseAngina'] = 1
heart_df.loc[heart_df['ExerciseAngina'] == 'N', 'ExerciseAngina'] = 0
heart_df.loc[heart_df['Sex'] == 'F', 'Sex'] = 1
heart_df.loc[heart_df['Sex'] == 'M', 'Sex'] = 0
heart_df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,0,ATA,140,289,0,Normal,172,0,0.0,Up,0
1,49,1,NAP,160,180,0,Normal,156,0,1.0,Flat,1
2,37,0,ATA,130,283,0,ST,98,0,0.0,Up,0
3,48,1,ASY,138,214,0,Normal,108,1,1.5,Flat,1
4,54,0,NAP,150,195,0,Normal,122,0,0.0,Up,0


In [7]:
# rename some headers
heart_df = heart_df.rename(columns={'Sex': 'Female?'})
heart_df.head()

Unnamed: 0,Age,Female?,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,0,ATA,140,289,0,Normal,172,0,0.0,Up,0
1,49,1,NAP,160,180,0,Normal,156,0,1.0,Flat,1
2,37,0,ATA,130,283,0,ST,98,0,0.0,Up,0
3,48,1,ASY,138,214,0,Normal,108,1,1.5,Flat,1
4,54,0,NAP,150,195,0,Normal,122,0,0.0,Up,0


In [8]:
# Convert categorical data to numeric & separate  feature for testing data
heart_df=pd.get_dummies(heart_df)
heart_df

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease,Female?_0,Female?_1,ChestPainType_ASY,...,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ExerciseAngina_0,ExerciseAngina_1,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
0,40,140,289,0,172,0.0,0,1,0,0,...,0,0,0,1,0,1,0,0,0,1
1,49,160,180,0,156,1.0,1,0,1,0,...,1,0,0,1,0,1,0,0,1,0
2,37,130,283,0,98,0.0,0,1,0,0,...,0,0,0,0,1,1,0,0,0,1
3,48,138,214,0,108,1.5,1,0,1,1,...,0,0,0,1,0,0,1,0,1,0
4,54,150,195,0,122,0.0,0,1,0,0,...,1,0,0,1,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,110,264,0,132,1.2,1,1,0,0,...,0,1,0,1,0,1,0,0,1,0
914,68,144,193,1,141,3.4,1,1,0,1,...,0,0,0,1,0,1,0,0,1,0
915,57,130,131,0,115,1.2,1,1,0,1,...,0,0,0,1,0,0,1,0,1,0
916,57,130,236,0,174,0.0,1,0,1,0,...,0,0,1,0,0,1,0,0,1,0


In [9]:
X = heart_df.drop('HeartDisease', axis=1)
y = heart_df['HeartDisease']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [11]:
# Apply standard scaling:

X_scaler = StandardScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)



In [12]:
# View scaled data:
X_train_scaled_df = pd.DataFrame(X_train_scaled)
X_train_scaled_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,-1.515055,-0.751145,0.20027,-0.559517,0.098938,0.288846,0.525853,-0.525853,0.928948,-0.476891,-0.536395,-0.240077,-0.511046,0.815107,-0.487621,0.82907,-0.82907,-0.284608,1.019259,-0.876741
1,-1.199012,0.83819,0.429253,-0.559517,1.610211,-0.089074,0.525853,-0.525853,-1.076487,-0.476891,-0.536395,4.165333,1.95677,-1.226833,-0.487621,0.82907,-0.82907,-0.284608,-0.981105,1.140588
2,0.275856,-0.645189,0.74983,-0.559517,0.496642,0.099886,-1.901671,1.901671,-1.076487,2.096917,-0.536395,-0.240077,-0.511046,0.815107,-0.487621,0.82907,-0.82907,-0.284608,1.019259,-0.876741
3,0.802595,0.308412,-0.285175,-0.559517,-0.497617,2.556362,0.525853,-0.525853,0.928948,-0.476891,-0.536395,-0.240077,1.95677,-1.226833,-0.487621,-1.206171,1.206171,-0.284608,1.019259,-0.876741
4,0.697247,0.414368,0.768149,-0.559517,-0.776009,0.572285,0.525853,-0.525853,0.928948,-0.476891,-0.536395,-0.240077,-0.511046,-1.226833,2.050774,-1.206171,1.206171,-0.284608,1.019259,-0.876741


In [13]:
# Train a Random Forest Classifier model and print the model score
rfc= RandomForestClassifier()
rfc.fit(X_train,y_train)
rfc.score(X_test,y_test)

0.9130434782608695

In [14]:
rfc.fit(X_train, y_train)

RandomForestClassifier()

In [15]:
# Display the confusion matrix:
from sklearn.metrics import confusion_matrix
y_true = y_test
y_pred = rfc.predict(X_test)
confusion_matrix(y_true, y_pred)

array([[ 67,  10],
       [  6, 101]], dtype=int64)

In [16]:
TN,FP,FN,TP = confusion_matrix(y_true, y_pred).ravel()
Accuracy = (TP + TN) / (TP + FP + TN + FN) # (98+54)/(98+19+54+13)
print(F"Accuracy:{Accuracy}")

Accuracy:0.9130434782608695


In [17]:
TN,FP,FN,TP = confusion_matrix(y_true, y_pred).ravel()
Sensitivity = TP/(TP + FN) # (98)/(98+13)
print(F"Sensitivity:{Sensitivity}")


Sensitivity:0.9439252336448598


In [18]:
TN,FP,FN,TP = confusion_matrix(y_true, y_pred).ravel()
Specificity = TN/(FP + TN) # (54)/(19+54)
print(F"Specificity:{Specificity}")


Specificity:0.8701298701298701


In [19]:
Sensitivity = TP/(TP + FN) 

Specificity = TN/(FP + TN)

Accuracy = (TP + TN) / (TP + FP + TN + FN) 