In [197]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier

In [108]:
df = pd.read_csv('heart.csv')
df

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,132,N,1.2,Flat,1
914,68,M,ASY,144,193,1,Normal,141,N,3.4,Flat,1
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat,1
916,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat,1


In [109]:
x, y = df.drop(columns=['HeartDisease']), df['HeartDisease']
x

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up
...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,132,N,1.2,Flat
914,68,M,ASY,144,193,1,Normal,141,N,3.4,Flat
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat
916,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat


In [110]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, shuffle=True, stratify=y, random_state=1)
x_test

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope
417,44,M,ASY,130,209,0,ST,127,N,0.0,Up
395,38,M,ASY,135,0,1,Normal,150,N,0.0,Flat
901,58,F,ASY,170,225,1,LVH,146,Y,2.8,Flat
624,63,F,ASY,150,407,0,LVH,154,N,4.0,Flat
414,54,M,ASY,130,0,1,Normal,110,Y,3.0,Flat
...,...,...,...,...,...,...,...,...,...,...,...
684,47,M,NAP,108,243,0,Normal,152,N,0.0,Up
673,59,F,ASY,174,249,0,Normal,143,Y,0.0,Flat
349,36,M,ASY,110,0,1,Normal,125,Y,1.0,Flat
848,52,M,ASY,128,255,0,Normal,161,Y,0.0,Up


In [111]:
order = ['Age', 'RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak', 'FastingBS', 'Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']

x_train, x_test = x_train[order], x_test[order]
x_train

Unnamed: 0,Age,RestingBP,Cholesterol,MaxHR,Oldpeak,FastingBS,Sex,ChestPainType,RestingECG,ExerciseAngina,ST_Slope
209,54,125,216,140,0.0,0,M,ASY,Normal,N,Flat
656,62,124,209,163,0.0,0,F,ASY,Normal,N,Up
451,64,144,0,122,1.0,0,M,ASY,ST,Y,Flat
294,32,95,0,127,0.7,1,M,TA,Normal,N,Up
545,48,132,272,139,0.2,0,M,ASY,ST,N,Up
...,...,...,...,...,...,...,...,...,...,...,...
23,44,150,288,150,3.0,0,M,ATA,Normal,Y,Flat
61,43,150,254,175,0.0,0,F,NAP,Normal,N,Up
477,61,110,0,108,2.0,1,M,ASY,Normal,Y,Down
871,61,150,243,137,1.0,1,M,NAP,Normal,Y,Flat


In [112]:
x_train.dtypes

Age                 int64
RestingBP           int64
Cholesterol         int64
MaxHR               int64
Oldpeak           float64
FastingBS           int64
Sex                object
ChestPainType      object
RestingECG         object
ExerciseAngina     object
ST_Slope           object
dtype: object

In [113]:
x_train_onehot = pd.get_dummies(x_train.iloc[:, 6:], dtype=int)
x_test_onehot = pd.get_dummies(x_test.iloc[:, 6:], dtype=int)
x_train_onehot

Unnamed: 0,Sex_F,Sex_M,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ExerciseAngina_N,ExerciseAngina_Y,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
209,0,1,1,0,0,0,0,1,0,1,0,0,1,0
656,1,0,1,0,0,0,0,1,0,1,0,0,0,1
451,0,1,1,0,0,0,0,0,1,0,1,0,1,0
294,0,1,0,0,0,1,0,1,0,1,0,0,0,1
545,0,1,1,0,0,0,0,0,1,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23,0,1,0,1,0,0,0,1,0,0,1,0,1,0
61,1,0,0,0,1,0,0,1,0,1,0,0,0,1
477,0,1,1,0,0,0,0,1,0,0,1,1,0,0
871,0,1,0,0,1,0,0,1,0,0,1,0,1,0


In [114]:
drop_cols = ['Sex_F', 'ChestPainType_ASY', 'RestingECG_LVH', 'ExerciseAngina_N', 'ST_Slope_Down']
x_train_onehot.drop(columns=drop_cols, inplace=True)
x_test_onehot.drop(columns=drop_cols, inplace=True)

In [115]:
x_train_onehot

Unnamed: 0,Sex_M,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_Normal,RestingECG_ST,ExerciseAngina_Y,ST_Slope_Flat,ST_Slope_Up
209,1,0,0,0,1,0,0,1,0
656,0,0,0,0,1,0,0,0,1
451,1,0,0,0,0,1,1,1,0
294,1,0,0,1,1,0,0,0,1
545,1,0,0,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...
23,1,1,0,0,1,0,1,1,0
61,0,0,1,0,1,0,0,0,1
477,1,0,0,0,1,0,1,0,0
871,1,0,1,0,1,0,1,1,0


In [118]:
x_test_onehot

Unnamed: 0,Sex_M,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_Normal,RestingECG_ST,ExerciseAngina_Y,ST_Slope_Flat,ST_Slope_Up
417,1,0,0,0,0,1,0,0,1
395,1,0,0,0,1,0,0,1,0
901,0,0,0,0,0,0,1,1,0
624,0,0,0,0,0,0,0,1,0
414,1,0,0,0,1,0,1,1,0
...,...,...,...,...,...,...,...,...,...
684,1,0,1,0,1,0,0,0,1
673,0,0,0,0,1,0,1,1,0
349,1,0,0,0,1,0,1,1,0
848,1,0,0,0,1,0,1,0,1


In [117]:
x_train_final = pd.concat([x_train.iloc[:, :6], x_train_onehot], axis=1)
x_train_final

Unnamed: 0,Age,RestingBP,Cholesterol,MaxHR,Oldpeak,FastingBS,Sex_M,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_Normal,RestingECG_ST,ExerciseAngina_Y,ST_Slope_Flat,ST_Slope_Up
209,54,125,216,140,0.0,0,1,0,0,0,1,0,0,1,0
656,62,124,209,163,0.0,0,0,0,0,0,1,0,0,0,1
451,64,144,0,122,1.0,0,1,0,0,0,0,1,1,1,0
294,32,95,0,127,0.7,1,1,0,0,1,1,0,0,0,1
545,48,132,272,139,0.2,0,1,0,0,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23,44,150,288,150,3.0,0,1,1,0,0,1,0,1,1,0
61,43,150,254,175,0.0,0,0,0,1,0,1,0,0,0,1
477,61,110,0,108,2.0,1,1,0,0,0,1,0,1,0,0
871,61,150,243,137,1.0,1,1,0,1,0,1,0,1,1,0


In [119]:
x_test_final = pd.concat([x_test.iloc[:, :6], x_test_onehot], axis=1)
x_test_final

Unnamed: 0,Age,RestingBP,Cholesterol,MaxHR,Oldpeak,FastingBS,Sex_M,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_Normal,RestingECG_ST,ExerciseAngina_Y,ST_Slope_Flat,ST_Slope_Up
417,44,130,209,127,0.0,0,1,0,0,0,0,1,0,0,1
395,38,135,0,150,0.0,1,1,0,0,0,1,0,0,1,0
901,58,170,225,146,2.8,1,0,0,0,0,0,0,1,1,0
624,63,150,407,154,4.0,0,0,0,0,0,0,0,0,1,0
414,54,130,0,110,3.0,1,1,0,0,0,1,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
684,47,108,243,152,0.0,0,1,0,1,0,1,0,0,0,1
673,59,174,249,143,0.0,0,0,0,0,0,1,0,1,1,0
349,36,110,0,125,1.0,1,1,0,0,0,1,0,1,1,0
848,52,128,255,161,0.0,0,1,0,0,0,1,0,1,0,1


In [121]:
x_train_final = x_train_final.to_numpy()
x_test_final = x_test_final.to_numpy()
y_train, y_test = y_train.to_numpy(), y_test.to_numpy()

In [134]:
rf = RandomForestClassifier(n_estimators=100, 
                            max_features=0.5,
                            random_state=1)
rf.fit(x_train_final, y_train)

In [135]:
print(rf.score(x_train_final, y_train))
print(rf.score(x_test_final, y_test))

1.0
0.8586956521739131


In [138]:
scaler = MinMaxScaler()
train_std = scaler.fit_transform(x_train_final[:, :5])
test_std = scaler.transform(x_test_final[:, :5])

In [140]:
x_train_std = np.empty((x_train_final.shape))
x_test_std = np.empty((x_test_final.shape))

x_train_std[:, :5], x_train_std[:, 5:] = train_std, x_train_final[:, 5:]
x_test_std[:, :5], x_test_std[:, 5:] = test_std, x_test_final[:, 5:]

In [151]:
lr = LogisticRegression(random_state=1, C=50)
lr.fit(x_train_std, y_train)

print(lr.score(x_train_std, y_train))
print(lr.score(x_test_std, y_test))

0.8801089918256131
0.8532608695652174


In [158]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(x_train_std, y_train)
print(knn.score(x_train_std, y_train))
print(knn.score(x_test_std, y_test))

0.8814713896457765
0.8586956521739131


In [172]:
adb = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),
                         random_state=1,
                         learning_rate=0.1,
                         algorithm='SAMME.R',
                         n_estimators=100)

adb.fit(x_train_final, y_train)
print(adb.score(x_train_final, y_train))
print(adb.score(x_test_final, y_test))

0.888283378746594
0.842391304347826


In [176]:
gbc = GradientBoostingClassifier(learning_rate=0.1,
                                 n_estimators=100)

gbc.fit(x_train_final, y_train)
print(gbc.score(x_train_final, y_train))
print(gbc.score(x_test_final, y_test))

0.9495912806539509
0.8532608695652174


In [198]:
svm = SVC(kernel='rbf',
          C=10,
          gamma=0.5, 

          random_state=1)
svm.fit(x_train_std, y_train)
print(svm.score(x_train_std, y_train))
print(svm.score(x_test_std, y_test))

0.9237057220708447
0.8532608695652174


In [208]:
vc = VotingClassifier(estimators=([('lr', LogisticRegression(random_state=1, C=50)),
                                   ('rf', RandomForestClassifier(n_estimators=100,
                                                                max_features=0.5,
                                                                random_state=1)),
                                    ('knn', KNeighborsClassifier(n_neighbors=5)),
                                    ('gbc', GradientBoostingClassifier(learning_rate=0.1,
                                                                      n_estimators=100,
                                                                      random_state=1)),
                                    ('svm', SVC(kernel='rbf', C=10, gamma=0.5, random_state=1, probability=True))
                                    ]),
                     voting='hard')

vc.fit(x_train_std, y_train)

print(vc.score(x_train_std, y_train))
print(vc.score(x_test_std, y_test))

0.9250681198910081
0.8695652173913043
