## Import Packages

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold
import category_encoders as ce
from sklearn import svm as svm
from sklearn import metrics
from sklearn.model_selection import GridSearchCV

## Import Data

In [2]:
heart = pd.read_csv('heart.csv')
print(heart.shape)
heart.head()

(918, 12)


Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


## Data Exploration and Feature Selection

In [3]:
### Examine correlation between variables
heart.corr(numeric_only = True)
# There was not a strong correlation among any of the variables to be used

### Encode categorical variables
encoder = ce.OneHotEncoder(return_df = True)
heart_encoded = encoder.fit_transform(heart.iloc[:, 0:11], heart.iloc[:, 11])
#print(heart_encoded.head())
### Check for low variance variables

thresh = VarianceThreshold(0.15)
high_variance = thresh.fit(heart_encoded)

cols = thresh.get_support(indices = True)
selected_columns = heart_encoded.iloc[:, cols].columns.tolist()
#print(selected_columns)
# ST_Slope_3 and ChestPainType_4 have low variance, but will be kept in since they are encoded alongside other variables
# I determined this set of features should all prove to be useful in the model, so I will move on from feature selection


## Split Data

In [4]:
# Data will be split 70/30
x_train, x_test, y_train, y_test = train_test_split(heart_encoded, heart.iloc[:, 11], test_size=0.3, random_state = 1)

## Build Model

In [5]:
# Use SVM, including 5 fold cross validation for hyperparameters

svm = svm.SVC(random_state = 1)
params = {'C': [0.1, 1, 10, 100], 'gamma': [1, 0.1, 0.01, 0.001], 'kernel': ['rbf', 'linear', 'sigmoid']} 
grid = GridSearchCV(svm, params, cv = 5)
grid.fit(x_train, y_train)

## Predict on Test Set

In [6]:
y_prediction = grid.predict(x_test)

## Evaluate Model

In [7]:
print("Model Accuracy:",metrics.accuracy_score(y_prediction, y_test))
print("Model Sensitivity:",metrics.recall_score(y_prediction, y_test))
print("Model Specificity:",metrics.precision_score(y_prediction, y_test, pos_label = 0))
print("Parameters used:", grid.best_params_)

Model Accuracy: 0.8731884057971014
Model Sensitivity: 0.9074074074074074
Model Specificity: 0.8623853211009175
Parameters used: {'C': 0.1, 'gamma': 1, 'kernel': 'linear'}
