In [40]:
import numpy as np # linear algebra
import pandas as pd # data processing
import random
import sklearn.linear_model
import matplotlib.pyplot as plt #visualization
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import train_test_split #split data
from sklearn.model_selection import GridSearchCV #get best parameters
from sklearn.model_selection import learning_curve
from sklearn.metrics import classification_report
from sklearn.model_selection import validation_curve
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
RANDOM_SEED = 690
CV = 10

In [41]:
data = pd.read_csv("heart.csv")

In [42]:
data.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [43]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


In [44]:
data['Sex'] = np.where(data['Sex'] == 'M', 1, 0)
data['ExerciseAngina'] = np.where(data['ExerciseAngina'] == 'Y', 1, 0)

In [45]:
#create target value and label
y=data.HeartDisease
X=data.drop('HeartDisease', axis=1)

#convert to binary
X=pd.get_dummies(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state = RANDOM_SEED)

In [46]:
dummy = DummyClassifier(strategy="most_frequent")
dummy.fit(X_train, y_train)
preds = dummy.predict(X_test)
print("Performance of Dummy Classifier:")
print(classification_report(y_test, preds, zero_division = 0))

Performance of Dummy Classifier:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       112
           1       0.51      1.00      0.68       118

    accuracy                           0.51       230
   macro avg       0.26      0.50      0.34       230
weighted avg       0.26      0.51      0.35       230



In [47]:
logreg = sklearn.linear_model.LogisticRegression(solver='liblinear', random_state = RANDOM_SEED)

In [48]:
param_grid = {'C': [0.1,0.5,1,10], 'penalty': ['l1', 'l2']}
lg = GridSearchCV(logreg, param_grid, verbose=True, 
                   scoring="f1", n_jobs=-1, cv=CV)
lg.fit(X_train, y_train)
preds = lg.predict(X_test)
print("Best parameters for Logistic Regression:")
print(lg.best_params_)
print("Performance of Logistic Regression: ")
print(classification_report(y_test, preds))

Fitting 10 folds for each of 8 candidates, totalling 80 fits
Best parameters for Logistic Regression:
{'C': 1, 'penalty': 'l1'}
Performance of Logistic Regression: 
              precision    recall  f1-score   support

           0       0.88      0.79      0.83       112
           1       0.82      0.90      0.85       118

    accuracy                           0.84       230
   macro avg       0.85      0.84      0.84       230
weighted avg       0.85      0.84      0.84       230



In [52]:
param_grid={
    'learning_rate': ["constant", "adaptive", "invscaling"],
    'hidden_layer_sizes': [(50,50), (50), (100)],
    'alpha': [1e-2],
    'solver': ['adam','sgd'],
    'activation': ["relu", "tanh", "logistic"]
}

#Scoring by balanced_accuracy over recall or accuracy gives far better results.
mlp = GridSearchCV(MLPClassifier(random_state = RANDOM_SEED, max_iter = 20000), param_grid, verbose=True, 
                   scoring="balanced_accuracy", n_jobs=-1, cv=CV)

mlp.fit(X_train, y_train)
print("Best parameters for MLP:")
print(mlp.best_params_)

Fitting 10 folds for each of 54 candidates, totalling 540 fits
Best parameters for MLP:
{'activation': 'logistic', 'alpha': 0.01, 'hidden_layer_sizes': (50, 50), 'learning_rate': 'constant', 'solver': 'adam'}


In [53]:
preds = mlp.predict(X_test)
print("Performance of MLP: ")
print(classification_report(y_test, preds))

Performance of MLP: 
              precision    recall  f1-score   support

           0       0.92      0.77      0.84       112
           1       0.81      0.94      0.87       118

    accuracy                           0.86       230
   macro avg       0.87      0.85      0.85       230
weighted avg       0.87      0.86      0.86       230



In [12]:
param_grid =[{'kernel': ['poly'], 'degree': [2, 3], 'C': [0.5,0.1,10]},
            {'kernel': ['linear', 'sigmoid'], 'C': [0.5,0.1,10]}]

svm = GridSearchCV(SVC(class_weight='balanced', random_state = RANDOM_SEED), param_grid, verbose=True, 
                   scoring="balanced_accuracy", n_jobs=-1, cv=CV)

svm.fit(X_train, np.ravel(y_train))
print("Best parameters are for SVM:")
print(svm.best_params_)

Fitting 10 folds for each of 12 candidates, totalling 120 fits
Best parameters are for SVM:
{'C': 0.5, 'kernel': 'linear'}


In [13]:
preds = svm.predict(X_test)
print("Performance of SVM: ")
print(classification_report(y_test, preds))

Performance of SVM: 
              precision    recall  f1-score   support

           0       0.89      0.83      0.86       112
           1       0.85      0.90      0.87       118

    accuracy                           0.87       230
   macro avg       0.87      0.86      0.86       230
weighted avg       0.87      0.87      0.86       230

