In [2]:
import csv
import urllib3
from bs4 import BeautifulSoup
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, Normalizer, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import zero_one_loss, accuracy_score, classification_report

In [20]:
# download data & clean
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data'
http = urllib3.PoolManager()
response = http.request('GET', url)
soup = BeautifulSoup(response.data.decode('utf-8'))
# split and remove last empty list entry
data = str(soup).split("\n")[:-1]
# split by line
data = [string.split(",") for string in data]
df = pd.DataFrame(data, columns=["age","sex","cp","trestbps","chol","fbs","restecg","thalach","exang","oldpeak","slope","ca","thal","num"])

# Clean Data, by inspection we notice '?' for missing values
error = df.loc[(df['thal'] == '?') | (df['ca'] == '?')]
df_clean = pd.concat([df,error]).drop_duplicates(keep=False)
df_clean = df_clean.astype(float)

# split training data and target
df_X = df_clean.drop('num', axis=1)
df_y = df_clean['num']
# all categories >= 1.0 indicate heart disease
df_y = df_y.replace([2.0,3.0,4.0], 1.0)

# Transform non-binary features by scaling each feature to a given range, other options MinMaxScaler, Normalizer
scaler = StandardScaler()
scaler.fit(df_X)
df_X = scaler.transform(df_X)


In [21]:
# Basic examples for Classification

X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size = 0.2, random_state = 0)

# KNN
classifier = KNeighborsClassifier(n_neighbors = 15)
classifier.fit(X_train, y_train)
y_predict = classifier.predict(X_test)
error_rate = zero_one_loss(y_test, y_predict, normalize=True)
accuracy = 1 - error_rate
print("Error Rate KNN: ", error_rate)
print("Accuracy KNN: ", accuracy)

# Random Forest
classifier = RandomForestClassifier()
classifier.fit(X_train, y_train)
y_predict = classifier.predict(X_test)
error_rate = zero_one_loss(y_test, y_predict, normalize=True)
accuracy = 1 - error_rate
print("Error Rate RF: ", error_rate)
print("Accuracy RF: ", accuracy)

# Multi Layer Perceptron
classifier = MLPClassifier(solver="lbfgs",max_iter=100000)
classifier.fit(X_train, y_train)
y_predict = classifier.predict(X_test)
error_rate = zero_one_loss(y_test, y_predict, normalize=True)
accuracy = 1 - error_rate
print("Error Rate MLP: ", error_rate)
print("Accuracy MLP: ", accuracy)

Error Rate KNN:  0.16666666666666663
Accuracy KNN:  0.8333333333333334
Error Rate RF:  0.19999999999999996
Accuracy RF:  0.8
Error Rate MLP:  0.21666666666666667
Accuracy MLP:  0.7833333333333333


In [24]:
# Hyperparameter Optimization on MLP
# ATTENTION! Running the GridSearch might take some time
# example results: Best parameters found:
# {'activation': 'tanh', 'alpha': 0.005, 'hidden_layer_sizes': (250,), 'learning_rate': 'adaptive', 'solver': 'lbfgs'}

classifier = MLPClassifier(max_iter=100000)

# some parameters to try
parameter_space = {
    'hidden_layer_sizes': [(100,), (150,), (200,), (250,), (300)],
    'activation': ['tanh', 'relu'],
    'solver': ['lbfgs'],
    'alpha': [0.001, 0.005],
    'learning_rate': ['constant', 'adaptive'],
}

# Grid Search : Exhaustive search over specified parameter values for an estimator
clf = GridSearchCV(classifier, parameter_space, n_jobs=-1, cv=5)
clf.fit(df_X, df_y) 
print('Best parameters found:\n', clf.best_params_)
y_predict = clf.predict(X_test)
print('Results on the test set:')
print(classification_report(y_test, y_predict))
error_rate = zero_one_loss(y_test, y_predict, normalize=True)
accuracy = 1 - error_rate
print("Error Rate MLP hyp: ", error_rate)
print("Accuracy MLP hyp: ", accuracy)

Best parameters found:
 {'activation': 'tanh', 'alpha': 0.005, 'hidden_layer_sizes': (250,), 'learning_rate': 'adaptive', 'solver': 'lbfgs'}
Results on the test set:
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00        30
         1.0       1.00      1.00      1.00        30

    accuracy                           1.00        60
   macro avg       1.00      1.00      1.00        60
weighted avg       1.00      1.00      1.00        60

Error Rate MLP hyp:  0.0
Accuracy MLP hyp:  1.0


In [40]:
classifier = MLPClassifier(max_iter=100000, activation= 'tanh', alpha= 0.005, hidden_layer_sizes= (250,), learning_rate = 'adaptive', solver = 'lbfgs')
classifier.fit(X_train, y_train)
y_predict = classifier.predict(X_test)
error_rate = zero_one_loss(y_test, y_predict, normalize=True)
accuracy = 1 - error_rate
print("Error Rate MLP: ", error_rate)
print("Accuracy MLP: ", accuracy)

Error Rate MLP:  0.21666666666666667
Accuracy MLP:  0.7833333333333333
