In [None]:
### Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats as st

from sklearn.model_selection import train_test_split
from sklearn import model_selection
from sklearn.metrics import accuracy_score

from sklearn import preprocessing

from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

import warnings
warnings.filterwarnings("ignore")



In [2]:
### Load data

pathname = 'Data/SAheart.data.txt'
data = pd.read_csv(pathname, sep=",", header=0, index_col='row.names')

In [3]:
### Define dependent and independent variables

feature_cols = ['sbp', 'tobacco', 'ldl', 'adiposity', 'typea', 'obesity', 'alcohol', 'age']
X = data[feature_cols]  # Training data
y = data['chd'] # Target variable

In [6]:
### Cross-validation

K = 10
outerCV = model_selection.KFold(n_splits=K, shuffle=True)

# Keep track of scores
scores = {'Baseline': [], 'LR': [], 'MLP': []}

# Outer CV to find best model
for train_idx, test_idx in outerCV.split(X):

    # Extract training and test data for current fold
    X_train, X_test = X.iloc[train_idx, :], X.iloc[test_idx, :]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    # Preprocessing
    scaler = preprocessing.StandardScaler().fit(X_train)
    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Inner CV to find best hyperparameters
    innerCV = model_selection.KFold(n_splits=K, shuffle=True)
    
    # Define models
    Baseline = st.mode(y_train.values)[0] # Baseline
    LR = LogisticRegression()   # Logistic regression
    MLP = MLPClassifier()   # ANN

    # Define search space for hyperparameters
    LR_grid = {'C': [0.01, 0.1, 1.0], 'max_iter': [100, 200, 500]}
    MLP_grid = {'alpha': [0.0001, 0.001, 0.01], 'learning_rate_init': [0.0001, 0.001, 0.01]} 

    # GridSearchCV to find best set of hyperparameters
    LR_search = model_selection.GridSearchCV(estimator=LR, param_grid=LR_grid, scoring='accuracy', cv=innerCV)
    MLP_search = model_selection.GridSearchCV(estimator=MLP, param_grid=MLP_grid, scoring='accuracy', cv=innerCV)
    
    LR_results = LR_search.fit(X_train_scaled, y_train)
    MLP_results = MLP_search.fit(X_train_scaled, y_train)

    # Get best performing model 
    LR_best_model = LR_results.best_estimator_
    MLP_best_model = MLP_results.best_estimator_

    # Make predictions on test set
    Baseline_y_preds = np.full(y_test.shape[0], Baseline)
    LR_y_preds = LR_best_model.predict(X_test_scaled)
    MLP_y_preds = MLP_best_model.predict(X_test_scaled)

    # Evaluate predictions
    Baseline_score = accuracy_score(y_test.values, Baseline_y_preds)
    LR_score = accuracy_score(y_test.values, LR_y_preds)
    MLP_score = accuracy_score(y_test.values, MLP_y_preds)

    # Add score to scores list
    scores['Baseline'].append(Baseline_score)
    scores['LR'].append(LR_score)
    scores['MLP'].append(MLP_score)

    # Report progress
    print('> Baseline acc=%.3f, mode=%s' % (Baseline_score, Baseline))
    print('> LR acc=%.3f, est=%.3f, cfg=%s' % (LR_score, LR_results.best_score_, LR_results.best_params_))
    print('> MLP acc=%.3f, est=%.3f, cfg=%s \n' % (MLP_score, MLP_results.best_score_, MLP_results.best_params_))

# Summarize the estimated performance of the model
print('Baseline Accuracy: %.3f (%.3f)' % (np.mean(scores['Baseline']), np.std(scores['Baseline'])))
print('LR Accuracy: %.3f (%.3f)' % (np.mean(scores['LR']), np.std(scores['LR'])))
print('MLP Accuracy: %.3f (%.3f) \n' % (np.mean(scores['MLP']), np.std(scores['MLP'])))

    



> Baseline acc=0.681, mode=0
> LR acc=0.745, est=0.699, cfg={'C': 0.1, 'max_iter': 100}
> MLP acc=0.766, est=0.696, cfg={'alpha': 0.001, 'learning_rate_init': 0.001} 

> Baseline acc=0.660, mode=0
> LR acc=0.660, est=0.713, cfg={'C': 1.0, 'max_iter': 100}
> MLP acc=0.702, est=0.721, cfg={'alpha': 0.01, 'learning_rate_init': 0.001} 

> Baseline acc=0.587, mode=0
> LR acc=0.804, est=0.695, cfg={'C': 1.0, 'max_iter': 100}
> MLP acc=0.739, est=0.706, cfg={'alpha': 0.01, 'learning_rate_init': 0.0001} 

> Baseline acc=0.652, mode=0
> LR acc=0.674, est=0.707, cfg={'C': 1.0, 'max_iter': 100}
> MLP acc=0.652, est=0.709, cfg={'alpha': 0.0001, 'learning_rate_init': 0.001} 

> Baseline acc=0.674, mode=0
> LR acc=0.630, est=0.716, cfg={'C': 1.0, 'max_iter': 100}
> MLP acc=0.696, est=0.731, cfg={'alpha': 0.01, 'learning_rate_init': 0.0001} 

> Baseline acc=0.543, mode=0
> LR acc=0.587, est=0.731, cfg={'C': 0.1, 'max_iter': 100}
> MLP acc=0.609, est=0.714, cfg={'alpha': 0.001, 'learning_rate_init': 0