# Week 3. Kfold Binary classifier (drafty notes)
Cognitive Systems for Health Technology Applications<br>
1.2.2019, Sakari Lukkarinen<br>
Helsinki Metropolia University of Applied Sciences

Inspired by: https://machinelearningmastery.com/binary-classification-tutorial-with-the-keras-deep-learning-library/

This example code shows how to use K-folding calculation from the scikit-learn library to solve the Case 1. See the Inspired by link above for more details.

In [3]:
# Import libraries
%pylab inline

import time

import pandas as pd

from sklearn.preprocessing import normalize
from sklearn.metrics import confusion_matrix
from sklearn.utils import shuffle
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from keras import models, layers
from keras.wrappers.scikit_learn import KerasClassifier

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


In [4]:
# Import data
url = r'http://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data'
dataframe = pd.read_csv(url, 
                        sep = ',', 
                        header = None, 
                        index_col = None,
                        na_values = '?')

# Data column names
name_list = ['age', 'sex', 'cp','trestbps', 'chol', 'fbs','restecg',
             'thalac','exang','oldpeak','slope','ca','thal','num']
dataframe.columns = name_list

# Filling missing data with columnwise median values
dataframe = dataframe.fillna(dataframe.median())

# fix random seed for reproducibility
seed = 7
numpy.random.seed(seed)

# Randomize rows
dataframe = shuffle(dataframe)

# Select the data (input) columns
data_list = ['age', 'sex', 'cp','trestbps', 'chol', 'fbs','restecg',
             'thalac','exang','oldpeak','slope','ca','thal']
data = dataframe[data_list]

# Normalize data
data_min = data.min()
data_max = data.max()
data_norm = (data - data_min)/(data_max - data_min)

# Select labels (output)
labels = dataframe['num']

# Convert the labels as binary
labels = 1.0*(labels > 0.0)

# Check the shapes of the normalized data and categorical output
print(data_norm.shape, labels.shape)

# Plot the histogram (if True)
if False:
  # histogram of the labels (severity of the disease)
  plt.hist(labels, bins = [-0.5, 0.5, 1.5, 2.5, 3.5, 4.5])
  plt.xlabel('Severity of the disease')
  plt.ylabel('Frequency')
  plt.title('Histogram of labels')
  show()


(303, 13) (303,)


In [0]:
# Define the model
def baseline():
  model = models.Sequential()
  model.add(layers.Dense(9, activation='relu', input_shape=(13,)))
  model.add(layers.Dense(5, activation='relu'))
  model.add(layers.Dense(1, activation='sigmoid'))
  
  
  model.compile(optimizer = 'adam',
                loss = 'binary_crossentropy',
                metrics = ['accuracy'])

  return model

In [6]:
# Evaluate the model
t_start = time.time()
estimator = KerasClassifier(build_fn=baseline, epochs=100, batch_size=16, verbose=0)
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
results = cross_val_score(estimator, data, labels, cv=kfold)
t_end = time.time()

print('Elapsed time: {:.2f} seconds'.format(t_end - t_start))

np.set_printoptions(formatter={'float': '{: 0.2f}'.format})

print('Results: ',results*100, "%")
print("Average (std): %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Elapsed time: 27.60 seconds
Results:  [ 45.16  45.16  54.84  77.42  53.33  86.67  63.33  66.67  93.33  72.41] %
Average (std): 65.83% (15.83%)


In [7]:
# Print the results (second time)

print('Results', results*100, "%")
print("Results: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Results [ 45.16  45.16  54.84  77.42  53.33  86.67  63.33  66.67  93.33  72.41] %
Results: 65.83% (15.83%)


In [8]:
# Try another cross validator
from sklearn.model_selection import cross_validate

cv_results = cross_validate(estimator, data, labels, cv=10,
                           return_train_score=True)
sorted(cv_results.keys())                         

print('Test score:', cv_results['test_score'])

Test score: [ 0.52  0.77  0.58  0.30  0.50  0.80  0.83  0.53  0.57  0.77]


In [9]:
cv_results

{'fit_time': array([ 3.12,  3.00,  3.05,  3.25,  3.56,  3.45,  3.50,  3.58,  3.63,
         3.99]),
 'score_time': array([ 0.28,  0.32,  0.33,  0.37,  0.40,  0.42,  0.45,  0.46,  0.49,
         0.52]),
 'test_score': array([ 0.52,  0.77,  0.58,  0.30,  0.50,  0.80,  0.83,  0.53,  0.57,
         0.77]),
 'train_score': array([ 0.54,  0.81,  0.65,  0.48,  0.55,  0.81,  0.79,  0.54,  0.54,
         0.81])}

More details, see: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_validate.html#sklearn.model_selection.cross_validate 