In [1]:
# -*- coding: utf-8 -*-
"""
Created on Sun Aug 16 17:10:53 2020

@author: wanxiang.shen@u.nus.edu
"""

import warnings
warnings.filterwarnings("ignore")


import pandas as pd
import numpy as np

from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import confusion_matrix, precision_recall_curve, roc_auc_score
from sklearn.metrics import auc as calculate_auc

import matplotlib.pyplot as plt
import seaborn as sns

from aggmap import AggMap, AggModel

np.random.seed(666) #just for reaptable results

def prc_auc_score(y_true, y_score):
    precision, recall, threshold  = precision_recall_curve(y_true, y_score) #PRC_AUC
    auc = calculate_auc(recall, precision)
    return auc

In [2]:
dfx  = pd.read_excel('./COVID19.xlsx', sheet_name='data')
dfy  = pd.read_excel('./COVID19.xlsx', sheet_name='sample_info')

dfx = dfx[dfx.columns[1:]]
cols = ["p-%s" % c for c in dfx.columns]
dfx.columns = cols

X = dfx.values
Y = pd.get_dummies(dfy['class']).values.astype(float)

mp = AggMap(dfx, metric = 'correlation')
mp.fit(cluster_channels = 5, verbose = 0)

2020-08-18 14:10:42,726 - INFO - [bidd-aggmap] - Calculating distance ...
2020-08-18 14:10:42,734 - INFO - [bidd-aggmap] - the number of process is 16


100%|##########| 3828/3828 [00:02<00:00, 1362.76it/s]
100%|##########| 3828/3828 [00:00<00:00, 1217270.33it/s]
100%|##########| 88/88 [00:00<00:00, 1031.89it/s]


2020-08-18 14:10:46,009 - INFO - [bidd-aggmap] - applying hierarchical clustering to obtain group information ...
2020-08-18 14:10:51,376 - INFO - [bidd-aggmap] - Applying grid feature map(assignment), this may take several minutes(1~30 min)
2020-08-18 14:10:51,406 - INFO - [bidd-aggmap] - Finished


<aggmap.map.AggMap at 0x7f46b665d290>

## 4-fold cross validation and repeat 5 times 

In [3]:
outer_fold = 4

run_all = []
for repeat_seed in [8, 16, 32, 64, 128]: #5 repeats random seeds
    
    outer = KFold(n_splits = outer_fold, shuffle = True, random_state = repeat_seed)
    outer_idx = outer.split(range(len(Y)), Y[:,0])

    for i, idx in enumerate(outer_idx):
        
        fold_num = "fold_%s" % str(i).zfill(2) 
        
        train_idx, test_idx = idx
        
        testY = Y[test_idx]
        testx = X[test_idx]
        
        trainx = X[train_idx]
        trainY = Y[train_idx]

        trainX = mp.batch_transform(trainx, scale_method = 'standard')
        testX = mp.batch_transform(testx, scale_method = 'standard')
        
        print("\n input train and test X shape is %s, %s " % (trainX.shape,  testX.shape))
        
        clf = AggModel.MultiClassEstimator(epochs = 100,  verbose = 0)
        clf.fit(trainX, trainY)

        y_true = testY[:,0]
        y_pred = clf.predict(testX)[:,0]
        y_score = clf.predict_proba(testX)[:,0]

        tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
        
        acc = (tp + tn) / sum([tn, fp, fn, tp])
        
        sensitivity = tp / sum([tp, fn])
        specificity = tn / sum([tn, fp])
        
        prc_auc = prc_auc_score(y_true, y_score)
        roc_auc = roc_auc_score(y_true, y_score)
        
        precision = tp / sum([tp, fp])
        recall =  tp / sum([tp, fn]) #equals to sensitivity
        
        
        res = {'fold': fold_num,
               'repeat_seed':repeat_seed,
               
               'accuracy':acc, 
               
               'prc_auc':prc_auc, 
               'roc_auc':roc_auc,

               'sensitivity': sensitivity, 
               'specificity': specificity,
               
               'precision':precision,
               'recall':recall,
              }
        
        run_all.append(res)

100%|##########| 271/271 [00:05<00:00, 51.41it/s]
100%|##########| 91/91 [00:00<00:00, 979.53it/s]



 input train and test X shape is (271, 10, 9, 5), (91, 10, 9, 5) 
MultiClassEstimator(batch_size=128, conv1_kernel_size=11, dense_avf='relu',
                    dense_layers=[128], epochs=100, lr=0.0001, metric='ROC',
                    monitor='val_loss', name='AggMap MultiClass Estimator',
                    patience=10000, random_state=32, verbose=0)


100%|##########| 271/271 [00:00<00:00, 784.95it/s]
100%|##########| 91/91 [00:00<00:00, 1075.71it/s]



 input train and test X shape is (271, 10, 9, 5), (91, 10, 9, 5) 
MultiClassEstimator(batch_size=128, conv1_kernel_size=11, dense_avf='relu',
                    dense_layers=[128], epochs=100, lr=0.0001, metric='ROC',
                    monitor='val_loss', name='AggMap MultiClass Estimator',
                    patience=10000, random_state=32, verbose=0)


100%|##########| 272/272 [00:00<00:00, 1033.77it/s]
100%|##########| 90/90 [00:00<00:00, 1182.65it/s]



 input train and test X shape is (272, 10, 9, 5), (90, 10, 9, 5) 
MultiClassEstimator(batch_size=128, conv1_kernel_size=11, dense_avf='relu',
                    dense_layers=[128], epochs=100, lr=0.0001, metric='ROC',
                    monitor='val_loss', name='AggMap MultiClass Estimator',
                    patience=10000, random_state=32, verbose=0)


100%|##########| 272/272 [00:00<00:00, 595.22it/s] 
100%|##########| 90/90 [00:00<00:00, 813.79it/s]



 input train and test X shape is (272, 10, 9, 5), (90, 10, 9, 5) 
MultiClassEstimator(batch_size=128, conv1_kernel_size=11, dense_avf='relu',
                    dense_layers=[128], epochs=100, lr=0.0001, metric='ROC',
                    monitor='val_loss', name='AggMap MultiClass Estimator',
                    patience=10000, random_state=32, verbose=0)


100%|##########| 271/271 [00:00<00:00, 1265.77it/s]
100%|##########| 91/91 [00:00<00:00, 1271.46it/s]



 input train and test X shape is (271, 10, 9, 5), (91, 10, 9, 5) 
MultiClassEstimator(batch_size=128, conv1_kernel_size=11, dense_avf='relu',
                    dense_layers=[128], epochs=100, lr=0.0001, metric='ROC',
                    monitor='val_loss', name='AggMap MultiClass Estimator',
                    patience=10000, random_state=32, verbose=0)


100%|##########| 271/271 [00:00<00:00, 1171.60it/s]
100%|##########| 91/91 [00:00<00:00, 1001.49it/s]



 input train and test X shape is (271, 10, 9, 5), (91, 10, 9, 5) 
MultiClassEstimator(batch_size=128, conv1_kernel_size=11, dense_avf='relu',
                    dense_layers=[128], epochs=100, lr=0.0001, metric='ROC',
                    monitor='val_loss', name='AggMap MultiClass Estimator',
                    patience=10000, random_state=32, verbose=0)


100%|##########| 272/272 [00:00<00:00, 1017.59it/s]
100%|##########| 90/90 [00:00<00:00, 930.37it/s]



 input train and test X shape is (272, 10, 9, 5), (90, 10, 9, 5) 
MultiClassEstimator(batch_size=128, conv1_kernel_size=11, dense_avf='relu',
                    dense_layers=[128], epochs=100, lr=0.0001, metric='ROC',
                    monitor='val_loss', name='AggMap MultiClass Estimator',
                    patience=10000, random_state=32, verbose=0)


100%|##########| 272/272 [00:00<00:00, 1384.33it/s]
100%|##########| 90/90 [00:00<00:00, 1102.71it/s]



 input train and test X shape is (272, 10, 9, 5), (90, 10, 9, 5) 
MultiClassEstimator(batch_size=128, conv1_kernel_size=11, dense_avf='relu',
                    dense_layers=[128], epochs=100, lr=0.0001, metric='ROC',
                    monitor='val_loss', name='AggMap MultiClass Estimator',
                    patience=10000, random_state=32, verbose=0)


100%|##########| 271/271 [00:00<00:00, 1142.92it/s]
100%|##########| 91/91 [00:00<00:00, 1176.11it/s]



 input train and test X shape is (271, 10, 9, 5), (91, 10, 9, 5) 
MultiClassEstimator(batch_size=128, conv1_kernel_size=11, dense_avf='relu',
                    dense_layers=[128], epochs=100, lr=0.0001, metric='ROC',
                    monitor='val_loss', name='AggMap MultiClass Estimator',
                    patience=10000, random_state=32, verbose=0)


100%|##########| 271/271 [00:00<00:00, 1299.00it/s]
100%|##########| 91/91 [00:00<00:00, 1652.30it/s]



 input train and test X shape is (271, 10, 9, 5), (91, 10, 9, 5) 
MultiClassEstimator(batch_size=128, conv1_kernel_size=11, dense_avf='relu',
                    dense_layers=[128], epochs=100, lr=0.0001, metric='ROC',
                    monitor='val_loss', name='AggMap MultiClass Estimator',
                    patience=10000, random_state=32, verbose=0)


100%|##########| 272/272 [00:00<00:00, 1016.68it/s]
100%|##########| 90/90 [00:00<00:00, 990.59it/s]



 input train and test X shape is (272, 10, 9, 5), (90, 10, 9, 5) 
MultiClassEstimator(batch_size=128, conv1_kernel_size=11, dense_avf='relu',
                    dense_layers=[128], epochs=100, lr=0.0001, metric='ROC',
                    monitor='val_loss', name='AggMap MultiClass Estimator',
                    patience=10000, random_state=32, verbose=0)


100%|##########| 272/272 [00:00<00:00, 1091.68it/s]
100%|##########| 90/90 [00:00<00:00, 991.72it/s]



 input train and test X shape is (272, 10, 9, 5), (90, 10, 9, 5) 
MultiClassEstimator(batch_size=128, conv1_kernel_size=11, dense_avf='relu',
                    dense_layers=[128], epochs=100, lr=0.0001, metric='ROC',
                    monitor='val_loss', name='AggMap MultiClass Estimator',
                    patience=10000, random_state=32, verbose=0)


100%|##########| 271/271 [00:00<00:00, 1153.12it/s]
100%|##########| 91/91 [00:00<00:00, 776.18it/s]



 input train and test X shape is (271, 10, 9, 5), (91, 10, 9, 5) 
MultiClassEstimator(batch_size=128, conv1_kernel_size=11, dense_avf='relu',
                    dense_layers=[128], epochs=100, lr=0.0001, metric='ROC',
                    monitor='val_loss', name='AggMap MultiClass Estimator',
                    patience=10000, random_state=32, verbose=0)


100%|##########| 271/271 [00:00<00:00, 1245.54it/s]
100%|##########| 91/91 [00:00<00:00, 1058.30it/s]



 input train and test X shape is (271, 10, 9, 5), (91, 10, 9, 5) 
MultiClassEstimator(batch_size=128, conv1_kernel_size=11, dense_avf='relu',
                    dense_layers=[128], epochs=100, lr=0.0001, metric='ROC',
                    monitor='val_loss', name='AggMap MultiClass Estimator',
                    patience=10000, random_state=32, verbose=0)


100%|##########| 272/272 [00:00<00:00, 1301.00it/s]
100%|##########| 90/90 [00:00<00:00, 546.83it/s]



 input train and test X shape is (272, 10, 9, 5), (90, 10, 9, 5) 
MultiClassEstimator(batch_size=128, conv1_kernel_size=11, dense_avf='relu',
                    dense_layers=[128], epochs=100, lr=0.0001, metric='ROC',
                    monitor='val_loss', name='AggMap MultiClass Estimator',
                    patience=10000, random_state=32, verbose=0)


100%|##########| 272/272 [00:00<00:00, 782.08it/s] 
100%|##########| 90/90 [00:00<00:00, 1204.20it/s]



 input train and test X shape is (272, 10, 9, 5), (90, 10, 9, 5) 
MultiClassEstimator(batch_size=128, conv1_kernel_size=11, dense_avf='relu',
                    dense_layers=[128], epochs=100, lr=0.0001, metric='ROC',
                    monitor='val_loss', name='AggMap MultiClass Estimator',
                    patience=10000, random_state=32, verbose=0)


100%|##########| 271/271 [00:00<00:00, 1247.27it/s]
100%|##########| 91/91 [00:00<00:00, 1206.96it/s]



 input train and test X shape is (271, 10, 9, 5), (91, 10, 9, 5) 
MultiClassEstimator(batch_size=128, conv1_kernel_size=11, dense_avf='relu',
                    dense_layers=[128], epochs=100, lr=0.0001, metric='ROC',
                    monitor='val_loss', name='AggMap MultiClass Estimator',
                    patience=10000, random_state=32, verbose=0)


100%|##########| 271/271 [00:00<00:00, 1014.71it/s]
100%|##########| 91/91 [00:00<00:00, 1077.83it/s]



 input train and test X shape is (271, 10, 9, 5), (91, 10, 9, 5) 
MultiClassEstimator(batch_size=128, conv1_kernel_size=11, dense_avf='relu',
                    dense_layers=[128], epochs=100, lr=0.0001, metric='ROC',
                    monitor='val_loss', name='AggMap MultiClass Estimator',
                    patience=10000, random_state=32, verbose=0)


100%|##########| 272/272 [00:00<00:00, 765.03it/s]
100%|##########| 90/90 [00:00<00:00, 958.79it/s]



 input train and test X shape is (272, 10, 9, 5), (90, 10, 9, 5) 
MultiClassEstimator(batch_size=128, conv1_kernel_size=11, dense_avf='relu',
                    dense_layers=[128], epochs=100, lr=0.0001, metric='ROC',
                    monitor='val_loss', name='AggMap MultiClass Estimator',
                    patience=10000, random_state=32, verbose=0)


100%|##########| 272/272 [00:00<00:00, 1024.81it/s]
100%|##########| 90/90 [00:00<00:00, 958.51it/s]



 input train and test X shape is (272, 10, 9, 5), (90, 10, 9, 5) 
MultiClassEstimator(batch_size=128, conv1_kernel_size=11, dense_avf='relu',
                    dense_layers=[128], epochs=100, lr=0.0001, metric='ROC',
                    monitor='val_loss', name='AggMap MultiClass Estimator',
                    patience=10000, random_state=32, verbose=0)


In [4]:
clf._model.count_params()

323138

In [6]:
df = pd.DataFrame(run_all)
df.to_excel('results.xlsx')
df

Unnamed: 0,fold,repeat_seed,accuracy,prc_auc,roc_auc,sensitivity,specificity,precision,recall
0,fold_00,8,0.934066,0.993149,0.990138,0.961538,0.897436,0.925926,0.961538
1,fold_01,8,0.934066,0.990188,0.983488,0.929825,0.941176,0.963636,0.929825
2,fold_02,8,0.9,0.970073,0.969877,0.933333,0.866667,0.875,0.933333
3,fold_03,8,0.977778,0.994337,0.991494,0.982456,0.969697,0.982456,0.982456
4,fold_00,16,0.967033,0.996818,0.994388,0.946429,1.0,1.0,0.946429
5,fold_01,16,0.956044,0.99118,0.985036,0.964912,0.941176,0.964912,0.964912
6,fold_02,16,0.944444,0.992035,0.990575,1.0,0.880952,0.90566,1.0
7,fold_03,16,0.933333,0.980059,0.98,0.98,0.875,0.907407,0.98
8,fold_00,32,0.89011,0.980737,0.966837,0.946429,0.8,0.883333,0.946429
9,fold_01,32,0.912088,0.978132,0.97598,0.980392,0.825,0.877193,0.980392


In [10]:
results = df.groupby(['repeat_seed']).apply(np.mean)
results

Unnamed: 0_level_0,repeat_seed,accuracy,prc_auc,roc_auc,sensitivity,specificity,precision,recall
repeat_seed,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
8,8.0,0.936477,0.986937,0.983749,0.951788,0.918744,0.936755,0.951788
16,16.0,0.950214,0.990023,0.9875,0.972835,0.924282,0.944495,0.972835
32,32.0,0.931105,0.988684,0.984451,0.967816,0.88125,0.921613,0.967816
64,64.0,0.931013,0.98609,0.979564,0.938416,0.917196,0.944409,0.938416
128,128.0,0.939225,0.992766,0.989289,0.957418,0.913389,0.939659,0.957418


In [11]:
results.mean().round(3)

repeat_seed    49.600
accuracy        0.938
prc_auc         0.989
roc_auc         0.985
sensitivity     0.958
specificity     0.911
precision       0.937
recall          0.958
dtype: float64

In [12]:
results.std().round(3)

repeat_seed    48.793
accuracy        0.008
prc_auc         0.003
roc_auc         0.004
sensitivity     0.014
specificity     0.017
precision       0.009
recall          0.014
dtype: float64