In [8]:
# -*- coding: utf-8 -*-
"""
Created on Sun Aug 16 17:10:53 2020

@author: wanxiang.shen@u.nus.edu
"""

import warnings
warnings.filterwarnings("ignore")


import pandas as pd
import numpy as np

from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import confusion_matrix, precision_recall_curve, roc_auc_score
from sklearn.metrics import auc as calculate_auc

import matplotlib.pyplot as plt
import seaborn as sns

from aggmap import AggMap, AggMapNet, loadmap

np.random.seed(666) #just for reaptable results

def prc_auc_score(y_true, y_score):
    precision, recall, threshold  = precision_recall_curve(y_true, y_score) #PRC_AUC
    auc = calculate_auc(recall, precision)
    return auc

In [53]:
dfx  = pd.read_excel('./COVID19.xlsx', sheet_name='data')
dfy  = pd.read_excel('./COVID19.xlsx', sheet_name='sample_info')

dfx = dfx[dfx.columns[1:]]
cols = ["p-%s" % c for c in dfx.columns]
dfx.columns = cols

X = dfx.values
Y = pd.get_dummies(dfy['class']).values.astype(float)

mp = AggMap(dfx, metric = 'correlation')
mp.fit(cluster_channels = 10, verbose = 0)
mp.save('./saved_model/aggmap.mp')
#mp = loadmap('./aggmap.mp')

2021-11-16 12:58:04,252 - [32mINFO[0m - [bidd-aggmap][0m - Calculating distance ...[0m
2021-11-16 12:58:04,253 - [32mINFO[0m - [bidd-aggmap][0m - the number of process is 16[0m


100%|##########| 3828/3828 [00:00<00:00, 8574.06it/s] 
100%|##########| 3828/3828 [00:00<00:00, 599029.80it/s]
100%|##########| 88/88 [00:00<00:00, 1125.91it/s]


2021-11-16 12:58:05,243 - [32mINFO[0m - [bidd-aggmap][0m - applying hierarchical clustering to obtain group information ...[0m
2021-11-16 12:58:05,395 - [32mINFO[0m - [bidd-aggmap][0m - Applying grid assignment of feature points, this may take several minutes(1~30 min)[0m
2021-11-16 12:58:05,407 - [32mINFO[0m - [bidd-aggmap][0m - Finished[0m


['./saved_model/aggmap.mp']

## 4-fold cross validation and repeat 5 times 

In [54]:
outer_fold = 4

run_all = []
for repeat_seed in [8, 16, 32, 64, 128]: #5 repeats random seeds
    
    outer = KFold(n_splits = outer_fold, shuffle = True, random_state = repeat_seed)
    outer_idx = outer.split(range(len(Y)), Y[:,0])

    for i, idx in enumerate(outer_idx):
        
        fold_num = "fold_%s" % str(i).zfill(2) 
        
        train_idx, test_idx = idx
        
        testY = Y[test_idx]
        testx = X[test_idx]
        
        trainx = X[train_idx]
        trainY = Y[train_idx]

        trainX = mp.batch_transform(trainx, scale_method = 'standard')
        testX = mp.batch_transform(testx, scale_method = 'standard')
        
        print("\n input train and test X shape is %s, %s " % (trainX.shape,  testX.shape))
        
        # fit the model
        clf = AggMapNet.MultiClassEstimator(epochs = 50, conv1_kernel_size=11,
                                            batch_size = 4, gpuid=0, verbose = 0)
        clf.fit(trainX, trainY)
        
        # save the model for explaination
        clf._model.save( './saved_model/seed_%s-%s.h5' % (repeat_seed, fold_num))
        
        # make prediction
        y_true = testY[:,0]
        y_pred = 1-clf.predict(testX)
        y_score = clf.predict_proba(testX)[:,0]

        tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
        
        acc = (tp + tn) / sum([tn, fp, fn, tp])
        
        sensitivity = tp / sum([tp, fn])
        specificity = tn / sum([tn, fp])
        
        prc_auc = prc_auc_score(y_true, y_score)
        roc_auc = roc_auc_score(y_true, y_score)
        
        precision = tp / sum([tp, fp])
        recall =  tp / sum([tp, fn]) #equals to sensitivity
        
        
        res = {'fold': fold_num,
               'repeat_seed':repeat_seed,
               
               'accuracy':acc, 
               
               'prc_auc':prc_auc, 
               'roc_auc':roc_auc,

               'sensitivity': sensitivity, 
               'specificity': specificity,
               
               'precision':precision,
               'recall':recall,
               
               'F1': 2*precision*sensitivity/(precision+sensitivity)
              }
        
        run_all.append(res)

100%|##########| 271/271 [00:02<00:00, 95.68it/s]
100%|##########| 91/91 [00:00<00:00, 960.49it/s]



 input train and test X shape is (271, 10, 9, 10), (91, 10, 9, 10) 
MultiClassEstimator(batch_norm=False, batch_size=4, conv1_kernel_size=11,
                    dense_avf='relu', dense_layers=[128], dropout=0.0,
                    epochs=50, gpuid='0', last_avf='softmax',
                    loss='categorical_crossentropy', lr=0.0001, metric='ACC',
                    monitor='val_loss', n_inception=2,
                    name='AggMap MultiClass Estimator', patience=10000,
                    random_state=32, verbose=0)


100%|##########| 271/271 [00:00<00:00, 1000.26it/s]
100%|##########| 91/91 [00:00<00:00, 695.32it/s]



 input train and test X shape is (271, 10, 9, 10), (91, 10, 9, 10) 
MultiClassEstimator(batch_norm=False, batch_size=4, conv1_kernel_size=11,
                    dense_avf='relu', dense_layers=[128], dropout=0.0,
                    epochs=50, gpuid='0', last_avf='softmax',
                    loss='categorical_crossentropy', lr=0.0001, metric='ACC',
                    monitor='val_loss', n_inception=2,
                    name='AggMap MultiClass Estimator', patience=10000,
                    random_state=32, verbose=0)


100%|##########| 272/272 [00:00<00:00, 1041.22it/s]
100%|##########| 90/90 [00:00<00:00, 597.11it/s]



 input train and test X shape is (272, 10, 9, 10), (90, 10, 9, 10) 
MultiClassEstimator(batch_norm=False, batch_size=4, conv1_kernel_size=11,
                    dense_avf='relu', dense_layers=[128], dropout=0.0,
                    epochs=50, gpuid='0', last_avf='softmax',
                    loss='categorical_crossentropy', lr=0.0001, metric='ACC',
                    monitor='val_loss', n_inception=2,
                    name='AggMap MultiClass Estimator', patience=10000,
                    random_state=32, verbose=0)


100%|##########| 272/272 [00:00<00:00, 1152.91it/s]
100%|##########| 90/90 [00:00<00:00, 650.94it/s]



 input train and test X shape is (272, 10, 9, 10), (90, 10, 9, 10) 
MultiClassEstimator(batch_norm=False, batch_size=4, conv1_kernel_size=11,
                    dense_avf='relu', dense_layers=[128], dropout=0.0,
                    epochs=50, gpuid='0', last_avf='softmax',
                    loss='categorical_crossentropy', lr=0.0001, metric='ACC',
                    monitor='val_loss', n_inception=2,
                    name='AggMap MultiClass Estimator', patience=10000,
                    random_state=32, verbose=0)


100%|##########| 271/271 [00:00<00:00, 1149.39it/s]
100%|##########| 91/91 [00:00<00:00, 756.82it/s]



 input train and test X shape is (271, 10, 9, 10), (91, 10, 9, 10) 
MultiClassEstimator(batch_norm=False, batch_size=4, conv1_kernel_size=11,
                    dense_avf='relu', dense_layers=[128], dropout=0.0,
                    epochs=50, gpuid='0', last_avf='softmax',
                    loss='categorical_crossentropy', lr=0.0001, metric='ACC',
                    monitor='val_loss', n_inception=2,
                    name='AggMap MultiClass Estimator', patience=10000,
                    random_state=32, verbose=0)


100%|##########| 271/271 [00:00<00:00, 1087.79it/s]
100%|##########| 91/91 [00:00<00:00, 653.68it/s]



 input train and test X shape is (271, 10, 9, 10), (91, 10, 9, 10) 
MultiClassEstimator(batch_norm=False, batch_size=4, conv1_kernel_size=11,
                    dense_avf='relu', dense_layers=[128], dropout=0.0,
                    epochs=50, gpuid='0', last_avf='softmax',
                    loss='categorical_crossentropy', lr=0.0001, metric='ACC',
                    monitor='val_loss', n_inception=2,
                    name='AggMap MultiClass Estimator', patience=10000,
                    random_state=32, verbose=0)


100%|##########| 272/272 [00:00<00:00, 1009.65it/s]
100%|##########| 90/90 [00:00<00:00, 738.31it/s]



 input train and test X shape is (272, 10, 9, 10), (90, 10, 9, 10) 
MultiClassEstimator(batch_norm=False, batch_size=4, conv1_kernel_size=11,
                    dense_avf='relu', dense_layers=[128], dropout=0.0,
                    epochs=50, gpuid='0', last_avf='softmax',
                    loss='categorical_crossentropy', lr=0.0001, metric='ACC',
                    monitor='val_loss', n_inception=2,
                    name='AggMap MultiClass Estimator', patience=10000,
                    random_state=32, verbose=0)


100%|##########| 272/272 [00:00<00:00, 1013.17it/s]
100%|##########| 90/90 [00:00<00:00, 727.41it/s]



 input train and test X shape is (272, 10, 9, 10), (90, 10, 9, 10) 
MultiClassEstimator(batch_norm=False, batch_size=4, conv1_kernel_size=11,
                    dense_avf='relu', dense_layers=[128], dropout=0.0,
                    epochs=50, gpuid='0', last_avf='softmax',
                    loss='categorical_crossentropy', lr=0.0001, metric='ACC',
                    monitor='val_loss', n_inception=2,
                    name='AggMap MultiClass Estimator', patience=10000,
                    random_state=32, verbose=0)


100%|##########| 271/271 [00:00<00:00, 1092.84it/s]
100%|##########| 91/91 [00:00<00:00, 657.90it/s]



 input train and test X shape is (271, 10, 9, 10), (91, 10, 9, 10) 
MultiClassEstimator(batch_norm=False, batch_size=4, conv1_kernel_size=11,
                    dense_avf='relu', dense_layers=[128], dropout=0.0,
                    epochs=50, gpuid='0', last_avf='softmax',
                    loss='categorical_crossentropy', lr=0.0001, metric='ACC',
                    monitor='val_loss', n_inception=2,
                    name='AggMap MultiClass Estimator', patience=10000,
                    random_state=32, verbose=0)


100%|##########| 271/271 [00:00<00:00, 1019.78it/s]
100%|##########| 91/91 [00:00<00:00, 646.78it/s]



 input train and test X shape is (271, 10, 9, 10), (91, 10, 9, 10) 
MultiClassEstimator(batch_norm=False, batch_size=4, conv1_kernel_size=11,
                    dense_avf='relu', dense_layers=[128], dropout=0.0,
                    epochs=50, gpuid='0', last_avf='softmax',
                    loss='categorical_crossentropy', lr=0.0001, metric='ACC',
                    monitor='val_loss', n_inception=2,
                    name='AggMap MultiClass Estimator', patience=10000,
                    random_state=32, verbose=0)


100%|##########| 272/272 [00:00<00:00, 1019.86it/s]
100%|##########| 90/90 [00:00<00:00, 740.29it/s]



 input train and test X shape is (272, 10, 9, 10), (90, 10, 9, 10) 
MultiClassEstimator(batch_norm=False, batch_size=4, conv1_kernel_size=11,
                    dense_avf='relu', dense_layers=[128], dropout=0.0,
                    epochs=50, gpuid='0', last_avf='softmax',
                    loss='categorical_crossentropy', lr=0.0001, metric='ACC',
                    monitor='val_loss', n_inception=2,
                    name='AggMap MultiClass Estimator', patience=10000,
                    random_state=32, verbose=0)


100%|##########| 272/272 [00:00<00:00, 992.44it/s]
100%|##########| 90/90 [00:00<00:00, 686.55it/s]



 input train and test X shape is (272, 10, 9, 10), (90, 10, 9, 10) 
MultiClassEstimator(batch_norm=False, batch_size=4, conv1_kernel_size=11,
                    dense_avf='relu', dense_layers=[128], dropout=0.0,
                    epochs=50, gpuid='0', last_avf='softmax',
                    loss='categorical_crossentropy', lr=0.0001, metric='ACC',
                    monitor='val_loss', n_inception=2,
                    name='AggMap MultiClass Estimator', patience=10000,
                    random_state=32, verbose=0)


100%|##########| 271/271 [00:00<00:00, 1014.65it/s]
100%|##########| 91/91 [00:00<00:00, 671.59it/s]



 input train and test X shape is (271, 10, 9, 10), (91, 10, 9, 10) 
MultiClassEstimator(batch_norm=False, batch_size=4, conv1_kernel_size=11,
                    dense_avf='relu', dense_layers=[128], dropout=0.0,
                    epochs=50, gpuid='0', last_avf='softmax',
                    loss='categorical_crossentropy', lr=0.0001, metric='ACC',
                    monitor='val_loss', n_inception=2,
                    name='AggMap MultiClass Estimator', patience=10000,
                    random_state=32, verbose=0)


100%|##########| 271/271 [00:00<00:00, 976.02it/s]
100%|##########| 91/91 [00:00<00:00, 677.21it/s]



 input train and test X shape is (271, 10, 9, 10), (91, 10, 9, 10) 
MultiClassEstimator(batch_norm=False, batch_size=4, conv1_kernel_size=11,
                    dense_avf='relu', dense_layers=[128], dropout=0.0,
                    epochs=50, gpuid='0', last_avf='softmax',
                    loss='categorical_crossentropy', lr=0.0001, metric='ACC',
                    monitor='val_loss', n_inception=2,
                    name='AggMap MultiClass Estimator', patience=10000,
                    random_state=32, verbose=0)


100%|##########| 272/272 [00:00<00:00, 1018.80it/s]
100%|##########| 90/90 [00:00<00:00, 743.82it/s]



 input train and test X shape is (272, 10, 9, 10), (90, 10, 9, 10) 
MultiClassEstimator(batch_norm=False, batch_size=4, conv1_kernel_size=11,
                    dense_avf='relu', dense_layers=[128], dropout=0.0,
                    epochs=50, gpuid='0', last_avf='softmax',
                    loss='categorical_crossentropy', lr=0.0001, metric='ACC',
                    monitor='val_loss', n_inception=2,
                    name='AggMap MultiClass Estimator', patience=10000,
                    random_state=32, verbose=0)


100%|##########| 272/272 [00:00<00:00, 1104.03it/s]
100%|##########| 90/90 [00:00<00:00, 736.50it/s]



 input train and test X shape is (272, 10, 9, 10), (90, 10, 9, 10) 
MultiClassEstimator(batch_norm=False, batch_size=4, conv1_kernel_size=11,
                    dense_avf='relu', dense_layers=[128], dropout=0.0,
                    epochs=50, gpuid='0', last_avf='softmax',
                    loss='categorical_crossentropy', lr=0.0001, metric='ACC',
                    monitor='val_loss', n_inception=2,
                    name='AggMap MultiClass Estimator', patience=10000,
                    random_state=32, verbose=0)


100%|##########| 271/271 [00:00<00:00, 981.88it/s]
100%|##########| 91/91 [00:00<00:00, 691.56it/s]



 input train and test X shape is (271, 10, 9, 10), (91, 10, 9, 10) 
MultiClassEstimator(batch_norm=False, batch_size=4, conv1_kernel_size=11,
                    dense_avf='relu', dense_layers=[128], dropout=0.0,
                    epochs=50, gpuid='0', last_avf='softmax',
                    loss='categorical_crossentropy', lr=0.0001, metric='ACC',
                    monitor='val_loss', n_inception=2,
                    name='AggMap MultiClass Estimator', patience=10000,
                    random_state=32, verbose=0)


100%|##########| 271/271 [00:00<00:00, 1025.32it/s]
100%|##########| 91/91 [00:00<00:00, 757.39it/s]



 input train and test X shape is (271, 10, 9, 10), (91, 10, 9, 10) 
MultiClassEstimator(batch_norm=False, batch_size=4, conv1_kernel_size=11,
                    dense_avf='relu', dense_layers=[128], dropout=0.0,
                    epochs=50, gpuid='0', last_avf='softmax',
                    loss='categorical_crossentropy', lr=0.0001, metric='ACC',
                    monitor='val_loss', n_inception=2,
                    name='AggMap MultiClass Estimator', patience=10000,
                    random_state=32, verbose=0)


100%|##########| 272/272 [00:00<00:00, 1013.61it/s]
100%|##########| 90/90 [00:00<00:00, 688.75it/s]



 input train and test X shape is (272, 10, 9, 10), (90, 10, 9, 10) 
MultiClassEstimator(batch_norm=False, batch_size=4, conv1_kernel_size=11,
                    dense_avf='relu', dense_layers=[128], dropout=0.0,
                    epochs=50, gpuid='0', last_avf='softmax',
                    loss='categorical_crossentropy', lr=0.0001, metric='ACC',
                    monitor='val_loss', n_inception=2,
                    name='AggMap MultiClass Estimator', patience=10000,
                    random_state=32, verbose=0)


100%|##########| 272/272 [00:00<00:00, 1075.89it/s]
100%|##########| 90/90 [00:00<00:00, 741.75it/s]



 input train and test X shape is (272, 10, 9, 10), (90, 10, 9, 10) 
MultiClassEstimator(batch_norm=False, batch_size=4, conv1_kernel_size=11,
                    dense_avf='relu', dense_layers=[128], dropout=0.0,
                    epochs=50, gpuid='0', last_avf='softmax',
                    loss='categorical_crossentropy', lr=0.0001, metric='ACC',
                    monitor='val_loss', n_inception=2,
                    name='AggMap MultiClass Estimator', patience=10000,
                    random_state=32, verbose=0)


In [55]:
df = pd.DataFrame(run_all)
df.to_excel('results.xlsx')
df

Unnamed: 0,fold,repeat_seed,accuracy,prc_auc,roc_auc,sensitivity,specificity,precision,recall,F1
0,fold_00,8,0.956044,0.998491,0.998028,1.0,0.897436,0.928571,1.0,0.962963
1,fold_01,8,0.956044,0.992434,0.985552,0.964912,0.941176,0.964912,0.964912,0.964912
2,fold_02,8,0.911111,0.967566,0.969383,0.955556,0.866667,0.877551,0.955556,0.914894
3,fold_03,8,0.977778,0.996747,0.994684,0.982456,0.969697,0.982456,0.982456,0.982456
4,fold_00,16,0.978022,0.994168,0.988776,0.964286,1.0,1.0,0.964286,0.981818
5,fold_01,16,0.901099,0.990433,0.982972,0.877193,0.941176,0.961538,0.877193,0.917431
6,fold_02,16,0.933333,0.994449,0.993552,1.0,0.857143,0.888889,1.0,0.941176
7,fold_03,16,0.955556,0.976956,0.9815,0.98,0.925,0.942308,0.98,0.960784
8,fold_00,32,0.901099,0.988024,0.976531,0.946429,0.828571,0.898305,0.946429,0.921739
9,fold_01,32,0.934066,0.985338,0.982843,0.980392,0.875,0.909091,0.980392,0.943396


In [56]:
results = df.groupby(['repeat_seed']).apply(np.mean)
results

Unnamed: 0_level_0,repeat_seed,accuracy,prc_auc,roc_auc,sensitivity,specificity,precision,recall,F1
repeat_seed,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
8,8.0,0.950244,0.98881,0.986912,0.975731,0.918744,0.938373,0.975731,0.956306
16,16.0,0.942002,0.989001,0.9867,0.95537,0.93083,0.948184,0.95537,0.950303
32,32.0,0.931013,0.991987,0.98794,0.958557,0.894643,0.929122,0.958557,0.942243
64,64.0,0.942002,0.983267,0.970117,0.948416,0.928054,0.95625,0.948416,0.950377
128,128.0,0.936477,0.991562,0.987052,0.952614,0.913577,0.939117,0.952614,0.945644


In [57]:
df.groupby('repeat_seed').apply(np.mean).mean().round(3)

repeat_seed    49.600
accuracy        0.940
prc_auc         0.989
roc_auc         0.984
sensitivity     0.958
specificity     0.917
precision       0.942
recall          0.958
F1              0.949
dtype: float64

In [58]:
df.groupby('repeat_seed').apply(np.std).mean().round(3)

repeat_seed    0.000
accuracy       0.023
prc_auc        0.008
roc_auc        0.011
sensitivity    0.037
specificity    0.048
precision      0.033
recall         0.037
F1             0.019
dtype: float64