In [18]:
import warnings
warnings.filterwarnings("ignore")


import pandas as pd
import numpy as np

from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import confusion_matrix, precision_recall_curve, roc_auc_score
from sklearn.metrics import auc as calculate_auc
from sklearn.datasets import load_breast_cancer

import matplotlib.pyplot as plt
import seaborn as sns

from aggmap import AggMap, AggModel

np.random.seed(666) #just for reaptable results

def prc_auc_score(y_true, y_score):
    precision, recall, threshold  = precision_recall_curve(y_true, y_score) #PRC_AUC
    auc = calculate_auc(recall, precision)
    return auc

In [19]:
data = load_breast_cancer()
dfx = pd.DataFrame(data.data, columns= data.feature_names)
y = pd.Series(data.target).map({0:'malignant', 1:'benign'})
dfy = pd.get_dummies(y) 
X = dfx.values.astype(float)
Y = dfy.values.astype(float)

In [20]:
mp = AggMap(dfx, metric = 'correlation')
mp.fit(cluster_channels = 5, verbose = 0)

2020-10-18 17:08:58,496 - [32mINFO[0m - [bidd-aggmap][0m - Calculating distance ...[0m
2020-10-18 17:08:58,497 - [32mINFO[0m - [bidd-aggmap][0m - the number of process is 16[0m


100%|##########| 435/435 [00:00<00:00, 2331.26it/s]
100%|##########| 435/435 [00:00<00:00, 1239485.22it/s]
100%|##########| 30/30 [00:00<00:00, 643.72it/s]


2020-10-18 17:09:00,122 - [32mINFO[0m - [bidd-aggmap][0m - applying hierarchical clustering to obtain group information ...[0m
2020-10-18 17:09:00,193 - [32mINFO[0m - [bidd-aggmap][0m - Applying grid feature map(assignment), this may take several minutes(1~30 min)[0m
2020-10-18 17:09:00,202 - [32mINFO[0m - [bidd-aggmap][0m - Finished[0m


<aggmap.map.AggMap at 0x7f4530628a58>

## 5-fold cross validation and repeat 5 times 

In [21]:
outer_fold = 5

run_all = []
for repeat_seed in [8, 16, 32, 64, 128]: #5 repeats random seeds
    
    outer = KFold(n_splits = outer_fold, shuffle = True, random_state = repeat_seed)
    outer_idx = outer.split(range(len(Y)), Y[:,0])

    for i, idx in enumerate(outer_idx):
        
        fold_num = "fold_%s" % str(i).zfill(2) 
        
        train_idx, test_idx = idx
        
        testY = Y[test_idx]
        testx = X[test_idx]
        
        trainx = X[train_idx]
        trainY = Y[train_idx]

        trainX = mp.batch_transform(trainx, scale_method = 'standard')
        testX = mp.batch_transform(testx, scale_method = 'standard')
        
        print("\n input train and test X shape is %s, %s " % (trainX.shape,  testX.shape))
        
        clf = AggModel.MultiClassEstimator(epochs = 100, gpuid = 3, conv1_kernel_size = 11, batch_size = 8, verbose = 0)
        clf.fit(trainX, trainY)

        y_true = testY[:,0]
        y_pred = clf.predict(testX)[:,0]
        y_score = clf.predict_proba(testX)[:,0]

        tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
        
        acc = (tp + tn) / sum([tn, fp, fn, tp])
        
        sensitivity = tp / sum([tp, fn])
        specificity = tn / sum([tn, fp])
        
        prc_auc = prc_auc_score(y_true, y_score)
        roc_auc = roc_auc_score(y_true, y_score)
        
        precision = tp / sum([tp, fp])
        recall =  tp / sum([tp, fn]) #equals to sensitivity
        
        
        res = {'fold': fold_num,
               'repeat_seed':repeat_seed,
               
               'accuracy':acc, 
               
               'prc_auc':prc_auc, 
               'roc_auc':roc_auc,

               'sensitivity': sensitivity, 
               'specificity': specificity,
               
               'precision':precision,
               'recall':recall,
              }
        
        run_all.append(res)

100%|##########| 455/455 [00:02<00:00, 178.64it/s]
100%|##########| 114/114 [00:00<00:00, 1173.89it/s]



 input train and test X shape is (455, 6, 5, 5), (114, 6, 5, 5) 
MultiClassEstimator(batch_size=8, epochs=100, gpuid='3')


100%|##########| 455/455 [00:00<00:00, 1868.59it/s]
100%|##########| 114/114 [00:00<00:00, 1157.82it/s]



 input train and test X shape is (455, 6, 5, 5), (114, 6, 5, 5) 
MultiClassEstimator(batch_size=8, epochs=100, gpuid='3')


100%|##########| 455/455 [00:00<00:00, 2233.28it/s]
100%|##########| 114/114 [00:00<00:00, 1075.66it/s]



 input train and test X shape is (455, 6, 5, 5), (114, 6, 5, 5) 
MultiClassEstimator(batch_size=8, epochs=100, gpuid='3')


100%|##########| 455/455 [00:00<00:00, 2279.89it/s]
100%|##########| 114/114 [00:00<00:00, 2030.62it/s]



 input train and test X shape is (455, 6, 5, 5), (114, 6, 5, 5) 
MultiClassEstimator(batch_size=8, epochs=100, gpuid='3')


100%|##########| 456/456 [00:00<00:00, 2511.44it/s]
100%|##########| 113/113 [00:00<00:00, 1053.18it/s]



 input train and test X shape is (456, 6, 5, 5), (113, 6, 5, 5) 
MultiClassEstimator(batch_size=8, epochs=100, gpuid='3')


100%|##########| 455/455 [00:00<00:00, 2388.93it/s]
100%|##########| 114/114 [00:00<00:00, 1242.74it/s]



 input train and test X shape is (455, 6, 5, 5), (114, 6, 5, 5) 
MultiClassEstimator(batch_size=8, epochs=100, gpuid='3')


100%|##########| 455/455 [00:00<00:00, 2363.47it/s]
100%|##########| 114/114 [00:00<00:00, 1449.91it/s]



 input train and test X shape is (455, 6, 5, 5), (114, 6, 5, 5) 
MultiClassEstimator(batch_size=8, epochs=100, gpuid='3')


100%|##########| 455/455 [00:00<00:00, 2028.96it/s]
100%|##########| 114/114 [00:00<00:00, 1114.57it/s]



 input train and test X shape is (455, 6, 5, 5), (114, 6, 5, 5) 
MultiClassEstimator(batch_size=8, epochs=100, gpuid='3')


100%|##########| 455/455 [00:00<00:00, 2243.35it/s]
100%|##########| 114/114 [00:00<00:00, 2213.99it/s]



 input train and test X shape is (455, 6, 5, 5), (114, 6, 5, 5) 
MultiClassEstimator(batch_size=8, epochs=100, gpuid='3')


100%|##########| 456/456 [00:00<00:00, 2174.13it/s]
100%|##########| 113/113 [00:00<00:00, 1052.04it/s]



 input train and test X shape is (456, 6, 5, 5), (113, 6, 5, 5) 
MultiClassEstimator(batch_size=8, epochs=100, gpuid='3')


100%|##########| 455/455 [00:00<00:00, 2327.33it/s]
100%|##########| 114/114 [00:00<00:00, 1396.02it/s]



 input train and test X shape is (455, 6, 5, 5), (114, 6, 5, 5) 
MultiClassEstimator(batch_size=8, epochs=100, gpuid='3')


100%|##########| 455/455 [00:00<00:00, 2761.98it/s]
100%|##########| 114/114 [00:00<00:00, 1203.31it/s]



 input train and test X shape is (455, 6, 5, 5), (114, 6, 5, 5) 
MultiClassEstimator(batch_size=8, epochs=100, gpuid='3')


100%|##########| 455/455 [00:00<00:00, 2487.96it/s]
100%|##########| 114/114 [00:00<00:00, 1264.28it/s]



 input train and test X shape is (455, 6, 5, 5), (114, 6, 5, 5) 
MultiClassEstimator(batch_size=8, epochs=100, gpuid='3')


100%|##########| 455/455 [00:00<00:00, 2535.23it/s]
100%|##########| 114/114 [00:00<00:00, 1244.76it/s]



 input train and test X shape is (455, 6, 5, 5), (114, 6, 5, 5) 
MultiClassEstimator(batch_size=8, epochs=100, gpuid='3')


100%|##########| 456/456 [00:00<00:00, 2106.77it/s]
100%|##########| 113/113 [00:00<00:00, 1327.68it/s]



 input train and test X shape is (456, 6, 5, 5), (113, 6, 5, 5) 
MultiClassEstimator(batch_size=8, epochs=100, gpuid='3')


100%|##########| 455/455 [00:00<00:00, 1881.95it/s]
100%|##########| 114/114 [00:00<00:00, 1085.10it/s]



 input train and test X shape is (455, 6, 5, 5), (114, 6, 5, 5) 
MultiClassEstimator(batch_size=8, epochs=100, gpuid='3')


100%|##########| 455/455 [00:00<00:00, 2314.39it/s]
100%|##########| 114/114 [00:00<00:00, 1226.59it/s]



 input train and test X shape is (455, 6, 5, 5), (114, 6, 5, 5) 
MultiClassEstimator(batch_size=8, epochs=100, gpuid='3')


100%|##########| 455/455 [00:00<00:00, 2224.12it/s]
100%|##########| 114/114 [00:00<00:00, 1492.31it/s]



 input train and test X shape is (455, 6, 5, 5), (114, 6, 5, 5) 
MultiClassEstimator(batch_size=8, epochs=100, gpuid='3')


100%|##########| 455/455 [00:00<00:00, 1906.39it/s]
100%|##########| 114/114 [00:00<00:00, 1647.64it/s]



 input train and test X shape is (455, 6, 5, 5), (114, 6, 5, 5) 
MultiClassEstimator(batch_size=8, epochs=100, gpuid='3')


100%|##########| 456/456 [00:00<00:00, 1978.81it/s]
100%|##########| 113/113 [00:00<00:00, 1101.21it/s]



 input train and test X shape is (456, 6, 5, 5), (113, 6, 5, 5) 
MultiClassEstimator(batch_size=8, epochs=100, gpuid='3')


100%|##########| 455/455 [00:00<00:00, 2067.13it/s]
100%|##########| 114/114 [00:00<00:00, 1401.61it/s]



 input train and test X shape is (455, 6, 5, 5), (114, 6, 5, 5) 
MultiClassEstimator(batch_size=8, epochs=100, gpuid='3')


100%|##########| 455/455 [00:00<00:00, 2030.36it/s]
100%|##########| 114/114 [00:00<00:00, 1475.68it/s]



 input train and test X shape is (455, 6, 5, 5), (114, 6, 5, 5) 
MultiClassEstimator(batch_size=8, epochs=100, gpuid='3')


100%|##########| 455/455 [00:00<00:00, 2195.03it/s]
100%|##########| 114/114 [00:00<00:00, 1621.40it/s]



 input train and test X shape is (455, 6, 5, 5), (114, 6, 5, 5) 
MultiClassEstimator(batch_size=8, epochs=100, gpuid='3')


100%|##########| 455/455 [00:00<00:00, 2439.38it/s]
100%|##########| 114/114 [00:00<00:00, 1296.54it/s]



 input train and test X shape is (455, 6, 5, 5), (114, 6, 5, 5) 
MultiClassEstimator(batch_size=8, epochs=100, gpuid='3')


100%|##########| 456/456 [00:00<00:00, 2966.46it/s]
100%|##########| 113/113 [00:00<00:00, 1105.69it/s]



 input train and test X shape is (456, 6, 5, 5), (113, 6, 5, 5) 
MultiClassEstimator(batch_size=8, epochs=100, gpuid='3')


In [22]:
clf._model.count_params()

323138

In [23]:
df = pd.DataFrame(run_all)
df.to_excel('results.xlsx')
df

Unnamed: 0,fold,repeat_seed,accuracy,prc_auc,roc_auc,sensitivity,specificity,precision,recall
0,fold_00,8,0.973684,0.998427,0.997762,0.970588,0.978261,0.985075,0.970588
1,fold_01,8,0.964912,0.998809,0.998016,0.958333,0.97619,0.985714,0.958333
2,fold_02,8,0.982456,0.987934,0.985075,1.0,0.957447,0.971014,1.0
3,fold_03,8,0.973684,0.987289,0.983696,0.985294,0.956522,0.971014,0.985294
4,fold_04,8,0.964602,0.999112,0.99764,0.987805,0.903226,0.964286,0.987805
5,fold_00,16,0.95614,0.994579,0.990812,0.972603,0.926829,0.959459,0.972603
6,fold_01,16,0.991228,1.0,1.0,0.985714,1.0,1.0,0.985714
7,fold_02,16,0.973684,0.998249,0.996537,0.986842,0.947368,0.974026,0.986842
8,fold_03,16,0.982456,0.99886,0.998412,0.985075,0.978723,0.985075,0.985075
9,fold_04,16,0.955752,0.987508,0.981556,0.957746,0.952381,0.971429,0.957746


In [24]:
results = df.groupby(['repeat_seed']).apply(np.mean)
results

Unnamed: 0_level_0,repeat_seed,accuracy,prc_auc,roc_auc,sensitivity,specificity,precision,recall
repeat_seed,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
8,8.0,0.971868,0.994314,0.992438,0.980404,0.954329,0.975421,0.980404
16,16.0,0.971852,0.995839,0.993463,0.977596,0.96106,0.977998,0.977596
32,32.0,0.977177,0.996553,0.994668,0.982761,0.968482,0.980029,0.982761
64,64.0,0.97016,0.996466,0.99399,0.977979,0.95345,0.97622,0.977979
128,128.0,0.968406,0.995928,0.992478,0.980313,0.946251,0.969657,0.980313


In [25]:
results.mean().round(3)

repeat_seed    49.600
accuracy        0.972
prc_auc         0.996
roc_auc         0.993
sensitivity     0.980
specificity     0.957
precision       0.976
recall          0.980
dtype: float64

In [26]:
results.mean().round(3)

repeat_seed    49.600
accuracy        0.972
prc_auc         0.996
roc_auc         0.993
sensitivity     0.980
specificity     0.957
precision       0.976
recall          0.980
dtype: float64

In [27]:
results.std().round(3)

repeat_seed    48.793
accuracy        0.003
prc_auc         0.001
roc_auc         0.001
sensitivity     0.002
specificity     0.008
precision       0.004
recall          0.002
dtype: float64