In [1]:
import warnings
warnings.filterwarnings("ignore")


import pandas as pd
import numpy as np

from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import confusion_matrix, precision_recall_curve, roc_auc_score
from sklearn.metrics import auc as calculate_auc
from sklearn.datasets import load_breast_cancer

import matplotlib.pyplot as plt
import seaborn as sns

from aggmap import AggMap, AggModel

np.random.seed(666) #just for reaptable results

def prc_auc_score(y_true, y_score):
    precision, recall, threshold  = precision_recall_curve(y_true, y_score) #PRC_AUC
    auc = calculate_auc(recall, precision)
    return auc

In [2]:
data = load_breast_cancer()
dfx = pd.DataFrame(data.data, columns= data.feature_names)
y = pd.Series(data.target).map({0:'malignant', 1:'benign'})
dfy = pd.get_dummies(y) 
X = dfx.values.astype(float)
Y = dfy.values.astype(float)

In [3]:
mp = AggMap(dfx, metric = 'correlation')
mp.fit(cluster_channels = 5, verbose = 0)

2020-10-25 13:21:53,302 - [32mINFO[0m - [bidd-aggmap][0m - Calculating distance ...[0m
2020-10-25 13:21:53,303 - [32mINFO[0m - [bidd-aggmap][0m - the number of process is 16[0m


100%|##########| 435/435 [00:00<00:00, 2033.91it/s]
100%|##########| 435/435 [00:00<00:00, 1042584.14it/s]
100%|##########| 30/30 [00:00<00:00, 697.30it/s]


2020-10-25 13:21:53,758 - [32mINFO[0m - [bidd-aggmap][0m - applying hierarchical clustering to obtain group information ...[0m
2020-10-25 13:21:56,955 - [32mINFO[0m - [bidd-aggmap][0m - Applying grid feature map(assignment), this may take several minutes(1~30 min)[0m
2020-10-25 13:21:56,965 - [32mINFO[0m - [bidd-aggmap][0m - Finished[0m


<aggmap.map.AggMap at 0x7fd0a0131828>

## 5-fold cross validation and repeat 5 times 

In [4]:
outer_fold = 5

run_all = []
for repeat_seed in [8, 16, 32, 64, 128]: #5 repeats random seeds
    
    outer = KFold(n_splits = outer_fold, shuffle = True, random_state = repeat_seed)
    outer_idx = outer.split(range(len(Y)), Y[:,0])

    for i, idx in enumerate(outer_idx):
        
        fold_num = "fold_%s" % str(i).zfill(2) 
        
        train_idx, test_idx = idx
        
        testY = Y[test_idx]
        testx = X[test_idx]
        
        trainx = X[train_idx]
        trainY = Y[train_idx]

        trainX = mp.batch_transform(trainx, scale_method = 'standard')
        testX = mp.batch_transform(testx, scale_method = 'standard')
        
        print("\n input train and test X shape is %s, %s " % (trainX.shape,  testX.shape))
        
        clf = AggModel.MultiClassEstimator(epochs = 50, gpuid = 3, 
                                           batch_size = 8, verbose = 0) #conv1_kernel_size = 5,
        clf.fit(trainX, trainY)

        y_true = testY[:,0]
        y_pred = clf.predict(testX)[:,0]
        y_score = clf.predict_proba(testX)[:,0]

        tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
        
        acc = (tp + tn) / sum([tn, fp, fn, tp])
        
        sensitivity = tp / sum([tp, fn])
        specificity = tn / sum([tn, fp])
        
        prc_auc = prc_auc_score(y_true, y_score)
        roc_auc = roc_auc_score(y_true, y_score)
        
        precision = tp / sum([tp, fp])
        recall =  tp / sum([tp, fn]) #equals to sensitivity
        
        
        res = {'fold': fold_num,
               'repeat_seed':repeat_seed,
               
               'accuracy':acc, 
               
               'prc_auc':prc_auc, 
               'roc_auc':roc_auc,

               'sensitivity': sensitivity, 
               'specificity': specificity,
               
               'precision':precision,
               'recall':recall,
               'F1': 2*precision*sensitivity/(precision+sensitivity)
              }
        
        run_all.append(res)

100%|##########| 455/455 [00:02<00:00, 189.11it/s]
100%|##########| 114/114 [00:00<00:00, 1502.05it/s]



 input train and test X shape is (455, 6, 5, 5), (114, 6, 5, 5) 
MultiClassEstimator(batch_size=8, epochs=50, gpuid='3')


100%|##########| 455/455 [00:00<00:00, 1812.91it/s]
100%|##########| 114/114 [00:00<00:00, 991.65it/s]



 input train and test X shape is (455, 6, 5, 5), (114, 6, 5, 5) 
MultiClassEstimator(batch_size=8, epochs=50, gpuid='3')


100%|##########| 455/455 [00:00<00:00, 2113.32it/s]
100%|##########| 114/114 [00:00<00:00, 1519.00it/s]



 input train and test X shape is (455, 6, 5, 5), (114, 6, 5, 5) 
MultiClassEstimator(batch_size=8, epochs=50, gpuid='3')


100%|##########| 455/455 [00:00<00:00, 2101.09it/s]
100%|##########| 114/114 [00:00<00:00, 1320.83it/s]



 input train and test X shape is (455, 6, 5, 5), (114, 6, 5, 5) 
MultiClassEstimator(batch_size=8, epochs=50, gpuid='3')


100%|##########| 456/456 [00:00<00:00, 1952.04it/s]
100%|##########| 113/113 [00:00<00:00, 1431.82it/s]



 input train and test X shape is (456, 6, 5, 5), (113, 6, 5, 5) 
MultiClassEstimator(batch_size=8, epochs=50, gpuid='3')


100%|##########| 455/455 [00:00<00:00, 2459.70it/s]
100%|##########| 114/114 [00:00<00:00, 1511.48it/s]



 input train and test X shape is (455, 6, 5, 5), (114, 6, 5, 5) 
MultiClassEstimator(batch_size=8, epochs=50, gpuid='3')


100%|##########| 455/455 [00:00<00:00, 2173.62it/s]
100%|##########| 114/114 [00:00<00:00, 1095.20it/s]



 input train and test X shape is (455, 6, 5, 5), (114, 6, 5, 5) 
MultiClassEstimator(batch_size=8, epochs=50, gpuid='3')


100%|##########| 455/455 [00:00<00:00, 2240.99it/s]
100%|##########| 114/114 [00:00<00:00, 1281.32it/s]



 input train and test X shape is (455, 6, 5, 5), (114, 6, 5, 5) 
MultiClassEstimator(batch_size=8, epochs=50, gpuid='3')


100%|##########| 455/455 [00:00<00:00, 2087.39it/s]
100%|##########| 114/114 [00:00<00:00, 1350.97it/s]



 input train and test X shape is (455, 6, 5, 5), (114, 6, 5, 5) 
MultiClassEstimator(batch_size=8, epochs=50, gpuid='3')


100%|##########| 456/456 [00:00<00:00, 2328.73it/s]
100%|##########| 113/113 [00:00<00:00, 1261.46it/s]



 input train and test X shape is (456, 6, 5, 5), (113, 6, 5, 5) 
MultiClassEstimator(batch_size=8, epochs=50, gpuid='3')


100%|##########| 455/455 [00:00<00:00, 1842.14it/s]
100%|##########| 114/114 [00:00<00:00, 1493.52it/s]



 input train and test X shape is (455, 6, 5, 5), (114, 6, 5, 5) 
MultiClassEstimator(batch_size=8, epochs=50, gpuid='3')


100%|##########| 455/455 [00:00<00:00, 2174.25it/s]
100%|##########| 114/114 [00:00<00:00, 979.39it/s]



 input train and test X shape is (455, 6, 5, 5), (114, 6, 5, 5) 
MultiClassEstimator(batch_size=8, epochs=50, gpuid='3')


100%|##########| 455/455 [00:00<00:00, 2050.65it/s]
100%|##########| 114/114 [00:00<00:00, 1499.98it/s]



 input train and test X shape is (455, 6, 5, 5), (114, 6, 5, 5) 
MultiClassEstimator(batch_size=8, epochs=50, gpuid='3')


100%|##########| 455/455 [00:00<00:00, 2142.64it/s]
100%|##########| 114/114 [00:00<00:00, 1167.48it/s]



 input train and test X shape is (455, 6, 5, 5), (114, 6, 5, 5) 
MultiClassEstimator(batch_size=8, epochs=50, gpuid='3')


100%|##########| 456/456 [00:00<00:00, 2016.07it/s]
100%|##########| 113/113 [00:00<00:00, 1181.95it/s]



 input train and test X shape is (456, 6, 5, 5), (113, 6, 5, 5) 
MultiClassEstimator(batch_size=8, epochs=50, gpuid='3')


100%|##########| 455/455 [00:00<00:00, 2247.61it/s]
100%|##########| 114/114 [00:00<00:00, 1133.97it/s]



 input train and test X shape is (455, 6, 5, 5), (114, 6, 5, 5) 
MultiClassEstimator(batch_size=8, epochs=50, gpuid='3')


100%|##########| 455/455 [00:00<00:00, 2583.43it/s]
100%|##########| 114/114 [00:00<00:00, 1121.67it/s]



 input train and test X shape is (455, 6, 5, 5), (114, 6, 5, 5) 
MultiClassEstimator(batch_size=8, epochs=50, gpuid='3')


100%|##########| 455/455 [00:00<00:00, 2230.31it/s]
100%|##########| 114/114 [00:00<00:00, 1496.58it/s]



 input train and test X shape is (455, 6, 5, 5), (114, 6, 5, 5) 
MultiClassEstimator(batch_size=8, epochs=50, gpuid='3')


100%|##########| 455/455 [00:00<00:00, 2408.96it/s]
100%|##########| 114/114 [00:00<00:00, 1240.13it/s]



 input train and test X shape is (455, 6, 5, 5), (114, 6, 5, 5) 
MultiClassEstimator(batch_size=8, epochs=50, gpuid='3')


100%|##########| 456/456 [00:00<00:00, 2302.78it/s]
100%|##########| 113/113 [00:00<00:00, 1487.66it/s]



 input train and test X shape is (456, 6, 5, 5), (113, 6, 5, 5) 
MultiClassEstimator(batch_size=8, epochs=50, gpuid='3')


100%|##########| 455/455 [00:00<00:00, 2320.78it/s]
100%|##########| 114/114 [00:00<00:00, 1235.80it/s]



 input train and test X shape is (455, 6, 5, 5), (114, 6, 5, 5) 
MultiClassEstimator(batch_size=8, epochs=50, gpuid='3')


100%|##########| 455/455 [00:00<00:00, 2172.20it/s]
100%|##########| 114/114 [00:00<00:00, 1310.78it/s]



 input train and test X shape is (455, 6, 5, 5), (114, 6, 5, 5) 
MultiClassEstimator(batch_size=8, epochs=50, gpuid='3')


100%|##########| 455/455 [00:00<00:00, 2043.14it/s]
100%|##########| 114/114 [00:00<00:00, 1226.62it/s]



 input train and test X shape is (455, 6, 5, 5), (114, 6, 5, 5) 
MultiClassEstimator(batch_size=8, epochs=50, gpuid='3')


100%|##########| 455/455 [00:00<00:00, 2131.37it/s]
100%|##########| 114/114 [00:00<00:00, 1137.82it/s]



 input train and test X shape is (455, 6, 5, 5), (114, 6, 5, 5) 
MultiClassEstimator(batch_size=8, epochs=50, gpuid='3')


100%|##########| 456/456 [00:00<00:00, 2417.40it/s]
100%|##########| 113/113 [00:00<00:00, 1447.45it/s]



 input train and test X shape is (456, 6, 5, 5), (113, 6, 5, 5) 
MultiClassEstimator(batch_size=8, epochs=50, gpuid='3')


In [5]:
clf._model.count_params()

323138

In [6]:
df = pd.DataFrame(run_all)
df.to_excel('results.xlsx')


In [7]:
df.groupby('repeat_seed').apply(np.mean).mean().round(3)

repeat_seed    49.600
accuracy        0.972
prc_auc         0.996
roc_auc         0.994
sensitivity     0.980
specificity     0.955
precision       0.975
recall          0.980
F1              0.978
dtype: float64

In [8]:
df.groupby('repeat_seed').apply(np.std).mean().round(3)

repeat_seed    0.000
accuracy       0.014
prc_auc        0.005
roc_auc        0.007
sensitivity    0.013
specificity    0.032
precision      0.015
recall         0.013
F1             0.011
dtype: float64