In [1]:
import warnings
warnings.filterwarnings("ignore")


import pandas as pd
import numpy as np

from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import confusion_matrix, precision_recall_curve
from sklearn.metrics import auc as calculate_auc
from sklearn.datasets import load_breast_cancer

import matplotlib.pyplot as plt
import seaborn as sns

from aggmap import AggMap, AggModel

np.random.seed(888) #just for reaptable results


color = sns.color_palette("rainbow_r", 6) #PiYG
sns.palplot(color)

In [2]:
data = load_breast_cancer()
dfx = pd.DataFrame(data.data, columns= data.feature_names)
y = pd.Series(data.target).map({0:'malignant', 1:'benign'})
dfy = pd.get_dummies(y)

In [3]:
X = dfx.values.astype(float)
Y = dfy.values.astype(float)

mp = AggMap(dfx, metric = 'correlation')

2020-10-14 10:26:02,978 - [32mINFO[0m - [bidd-aggmap][0m - Calculating distance ...[0m
2020-10-14 10:26:02,979 - [32mINFO[0m - [bidd-aggmap][0m - the number of process is 16[0m


100%|##########| 435/435 [00:00<00:00, 2613.26it/s]
100%|##########| 435/435 [00:00<00:00, 1006355.34it/s]
100%|##########| 30/30 [00:00<00:00, 701.19it/s]


In [None]:
n_splits = 5 #5-fold reapeat 5 times
run_all = []

for repeat_seed in  [8, 16, 32, 64, 128]: #5 repeats random seeds 
    
    outer = KFold(n_splits = n_splits, shuffle = True, random_state = repeat_seed)
    outer_idx = list(outer.split(range(len(Y))))
    
    for c in [1, 3, 5]: 
        
        mp = mp.fit(cluster_channels = c, verbose = 0)
            
        for i, idx in enumerate(outer_idx):
            
            train_idx, valid_idx = idx
            fold_num = "fold_%s" % str(i+1).zfill(2) 
            
            validY = Y[valid_idx]
            validx = X[valid_idx]

            trainx = X[train_idx]
            trainY = Y[train_idx]

            trainX = mp.batch_transform(trainx, scale_method = 'standard')
            validX = mp.batch_transform(validx, scale_method = 'standard')
            print("\n input train and test X shape is %s, %s " % (trainX.shape,  validX.shape))
            clf = AggModel.MultiClassEstimator(epochs = 50, batch_size = 8, conv1_kernel_size = 5, metric = 'ACC',verbose = 0)
            clf.fit(trainX, trainY, X_valid = validX, y_valid = validY)

            history = clf.history.history
            history['fold'] = fold_num
            history['c'] = c
            history['repeat_seed'] = repeat_seed
            
            run_all.append(history)

  0%|          | 0/455 [00:00<?, ?it/s]

2020-10-14 21:31:06,870 - [32mINFO[0m - [bidd-aggmap][0m - applying hierarchical clustering to obtain group information ...[0m
2020-10-14 21:31:06,942 - [32mINFO[0m - [bidd-aggmap][0m - Applying grid feature map(assignment), this may take several minutes(1~30 min)[0m
2020-10-14 21:31:06,948 - [32mINFO[0m - [bidd-aggmap][0m - Finished[0m


100%|##########| 455/455 [00:02<00:00, 178.22it/s]
100%|##########| 114/114 [00:00<00:00, 1552.29it/s]



 input train and test X shape is (455, 6, 5, 1), (114, 6, 5, 1) 
MultiClassEstimator(batch_size=8, conv1_kernel_size=5, epochs=50, gpuid='0',
                    metric='ACC')


100%|##########| 455/455 [00:00<00:00, 2130.70it/s]
100%|##########| 114/114 [00:00<00:00, 1400.41it/s]



 input train and test X shape is (455, 6, 5, 1), (114, 6, 5, 1) 
MultiClassEstimator(batch_size=8, conv1_kernel_size=5, epochs=50, gpuid='0',
                    metric='ACC')


100%|##########| 455/455 [00:00<00:00, 2107.07it/s]
100%|##########| 114/114 [00:00<00:00, 1486.39it/s]



 input train and test X shape is (455, 6, 5, 1), (114, 6, 5, 1) 
MultiClassEstimator(batch_size=8, conv1_kernel_size=5, epochs=50, gpuid='0',
                    metric='ACC')


100%|##########| 455/455 [00:00<00:00, 2099.46it/s]
100%|##########| 114/114 [00:00<00:00, 1905.80it/s]



 input train and test X shape is (455, 6, 5, 1), (114, 6, 5, 1) 
MultiClassEstimator(batch_size=8, conv1_kernel_size=5, epochs=50, gpuid='0',
                    metric='ACC')


100%|##########| 456/456 [00:00<00:00, 2719.12it/s]
100%|##########| 113/113 [00:00<00:00, 1491.43it/s]



 input train and test X shape is (456, 6, 5, 1), (113, 6, 5, 1) 
MultiClassEstimator(batch_size=8, conv1_kernel_size=5, epochs=50, gpuid='0',
                    metric='ACC')


 33%|###2      | 148/455 [00:00<00:00, 1368.78it/s]

2020-10-14 21:32:53,367 - [32mINFO[0m - [bidd-aggmap][0m - applying hierarchical clustering to obtain group information ...[0m
2020-10-14 21:32:53,434 - [32mINFO[0m - [bidd-aggmap][0m - Applying grid feature map(assignment), this may take several minutes(1~30 min)[0m
2020-10-14 21:32:53,442 - [32mINFO[0m - [bidd-aggmap][0m - Finished[0m


100%|##########| 455/455 [00:00<00:00, 2647.71it/s]
100%|##########| 114/114 [00:00<00:00, 1231.26it/s]



 input train and test X shape is (455, 6, 5, 3), (114, 6, 5, 3) 
MultiClassEstimator(batch_size=8, conv1_kernel_size=5, epochs=50, gpuid='0',
                    metric='ACC')


100%|##########| 455/455 [00:00<00:00, 2271.11it/s]
100%|##########| 114/114 [00:00<00:00, 1307.53it/s]



 input train and test X shape is (455, 6, 5, 3), (114, 6, 5, 3) 
MultiClassEstimator(batch_size=8, conv1_kernel_size=5, epochs=50, gpuid='0',
                    metric='ACC')


100%|##########| 455/455 [00:00<00:00, 2351.29it/s]
100%|##########| 114/114 [00:00<00:00, 1298.43it/s]



 input train and test X shape is (455, 6, 5, 3), (114, 6, 5, 3) 
MultiClassEstimator(batch_size=8, conv1_kernel_size=5, epochs=50, gpuid='0',
                    metric='ACC')


100%|##########| 455/455 [00:00<00:00, 2143.01it/s]
100%|##########| 114/114 [00:00<00:00, 1209.39it/s]



 input train and test X shape is (455, 6, 5, 3), (114, 6, 5, 3) 
MultiClassEstimator(batch_size=8, conv1_kernel_size=5, epochs=50, gpuid='0',
                    metric='ACC')


100%|##########| 456/456 [00:00<00:00, 2759.59it/s]
100%|##########| 113/113 [00:00<00:00, 1553.46it/s]



 input train and test X shape is (456, 6, 5, 3), (113, 6, 5, 3) 
MultiClassEstimator(batch_size=8, conv1_kernel_size=5, epochs=50, gpuid='0',
                    metric='ACC')


 24%|##3       | 108/455 [00:00<00:00, 1010.82it/s]

2020-10-14 21:34:37,651 - [32mINFO[0m - [bidd-aggmap][0m - applying hierarchical clustering to obtain group information ...[0m
2020-10-14 21:34:37,719 - [32mINFO[0m - [bidd-aggmap][0m - Applying grid feature map(assignment), this may take several minutes(1~30 min)[0m
2020-10-14 21:34:37,728 - [32mINFO[0m - [bidd-aggmap][0m - Finished[0m


100%|##########| 455/455 [00:00<00:00, 2301.04it/s]
100%|##########| 114/114 [00:00<00:00, 1110.43it/s]



 input train and test X shape is (455, 6, 5, 5), (114, 6, 5, 5) 
MultiClassEstimator(batch_size=8, conv1_kernel_size=5, epochs=50, gpuid='0',
                    metric='ACC')


100%|##########| 455/455 [00:00<00:00, 2256.27it/s]
100%|##########| 114/114 [00:00<00:00, 1138.25it/s]



 input train and test X shape is (455, 6, 5, 5), (114, 6, 5, 5) 
MultiClassEstimator(batch_size=8, conv1_kernel_size=5, epochs=50, gpuid='0',
                    metric='ACC')


100%|##########| 455/455 [00:00<00:00, 2389.14it/s]
100%|##########| 114/114 [00:00<00:00, 1110.09it/s]



 input train and test X shape is (455, 6, 5, 5), (114, 6, 5, 5) 
MultiClassEstimator(batch_size=8, conv1_kernel_size=5, epochs=50, gpuid='0',
                    metric='ACC')


In [None]:
dfr = pd.DataFrame(run_all)
dfr.to_csv('./5FCV_valid.csv')

In [None]:
sns.set(style = 'white', font_scale=1.2)

In [1]:
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(18,6), sharex=False, sharey=False, )
ax1, ax2 = axes        


col = 'val_loss'
loss_mean = dfr.groupby(['c']).agg({col: lambda x:x.tolist()})[col].apply(lambda x: np.array(x).mean(axis=0)).apply(pd.Series).T
loss_std = dfr.groupby(['c']).agg({col: lambda x:x.tolist()})[col].apply(lambda x: np.array(x).std(axis=0)).apply(pd.Series).T
loss_mean.head(50).plot(ax=ax1, lw = 2, color = color) # ,ylim = (-2.55, -2) 
ax1.set_ylabel('Validate loss')
ax1.set_xlabel('Epochs')


col = 'val_accuracy'
acc_mean = dfr.groupby(['c']).agg({col: lambda x:x.tolist()})[col].apply(lambda x: np.array(x).mean(axis=0)).apply(pd.Series).T
acc_std = dfr.groupby(['c']).agg({col: lambda x:x.tolist()})[col].apply(lambda x: np.array(x).std(axis=0)).apply(pd.Series).T
acc_mean.head(50).plot(ax=ax2, lw = 2, color = color)
ax2.set_ylabel('Validate accuracy')
ax2.set_xlabel('Epochs')


plt.savefig('5FCV_valid.png', bbox_inches='tight', dpi=300)

NameError: name 'plt' is not defined