In [1]:
# -*- coding: utf-8 -*-
"""
Created on Sun Aug 16 17:10:53 2020

@author: wanxiang.shen@u.nus.edu
"""

import warnings, os
warnings.filterwarnings("ignore")


import pandas as pd
import numpy as np
from glob import glob

from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import confusion_matrix, precision_recall_curve, roc_auc_score
from sklearn.metrics import auc as calculate_auc

import matplotlib.pyplot as plt
import seaborn as sns

from aggmap import AggMap, AggModel, loadmap
from aggmap import show
np.random.seed(666) #just for reaptable results

In [2]:
def score(dfr):
    y_true = dfr.y_true
    y_score = dfr.y_score
    y_pred = dfr.y_score.round()

    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()

    acc = (tp + tn) / sum([tn, fp, fn, tp])
    auc = roc_auc_score(y_true, y_score)
    ber =  (fp / (fp + tn) + fn / (tp + fn)) * 0.5

    print('acc: %.3f, roc-auc: %.3f, BER: %.3f' % (acc,auc,ber))

    return acc, auc

def get_best_epochs(X, y, batch_size):
    
    ## get best epochs
    kf = StratifiedKFold(n_splits = 5, shuffle = True, random_state=1)
    split_idx = kf.split(X, y[:,1])
    
    epochs = []
    for i, idx in enumerate(split_idx):

        train_idx, test_idx = idx

        testY = y[test_idx]
        testX = X[test_idx]

        trainX = X[train_idx]
        trainY = y[train_idx]
        
        clf = AggModel.MultiClassEstimator(gpuid = gpuid, 
                                           batch_size = batch_size,
                                           patience = 20, 
                                           monitor = 'val_loss') #
        
        clf.fit(trainX, trainY, testX, testY)  

        epochs.append(clf._performance.best_epoch)
    
    best_epoch = int(np.mean(epochs))
    print('Best Avg. Epochs: %s' % best_epoch)
    return best_epoch

In [3]:
gpuid = 6

In [4]:
mp = loadmap('/raid/shenwanxiang/agg_mp_object/RSPC-S.mp')

## 5 fold CV

In [5]:
lst = glob('../tasks_O_z-score/*.h5')
lst = [i for i in lst if 'stage' in i]

n_fold = 5


res = []
for p in lst:
    
    print('#'*50 + ' %s ' % p + '#'*50 )

    dfx = pd.read_hdf(p, key = 'expression')
    dfy = pd.read_hdf(p, key = 'labels')

    print(p, dfy.value_counts().to_dict())
    
    X = mp.batch_transform(dfx.values)
    Y = pd.get_dummies(dfy).values

    outer = KFold(n_splits = n_fold, shuffle = True, random_state = 123)
    outer_idx = outer.split(X, Y[:,1])

    fold_all = []
    for i, idx in enumerate(outer_idx):

        fold_num = "fold_%s" % str(i).zfill(2) 

        train_idx, test_idx = idx

        testY = Y[test_idx]
        testX = X[test_idx]

        trainX = X[train_idx]
        trainY = Y[train_idx]

        ## training
        print("\nInput train and test X shape is %s, %s  \n" % (trainX.shape,  testX.shape))
        print("Getting the best number of epochs \n")
        
        #optimizied the first fold HPs only


        batch_size = 16
        best_epochs = get_best_epochs(trainX, trainY, batch_size = batch_size)

        clf = AggModel.MultiClassEstimator(epochs = best_epochs,  gpuid = gpuid, 
                                           batch_size = batch_size, verbose=0) #
        clf.fit(trainX, trainY)  #
        
        ## evaluate
        y_true = testY[:,1] 
        y_score = clf.predict_proba(testX)[:,1]

        dfr = pd.DataFrame([y_true, y_score]).T
        dfr.columns = ['y_true', 'y_score']
        dfr.index = dfy.iloc[test_idx].index
        acc, auc = score(dfr)
        
        fold_all.append(auc)
        
        res.append([auc, i, p, best_epochs,  batch_size])
        
    print('The auc score for %s is %s.' % (p, np.mean(fold_all)))

  0%|          | 0/504 [00:00<?, ?it/s]

################################################## ../tasks_O_z-score/O_z-score_validate_LUSC_stage.h5 ##################################################
../tasks_O_z-score/O_z-score_validate_LUSC_stage.h5 {1: 255, 0: 249}


100%|##########| 504/504 [02:13<00:00,  3.78it/s]



Input train and test X shape is (403, 135, 134, 5), (101, 135, 134, 5)  

Getting the best number of epochs 

{'epochs': 200, 'lr': 0.0001, 'conv1_kernel_size': 13, 'dense_layers': [128], 'dense_avf': 'relu', 'batch_size': 16, 'dropout': 0.0, 'batch_norm': False, 'n_inception': 2, 'monitor': 'val_loss', 'patience': 20, 'random_state': 32, 'verbose': 0, 'name': 'AggMap MultiClass Estimator', 'gpuid': '6'}

Restoring model weights from the end of the best epoch.

Epoch 00073: early stopping
{'epochs': 200, 'lr': 0.0001, 'conv1_kernel_size': 13, 'dense_layers': [128], 'dense_avf': 'relu', 'batch_size': 16, 'dropout': 0.0, 'batch_norm': False, 'n_inception': 2, 'monitor': 'val_loss', 'patience': 20, 'random_state': 32, 'verbose': 0, 'name': 'AggMap MultiClass Estimator', 'gpuid': '6'}

Restoring model weights from the end of the best epoch.

Epoch 00069: early stopping
{'epochs': 200, 'lr': 0.0001, 'conv1_kernel_size': 13, 'dense_layers': [128], 'dense_avf': 'relu', 'batch_size': 16, 'dro

  0%|          | 0/1134 [00:00<?, ?it/s]

acc: 0.590, roc-auc: 0.616, BER: 0.394
The auc score for ../tasks_O_z-score/O_z-score_validate_LUSC_stage.h5 is 0.6244620132923214.
################################################## ../tasks_O_z-score/O_z-score_test_BRCA_stage.h5 ##################################################
../tasks_O_z-score/O_z-score_test_BRCA_stage.h5 {0: 861, 1: 273}


100%|##########| 1134/1134 [05:05<00:00,  3.71it/s]



Input train and test X shape is (907, 135, 134, 5), (227, 135, 134, 5)  

Getting the best number of epochs 

{'epochs': 200, 'lr': 0.0001, 'conv1_kernel_size': 13, 'dense_layers': [128], 'dense_avf': 'relu', 'batch_size': 16, 'dropout': 0.0, 'batch_norm': False, 'n_inception': 2, 'monitor': 'val_loss', 'patience': 20, 'random_state': 32, 'verbose': 0, 'name': 'AggMap MultiClass Estimator', 'gpuid': '6'}

Restoring model weights from the end of the best epoch.

Epoch 00053: early stopping
{'epochs': 200, 'lr': 0.0001, 'conv1_kernel_size': 13, 'dense_layers': [128], 'dense_avf': 'relu', 'batch_size': 16, 'dropout': 0.0, 'batch_norm': False, 'n_inception': 2, 'monitor': 'val_loss', 'patience': 20, 'random_state': 32, 'verbose': 0, 'name': 'AggMap MultiClass Estimator', 'gpuid': '6'}

Restoring model weights from the end of the best epoch.

Epoch 00050: early stopping
{'epochs': 200, 'lr': 0.0001, 'conv1_kernel_size': 13, 'dense_layers': [128], 'dense_avf': 'relu', 'batch_size': 16, 'dro

  0%|          | 0/374 [00:00<?, ?it/s]

acc: 0.774, roc-auc: 0.650, BER: 0.500
The auc score for ../tasks_O_z-score/O_z-score_test_BRCA_stage.h5 is 0.6286139467619619.
################################################## ../tasks_O_z-score/O_z-score_train_LIHC_stage.h5 ##################################################
../tasks_O_z-score/O_z-score_train_LIHC_stage.h5 {0: 197, 1: 177}


100%|##########| 374/374 [02:13<00:00,  2.80it/s]



Input train and test X shape is (299, 135, 134, 5), (75, 135, 134, 5)  

Getting the best number of epochs 

{'epochs': 200, 'lr': 0.0001, 'conv1_kernel_size': 13, 'dense_layers': [128], 'dense_avf': 'relu', 'batch_size': 16, 'dropout': 0.0, 'batch_norm': False, 'n_inception': 2, 'monitor': 'val_loss', 'patience': 20, 'random_state': 32, 'verbose': 0, 'name': 'AggMap MultiClass Estimator', 'gpuid': '6'}

Restoring model weights from the end of the best epoch.

Epoch 00116: early stopping
{'epochs': 200, 'lr': 0.0001, 'conv1_kernel_size': 13, 'dense_layers': [128], 'dense_avf': 'relu', 'batch_size': 16, 'dropout': 0.0, 'batch_norm': False, 'n_inception': 2, 'monitor': 'val_loss', 'patience': 20, 'random_state': 32, 'verbose': 0, 'name': 'AggMap MultiClass Estimator', 'gpuid': '6'}

Restoring model weights from the end of the best epoch.

Epoch 00063: early stopping
{'epochs': 200, 'lr': 0.0001, 'conv1_kernel_size': 13, 'dense_layers': [128], 'dense_avf': 'relu', 'batch_size': 16, 'drop

  0%|          | 0/554 [00:00<?, ?it/s]

acc: 0.581, roc-auc: 0.679, BER: 0.417
The auc score for ../tasks_O_z-score/O_z-score_train_LIHC_stage.h5 is 0.6816130670295297.
################################################## ../tasks_O_z-score/O_z-score_train_UCEC_stage.h5 ##################################################
../tasks_O_z-score/O_z-score_train_UCEC_stage.h5 {0: 342, 1: 212}


100%|##########| 554/554 [02:47<00:00,  3.31it/s]



Input train and test X shape is (443, 135, 134, 5), (111, 135, 134, 5)  

Getting the best number of epochs 

{'epochs': 200, 'lr': 0.0001, 'conv1_kernel_size': 13, 'dense_layers': [128], 'dense_avf': 'relu', 'batch_size': 16, 'dropout': 0.0, 'batch_norm': False, 'n_inception': 2, 'monitor': 'val_loss', 'patience': 20, 'random_state': 32, 'verbose': 0, 'name': 'AggMap MultiClass Estimator', 'gpuid': '6'}

Restoring model weights from the end of the best epoch.

Epoch 00062: early stopping
{'epochs': 200, 'lr': 0.0001, 'conv1_kernel_size': 13, 'dense_layers': [128], 'dense_avf': 'relu', 'batch_size': 16, 'dropout': 0.0, 'batch_norm': False, 'n_inception': 2, 'monitor': 'val_loss', 'patience': 20, 'random_state': 32, 'verbose': 0, 'name': 'AggMap MultiClass Estimator', 'gpuid': '6'}

Restoring model weights from the end of the best epoch.

Epoch 00068: early stopping
{'epochs': 200, 'lr': 0.0001, 'conv1_kernel_size': 13, 'dense_layers': [128], 'dense_avf': 'relu', 'batch_size': 16, 'dro

  0%|          | 0/513 [00:00<?, ?it/s]

acc: 0.627, roc-auc: 0.678, BER: 0.404
The auc score for ../tasks_O_z-score/O_z-score_train_UCEC_stage.h5 is 0.7067611479423046.
################################################## ../tasks_O_z-score/O_z-score_train_THCA_stage.h5 ##################################################
../tasks_O_z-score/O_z-score_train_THCA_stage.h5 {0: 291, 1: 222}


100%|##########| 513/513 [02:58<00:00,  2.88it/s]



Input train and test X shape is (410, 135, 134, 5), (103, 135, 134, 5)  

Getting the best number of epochs 

{'epochs': 200, 'lr': 0.0001, 'conv1_kernel_size': 13, 'dense_layers': [128], 'dense_avf': 'relu', 'batch_size': 16, 'dropout': 0.0, 'batch_norm': False, 'n_inception': 2, 'monitor': 'val_loss', 'patience': 20, 'random_state': 32, 'verbose': 0, 'name': 'AggMap MultiClass Estimator', 'gpuid': '6'}

Restoring model weights from the end of the best epoch.

Epoch 00074: early stopping
{'epochs': 200, 'lr': 0.0001, 'conv1_kernel_size': 13, 'dense_layers': [128], 'dense_avf': 'relu', 'batch_size': 16, 'dropout': 0.0, 'batch_norm': False, 'n_inception': 2, 'monitor': 'val_loss', 'patience': 20, 'random_state': 32, 'verbose': 0, 'name': 'AggMap MultiClass Estimator', 'gpuid': '6'}

Restoring model weights from the end of the best epoch.

Epoch 00065: early stopping
{'epochs': 200, 'lr': 0.0001, 'conv1_kernel_size': 13, 'dense_layers': [128], 'dense_avf': 'relu', 'batch_size': 16, 'dro

  0%|          | 0/505 [00:00<?, ?it/s]

acc: 0.657, roc-auc: 0.609, BER: 0.406
The auc score for ../tasks_O_z-score/O_z-score_train_THCA_stage.h5 is 0.678543821238921.
################################################## ../tasks_O_z-score/O_z-score_train_COAD_stage.h5 ##################################################
../tasks_O_z-score/O_z-score_train_COAD_stage.h5 {0: 296, 1: 209}


100%|##########| 505/505 [02:54<00:00,  2.90it/s]



Input train and test X shape is (404, 135, 134, 5), (101, 135, 134, 5)  

Getting the best number of epochs 

{'epochs': 200, 'lr': 0.0001, 'conv1_kernel_size': 13, 'dense_layers': [128], 'dense_avf': 'relu', 'batch_size': 16, 'dropout': 0.0, 'batch_norm': False, 'n_inception': 2, 'monitor': 'val_loss', 'patience': 20, 'random_state': 32, 'verbose': 0, 'name': 'AggMap MultiClass Estimator', 'gpuid': '6'}

Restoring model weights from the end of the best epoch.

Epoch 00093: early stopping
{'epochs': 200, 'lr': 0.0001, 'conv1_kernel_size': 13, 'dense_layers': [128], 'dense_avf': 'relu', 'batch_size': 16, 'dropout': 0.0, 'batch_norm': False, 'n_inception': 2, 'monitor': 'val_loss', 'patience': 20, 'random_state': 32, 'verbose': 0, 'name': 'AggMap MultiClass Estimator', 'gpuid': '6'}

Restoring model weights from the end of the best epoch.

Epoch 00069: early stopping
{'epochs': 200, 'lr': 0.0001, 'conv1_kernel_size': 13, 'dense_layers': [128], 'dense_avf': 'relu', 'batch_size': 16, 'dro

  0%|          | 0/544 [00:00<?, ?it/s]

acc: 0.723, roc-auc: 0.797, BER: 0.330
The auc score for ../tasks_O_z-score/O_z-score_train_COAD_stage.h5 is 0.7244282496365837.
################################################## ../tasks_O_z-score/O_z-score_train_KIRC_stage.h5 ##################################################
../tasks_O_z-score/O_z-score_train_KIRC_stage.h5 {0: 338, 1: 206}


100%|##########| 544/544 [02:56<00:00,  3.09it/s]



Input train and test X shape is (435, 135, 134, 5), (109, 135, 134, 5)  

Getting the best number of epochs 

{'epochs': 200, 'lr': 0.0001, 'conv1_kernel_size': 13, 'dense_layers': [128], 'dense_avf': 'relu', 'batch_size': 16, 'dropout': 0.0, 'batch_norm': False, 'n_inception': 2, 'monitor': 'val_loss', 'patience': 20, 'random_state': 32, 'verbose': 0, 'name': 'AggMap MultiClass Estimator', 'gpuid': '6'}

Restoring model weights from the end of the best epoch.

Epoch 00073: early stopping
{'epochs': 200, 'lr': 0.0001, 'conv1_kernel_size': 13, 'dense_layers': [128], 'dense_avf': 'relu', 'batch_size': 16, 'dropout': 0.0, 'batch_norm': False, 'n_inception': 2, 'monitor': 'val_loss', 'patience': 20, 'random_state': 32, 'verbose': 0, 'name': 'AggMap MultiClass Estimator', 'gpuid': '6'}

Restoring model weights from the end of the best epoch.

Epoch 00076: early stopping
{'epochs': 200, 'lr': 0.0001, 'conv1_kernel_size': 13, 'dense_layers': [128], 'dense_avf': 'relu', 'batch_size': 16, 'dro

  0%|          | 0/249 [00:00<?, ?it/s]

acc: 0.741, roc-auc: 0.766, BER: 0.304
The auc score for ../tasks_O_z-score/O_z-score_train_KIRC_stage.h5 is 0.7746025301007846.
################################################## ../tasks_O_z-score/O_z-score_train_SKCM_stage.h5 ##################################################
../tasks_O_z-score/O_z-score_train_SKCM_stage.h5 {0: 157, 1: 92}


100%|##########| 249/249 [01:13<00:00,  3.39it/s]



Input train and test X shape is (199, 135, 134, 5), (50, 135, 134, 5)  

Getting the best number of epochs 

{'epochs': 200, 'lr': 0.0001, 'conv1_kernel_size': 13, 'dense_layers': [128], 'dense_avf': 'relu', 'batch_size': 16, 'dropout': 0.0, 'batch_norm': False, 'n_inception': 2, 'monitor': 'val_loss', 'patience': 20, 'random_state': 32, 'verbose': 0, 'name': 'AggMap MultiClass Estimator', 'gpuid': '6'}

Restoring model weights from the end of the best epoch.

Epoch 00109: early stopping
{'epochs': 200, 'lr': 0.0001, 'conv1_kernel_size': 13, 'dense_layers': [128], 'dense_avf': 'relu', 'batch_size': 16, 'dropout': 0.0, 'batch_norm': False, 'n_inception': 2, 'monitor': 'val_loss', 'patience': 20, 'random_state': 32, 'verbose': 0, 'name': 'AggMap MultiClass Estimator', 'gpuid': '6'}

Restoring model weights from the end of the best epoch.

Epoch 00052: early stopping
{'epochs': 200, 'lr': 0.0001, 'conv1_kernel_size': 13, 'dense_layers': [128], 'dense_avf': 'relu', 'batch_size': 16, 'drop

  0%|          | 0/416 [00:00<?, ?it/s]

acc: 0.633, roc-auc: 0.601, BER: 0.516
The auc score for ../tasks_O_z-score/O_z-score_train_SKCM_stage.h5 is 0.6614024601351576.
################################################## ../tasks_O_z-score/O_z-score_train_STAD_stage.h5 ##################################################
../tasks_O_z-score/O_z-score_train_STAD_stage.h5 {1: 210, 0: 206}


100%|##########| 416/416 [02:00<00:00,  3.44it/s]



Input train and test X shape is (332, 135, 134, 5), (84, 135, 134, 5)  

Getting the best number of epochs 

{'epochs': 200, 'lr': 0.0001, 'conv1_kernel_size': 13, 'dense_layers': [128], 'dense_avf': 'relu', 'batch_size': 16, 'dropout': 0.0, 'batch_norm': False, 'n_inception': 2, 'monitor': 'val_loss', 'patience': 20, 'random_state': 32, 'verbose': 0, 'name': 'AggMap MultiClass Estimator', 'gpuid': '6'}

Restoring model weights from the end of the best epoch.

Epoch 00067: early stopping
{'epochs': 200, 'lr': 0.0001, 'conv1_kernel_size': 13, 'dense_layers': [128], 'dense_avf': 'relu', 'batch_size': 16, 'dropout': 0.0, 'batch_norm': False, 'n_inception': 2, 'monitor': 'val_loss', 'patience': 20, 'random_state': 32, 'verbose': 0, 'name': 'AggMap MultiClass Estimator', 'gpuid': '6'}

Restoring model weights from the end of the best epoch.

Epoch 00067: early stopping
{'epochs': 200, 'lr': 0.0001, 'conv1_kernel_size': 13, 'dense_layers': [128], 'dense_avf': 'relu', 'batch_size': 16, 'drop

  0%|          | 0/542 [00:00<?, ?it/s]

acc: 0.482, roc-auc: 0.467, BER: 0.515
The auc score for ../tasks_O_z-score/O_z-score_train_STAD_stage.h5 is 0.6177831681939514.
################################################## ../tasks_O_z-score/O_z-score_train_LUAD_stage.h5 ##################################################
../tasks_O_z-score/O_z-score_train_LUAD_stage.h5 {0: 306, 1: 236}


100%|##########| 542/542 [02:35<00:00,  3.48it/s]



Input train and test X shape is (433, 135, 134, 5), (109, 135, 134, 5)  

Getting the best number of epochs 

{'epochs': 200, 'lr': 0.0001, 'conv1_kernel_size': 13, 'dense_layers': [128], 'dense_avf': 'relu', 'batch_size': 16, 'dropout': 0.0, 'batch_norm': False, 'n_inception': 2, 'monitor': 'val_loss', 'patience': 20, 'random_state': 32, 'verbose': 0, 'name': 'AggMap MultiClass Estimator', 'gpuid': '6'}

Restoring model weights from the end of the best epoch.

Epoch 00062: early stopping
{'epochs': 200, 'lr': 0.0001, 'conv1_kernel_size': 13, 'dense_layers': [128], 'dense_avf': 'relu', 'batch_size': 16, 'dropout': 0.0, 'batch_norm': False, 'n_inception': 2, 'monitor': 'val_loss', 'patience': 20, 'random_state': 32, 'verbose': 0, 'name': 'AggMap MultiClass Estimator', 'gpuid': '6'}

Restoring model weights from the end of the best epoch.

Epoch 00086: early stopping
{'epochs': 200, 'lr': 0.0001, 'conv1_kernel_size': 13, 'dense_layers': [128], 'dense_avf': 'relu', 'batch_size': 16, 'dro

In [19]:
dfres = pd.DataFrame(res, columns = ['score', 'fold', 'task', 'best_epoch',  'batch_size'])
dfres.to_csv('./5FCV_results_stage.csv')

In [20]:
dfres.groupby('task').mean().sort_values('score',ascending=False)

Unnamed: 0_level_0,score,fold,best_epoch,batch_size
task,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
../tasks_O_z-score/O_z-score_train_KIRC_stage.h5,0.774603,2.0,46.6,16.0
../tasks_O_z-score/O_z-score_train_COAD_stage.h5,0.724428,2.0,60.0,16.0
../tasks_O_z-score/O_z-score_train_UCEC_stage.h5,0.706761,2.0,34.4,16.0
../tasks_O_z-score/O_z-score_train_LIHC_stage.h5,0.681613,2.0,43.6,16.0
../tasks_O_z-score/O_z-score_train_THCA_stage.h5,0.678544,2.0,47.6,16.0
../tasks_O_z-score/O_z-score_train_SKCM_stage.h5,0.661402,2.0,65.0,16.0
../tasks_O_z-score/O_z-score_train_LUAD_stage.h5,0.655976,2.0,36.6,16.0
../tasks_O_z-score/O_z-score_test_BRCA_stage.h5,0.628614,2.0,29.6,16.0
../tasks_O_z-score/O_z-score_validate_LUSC_stage.h5,0.624462,2.0,39.2,16.0
../tasks_O_z-score/O_z-score_train_STAD_stage.h5,0.617783,2.0,30.2,16.0


In [21]:
dfres.groupby('task').mean().mean()

score          0.675419
fold           2.000000
best_epoch    43.280000
batch_size    16.000000
dtype: float64

In [22]:
dfres.groupby('task').mean().std()

score          0.049389
fold           0.000000
best_epoch    11.923161
batch_size     0.000000
dtype: float64