In [1]:
# -*- coding: utf-8 -*-
"""
Created on Sun Aug 16 17:10:53 2020

@author: wanxiang.shen@u.nus.edu
"""

import warnings, os
warnings.filterwarnings("ignore")


import pandas as pd
import numpy as np
from glob import glob

from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import confusion_matrix, precision_recall_curve, roc_auc_score
from sklearn.metrics import auc as calculate_auc

import matplotlib.pyplot as plt
import seaborn as sns

from aggmap import AggMap, AggModel, loadmap
from aggmap import show
np.random.seed(666) #just for reaptable results

In [2]:
def score(dfr):
    y_true = dfr.y_true
    y_score = dfr.y_score
    y_pred = dfr.y_score.round()

    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()

    acc = (tp + tn) / sum([tn, fp, fn, tp])
    auc = roc_auc_score(y_true, y_score)
    ber =  (fp / (fp + tn) + fn / (tp + fn)) * 0.5

    print('acc: %.3f, roc-auc: %.3f, BER: %.3f' % (acc,auc,ber))

    return acc, auc

def get_best_epochs(X, y, batch_size):
    
    ## get best epochs
    kf = StratifiedKFold(n_splits = 5, shuffle = True, random_state=1)
    split_idx = kf.split(X, y[:,1])
    
    epochs = []
    for i, idx in enumerate(split_idx):

        train_idx, test_idx = idx

        testY = y[test_idx]
        testX = X[test_idx]

        trainX = X[train_idx]
        trainY = y[train_idx]
        
        clf = AggModel.MultiClassEstimator(gpuid = gpuid,
                                           batch_size = batch_size,
                                           patience = 20, 
                                           monitor = 'val_loss') #
        
        clf.fit(trainX, trainY, testX, testY)  

        epochs.append(clf._performance.best_epoch)
    
    best_epoch = int(np.mean(epochs))
    print('Best Avg. Epochs: %s' % best_epoch)
    return best_epoch

In [3]:
gpuid = 2

In [4]:
mp = loadmap('/raid/shenwanxiang/agg_mp_object/RSPC-S.mp')

## 5 fold CV

In [5]:
lst = glob('../tasks_O_z-score/*.h5')
lst = [i for i in lst if 'grade' in i]

n_fold = 5


res = []
for p in lst:
    
    print('#'*50 + ' %s ' % p + '#'*50 )

    dfx = pd.read_hdf(p, key = 'expression')
    dfy = pd.read_hdf(p, key = 'labels')

    print(p, dfy.value_counts().to_dict())
    
    X = mp.batch_transform(dfx.values)
    Y = pd.get_dummies(dfy).values

    outer = KFold(n_splits = n_fold, shuffle = True, random_state = 123)
    outer_idx = outer.split(X, Y[:,1])

    fold_all = []
    for i, idx in enumerate(outer_idx):

        fold_num = "fold_%s" % str(i).zfill(2) 

        train_idx, test_idx = idx

        testY = Y[test_idx]
        testX = X[test_idx]

        trainX = X[train_idx]
        trainY = Y[train_idx]

        ## training
        print("\nInput train and test X shape is %s, %s  \n" % (trainX.shape,  testX.shape))
        print("Getting the best number of epochs \n")
        
        #optimizied the first fold HPs only

        batch_size = 16
        best_epochs = get_best_epochs(trainX, trainY, batch_size = batch_size)

        clf = AggModel.MultiClassEstimator(epochs = best_epochs,  gpuid = gpuid, 
                                           batch_size = batch_size, verbose=0) #
        clf.fit(trainX, trainY)  #
        
        ## evaluate
        y_true = testY[:,1] 
        y_score = clf.predict_proba(testX)[:,1]

        dfr = pd.DataFrame([y_true, y_score]).T
        dfr.columns = ['y_true', 'y_score']
        dfr.index = dfy.iloc[test_idx].index
        acc, auc = score(dfr)
        
        fold_all.append(auc)
        
        res.append([auc, i, p, best_epochs,  batch_size])
        
    print('The auc score for %s is %s.' % (p, np.mean(fold_all)))

################################################## ../tasks_O_z-score/O_z-score_train_UCEC_grade.h5 ##################################################


  0%|          | 0/554 [00:00<?, ?it/s]

../tasks_O_z-score/O_z-score_train_UCEC_grade.h5 {1: 324, 0: 230}


100%|##########| 554/554 [02:45<00:00,  3.34it/s]



Input train and test X shape is (443, 135, 134, 5), (111, 135, 134, 5)  

Getting the best number of epochs 

{'epochs': 200, 'lr': 0.0001, 'conv1_kernel_size': 13, 'dense_layers': [128], 'dense_avf': 'relu', 'batch_size': 16, 'dropout': 0.0, 'batch_norm': False, 'n_inception': 2, 'monitor': 'val_loss', 'patience': 20, 'random_state': 32, 'verbose': 0, 'name': 'AggMap MultiClass Estimator', 'gpuid': '2'}

Restoring model weights from the end of the best epoch.

Epoch 00063: early stopping
{'epochs': 200, 'lr': 0.0001, 'conv1_kernel_size': 13, 'dense_layers': [128], 'dense_avf': 'relu', 'batch_size': 16, 'dropout': 0.0, 'batch_norm': False, 'n_inception': 2, 'monitor': 'val_loss', 'patience': 20, 'random_state': 32, 'verbose': 0, 'name': 'AggMap MultiClass Estimator', 'gpuid': '2'}

Restoring model weights from the end of the best epoch.

Epoch 00081: early stopping
{'epochs': 200, 'lr': 0.0001, 'conv1_kernel_size': 13, 'dense_layers': [128], 'dense_avf': 'relu', 'batch_size': 16, 'dro

  0%|          | 0/374 [00:00<?, ?it/s]

acc: 0.836, roc-auc: 0.906, BER: 0.150
The auc score for ../tasks_O_z-score/O_z-score_train_UCEC_grade.h5 is 0.9027706449579567.
################################################## ../tasks_O_z-score/O_z-score_train_LIHC_grade.h5 ##################################################
../tasks_O_z-score/O_z-score_train_LIHC_grade.h5 {0: 250, 1: 124}


100%|##########| 374/374 [02:00<00:00,  3.10it/s]



Input train and test X shape is (299, 135, 134, 5), (75, 135, 134, 5)  

Getting the best number of epochs 

{'epochs': 200, 'lr': 0.0001, 'conv1_kernel_size': 13, 'dense_layers': [128], 'dense_avf': 'relu', 'batch_size': 16, 'dropout': 0.0, 'batch_norm': False, 'n_inception': 2, 'monitor': 'val_loss', 'patience': 20, 'random_state': 32, 'verbose': 0, 'name': 'AggMap MultiClass Estimator', 'gpuid': '2'}

Restoring model weights from the end of the best epoch.

Epoch 00063: early stopping
{'epochs': 200, 'lr': 0.0001, 'conv1_kernel_size': 13, 'dense_layers': [128], 'dense_avf': 'relu', 'batch_size': 16, 'dropout': 0.0, 'batch_norm': False, 'n_inception': 2, 'monitor': 'val_loss', 'patience': 20, 'random_state': 32, 'verbose': 0, 'name': 'AggMap MultiClass Estimator', 'gpuid': '2'}

Restoring model weights from the end of the best epoch.

Epoch 00107: early stopping
{'epochs': 200, 'lr': 0.0001, 'conv1_kernel_size': 13, 'dense_layers': [128], 'dense_avf': 'relu', 'batch_size': 16, 'drop

  0%|          | 0/416 [00:00<?, ?it/s]

acc: 0.662, roc-auc: 0.777, BER: 0.446
The auc score for ../tasks_O_z-score/O_z-score_train_LIHC_grade.h5 is 0.6890299406666223.
################################################## ../tasks_O_z-score/O_z-score_train_STAD_grade.h5 ##################################################
../tasks_O_z-score/O_z-score_train_STAD_grade.h5 {1: 246, 0: 170}


100%|##########| 416/416 [02:05<00:00,  3.31it/s]



Input train and test X shape is (332, 135, 134, 5), (84, 135, 134, 5)  

Getting the best number of epochs 

{'epochs': 200, 'lr': 0.0001, 'conv1_kernel_size': 13, 'dense_layers': [128], 'dense_avf': 'relu', 'batch_size': 16, 'dropout': 0.0, 'batch_norm': False, 'n_inception': 2, 'monitor': 'val_loss', 'patience': 20, 'random_state': 32, 'verbose': 0, 'name': 'AggMap MultiClass Estimator', 'gpuid': '2'}

Restoring model weights from the end of the best epoch.

Epoch 00078: early stopping
{'epochs': 200, 'lr': 0.0001, 'conv1_kernel_size': 13, 'dense_layers': [128], 'dense_avf': 'relu', 'batch_size': 16, 'dropout': 0.0, 'batch_norm': False, 'n_inception': 2, 'monitor': 'val_loss', 'patience': 20, 'random_state': 32, 'verbose': 0, 'name': 'AggMap MultiClass Estimator', 'gpuid': '2'}

Restoring model weights from the end of the best epoch.

Epoch 00070: early stopping
{'epochs': 200, 'lr': 0.0001, 'conv1_kernel_size': 13, 'dense_layers': [128], 'dense_avf': 'relu', 'batch_size': 16, 'drop

  0%|          | 0/544 [00:00<?, ?it/s]

acc: 0.783, roc-auc: 0.807, BER: 0.230
The auc score for ../tasks_O_z-score/O_z-score_train_STAD_grade.h5 is 0.7544023003842244.
################################################## ../tasks_O_z-score/O_z-score_train_KIRC_grade.h5 ##################################################
../tasks_O_z-score/O_z-score_train_KIRC_grade.h5 {0: 337, 1: 207}


100%|##########| 544/544 [02:57<00:00,  3.07it/s]



Input train and test X shape is (435, 135, 134, 5), (109, 135, 134, 5)  

Getting the best number of epochs 

{'epochs': 200, 'lr': 0.0001, 'conv1_kernel_size': 13, 'dense_layers': [128], 'dense_avf': 'relu', 'batch_size': 16, 'dropout': 0.0, 'batch_norm': False, 'n_inception': 2, 'monitor': 'val_loss', 'patience': 20, 'random_state': 32, 'verbose': 0, 'name': 'AggMap MultiClass Estimator', 'gpuid': '2'}

Restoring model weights from the end of the best epoch.

Epoch 00075: early stopping
{'epochs': 200, 'lr': 0.0001, 'conv1_kernel_size': 13, 'dense_layers': [128], 'dense_avf': 'relu', 'batch_size': 16, 'dropout': 0.0, 'batch_norm': False, 'n_inception': 2, 'monitor': 'val_loss', 'patience': 20, 'random_state': 32, 'verbose': 0, 'name': 'AggMap MultiClass Estimator', 'gpuid': '2'}

Restoring model weights from the end of the best epoch.

Epoch 00040: early stopping
{'epochs': 200, 'lr': 0.0001, 'conv1_kernel_size': 13, 'dense_layers': [128], 'dense_avf': 'relu', 'batch_size': 16, 'dro

  0%|          | 0/504 [00:00<?, ?it/s]

acc: 0.565, roc-auc: 0.546, BER: 0.491
The auc score for ../tasks_O_z-score/O_z-score_train_KIRC_grade.h5 is 0.6315262010907865.
################################################## ../tasks_O_z-score/O_z-score_test_HNSC_grade.h5 ##################################################
../tasks_O_z-score/O_z-score_test_HNSC_grade.h5 {0: 385, 1: 119}


100%|##########| 504/504 [02:56<00:00,  2.85it/s]



Input train and test X shape is (403, 135, 134, 5), (101, 135, 134, 5)  

Getting the best number of epochs 

{'epochs': 200, 'lr': 0.0001, 'conv1_kernel_size': 13, 'dense_layers': [128], 'dense_avf': 'relu', 'batch_size': 16, 'dropout': 0.0, 'batch_norm': False, 'n_inception': 2, 'monitor': 'val_loss', 'patience': 20, 'random_state': 32, 'verbose': 0, 'name': 'AggMap MultiClass Estimator', 'gpuid': '2'}

Restoring model weights from the end of the best epoch.

Epoch 00090: early stopping
{'epochs': 200, 'lr': 0.0001, 'conv1_kernel_size': 13, 'dense_layers': [128], 'dense_avf': 'relu', 'batch_size': 16, 'dropout': 0.0, 'batch_norm': False, 'n_inception': 2, 'monitor': 'val_loss', 'patience': 20, 'random_state': 32, 'verbose': 0, 'name': 'AggMap MultiClass Estimator', 'gpuid': '2'}

Restoring model weights from the end of the best epoch.

Epoch 00058: early stopping
{'epochs': 200, 'lr': 0.0001, 'conv1_kernel_size': 13, 'dense_layers': [128], 'dense_avf': 'relu', 'batch_size': 16, 'dro

  0%|          | 0/306 [00:00<?, ?it/s]

acc: 0.720, roc-auc: 0.811, BER: 0.425
The auc score for ../tasks_O_z-score/O_z-score_test_HNSC_grade.h5 is 0.758214691671114.
################################################## ../tasks_O_z-score/O_z-score_train_CESC_grade.h5 ##################################################
../tasks_O_z-score/O_z-score_train_CESC_grade.h5 {0: 187, 1: 119}


100%|##########| 306/306 [01:29<00:00,  3.42it/s]



Input train and test X shape is (244, 135, 134, 5), (62, 135, 134, 5)  

Getting the best number of epochs 

{'epochs': 200, 'lr': 0.0001, 'conv1_kernel_size': 13, 'dense_layers': [128], 'dense_avf': 'relu', 'batch_size': 16, 'dropout': 0.0, 'batch_norm': False, 'n_inception': 2, 'monitor': 'val_loss', 'patience': 20, 'random_state': 32, 'verbose': 0, 'name': 'AggMap MultiClass Estimator', 'gpuid': '2'}

Restoring model weights from the end of the best epoch.

Epoch 00073: early stopping
{'epochs': 200, 'lr': 0.0001, 'conv1_kernel_size': 13, 'dense_layers': [128], 'dense_avf': 'relu', 'batch_size': 16, 'dropout': 0.0, 'batch_norm': False, 'n_inception': 2, 'monitor': 'val_loss', 'patience': 20, 'random_state': 32, 'verbose': 0, 'name': 'AggMap MultiClass Estimator', 'gpuid': '2'}

Restoring model weights from the end of the best epoch.

Epoch 00081: early stopping
{'epochs': 200, 'lr': 0.0001, 'conv1_kernel_size': 13, 'dense_layers': [128], 'dense_avf': 'relu', 'batch_size': 16, 'drop

  0%|          | 0/179 [00:00<?, ?it/s]

acc: 0.607, roc-auc: 0.599, BER: 0.405
The auc score for ../tasks_O_z-score/O_z-score_train_CESC_grade.h5 is 0.6680213731684319.
################################################## ../tasks_O_z-score/O_z-score_train_PAAD_grade.h5 ##################################################
../tasks_O_z-score/O_z-score_train_PAAD_grade.h5 {0: 131, 1: 48}


100%|##########| 179/179 [00:58<00:00,  3.08it/s]



Input train and test X shape is (143, 135, 134, 5), (36, 135, 134, 5)  

Getting the best number of epochs 

{'epochs': 200, 'lr': 0.0001, 'conv1_kernel_size': 13, 'dense_layers': [128], 'dense_avf': 'relu', 'batch_size': 16, 'dropout': 0.0, 'batch_norm': False, 'n_inception': 2, 'monitor': 'val_loss', 'patience': 20, 'random_state': 32, 'verbose': 0, 'name': 'AggMap MultiClass Estimator', 'gpuid': '2'}

Restoring model weights from the end of the best epoch.

Epoch 00112: early stopping
{'epochs': 200, 'lr': 0.0001, 'conv1_kernel_size': 13, 'dense_layers': [128], 'dense_avf': 'relu', 'batch_size': 16, 'dropout': 0.0, 'batch_norm': False, 'n_inception': 2, 'monitor': 'val_loss', 'patience': 20, 'random_state': 32, 'verbose': 0, 'name': 'AggMap MultiClass Estimator', 'gpuid': '2'}
{'epochs': 200, 'lr': 0.0001, 'conv1_kernel_size': 13, 'dense_layers': [128], 'dense_avf': 'relu', 'batch_size': 16, 'dropout': 0.0, 'batch_norm': False, 'n_inception': 2, 'monitor': 'val_loss', 'patience': 2

  0%|          | 0/532 [00:00<?, ?it/s]

acc: 0.571, roc-auc: 0.707, BER: 0.500
The auc score for ../tasks_O_z-score/O_z-score_train_PAAD_grade.h5 is 0.6310336711265518.
################################################## ../tasks_O_z-score/O_z-score_train_LGG_grade.h5 ##################################################
../tasks_O_z-score/O_z-score_train_LGG_grade.h5 {1: 269, 0: 263}


100%|##########| 532/532 [02:54<00:00,  3.05it/s]



Input train and test X shape is (425, 135, 134, 5), (107, 135, 134, 5)  

Getting the best number of epochs 

{'epochs': 200, 'lr': 0.0001, 'conv1_kernel_size': 13, 'dense_layers': [128], 'dense_avf': 'relu', 'batch_size': 16, 'dropout': 0.0, 'batch_norm': False, 'n_inception': 2, 'monitor': 'val_loss', 'patience': 20, 'random_state': 32, 'verbose': 0, 'name': 'AggMap MultiClass Estimator', 'gpuid': '2'}

Restoring model weights from the end of the best epoch.

Epoch 00072: early stopping
{'epochs': 200, 'lr': 0.0001, 'conv1_kernel_size': 13, 'dense_layers': [128], 'dense_avf': 'relu', 'batch_size': 16, 'dropout': 0.0, 'batch_norm': False, 'n_inception': 2, 'monitor': 'val_loss', 'patience': 20, 'random_state': 32, 'verbose': 0, 'name': 'AggMap MultiClass Estimator', 'gpuid': '2'}

Restoring model weights from the end of the best epoch.

Epoch 00094: early stopping
{'epochs': 200, 'lr': 0.0001, 'conv1_kernel_size': 13, 'dense_layers': [128], 'dense_avf': 'relu', 'batch_size': 16, 'dro

In [13]:
dfres = pd.DataFrame(res, columns = ['score', 'fold', 'task', 'best_epoch',  'batch_size'])
dfres.to_csv('./5FCV_results_grade.csv')

In [12]:
dfres.groupby('task').mean().sort_values('score',ascending=False)

Unnamed: 0_level_0,score,fold,best_epoch,batch_size
task,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
../tasks_O_z-score/O_z-score_train_UCEC_grade.h5,0.902771,2.0,50.6,16.0
../tasks_O_z-score/O_z-score_train_LGG_grade.h5,0.773789,2.0,43.6,16.0
../tasks_O_z-score/O_z-score_test_HNSC_grade.h5,0.758215,2.0,49.2,16.0
../tasks_O_z-score/O_z-score_train_STAD_grade.h5,0.754402,2.0,48.2,16.0
../tasks_O_z-score/O_z-score_train_LIHC_grade.h5,0.68903,2.0,45.6,16.0
../tasks_O_z-score/O_z-score_train_CESC_grade.h5,0.668021,2.0,62.6,16.0
../tasks_O_z-score/O_z-score_train_KIRC_grade.h5,0.631526,2.0,37.0,16.0
../tasks_O_z-score/O_z-score_train_PAAD_grade.h5,0.631034,2.0,84.0,16.0


In [8]:
dfres.groupby('task').mean().mean()

score          0.726098
fold           2.000000
best_epoch    52.600000
batch_size    16.000000
dtype: float64

In [9]:
dfres.groupby('task').mean().std()

score          0.091064
fold           0.000000
best_epoch    14.610368
batch_size     0.000000
dtype: float64