In [1]:
# -*- coding: utf-8 -*-
"""
Created on Sun Aug 16 17:10:53 2020

@author: wanxiang.shen@u.nus.edu
"""

import warnings, os
warnings.filterwarnings("ignore")


import pandas as pd
import numpy as np
from glob import glob

from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import confusion_matrix, precision_recall_curve, roc_auc_score
from sklearn.metrics import auc as calculate_auc

import matplotlib.pyplot as plt
import seaborn as sns

from aggmap import AggMap, loadmap, show

from aggmap.AggModel import RegressionEstimator, MultiClassEstimator, MultiLabelEstimator


np.random.seed(666) #just for reaptable results

In [2]:
gpuid = 6

In [3]:
def score(dfr):
    y_true = dfr.y_true
    y_score = dfr.y_score
    y_pred = dfr.y_score.round()

    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()

    acc = (tp + tn) / sum([tn, fp, fn, tp])
    auc = roc_auc_score(y_true, y_score)
    ber =  (fp / (fp + tn) + fn / (tp + fn)) * 0.5

    print('acc: %.3f, roc-auc: %.3f, BER: %.3f' % (acc,auc,ber))

    return acc, auc

def get_best_epochs(X, y, batch_size):
    
    ## get best epochs
    kf = StratifiedKFold(n_splits = 5, shuffle = True, random_state=1)
    split_idx = kf.split(X, y[:,1])
    
    epochs = []
    for i, idx in enumerate(split_idx):

        train_idx, test_idx = idx

        testY = y[test_idx]
        testX = X[test_idx]

        trainX = X[train_idx]
        trainY = y[train_idx]
        
        clf = MultiClassEstimator(gpuid = gpuid, 
                                  batch_norm = True,
                                  conv1_kernel_size = 13,
                                  batch_size = batch_size,
                                  n_inception = 3,
                                  patience = 20, 
                                  monitor = 'val_loss') #
        
        clf.fit(trainX, trainY, testX, testY)  

        epochs.append(clf._performance.best_epoch)
    
    best_epoch = int(np.mean(epochs))
    print('Best Avg. Epochs: %s' % best_epoch)
    return best_epoch

In [4]:
gpuid = 5

In [5]:
mp = loadmap('/raid/shenwanxiang/agg_mp_object/RSPC-S.mp')

## 5 fold CV

In [None]:
lst = glob('./tasks_O_z-score/*.h5')


n_fold = 5


res = []
for p in lst:
    
    print('#'*50 + ' %s ' % p + '#'*50 )

    dfx = pd.read_hdf(p, key = 'expression')
    dfy = pd.read_hdf(p, key = 'labels')

    print(p, dfy.value_counts().to_dict())
    
    X = mp.batch_transform(dfx.values)
    Y = pd.get_dummies(dfy).values

    outer = KFold(n_splits = n_fold, shuffle = True, random_state = 42)
    outer_idx = outer.split(X, Y[:,1])

    fold_all = []
    for i, idx in enumerate(outer_idx):

        fold_num = "fold_%s" % str(i).zfill(2) 

        train_idx, test_idx = idx

        testY = Y[test_idx]
        testX = X[test_idx]

        trainX = X[train_idx]
        trainY = Y[train_idx]

        ## training
        print("\nInput train and test X shape is %s, %s  \n" % (trainX.shape,  testX.shape))
        print("Getting the best number of epochs \n")
        
        #optimizied the first fold HPs only

        batch_size = 32
#         if len(trainX) < 128:
#             batch_size = 16
        best_epochs = 80 #get_best_epochs(trainX, trainY, batch_size = batch_size)

        clf = MultiClassEstimator(epochs = best_epochs,  
                                  gpuid = gpuid,  
                                  conv1_kernel_size = 15, 
                                  batch_norm = False, 
                                  n_inception = 2,
                                  batch_size = batch_size, 
                                  verbose=0) #
        clf.fit(trainX, trainY)  #
        
        ## evaluate
        y_true = testY[:,1] 
        y_score = clf.predict_proba(testX)[:,1]

        dfr = pd.DataFrame([y_true, y_score]).T
        dfr.columns = ['y_true', 'y_score']
        dfr.index = dfy.iloc[test_idx].index
        acc, auc = score(dfr)
        
        fold_all.append(auc)
        
        res.append([auc, i, p, best_epochs,  batch_size])
        
    print('The auc score for %s is %s.' % (p, np.mean(fold_all)))

  0%|          | 0/554 [00:00<?, ?it/s]

################################################## ./tasks_O_z-score/O_z-score_train_UCEC_grade.h5 ##################################################
./tasks_O_z-score/O_z-score_train_UCEC_grade.h5 {1: 324, 0: 230}


100%|##########| 554/554 [02:07<00:00,  4.36it/s]



Input train and test X shape is (443, 135, 134, 5), (111, 135, 134, 5)  

Getting the best number of epochs 

{'epochs': 80, 'lr': 0.0001, 'conv1_kernel_size': 15, 'dense_layers': [128], 'dense_avf': 'relu', 'batch_size': 32, 'dropout': 0.0, 'batch_norm': False, 'n_inception': 2, 'monitor': 'val_loss', 'patience': 10000, 'random_state': 32, 'verbose': 0, 'name': 'AggMap MultiClass Estimator', 'gpuid': '5'}
acc: 0.820, roc-auc: 0.865, BER: 0.177

Input train and test X shape is (443, 135, 134, 5), (111, 135, 134, 5)  

Getting the best number of epochs 

{'epochs': 80, 'lr': 0.0001, 'conv1_kernel_size': 15, 'dense_layers': [128], 'dense_avf': 'relu', 'batch_size': 32, 'dropout': 0.0, 'batch_norm': False, 'n_inception': 2, 'monitor': 'val_loss', 'patience': 10000, 'random_state': 32, 'verbose': 0, 'name': 'AggMap MultiClass Estimator', 'gpuid': '5'}
acc: 0.793, roc-auc: 0.871, BER: 0.218

Input train and test X shape is (443, 135, 134, 5), (111, 135, 134, 5)  

Getting the best number o

  0%|          | 0/374 [00:00<?, ?it/s]

acc: 0.855, roc-auc: 0.902, BER: 0.167
The auc score for ./tasks_O_z-score/O_z-score_train_UCEC_grade.h5 is 0.8963286905453302.
################################################## ./tasks_O_z-score/O_z-score_train_LIHC_grade.h5 ##################################################
./tasks_O_z-score/O_z-score_train_LIHC_grade.h5 {0: 250, 1: 124}


100%|##########| 374/374 [01:29<00:00,  4.16it/s]



Input train and test X shape is (299, 135, 134, 5), (75, 135, 134, 5)  

Getting the best number of epochs 

{'epochs': 80, 'lr': 0.0001, 'conv1_kernel_size': 15, 'dense_layers': [128], 'dense_avf': 'relu', 'batch_size': 32, 'dropout': 0.0, 'batch_norm': False, 'n_inception': 2, 'monitor': 'val_loss', 'patience': 10000, 'random_state': 32, 'verbose': 0, 'name': 'AggMap MultiClass Estimator', 'gpuid': '5'}
acc: 0.667, roc-auc: 0.762, BER: 0.435

Input train and test X shape is (299, 135, 134, 5), (75, 135, 134, 5)  

Getting the best number of epochs 

{'epochs': 80, 'lr': 0.0001, 'conv1_kernel_size': 15, 'dense_layers': [128], 'dense_avf': 'relu', 'batch_size': 32, 'dropout': 0.0, 'batch_norm': False, 'n_inception': 2, 'monitor': 'val_loss', 'patience': 10000, 'random_state': 32, 'verbose': 0, 'name': 'AggMap MultiClass Estimator', 'gpuid': '5'}
acc: 0.613, roc-auc: 0.656, BER: 0.453

Input train and test X shape is (299, 135, 134, 5), (75, 135, 134, 5)  

Getting the best number of e

  0%|          | 0/416 [00:00<?, ?it/s]

acc: 0.676, roc-auc: 0.611, BER: 0.401
The auc score for ./tasks_O_z-score/O_z-score_train_LIHC_grade.h5 is 0.694137425708688.
################################################## ./tasks_O_z-score/O_z-score_train_STAD_grade.h5 ##################################################
./tasks_O_z-score/O_z-score_train_STAD_grade.h5 {1: 246, 0: 170}


100%|##########| 416/416 [01:38<00:00,  4.21it/s]



Input train and test X shape is (332, 135, 134, 5), (84, 135, 134, 5)  

Getting the best number of epochs 

{'epochs': 80, 'lr': 0.0001, 'conv1_kernel_size': 15, 'dense_layers': [128], 'dense_avf': 'relu', 'batch_size': 32, 'dropout': 0.0, 'batch_norm': False, 'n_inception': 2, 'monitor': 'val_loss', 'patience': 10000, 'random_state': 32, 'verbose': 0, 'name': 'AggMap MultiClass Estimator', 'gpuid': '5'}
acc: 0.726, roc-auc: 0.745, BER: 0.331

Input train and test X shape is (333, 135, 134, 5), (83, 135, 134, 5)  

Getting the best number of epochs 

{'epochs': 80, 'lr': 0.0001, 'conv1_kernel_size': 15, 'dense_layers': [128], 'dense_avf': 'relu', 'batch_size': 32, 'dropout': 0.0, 'batch_norm': False, 'n_inception': 2, 'monitor': 'val_loss', 'patience': 10000, 'random_state': 32, 'verbose': 0, 'name': 'AggMap MultiClass Estimator', 'gpuid': '5'}
acc: 0.747, roc-auc: 0.804, BER: 0.292

Input train and test X shape is (333, 135, 134, 5), (83, 135, 134, 5)  

Getting the best number of e

  0%|          | 0/544 [00:00<?, ?it/s]

acc: 0.687, roc-auc: 0.716, BER: 0.329
The auc score for ./tasks_O_z-score/O_z-score_train_STAD_grade.h5 is 0.742433809557923.
################################################## ./tasks_O_z-score/O_z-score_train_KIRC_grade.h5 ##################################################
./tasks_O_z-score/O_z-score_train_KIRC_grade.h5 {0: 337, 1: 207}


100%|##########| 544/544 [02:18<00:00,  3.93it/s]



Input train and test X shape is (435, 135, 134, 5), (109, 135, 134, 5)  

Getting the best number of epochs 

{'epochs': 80, 'lr': 0.0001, 'conv1_kernel_size': 15, 'dense_layers': [128], 'dense_avf': 'relu', 'batch_size': 32, 'dropout': 0.0, 'batch_norm': False, 'n_inception': 2, 'monitor': 'val_loss', 'patience': 10000, 'random_state': 32, 'verbose': 0, 'name': 'AggMap MultiClass Estimator', 'gpuid': '5'}
acc: 0.633, roc-auc: 0.607, BER: 0.459

Input train and test X shape is (435, 135, 134, 5), (109, 135, 134, 5)  

Getting the best number of epochs 

{'epochs': 80, 'lr': 0.0001, 'conv1_kernel_size': 15, 'dense_layers': [128], 'dense_avf': 'relu', 'batch_size': 32, 'dropout': 0.0, 'batch_norm': False, 'n_inception': 2, 'monitor': 'val_loss', 'patience': 10000, 'random_state': 32, 'verbose': 0, 'name': 'AggMap MultiClass Estimator', 'gpuid': '5'}
acc: 0.651, roc-auc: 0.533, BER: 0.450

Input train and test X shape is (435, 135, 134, 5), (109, 135, 134, 5)  

Getting the best number o

In [None]:
clf.plot_model()

In [None]:
dfres = pd.DataFrame(res, columns = ['score', 'fold', 'task', 'best_epoch',  'batch_size'])
dfres.to_csv('./5FCV_results_opt.csv')

In [None]:
dfres.groupby('task').mean().sort_values('score',ascending=False)

In [None]:
dfres.groupby('task').mean().mean()

In [None]:
dfres.groupby('task').mean().std()