In [1]:
import warnings, os
warnings.filterwarnings("ignore")

from copy import copy
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import confusion_matrix, precision_recall_curve, accuracy_score
from sklearn.metrics import roc_auc_score, matthews_corrcoef, precision_score, recall_score, f1_score


import seaborn as sns
import matplotlib.pyplot as plt
from joblib import dump, load

from aggmap import AggMap, loadmap
from aggmap import AggMapNet as AggModel

from aggmap.AggMapNet import load_model, save_model
from aggmap import show


np.random.seed(666) #just for reaptable results


def score(dfr):
    y_true = dfr.y_true
    y_score = dfr.y_score
    y_pred = dfr.y_pred

    '''
    the metrics are taken from orignal paper:
    Meta-Signer: Metagenomic Signature Identifier based on Rank Aggregation of Features
    https://github.com/YDaiLab/Meta-Signer/blob/bd6a1cd98d1035f848ecb6e53d9ee67a85871db2/src/utils/metasigner_io.py#L34
    '''
    auc = roc_auc_score(y_true, y_score, average='weighted')        
    mcc = matthews_corrcoef(y_true, y_pred)
    pres = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')
    acc = accuracy_score(y_true, y_pred)
    
    print('roc-auc: %.3f, mcc: %.3f, pres: %.3f, recall: %.3f, f1: %.3f' % (auc, mcc, pres, recall, f1))

    return acc, auc, mcc, pres, recall, f1

2022-10-03 16:18:24.568817: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


# read data

In [2]:
task = 'T2D'
data_path = '../../../01_data/species_level/%s/' % (task)
save_dir = '%s_results' % task
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

dfa = pd.read_csv(os.path.join(data_path, 'abundance.tsv'),sep='\t', header=None, index_col=0)
dfy = pd.read_csv(os.path.join(data_path, 'labels.txt'),sep='\t', header=None)
dfx = dfa.T
dfy = pd.get_dummies(dfy[0].map({'t2d':1, 'n':0}))
Y = dfy.values

# 10FCV

In [3]:
gpuid = 1

outer_fold = 10
repeat_seeds = [8, 16, 32, 64, 128, 256, 1024, 2048, 4096, 8192] #10 repeats random seeds 8, 16, 32, 64, 128

each_fold_results = []
run_all_res = []

for i, repeat_seed in enumerate(repeat_seeds): 
    outer = StratifiedKFold(n_splits = outer_fold, shuffle = True, random_state = repeat_seed)
    outer_idx = outer.split(range(len(dfy)), dfy.idxmax(axis=1))
    run_one_res = []
    for j, idx in enumerate(outer_idx):
        fold_num = "fold_%s" % str(j).zfill(2) 
        print('#'*50 + ' repeat_seed: %s; %s ' % (repeat_seed, fold_num) + '#'*50 )
        
        train_idx, test_idx = idx
        dfx_train = dfx.iloc[train_idx]
        dfy_train = dfy.iloc[train_idx]
        
        ## get best parameters 
        if (i == 0) & (j == 0):
            from tune import finetune_HPs
            best_fill, best_scale_method, best_channel_number, best_epochs, batch_size = finetune_HPs(dfx_train, dfy_train, gpuid=gpuid)
            featHPs = {"best_fill":best_fill, "best_scale_method":best_scale_method, "best_channel_number":best_channel_number}
            dfx = np.log(dfx + best_fill)
            mp = AggMap(dfx, metric = 'correlation')
            mp = mp.fit(cluster_channels = best_channel_number, verbose = 0, var_thr = 0)
            mp.plot_grid(save_dir)
            mp.plot_scatter(save_dir)
            mp.save(os.path.join(save_dir, 'agg.mp'))
            X = mp.batch_transform(dfx.values, scale = best_scale_method) # NaN values should be the lowest value
            
        testY = Y[test_idx]
        testX = X[test_idx]
        
        trainX = X[train_idx]
        trainY = Y[train_idx]

        print("\n input train and test X shape is %s, %s " % (trainX.shape,  testX.shape))

        clf = AggModel.MultiClassEstimator(epochs = best_epochs,  batch_size = batch_size, verbose = 0, gpuid=gpuid) #
        clf.fit(trainX, trainY)  #, 
        
        ## save model for explaination
        if i == 0:
            clf.save_model(os.path.join(save_dir, '%s.model' % fold_num))
            paras = clf.get_params()
            paras.update({'featHPs':featHPs})
            pd.Series(paras).to_json(os.path.join(save_dir, 'HPs.json'))

        pred_proba = clf.predict_proba(testX)
        y_true = testY[:,1] 
        y_score = pred_proba[:,1]
        y_pred = np.argmax(pred_proba, axis=1)
        
        dfr = pd.DataFrame([y_true, y_score, y_pred]).T
        dfr.columns = ['y_true', 'y_score', 'y_pred']
        dfr.index = dfy.iloc[test_idx].index

        acc, auc, mcc, pres, recall, f1  = score(dfr)
        run_one_res.append(dfr)
        ts = pd.Series([acc, auc, mcc, pres, recall, f1, i, repeat_seed]).round(3)
        ts.index = ['acc','auc', 'mcc', 'pres', 'recall', 'f1', 'i', 'repeat_seed']

        print(ts.to_dict())
        each_fold_results.append(ts.to_dict())
    run_all_res.append(pd.concat(run_one_res))

################################################## repeat_seed: 8; fold_00 ##################################################
2022-10-03 16:18:25,254 - [32mINFO[0m - [bidd-aggmap][0m - Calculating distance ...[0m
2022-10-03 16:18:25,264 - [32mINFO[0m - [bidd-aggmap][0m - the number of process is 16[0m


100%|#############################################################################################################################| 183315/183315 [00:09<00:00, 18841.24it/s]
100%|###########################################################################################################################| 183315/183315 [00:00<00:00, 5035554.64it/s]
100%|####################################################################################################################################| 606/606 [00:00<00:00, 5311.62it/s]


2022-10-03 16:18:35,308 - [32mINFO[0m - [bidd-aggmap][0m - applying hierarchical clustering to obtain group information ...[0m
2022-10-03 16:18:37,723 - [32mINFO[0m - [bidd-aggmap][0m - Applying grid assignment of feature points, this may take several minutes(1~30 min)[0m
2022-10-03 16:18:38,069 - [32mINFO[0m - [bidd-aggmap][0m - Finished[0m


100%|#####################################################################################################################################| 396/396 [00:01<00:00, 213.92it/s]
2022-10-03 16:18:40.063902: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-10-03 16:18:40.080191: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-10-03 16:18:40.080299: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-10-03 16:18:40.080913: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) 

MultiClassEstimator(batch_norm=False, batch_size=2, conv1_kernel_size=13,
                    dense_avf='relu', dense_layers=[128], dropout=0.0,
                    epochs=30, gpuid='1', last_avf='softmax',
                    loss='categorical_crossentropy', lr=0.0001, metric='ACC',
                    monitor='val_loss', n_inception=2,
                    name='AggMap MultiClass Estimator', patience=1000,
                    random_state=32, verbose=0)


2022-10-03 16:18:42.130773: I tensorflow/stream_executor/cuda/cuda_dnn.cc:384] Loaded cuDNN version 8302
2022-10-03 16:18:43.399840: I tensorflow/core/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2022-10-03 16:18:43.468494: I tensorflow/stream_executor/cuda/cuda_blas.cc:1786] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.


{'best_loss': 0.626, 'best_epoch': 7, 'fill': 0.01, 'fold_num': 'fold_01'}
MultiClassEstimator(batch_norm=False, batch_size=2, conv1_kernel_size=13,
                    dense_avf='relu', dense_layers=[128], dropout=0.0,
                    epochs=30, gpuid='1', last_avf='softmax',
                    loss='categorical_crossentropy', lr=0.0001, metric='ACC',
                    monitor='val_loss', n_inception=2,
                    name='AggMap MultiClass Estimator', patience=1000,
                    random_state=32, verbose=0)
{'best_loss': 0.625, 'best_epoch': 7, 'fill': 0.01, 'fold_num': 'fold_03'}
MultiClassEstimator(batch_norm=False, batch_size=2, conv1_kernel_size=13,
                    dense_avf='relu', dense_layers=[128], dropout=0.0,
                    epochs=30, gpuid='1', last_avf='softmax',
                    loss='categorical_crossentropy', lr=0.0001, metric='ACC',
                    monitor='val_loss', n_inception=2,
                    name='AggMap MultiClass Estimat

100%|#############################################################################################################################| 183315/183315 [00:08<00:00, 21004.48it/s]
100%|###########################################################################################################################| 183315/183315 [00:00<00:00, 4648574.30it/s]
100%|####################################################################################################################################| 606/606 [00:00<00:00, 5279.14it/s]


2022-10-03 16:21:19,063 - [32mINFO[0m - [bidd-aggmap][0m - applying hierarchical clustering to obtain group information ...[0m
2022-10-03 16:21:19,569 - [32mINFO[0m - [bidd-aggmap][0m - Applying grid assignment of feature points, this may take several minutes(1~30 min)[0m
2022-10-03 16:21:19,939 - [32mINFO[0m - [bidd-aggmap][0m - Finished[0m


100%|####################################################################################################################################| 396/396 [00:00<00:00, 3054.07it/s]


MultiClassEstimator(batch_norm=False, batch_size=2, conv1_kernel_size=13,
                    dense_avf='relu', dense_layers=[128], dropout=0.0,
                    epochs=30, gpuid='1', last_avf='softmax',
                    loss='categorical_crossentropy', lr=0.0001, metric='ACC',
                    monitor='val_loss', n_inception=2,
                    name='AggMap MultiClass Estimator', patience=1000,
                    random_state=32, verbose=0)
{'best_loss': 0.551, 'best_epoch': 7, 'fill': 1e-05, 'fold_num': 'fold_01'}
MultiClassEstimator(batch_norm=False, batch_size=2, conv1_kernel_size=13,
                    dense_avf='relu', dense_layers=[128], dropout=0.0,
                    epochs=30, gpuid='1', last_avf='softmax',
                    loss='categorical_crossentropy', lr=0.0001, metric='ACC',
                    monitor='val_loss', n_inception=2,
                    name='AggMap MultiClass Estimator', patience=1000,
                    random_state=32, verbose=0)
{'best

100%|#############################################################################################################################| 183315/183315 [00:09<00:00, 20023.99it/s]
100%|###########################################################################################################################| 183315/183315 [00:00<00:00, 4640326.13it/s]
100%|####################################################################################################################################| 606/606 [00:00<00:00, 4882.13it/s]


2022-10-03 16:23:56,426 - [32mINFO[0m - [bidd-aggmap][0m - applying hierarchical clustering to obtain group information ...[0m
2022-10-03 16:23:56,941 - [32mINFO[0m - [bidd-aggmap][0m - Applying grid assignment of feature points, this may take several minutes(1~30 min)[0m
2022-10-03 16:23:57,453 - [32mINFO[0m - [bidd-aggmap][0m - Finished[0m


100%|####################################################################################################################################| 396/396 [00:00<00:00, 3961.86it/s]


MultiClassEstimator(batch_norm=False, batch_size=2, conv1_kernel_size=13,
                    dense_avf='relu', dense_layers=[128], dropout=0.0,
                    epochs=30, gpuid='1', last_avf='softmax',
                    loss='categorical_crossentropy', lr=0.0001, metric='ACC',
                    monitor='val_loss', n_inception=2,
                    name='AggMap MultiClass Estimator', patience=1000,
                    random_state=32, verbose=0)
{'best_loss': 0.546, 'best_epoch': 7, 'fill': 1e-08, 'fold_num': 'fold_01'}
MultiClassEstimator(batch_norm=False, batch_size=2, conv1_kernel_size=13,
                    dense_avf='relu', dense_layers=[128], dropout=0.0,
                    epochs=30, gpuid='1', last_avf='softmax',
                    loss='categorical_crossentropy', lr=0.0001, metric='ACC',
                    monitor='val_loss', n_inception=2,
                    name='AggMap MultiClass Estimator', patience=1000,
                    random_state=32, verbose=0)
{'best

100%|#############################################################################################################################| 183315/183315 [00:08<00:00, 22307.12it/s]
100%|###########################################################################################################################| 183315/183315 [00:00<00:00, 4851858.31it/s]
100%|####################################################################################################################################| 606/606 [00:00<00:00, 5208.22it/s]


2022-10-03 16:26:29,945 - [32mINFO[0m - [bidd-aggmap][0m - applying hierarchical clustering to obtain group information ...[0m
2022-10-03 16:26:30,460 - [32mINFO[0m - [bidd-aggmap][0m - Applying grid assignment of feature points, this may take several minutes(1~30 min)[0m
2022-10-03 16:26:30,839 - [32mINFO[0m - [bidd-aggmap][0m - Finished[0m


100%|####################################################################################################################################| 396/396 [00:00<00:00, 3817.29it/s]


MultiClassEstimator(batch_norm=False, batch_size=2, conv1_kernel_size=13,
                    dense_avf='relu', dense_layers=[128], dropout=0.0,
                    epochs=26, gpuid='1', last_avf='softmax',
                    loss='categorical_crossentropy', lr=0.0001, metric='ACC',
                    monitor='val_loss', n_inception=2,
                    name='AggMap MultiClass Estimator', patience=1000,
                    random_state=32, verbose=0)
{'best_loss': 0.545, 'best_epoch': 7, 'scale_method': 'minmax', 'fold_num': 'fold_01'}
MultiClassEstimator(batch_norm=False, batch_size=2, conv1_kernel_size=13,
                    dense_avf='relu', dense_layers=[128], dropout=0.0,
                    epochs=26, gpuid='1', last_avf='softmax',
                    loss='categorical_crossentropy', lr=0.0001, metric='ACC',
                    monitor='val_loss', n_inception=2,
                    name='AggMap MultiClass Estimator', patience=1000,
                    random_state=32, verbos

100%|####################################################################################################################################| 396/396 [00:00<00:00, 3862.91it/s]


MultiClassEstimator(batch_norm=False, batch_size=2, conv1_kernel_size=13,
                    dense_avf='relu', dense_layers=[128], dropout=0.0,
                    epochs=26, gpuid='1', last_avf='softmax',
                    loss='categorical_crossentropy', lr=0.0001, metric='ACC',
                    monitor='val_loss', n_inception=2,
                    name='AggMap MultiClass Estimator', patience=1000,
                    random_state=32, verbose=0)
{'best_loss': 0.591, 'best_epoch': 10, 'scale_method': 'standard', 'fold_num': 'fold_01'}
MultiClassEstimator(batch_norm=False, batch_size=2, conv1_kernel_size=13,
                    dense_avf='relu', dense_layers=[128], dropout=0.0,
                    epochs=26, gpuid='1', last_avf='softmax',
                    loss='categorical_crossentropy', lr=0.0001, metric='ACC',
                    monitor='val_loss', n_inception=2,
                    name='AggMap MultiClass Estimator', patience=1000,
                    random_state=32, ver

100%|#############################################################################################################################| 183315/183315 [00:08<00:00, 21306.74it/s]
100%|###########################################################################################################################| 183315/183315 [00:00<00:00, 5187031.31it/s]
100%|####################################################################################################################################| 606/606 [00:00<00:00, 5148.55it/s]


2022-10-03 16:30:35,252 - [32mINFO[0m - [bidd-aggmap][0m - applying hierarchical clustering to obtain group information ...[0m
2022-10-03 16:30:35,758 - [32mINFO[0m - [bidd-aggmap][0m - Applying grid assignment of feature points, this may take several minutes(1~30 min)[0m
2022-10-03 16:30:36,137 - [32mINFO[0m - [bidd-aggmap][0m - Finished[0m


100%|####################################################################################################################################| 396/396 [00:00<00:00, 4214.91it/s]


MultiClassEstimator(batch_norm=False, batch_size=2, conv1_kernel_size=13,
                    dense_avf='relu', dense_layers=[128], dropout=0.0, epochs=5,
                    gpuid='1', last_avf='softmax',
                    loss='categorical_crossentropy', lr=0.0001, metric='ACC',
                    monitor='val_loss', n_inception=2,
                    name='AggMap MultiClass Estimator', patience=1000,
                    random_state=32, verbose=0)
{'best_loss': 0.659, 'best_epoch': 5, 'cluster_channels': 1, 'fold_num': 'fold_01'}
MultiClassEstimator(batch_norm=False, batch_size=2, conv1_kernel_size=13,
                    dense_avf='relu', dense_layers=[128], dropout=0.0, epochs=5,
                    gpuid='1', last_avf='softmax',
                    loss='categorical_crossentropy', lr=0.0001, metric='ACC',
                    monitor='val_loss', n_inception=2,
                    name='AggMap MultiClass Estimator', patience=1000,
                    random_state=32, verbose=0)


100%|####################################################################################################################################| 396/396 [00:00<00:00, 3677.80it/s]


MultiClassEstimator(batch_norm=False, batch_size=2, conv1_kernel_size=13,
                    dense_avf='relu', dense_layers=[128], dropout=0.0, epochs=5,
                    gpuid='1', last_avf='softmax',
                    loss='categorical_crossentropy', lr=0.0001, metric='ACC',
                    monitor='val_loss', n_inception=2,
                    name='AggMap MultiClass Estimator', patience=1000,
                    random_state=32, verbose=0)
{'best_loss': 0.617, 'best_epoch': 5, 'cluster_channels': 5, 'fold_num': 'fold_01'}
MultiClassEstimator(batch_norm=False, batch_size=2, conv1_kernel_size=13,
                    dense_avf='relu', dense_layers=[128], dropout=0.0, epochs=5,
                    gpuid='1', last_avf='softmax',
                    loss='categorical_crossentropy', lr=0.0001, metric='ACC',
                    monitor='val_loss', n_inception=2,
                    name='AggMap MultiClass Estimator', patience=1000,
                    random_state=32, verbose=0)


100%|####################################################################################################################################| 396/396 [00:00<00:00, 3456.43it/s]


MultiClassEstimator(batch_norm=False, batch_size=2, conv1_kernel_size=13,
                    dense_avf='relu', dense_layers=[128], dropout=0.0, epochs=5,
                    gpuid='1', last_avf='softmax',
                    loss='categorical_crossentropy', lr=0.0001, metric='ACC',
                    monitor='val_loss', n_inception=2,
                    name='AggMap MultiClass Estimator', patience=1000,
                    random_state=32, verbose=0)
{'best_loss': 0.597, 'best_epoch': 5, 'cluster_channels': 9, 'fold_num': 'fold_01'}
MultiClassEstimator(batch_norm=False, batch_size=2, conv1_kernel_size=13,
                    dense_avf='relu', dense_layers=[128], dropout=0.0, epochs=5,
                    gpuid='1', last_avf='softmax',
                    loss='categorical_crossentropy', lr=0.0001, metric='ACC',
                    monitor='val_loss', n_inception=2,
                    name='AggMap MultiClass Estimator', patience=1000,
                    random_state=32, verbose=0)


100%|####################################################################################################################################| 396/396 [00:00<00:00, 3444.37it/s]


MultiClassEstimator(batch_norm=False, batch_size=2, conv1_kernel_size=13,
                    dense_avf='relu', dense_layers=[128], dropout=0.0, epochs=5,
                    gpuid='1', last_avf='softmax',
                    loss='categorical_crossentropy', lr=0.0001, metric='ACC',
                    monitor='val_loss', n_inception=2,
                    name='AggMap MultiClass Estimator', patience=1000,
                    random_state=32, verbose=0)
{'best_loss': 0.615, 'best_epoch': 5, 'cluster_channels': 13, 'fold_num': 'fold_01'}
MultiClassEstimator(batch_norm=False, batch_size=2, conv1_kernel_size=13,
                    dense_avf='relu', dense_layers=[128], dropout=0.0, epochs=5,
                    gpuid='1', last_avf='softmax',
                    loss='categorical_crossentropy', lr=0.0001, metric='ACC',
                    monitor='val_loss', n_inception=2,
                    name='AggMap MultiClass Estimator', patience=1000,
                    random_state=32, verbose=0)

100%|####################################################################################################################################| 396/396 [00:00<00:00, 3513.90it/s]


MultiClassEstimator(batch_norm=False, batch_size=2, conv1_kernel_size=13,
                    dense_avf='relu', dense_layers=[128], dropout=0.0, epochs=5,
                    gpuid='1', last_avf='softmax',
                    loss='categorical_crossentropy', lr=0.0001, metric='ACC',
                    monitor='val_loss', n_inception=2,
                    name='AggMap MultiClass Estimator', patience=1000,
                    random_state=32, verbose=0)
{'best_loss': 0.593, 'best_epoch': 5, 'cluster_channels': 17, 'fold_num': 'fold_01'}
MultiClassEstimator(batch_norm=False, batch_size=2, conv1_kernel_size=13,
                    dense_avf='relu', dense_layers=[128], dropout=0.0, epochs=5,
                    gpuid='1', last_avf='softmax',
                    loss='categorical_crossentropy', lr=0.0001, metric='ACC',
                    monitor='val_loss', n_inception=2,
                    name='AggMap MultiClass Estimator', patience=1000,
                    random_state=32, verbose=0)

100%|####################################################################################################################################| 396/396 [00:00<00:00, 3451.25it/s]


MultiClassEstimator(batch_norm=False, batch_size=2, conv1_kernel_size=13,
                    dense_avf='relu', dense_layers=[128], dropout=0.0, epochs=5,
                    gpuid='1', last_avf='softmax',
                    loss='categorical_crossentropy', lr=0.0001, metric='ACC',
                    monitor='val_loss', n_inception=2,
                    name='AggMap MultiClass Estimator', patience=1000,
                    random_state=32, verbose=0)
{'best_loss': 0.602, 'best_epoch': 5, 'cluster_channels': 21, 'fold_num': 'fold_01'}
MultiClassEstimator(batch_norm=False, batch_size=2, conv1_kernel_size=13,
                    dense_avf='relu', dense_layers=[128], dropout=0.0, epochs=5,
                    gpuid='1', last_avf='softmax',
                    loss='categorical_crossentropy', lr=0.0001, metric='ACC',
                    monitor='val_loss', n_inception=2,
                    name='AggMap MultiClass Estimator', patience=1000,
                    random_state=32, verbose=0)

100%|#############################################################################################################################| 183315/183315 [00:09<00:00, 18443.74it/s]
100%|###########################################################################################################################| 183315/183315 [00:00<00:00, 5297497.85it/s]
100%|####################################################################################################################################| 606/606 [00:00<00:00, 5265.62it/s]


2022-10-03 16:33:53,670 - [32mINFO[0m - [bidd-aggmap][0m - applying hierarchical clustering to obtain group information ...[0m
2022-10-03 16:33:54,145 - [32mINFO[0m - [bidd-aggmap][0m - Applying grid assignment of feature points, this may take several minutes(1~30 min)[0m
2022-10-03 16:33:54,387 - [32mINFO[0m - [bidd-aggmap][0m - Finished[0m
2022-10-03 16:33:54,393 - [32mINFO[0m - [bidd-aggmap][0m - generate file: T2D_results/feature points_606_correlation_umap_mp[0m
2022-10-03 16:33:54,404 - [32mINFO[0m - [bidd-aggmap][0m - save html file to T2D_results/feature points_606_correlation_umap_mp[0m
2022-10-03 16:33:54,405 - [32mINFO[0m - [bidd-aggmap][0m - generate file: T2D_results/feature points_606_correlation_umap_scatter[0m
2022-10-03 16:33:54,414 - [32mINFO[0m - [bidd-aggmap][0m - save html file to T2D_results/feature points_606_correlation_umap_scatter[0m


100%|####################################################################################################################################| 440/440 [00:00<00:00, 3595.59it/s]



 input train and test X shape is (396, 25, 25, 21), (44, 25, 25, 21) 
MultiClassEstimator(batch_norm=False, batch_size=2, conv1_kernel_size=13,
                    dense_avf='relu', dense_layers=[128], dropout=0.0,
                    epochs=14, gpuid='1', last_avf='softmax',
                    loss='categorical_crossentropy', lr=0.0001, metric='ACC',
                    monitor='val_loss', n_inception=2,
                    name='AggMap MultiClass Estimator', patience=10000,
                    random_state=32, verbose=0)
saving model to T2D_results/fold_00.model
roc-auc: 0.729, mcc: 0.306, pres: 0.672, recall: 0.636, f1: 0.617
{'acc': 0.636, 'auc': 0.729, 'mcc': 0.306, 'pres': 0.672, 'recall': 0.636, 'f1': 0.617, 'i': 0.0, 'repeat_seed': 8.0}
################################################## repeat_seed: 8; fold_01 ##################################################

 input train and test X shape is (396, 25, 25, 21), (44, 25, 25, 21) 
MultiClassEstimator(batch_norm=False, batch_si

In [4]:
pd.DataFrame(each_fold_results).groupby('repeat_seed').mean().mean()

acc       0.68413
auc       0.73851
mcc       0.37184
pres      0.68798
recall    0.68413
f1        0.68221
i         4.50000
dtype: float64

In [5]:
pd.DataFrame(each_fold_results).groupby('repeat_seed').std().mean()

acc       0.070120
auc       0.073962
mcc       0.140035
pres      0.069929
recall    0.070120
f1        0.071253
i         0.000000
dtype: float64

In [7]:
pd.DataFrame(each_fold_results).to_csv(os.path.join(save_dir, 'performance_results.csv'))

# feature importance

In [None]:
all_imps = []
for i in range(10):
    clf = load_model(os.path.join(save_dir, 'fold_%s.model' % str(i).zfill(2)))
    dfe = clf.explain_model(mp, clf.X_, clf.y_, binary_task=True, apply_logrithm=False)
    df_imp = dfe.col_1_importance.to_frame(name = 'fold_%s_imp' % str(i).zfill(2))
    all_imps.append(df_imp)

  1%|1         | 8/625 [00:00<00:08, 70.78it/s]

calculating feature importance for column 1 ...


100%|##########| 625/625 [00:59<00:00, 10.46it/s]
  1%|1         | 7/625 [00:00<00:09, 62.91it/s]

calculating feature importance for column 1 ...


100%|##########| 625/625 [00:56<00:00, 10.98it/s]
  1%|1         | 8/625 [00:00<00:07, 78.82it/s]

calculating feature importance for column 1 ...


100%|##########| 625/625 [00:57<00:00, 10.95it/s]
  1%|1         | 7/625 [00:00<00:10, 61.35it/s]

calculating feature importance for column 1 ...


 51%|#####     | 317/625 [00:27<00:23, 13.26it/s]

In [None]:
dfi = dfe[dfe.columns[:-1]]
dfi['scatter_x'] = dfi.v.map(mp.df_embedding.x)
dfi['scatter_y'] = dfi.v.map(mp.df_embedding.y)

dfimp_all = pd.concat(all_imps, axis=1)
dfi = dfi.join(dfimp_all.mean(axis=1).to_frame(name = 'avg_imp'))
dfi = dfi.join(dfimp_all)
dfi.to_csv(os.path.join(save_dir, 'feature_imp_results.csv'))

In [None]:
dfi