In [1]:
import warnings, os
warnings.filterwarnings("ignore")

from copy import copy
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import confusion_matrix, precision_recall_curve
from sklearn.metrics import roc_auc_score, matthews_corrcoef, precision_score, recall_score, f1_score


import seaborn as sns
import matplotlib.pyplot as plt
from joblib import dump, load

from aggmap import AggMap, AggModel, loadmap
from aggmap.AggModel import load_model, save_model
from aggmap import show


np.random.seed(666) #just for reaptable results


def score(dfr):
    y_true = dfr.y_true
    y_score = dfr.y_score
    y_pred = dfr.y_pred

    '''
    the metrics are taken from orignal paper:
    Meta-Signer: Metagenomic Signature Identifier based on Rank Aggregation of Features
    https://github.com/YDaiLab/Meta-Signer/blob/bd6a1cd98d1035f848ecb6e53d9ee67a85871db2/src/utils/metasigner_io.py#L34
    '''
    auc = roc_auc_score(y_true, y_score, average='weighted')        
    mcc = matthews_corrcoef(y_true, y_pred)
    pres = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')
    
    print('roc-auc: %.3f, mcc: %.3f, pres: %.3f, recall: %.3f, f1: %.3f' % (auc, mcc, pres, recall, f1))

    return auc, mcc, pres, recall, f1

# read data

In [2]:
task = 'Cirrhosis'
data_path = '../01_data/species_level/%s/' % (task)
save_dir = '%s_results' % task
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

dfa = pd.read_csv(os.path.join(data_path, 'abundance.tsv'),sep='\t', header=None, index_col=0)
dfy = pd.read_csv(os.path.join(data_path, 'labels.txt'),sep='\t', header=None)
dfx = dfa.T
dfy = pd.get_dummies(dfy[0].map({'cirrhosis':1, 'n':0}))
Y = dfy.values

# 10FCV

In [3]:
gpuid = 7

outer_fold = 10
repeat_seeds = [8, 16, 32, 64, 128, 256, 1024, 2048, 4096, 8192] #10 repeats random seeds 8, 16, 32, 64, 128

each_fold_results = []
run_all_res = []

for i, repeat_seed in enumerate(repeat_seeds): 
    outer = StratifiedKFold(n_splits = outer_fold, shuffle = True, random_state = repeat_seed)
    outer_idx = outer.split(range(len(dfy)), dfy.idxmax(axis=1))
    run_one_res = []
    for j, idx in enumerate(outer_idx):
        fold_num = "fold_%s" % str(j).zfill(2) 
        print('#'*50 + ' repeat_seed: %s; %s ' % (repeat_seed, fold_num) + '#'*50 )
        
        train_idx, test_idx = idx
        dfx_train = dfx.iloc[train_idx]
        dfy_train = dfy.iloc[train_idx]
        
        ## get best parameters 
        if (i == 0) & (j == 0):
            from tune import finetune_HPs
            best_fill, best_scale_method, best_channel_number, best_epochs, batch_size = finetune_HPs(dfx_train, dfy_train, gpuid=gpuid)
            featHPs = {"best_fill":best_fill, "best_scale_method":best_scale_method, "best_channel_number":best_channel_number}
            dfx = np.log(dfx + best_fill)
            mp = AggMap(dfx, metric = 'correlation')
            mp = mp.fit(cluster_channels = best_channel_number, verbose = 0, var_thr = 0)
            X = mp.batch_transform(dfx.values, scale = best_scale_method) # NaN values should be the lowest value
            mp.save(os.path.join(save_dir, 'agg.mp'))
            
        testY = Y[test_idx]
        testX = X[test_idx]
        
        trainX = X[train_idx]
        trainY = Y[train_idx]

        print("\n input train and test X shape is %s, %s " % (trainX.shape,  testX.shape))

        clf = AggModel.MultiClassEstimator(epochs = best_epochs,  batch_size = batch_size, verbose = 0, gpuid=gpuid) #
        clf.fit(trainX, trainY)  #, 
        
        ## save model for explaination
        if i == 0:
            clf.save_model(os.path.join(save_dir, '%s.model' % fold_num))
            paras = clf.get_params()
            paras.update({'featHPs':featHPs})
            pd.Series(paras).to_json(os.path.join(save_dir, 'HPs.json'))

        pred_proba = clf.predict_proba(testX)
        y_true = testY[:,1] 
        y_score = pred_proba[:,1]
        y_pred = np.argmax(pred_proba, axis=1)
        
        dfr = pd.DataFrame([y_true, y_score, y_pred]).T
        dfr.columns = ['y_true', 'y_score', 'y_pred']
        dfr.index = dfy.iloc[test_idx].index
        auc, mcc, pres, recall, f1  = score(dfr)
        run_one_res.append(dfr)
        ts = pd.Series([auc, mcc, pres, recall, f1, i, repeat_seed]).round(3)
        ts.index = ['auc', 'mcc', 'pres', 'recall', 'f1', 'i', 'repeat_seed']
        print(ts.to_dict())
        each_fold_results.append(ts.to_dict())
    run_all_res.append(pd.concat(run_one_res))

################################################## repeat_seed: 8; fold_00 ##################################################
2021-08-17 15:41:07,255 - [32mINFO[0m - [bidd-aggmap][0m - Calculating distance ...[0m
2021-08-17 15:41:07,273 - [32mINFO[0m - [bidd-aggmap][0m - the number of process is 16[0m


100%|##########| 146611/146611 [00:04<00:00, 33225.20it/s]
100%|##########| 146611/146611 [00:00<00:00, 1936235.93it/s]
100%|##########| 542/542 [00:00<00:00, 827.20it/s]


2021-08-17 15:41:12,641 - [32mINFO[0m - [bidd-aggmap][0m - applying hierarchical clustering to obtain group information ...[0m
2021-08-17 15:41:16,863 - [32mINFO[0m - [bidd-aggmap][0m - Applying grid feature map(assignment), this may take several minutes(1~30 min)[0m


  0%|          | 0/208 [00:00<?, ?it/s]

2021-08-17 15:41:17,372 - [32mINFO[0m - [bidd-aggmap][0m - Finished[0m


100%|##########| 208/208 [00:02<00:00, 78.19it/s]


{'best_loss': 0.54, 'best_epoch': 2, 'fill': 0.01, 'fold_num': 'fold_01'}
{'best_loss': 0.224, 'best_epoch': 16, 'fill': 0.01, 'fold_num': 'fold_03'}
{'best_loss': 0.373, 'best_epoch': 11, 'fill': 0.01, 'fold_num': 'fold_05'}
{'best_loss': 0.354, 'best_epoch': 15, 'fill': 0.01, 'fold_num': 'fold_07'}
{'best_loss': 0.102, 'best_epoch': 11, 'fill': 0.01, 'fold_num': 'fold_09'}
2021-08-17 15:42:48,357 - [32mINFO[0m - [bidd-aggmap][0m - Calculating distance ...[0m
2021-08-17 15:42:48,374 - [32mINFO[0m - [bidd-aggmap][0m - the number of process is 16[0m


100%|##########| 146611/146611 [00:04<00:00, 29734.56it/s]
100%|##########| 146611/146611 [00:00<00:00, 1753509.13it/s]
100%|##########| 542/542 [00:00<00:00, 786.81it/s]


2021-08-17 15:42:55,659 - [32mINFO[0m - [bidd-aggmap][0m - applying hierarchical clustering to obtain group information ...[0m
2021-08-17 15:42:56,832 - [32mINFO[0m - [bidd-aggmap][0m - Applying grid feature map(assignment), this may take several minutes(1~30 min)[0m


 23%|##3       | 48/208 [00:00<00:00, 453.73it/s]

2021-08-17 15:42:57,181 - [32mINFO[0m - [bidd-aggmap][0m - Finished[0m


100%|##########| 208/208 [00:00<00:00, 846.29it/s]


{'best_loss': 0.547, 'best_epoch': 2, 'fill': 1e-05, 'fold_num': 'fold_01'}
{'best_loss': 0.209, 'best_epoch': 15, 'fill': 1e-05, 'fold_num': 'fold_03'}
{'best_loss': 0.27, 'best_epoch': 11, 'fill': 1e-05, 'fold_num': 'fold_05'}
{'best_loss': 0.422, 'best_epoch': 2, 'fill': 1e-05, 'fold_num': 'fold_07'}
{'best_loss': 0.029, 'best_epoch': 30, 'fill': 1e-05, 'fold_num': 'fold_09'}
2021-08-17 15:44:23,844 - [32mINFO[0m - [bidd-aggmap][0m - Calculating distance ...[0m
2021-08-17 15:44:23,861 - [32mINFO[0m - [bidd-aggmap][0m - the number of process is 16[0m


100%|##########| 146611/146611 [00:05<00:00, 28682.20it/s]
100%|##########| 146611/146611 [00:00<00:00, 1732029.53it/s]
100%|##########| 542/542 [00:00<00:00, 833.83it/s]


2021-08-17 15:44:31,305 - [32mINFO[0m - [bidd-aggmap][0m - applying hierarchical clustering to obtain group information ...[0m
2021-08-17 15:44:32,426 - [32mINFO[0m - [bidd-aggmap][0m - Applying grid feature map(assignment), this may take several minutes(1~30 min)[0m


 27%|##6       | 56/208 [00:00<00:00, 496.60it/s]

2021-08-17 15:44:32,837 - [32mINFO[0m - [bidd-aggmap][0m - Finished[0m


100%|##########| 208/208 [00:00<00:00, 858.81it/s]


{'best_loss': 0.556, 'best_epoch': 2, 'fill': 1e-08, 'fold_num': 'fold_01'}
{'best_loss': 0.248, 'best_epoch': 15, 'fill': 1e-08, 'fold_num': 'fold_03'}
{'best_loss': 0.227, 'best_epoch': 11, 'fill': 1e-08, 'fold_num': 'fold_05'}
{'best_loss': 0.442, 'best_epoch': 2, 'fill': 1e-08, 'fold_num': 'fold_07'}
{'best_loss': 0.067, 'best_epoch': 21, 'fill': 1e-08, 'fold_num': 'fold_09'}
2021-08-17 15:45:59,231 - [32mINFO[0m - [bidd-aggmap][0m - Calculating distance ...[0m
2021-08-17 15:45:59,248 - [32mINFO[0m - [bidd-aggmap][0m - the number of process is 16[0m


100%|##########| 146611/146611 [00:04<00:00, 31555.89it/s]
100%|##########| 146611/146611 [00:00<00:00, 1729968.39it/s]
100%|##########| 542/542 [00:00<00:00, 804.08it/s]


2021-08-17 15:46:06,269 - [32mINFO[0m - [bidd-aggmap][0m - applying hierarchical clustering to obtain group information ...[0m
2021-08-17 15:46:07,382 - [32mINFO[0m - [bidd-aggmap][0m - Applying grid feature map(assignment), this may take several minutes(1~30 min)[0m


 23%|##3       | 48/208 [00:00<00:00, 452.47it/s]

2021-08-17 15:46:07,726 - [32mINFO[0m - [bidd-aggmap][0m - Finished[0m


100%|##########| 208/208 [00:00<00:00, 853.19it/s]


{'best_loss': 0.546, 'best_epoch': 2, 'scale_method': 'minmax', 'fold_num': 'fold_01'}
{'best_loss': 0.234, 'best_epoch': 15, 'scale_method': 'minmax', 'fold_num': 'fold_03'}
{'best_loss': 0.259, 'best_epoch': 11, 'scale_method': 'minmax', 'fold_num': 'fold_05'}
{'best_loss': 0.423, 'best_epoch': 2, 'scale_method': 'minmax', 'fold_num': 'fold_07'}


 27%|##6       | 56/208 [00:00<00:00, 488.92it/s]

{'best_loss': 0.04, 'best_epoch': 30, 'scale_method': 'minmax', 'fold_num': 'fold_09'}


100%|##########| 208/208 [00:00<00:00, 796.83it/s]


{'best_loss': 0.603, 'best_epoch': 1, 'scale_method': 'standard', 'fold_num': 'fold_01'}
{'best_loss': 0.379, 'best_epoch': 5, 'scale_method': 'standard', 'fold_num': 'fold_03'}
{'best_loss': 0.395, 'best_epoch': 8, 'scale_method': 'standard', 'fold_num': 'fold_05'}
{'best_loss': 0.46, 'best_epoch': 3, 'scale_method': 'standard', 'fold_num': 'fold_07'}
{'best_loss': 0.225, 'best_epoch': 10, 'scale_method': 'standard', 'fold_num': 'fold_09'}


best fill value: 1e-05, best_scale_method:minmax, best avg. best_epochs: 11
2021-08-17 15:49:04,847 - [32mINFO[0m - [bidd-aggmap][0m - Calculating distance ...[0m
2021-08-17 15:49:04,865 - [32mINFO[0m - [bidd-aggmap][0m - the number of process is 16[0m


100%|##########| 146611/146611 [00:04<00:00, 29811.05it/s]
100%|##########| 146611/146611 [00:00<00:00, 1829395.29it/s]
100%|##########| 542/542 [00:00<00:00, 826.34it/s]


2021-08-17 15:49:12,235 - [32mINFO[0m - [bidd-aggmap][0m - applying hierarchical clustering to obtain group information ...[0m
2021-08-17 15:49:13,394 - [32mINFO[0m - [bidd-aggmap][0m - Applying grid feature map(assignment), this may take several minutes(1~30 min)[0m


 27%|##6       | 56/208 [00:00<00:00, 496.91it/s]

2021-08-17 15:49:13,720 - [32mINFO[0m - [bidd-aggmap][0m - Finished[0m


100%|##########| 208/208 [00:00<00:00, 784.24it/s]


{'best_loss': 0.561, 'best_epoch': 3, 'cluster_channels': 1, 'fold_num': 'fold_01'}
{'best_loss': 0.315, 'best_epoch': 11, 'cluster_channels': 1, 'fold_num': 'fold_03'}
{'best_loss': 0.309, 'best_epoch': 11, 'cluster_channels': 1, 'fold_num': 'fold_05'}
{'best_loss': 0.423, 'best_epoch': 6, 'cluster_channels': 1, 'fold_num': 'fold_07'}
{'best_loss': 0.091, 'best_epoch': 11, 'cluster_channels': 1, 'fold_num': 'fold_09'}
2021-08-17 15:49:48,520 - [32mINFO[0m - [bidd-aggmap][0m - applying hierarchical clustering to obtain group information ...[0m
2021-08-17 15:49:49,628 - [32mINFO[0m - [bidd-aggmap][0m - Applying grid feature map(assignment), this may take several minutes(1~30 min)[0m


 27%|##6       | 56/208 [00:00<00:00, 459.02it/s]

2021-08-17 15:49:49,969 - [32mINFO[0m - [bidd-aggmap][0m - Finished[0m


100%|##########| 208/208 [00:00<00:00, 774.50it/s]


{'best_loss': 0.544, 'best_epoch': 2, 'cluster_channels': 5, 'fold_num': 'fold_01'}
{'best_loss': 0.257, 'best_epoch': 8, 'cluster_channels': 5, 'fold_num': 'fold_03'}
{'best_loss': 0.269, 'best_epoch': 11, 'cluster_channels': 5, 'fold_num': 'fold_05'}
{'best_loss': 0.424, 'best_epoch': 2, 'cluster_channels': 5, 'fold_num': 'fold_07'}
{'best_loss': 0.075, 'best_epoch': 10, 'cluster_channels': 5, 'fold_num': 'fold_09'}
2021-08-17 15:50:25,140 - [32mINFO[0m - [bidd-aggmap][0m - applying hierarchical clustering to obtain group information ...[0m
2021-08-17 15:50:26,250 - [32mINFO[0m - [bidd-aggmap][0m - Applying grid feature map(assignment), this may take several minutes(1~30 min)[0m


 31%|###       | 64/208 [00:00<00:00, 495.55it/s]

2021-08-17 15:50:26,579 - [32mINFO[0m - [bidd-aggmap][0m - Finished[0m


100%|##########| 208/208 [00:00<00:00, 786.26it/s]


{'best_loss': 0.545, 'best_epoch': 2, 'cluster_channels': 9, 'fold_num': 'fold_01'}
{'best_loss': 0.231, 'best_epoch': 8, 'cluster_channels': 9, 'fold_num': 'fold_03'}
{'best_loss': 0.291, 'best_epoch': 8, 'cluster_channels': 9, 'fold_num': 'fold_05'}
{'best_loss': 0.42, 'best_epoch': 3, 'cluster_channels': 9, 'fold_num': 'fold_07'}
{'best_loss': 0.075, 'best_epoch': 11, 'cluster_channels': 9, 'fold_num': 'fold_09'}
2021-08-17 15:51:02,106 - [32mINFO[0m - [bidd-aggmap][0m - applying hierarchical clustering to obtain group information ...[0m
2021-08-17 15:51:03,226 - [32mINFO[0m - [bidd-aggmap][0m - Applying grid feature map(assignment), this may take several minutes(1~30 min)[0m


 27%|##6       | 56/208 [00:00<00:00, 456.91it/s]

2021-08-17 15:51:03,563 - [32mINFO[0m - [bidd-aggmap][0m - Finished[0m


100%|##########| 208/208 [00:00<00:00, 807.87it/s]


{'best_loss': 0.522, 'best_epoch': 2, 'cluster_channels': 13, 'fold_num': 'fold_01'}
{'best_loss': 0.253, 'best_epoch': 8, 'cluster_channels': 13, 'fold_num': 'fold_03'}
{'best_loss': 0.262, 'best_epoch': 8, 'cluster_channels': 13, 'fold_num': 'fold_05'}
{'best_loss': 0.415, 'best_epoch': 2, 'cluster_channels': 13, 'fold_num': 'fold_07'}
{'best_loss': 0.083, 'best_epoch': 11, 'cluster_channels': 13, 'fold_num': 'fold_09'}
2021-08-17 15:51:40,023 - [32mINFO[0m - [bidd-aggmap][0m - applying hierarchical clustering to obtain group information ...[0m
2021-08-17 15:51:41,151 - [32mINFO[0m - [bidd-aggmap][0m - Applying grid feature map(assignment), this may take several minutes(1~30 min)[0m


 27%|##6       | 56/208 [00:00<00:00, 532.31it/s]

2021-08-17 15:51:41,500 - [32mINFO[0m - [bidd-aggmap][0m - Finished[0m


100%|##########| 208/208 [00:00<00:00, 856.80it/s]


{'best_loss': 0.526, 'best_epoch': 2, 'cluster_channels': 17, 'fold_num': 'fold_01'}
{'best_loss': 0.261, 'best_epoch': 9, 'cluster_channels': 17, 'fold_num': 'fold_03'}
{'best_loss': 0.258, 'best_epoch': 8, 'cluster_channels': 17, 'fold_num': 'fold_05'}
{'best_loss': 0.415, 'best_epoch': 2, 'cluster_channels': 17, 'fold_num': 'fold_07'}
{'best_loss': 0.078, 'best_epoch': 11, 'cluster_channels': 17, 'fold_num': 'fold_09'}
2021-08-17 15:52:17,969 - [32mINFO[0m - [bidd-aggmap][0m - applying hierarchical clustering to obtain group information ...[0m
2021-08-17 15:52:19,086 - [32mINFO[0m - [bidd-aggmap][0m - Applying grid feature map(assignment), this may take several minutes(1~30 min)[0m


 27%|##6       | 56/208 [00:00<00:00, 419.53it/s]

2021-08-17 15:52:19,440 - [32mINFO[0m - [bidd-aggmap][0m - Finished[0m


100%|##########| 208/208 [00:00<00:00, 716.54it/s]


{'best_loss': 0.533, 'best_epoch': 2, 'cluster_channels': 21, 'fold_num': 'fold_01'}
{'best_loss': 0.27, 'best_epoch': 5, 'cluster_channels': 21, 'fold_num': 'fold_03'}
{'best_loss': 0.254, 'best_epoch': 8, 'cluster_channels': 21, 'fold_num': 'fold_05'}
{'best_loss': 0.42, 'best_epoch': 2, 'cluster_channels': 21, 'fold_num': 'fold_07'}
{'best_loss': 0.07, 'best_epoch': 11, 'cluster_channels': 21, 'fold_num': 'fold_09'}


best channel number: 13
2021-08-17 15:52:56,999 - [32mINFO[0m - [bidd-aggmap][0m - Calculating distance ...[0m
2021-08-17 15:52:57,015 - [32mINFO[0m - [bidd-aggmap][0m - the number of process is 16[0m


100%|##########| 146611/146611 [00:04<00:00, 30478.06it/s]
100%|##########| 146611/146611 [00:00<00:00, 1884927.18it/s]
100%|##########| 542/542 [00:00<00:00, 825.91it/s]


2021-08-17 15:53:04,592 - [32mINFO[0m - [bidd-aggmap][0m - applying hierarchical clustering to obtain group information ...[0m
2021-08-17 15:53:05,689 - [32mINFO[0m - [bidd-aggmap][0m - Applying grid feature map(assignment), this may take several minutes(1~30 min)[0m


 21%|##        | 48/232 [00:00<00:00, 464.40it/s]

2021-08-17 15:53:05,994 - [32mINFO[0m - [bidd-aggmap][0m - Finished[0m


100%|##########| 232/232 [00:00<00:00, 822.33it/s]



 input train and test X shape is (208, 24, 23, 13), (24, 24, 23, 13) 
saving model to Cirrhosis_results/fold_00.model
roc-auc: 0.993, mcc: 0.833, pres: 0.917, recall: 0.917, f1: 0.917
{'auc': 0.993, 'mcc': 0.833, 'pres': 0.917, 'recall': 0.917, 'f1': 0.917, 'i': 0.0, 'repeat_seed': 8.0}
################################################## repeat_seed: 8; fold_01 ##################################################

 input train and test X shape is (208, 24, 23, 13), (24, 24, 23, 13) 
saving model to Cirrhosis_results/fold_01.model
roc-auc: 0.812, mcc: 0.676, pres: 0.843, recall: 0.833, f1: 0.832
{'auc': 0.812, 'mcc': 0.676, 'pres': 0.843, 'recall': 0.833, 'f1': 0.832, 'i': 0.0, 'repeat_seed': 8.0}
################################################## repeat_seed: 8; fold_02 ##################################################

 input train and test X shape is (209, 24, 23, 13), (23, 24, 23, 13) 
saving model to Cirrhosis_results/fold_02.model
roc-auc: 0.970, mcc: 0.740, pres: 0.872, recall: 0.

In [9]:
pd.DataFrame(each_fold_results).groupby('repeat_seed').mean().mean()

auc       0.94881
mcc       0.78696
pres      0.89737
recall    0.88997
f1        0.88928
i         4.50000
dtype: float64

In [10]:
pd.DataFrame(each_fold_results).groupby('repeat_seed').std().mean()

auc       0.051292
mcc       0.126348
pres      0.061892
recall    0.064679
f1        0.065222
i         0.000000
dtype: float64

In [12]:
pd.DataFrame(each_fold_results).to_csv(os.path.join(save_dir, 'performance_results.csv'))

# feature importance

In [29]:
all_imps = []
for i in range(10):
    clf = load_model(os.path.join(save_dir, 'fold_%s.model' % str(i).zfill(2)))
    dfe = clf.explain_model(mp, clf.X_, clf.y_, binary_task=True, apply_logrithm=False)
    df_imp = dfe.col_1_importance.to_frame(name = 'fold_%s_imp' % str(i).zfill(2))
    all_imps.append(df_imp)

  0%|          | 0/552 [00:00<?, ?it/s]

calculating feature importance for column 1 ...


100%|##########| 552/552 [00:18<00:00, 30.46it/s]
  0%|          | 0/552 [00:00<?, ?it/s]

calculating feature importance for column 1 ...


100%|##########| 552/552 [00:17<00:00, 31.72it/s]
  0%|          | 0/552 [00:00<?, ?it/s]

calculating feature importance for column 1 ...


100%|##########| 552/552 [00:17<00:00, 31.24it/s]
  0%|          | 0/552 [00:00<?, ?it/s]

calculating feature importance for column 1 ...


100%|##########| 552/552 [00:18<00:00, 30.57it/s]
  0%|          | 0/552 [00:00<?, ?it/s]

calculating feature importance for column 1 ...


100%|##########| 552/552 [00:17<00:00, 31.10it/s]
  0%|          | 0/552 [00:00<?, ?it/s]

calculating feature importance for column 1 ...


100%|##########| 552/552 [00:18<00:00, 30.57it/s]
  0%|          | 0/552 [00:00<?, ?it/s]

calculating feature importance for column 1 ...


100%|##########| 552/552 [00:17<00:00, 30.90it/s]
  0%|          | 0/552 [00:00<?, ?it/s]

calculating feature importance for column 1 ...


100%|##########| 552/552 [00:17<00:00, 30.71it/s]
  0%|          | 0/552 [00:00<?, ?it/s]

calculating feature importance for column 1 ...


100%|##########| 552/552 [00:17<00:00, 31.34it/s]
  0%|          | 0/552 [00:00<?, ?it/s]

calculating feature importance for column 1 ...


100%|##########| 552/552 [00:17<00:00, 31.71it/s]


In [49]:
dfi = dfe[dfe.columns[:-1]]
dfi['scatter_x'] = dfi.v.map(mp.df_embedding.x)
dfi['scatter_y'] = dfi.v.map(mp.df_embedding.y)

dfimp_all = pd.concat(all_imps, axis=1)
dfi = dfi.join(dfimp_all.mean(axis=1).to_frame(name = 'avg_imp'))
dfi = dfi.join(dfimp_all)
dfi.to_csv(os.path.join(save_dir, 'feature_imp_results.csv'))

In [50]:
dfi

Unnamed: 0,x,y,v,Subtypes,colors,scatter_x,scatter_y,avg_imp,fold_00_imp,fold_01_imp,fold_02_imp,fold_03_imp,fold_04_imp,fold_05_imp,fold_06_imp,fold_07_imp,fold_08_imp,fold_09_imp
0,0,0,k__Bacteria|p__Firmicutes|c__Negativicutes|o__...,cluster_07,#00fff6,-2.435016,-7.996033,2.020425,2.636166,2.422558,1.480931,2.304214,1.639054,2.490149,2.467577,1.172250,2.032199,1.559150
1,1,0,k__Bacteria|p__Firmicutes|c__Negativicutes|o__...,cluster_07,#00fff6,-2.415546,-8.048627,3.423417,4.224251,4.586573,2.836215,3.500902,2.881480,4.132021,3.808488,2.303154,3.140248,2.820839
2,2,0,k__Bacteria|p__Firmicutes|c__Negativicutes|o__...,cluster_07,#00fff6,-2.361040,-8.083551,4.228039,5.347902,5.490348,2.512368,4.742337,3.766255,5.282240,4.638997,2.324160,4.553426,3.622361
3,3,0,k__Bacteria|p__Proteobacteria|c__Gammaproteoba...,cluster_07,#00fff6,-2.322250,-8.112451,7.086851,8.235827,8.364681,5.941798,7.976813,6.055808,7.650206,7.116522,4.983071,7.361613,7.182173
4,4,0,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactob...,cluster_07,#00fff6,-2.155409,-7.984416,2.906158,2.939558,3.276735,2.188092,2.944056,2.273452,3.370101,3.779549,2.006394,3.652944,2.630703
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
547,18,23,0,,#000000,,,-0.552890,-0.536272,-0.492827,-0.612344,-0.472785,-0.608114,-0.506613,-0.558567,-0.578415,-0.585439,-0.577526
548,19,23,0,,#000000,,,-0.552892,-0.536272,-0.492827,-0.612345,-0.472785,-0.608117,-0.506613,-0.558569,-0.578415,-0.585442,-0.577529
549,20,23,0,,#000000,,,-0.552890,-0.536272,-0.492827,-0.612344,-0.472785,-0.608114,-0.506613,-0.558567,-0.578415,-0.585439,-0.577526
550,21,23,0,,#000000,,,-0.552892,-0.536272,-0.492827,-0.612345,-0.472785,-0.608117,-0.506613,-0.558569,-0.578415,-0.585442,-0.577529
