In [1]:
import warnings, os
warnings.filterwarnings("ignore")

from copy import copy
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import confusion_matrix, precision_recall_curve
from sklearn.metrics import roc_auc_score, matthews_corrcoef, precision_score, recall_score, f1_score


import seaborn as sns
import matplotlib.pyplot as plt
from joblib import dump, load

from aggmap import AggMap, AggModel, loadmap
from aggmap.AggModel import load_model, save_model
from aggmap import show


np.random.seed(666) #just for reaptable results


def score(dfr):
    y_true = dfr.y_true
    y_score = dfr.y_score
    y_pred = dfr.y_pred

    '''
    the metrics are taken from orignal paper:
    Meta-Signer: Metagenomic Signature Identifier based on Rank Aggregation of Features
    https://github.com/YDaiLab/Meta-Signer/blob/bd6a1cd98d1035f848ecb6e53d9ee67a85871db2/src/utils/metasigner_io.py#L34
    '''
    auc = roc_auc_score(y_true, y_score, average='weighted')        
    mcc = matthews_corrcoef(y_true, y_pred)
    pres = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')
    
    print('roc-auc: %.3f, mcc: %.3f, pres: %.3f, recall: %.3f, f1: %.3f' % (auc, mcc, pres, recall, f1))

    return auc, mcc, pres, recall, f1

# read data

In [2]:
task = 'IBD'
data_path = '../01_data/species_level/%s/' % (task)
save_dir = '%s_results' % task
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

dfa = pd.read_csv(os.path.join(data_path, 'abundance.tsv'),sep='\t', header=None, index_col=0)
dfy = pd.read_csv(os.path.join(data_path, 'labels.txt'),sep='\t', header=None)
dfx = dfa.T
dfy = pd.get_dummies(dfy[0].map({'ibd':1, 'n':0}))
Y = dfy.values

# generate Fmaps

In [3]:
#could be optimized
featHPs = {"best_fill":1e-2, "best_scale_method":'minmax', "best_channel_number":5}
dfx = np.log(dfx + featHPs.get('best_fill'))
mp = AggMap(dfx, metric = 'correlation')
mp = mp.fit(cluster_channels = featHPs.get('best_channel_number'), verbose = 0, var_thr = 0)
X = mp.batch_transform(dfx.values, scale = featHPs.get('best_scale_method')) # NaN values should be the lowest value
mp.plot_grid(save_dir)
mp.plot_scatter(save_dir)
mp.save(os.path.join(save_dir, 'agg.mp'))

2021-08-18 10:52:49,925 - [32mINFO[0m - [bidd-aggmap][0m - Calculating distance ...[0m
2021-08-18 10:52:49,937 - [32mINFO[0m - [bidd-aggmap][0m - the number of process is 16[0m


100%|##########| 97903/97903 [00:03<00:00, 27138.16it/s]
100%|##########| 97903/97903 [00:00<00:00, 1925594.46it/s]
100%|##########| 443/443 [00:00<00:00, 811.25it/s]


2021-08-18 10:52:54,364 - [32mINFO[0m - [bidd-aggmap][0m - applying hierarchical clustering to obtain group information ...[0m
2021-08-18 10:52:58,966 - [32mINFO[0m - [bidd-aggmap][0m - Applying grid feature map(assignment), this may take several minutes(1~30 min)[0m


  0%|          | 0/110 [00:00<?, ?it/s]

2021-08-18 10:52:59,236 - [32mINFO[0m - [bidd-aggmap][0m - Finished[0m


100%|##########| 110/110 [00:03<00:00, 32.48it/s]


2021-08-18 10:53:02,782 - [32mINFO[0m - [bidd-aggmap][0m - generate file: IBD_results/feature points_443_correlation_umap_mp[0m
2021-08-18 10:53:02,809 - [32mINFO[0m - [bidd-aggmap][0m - save html file to IBD_results/feature points_443_correlation_umap_mp[0m
2021-08-18 10:53:02,809 - [32mINFO[0m - [bidd-aggmap][0m - generate file: IBD_results/feature points_443_correlation_umap_scatter[0m
2021-08-18 10:53:02,830 - [32mINFO[0m - [bidd-aggmap][0m - save html file to IBD_results/feature points_443_correlation_umap_scatter[0m


['IBD_results/agg.mp']

# 10FCV

In [4]:
gpuid = 5

outer_fold = 10
repeat_seeds = [8, 16, 32, 64, 128, 256, 1024, 2048, 4096, 8192] #10 repeats random seeds 8, 16, 32, 64, 128

each_fold_results = []
run_all_res = []

for i, repeat_seed in enumerate(repeat_seeds): 
    outer = StratifiedKFold(n_splits = outer_fold, shuffle = True, random_state = repeat_seed)
    outer_idx = outer.split(range(len(dfy)), dfy.idxmax(axis=1))
    run_one_res = []
    for j, idx in enumerate(outer_idx):
        fold_num = "fold_%s" % str(j).zfill(2) 
        print('#'*50 + ' repeat_seed: %s; %s ' % (repeat_seed, fold_num) + '#'*50 )
        
        train_idx, test_idx = idx

        testY = Y[test_idx]
        testX = X[test_idx]
        
        trainX = X[train_idx]
        trainY = Y[train_idx]

        print("\n input train and test X shape is %s, %s " % (trainX.shape,  testX.shape))

        clf = AggModel.MultiClassEstimator(epochs = 50,  batch_size = 2, verbose = 0, gpuid=gpuid) #
        clf.fit(trainX, trainY)  #, 
        
        ## save model for explaination
        if i == 0:
            clf.save_model(os.path.join(save_dir, '%s.model' % fold_num))
            paras = clf.get_params()
            paras.update({'featHPs':featHPs})
            pd.Series(paras).to_json(os.path.join(save_dir, 'HPs.json'))

        pred_proba = clf.predict_proba(testX)
        y_true = testY[:,1] 
        y_score = pred_proba[:,1]
        y_pred = np.argmax(pred_proba, axis=1)
        
        dfr = pd.DataFrame([y_true, y_score, y_pred]).T
        dfr.columns = ['y_true', 'y_score', 'y_pred']
        dfr.index = dfy.iloc[test_idx].index
        auc, mcc, pres, recall, f1  = score(dfr)
        run_one_res.append(dfr)
        ts = pd.Series([auc, mcc, pres, recall, f1, i, repeat_seed]).round(3)
        ts.index = ['auc', 'mcc', 'pres', 'recall', 'f1', 'i', 'repeat_seed']
        print(ts.to_dict())
        each_fold_results.append(ts.to_dict())
    run_all_res.append(pd.concat(run_one_res))

################################################## repeat_seed: 8; fold_00 ##################################################

 input train and test X shape is (99, 22, 21, 5), (11, 22, 21, 5) 
saving model to IBD_results/fold_00.model
roc-auc: 0.944, mcc: 0.671, pres: 0.918, recall: 0.909, f1: 0.896
{'auc': 0.944, 'mcc': 0.671, 'pres': 0.918, 'recall': 0.909, 'f1': 0.896, 'i': 0.0, 'repeat_seed': 8.0}
################################################## repeat_seed: 8; fold_01 ##################################################

 input train and test X shape is (99, 22, 21, 5), (11, 22, 21, 5) 
saving model to IBD_results/fold_01.model
roc-auc: 1.000, mcc: 1.000, pres: 1.000, recall: 1.000, f1: 1.000
{'auc': 1.0, 'mcc': 1.0, 'pres': 1.0, 'recall': 1.0, 'f1': 1.0, 'i': 0.0, 'repeat_seed': 8.0}
################################################## repeat_seed: 8; fold_02 ##################################################

 input train and test X shape is (99, 22, 21, 5), (11, 22, 21, 5) 
savi

In [5]:
pd.DataFrame(each_fold_results).groupby('repeat_seed').mean().mean()

auc       0.92971
mcc       0.60023
pres      0.86486
recall    0.87351
f1        0.85548
i         4.50000
dtype: float64

In [6]:
pd.DataFrame(each_fold_results).groupby('repeat_seed').std().mean()

auc       0.099104
mcc       0.316995
pres      0.124852
recall    0.093136
f1        0.110754
i         0.000000
dtype: float64

In [7]:
pd.DataFrame(each_fold_results).to_csv(os.path.join(save_dir, 'performance_results.csv'))

# feature importance

In [8]:
all_imps = []
for i in range(10):
    clf = load_model(os.path.join(save_dir, 'fold_%s.model' % str(i).zfill(2)))
    dfe = clf.explain_model(mp, clf.X_, clf.y_, binary_task=True, apply_logrithm=False)
    df_imp = dfe.col_1_importance.to_frame(name = 'fold_%s_imp' % str(i).zfill(2))
    all_imps.append(df_imp)

  0%|          | 0/462 [00:00<?, ?it/s]

calculating feature importance for column 1 ...


100%|##########| 462/462 [00:11<00:00, 39.23it/s]
  0%|          | 0/462 [00:00<?, ?it/s]

calculating feature importance for column 1 ...


100%|##########| 462/462 [00:11<00:00, 41.43it/s]
  0%|          | 0/462 [00:00<?, ?it/s]

calculating feature importance for column 1 ...


100%|##########| 462/462 [00:11<00:00, 40.86it/s]
  0%|          | 0/462 [00:00<?, ?it/s]

calculating feature importance for column 1 ...


100%|##########| 462/462 [00:11<00:00, 41.61it/s]
  0%|          | 0/462 [00:00<?, ?it/s]

calculating feature importance for column 1 ...


100%|##########| 462/462 [00:12<00:00, 37.45it/s]
  0%|          | 0/462 [00:00<?, ?it/s]

calculating feature importance for column 1 ...


100%|##########| 462/462 [00:11<00:00, 41.47it/s]
  0%|          | 0/462 [00:00<?, ?it/s]

calculating feature importance for column 1 ...


100%|##########| 462/462 [00:11<00:00, 40.91it/s]
  0%|          | 0/462 [00:00<?, ?it/s]

calculating feature importance for column 1 ...


100%|##########| 462/462 [00:10<00:00, 44.32it/s]
  0%|          | 0/462 [00:00<?, ?it/s]

calculating feature importance for column 1 ...


100%|##########| 462/462 [00:07<00:00, 62.11it/s]
  0%|          | 0/462 [00:00<?, ?it/s]

calculating feature importance for column 1 ...


100%|##########| 462/462 [00:05<00:00, 85.58it/s]


In [9]:
dfi = dfe[dfe.columns[:-1]]
dfi['scatter_x'] = dfi.v.map(mp.df_embedding.x)
dfi['scatter_y'] = dfi.v.map(mp.df_embedding.y)

dfimp_all = pd.concat(all_imps, axis=1)
dfi = dfi.join(dfimp_all.mean(axis=1).to_frame(name = 'avg_imp'))
dfi = dfi.join(dfimp_all)
dfi.to_csv(os.path.join(save_dir, 'feature_imp_results.csv'))

In [10]:
dfi

Unnamed: 0,x,y,v,Subtypes,colors,scatter_x,scatter_y,avg_imp,fold_00_imp,fold_01_imp,fold_02_imp,fold_03_imp,fold_04_imp,fold_05_imp,fold_06_imp,fold_07_imp,fold_08_imp,fold_09_imp
0,0,0,k__Bacteria|p__Actinobacteria|c__Actinobacteri...,cluster_04,#0010ff,1.784434,4.860157,-0.357609,-0.374296,-0.343841,-0.351617,-0.350883,-0.387416,-0.343952,-0.304403,-0.392817,-0.395181,-0.331685
1,1,0,k__Bacteria|p__Actinobacteria|c__Actinobacteri...,cluster_04,#0010ff,1.806937,4.877081,-0.357183,-0.373071,-0.344270,-0.349475,-0.350875,-0.389079,-0.344588,-0.300549,-0.392917,-0.396853,-0.330151
2,2,0,k__Eukaryota|p__Ascomycota|c__Saccharomycetes|...,cluster_04,#0010ff,1.986317,4.854336,-0.355977,-0.374523,-0.343834,-0.352622,-0.346933,-0.381744,-0.338795,-0.298943,-0.395344,-0.396957,-0.330078
3,3,0,k__Bacteria|p__Actinobacteria|c__Actinobacteri...,cluster_03,#00fff6,3.758513,3.404750,-0.142430,-0.048156,-0.117301,-0.159859,-0.118160,-0.123102,-0.155910,-0.212354,-0.094496,-0.175595,-0.219367
4,4,0,k__Bacteria|p__Actinobacteria|c__Actinobacteri...,cluster_03,#00fff6,3.803871,3.383943,-0.034745,0.052014,0.081924,-0.084332,0.024735,0.016212,0.045006,-0.319133,0.063063,-0.124851,-0.102087
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
457,16,21,0,,#000000,,,-0.375298,-0.397235,-0.359437,-0.375073,-0.350883,-0.412874,-0.360478,-0.319133,-0.427728,-0.414231,-0.335906
458,17,21,0,,#000000,,,-0.375298,-0.397236,-0.359437,-0.375073,-0.350883,-0.412874,-0.360478,-0.319133,-0.427728,-0.414231,-0.335906
459,18,21,0,,#000000,,,-0.375298,-0.397235,-0.359437,-0.375073,-0.350883,-0.412874,-0.360478,-0.319133,-0.427728,-0.414231,-0.335906
460,19,21,0,,#000000,,,-0.375298,-0.397235,-0.359437,-0.375073,-0.350883,-0.412874,-0.360478,-0.319133,-0.427728,-0.414231,-0.335906
