In [10]:
import warnings, os
warnings.filterwarnings("ignore")

from copy import copy
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import confusion_matrix, precision_recall_curve
from sklearn.metrics import roc_auc_score, matthews_corrcoef, precision_score, recall_score, f1_score


import seaborn as sns
import matplotlib.pyplot as plt
from joblib import dump, load

from aggmap import AggMap, AggModel, loadmap
from aggmap.AggModel import load_model, save_model
from aggmap import show


np.random.seed(666) #just for reaptable results


def score(dfr):
    y_true = dfr.y_true
    y_score = dfr.y_score
    y_pred = dfr.y_pred

    '''
    the metrics are taken from orignal paper:
    https://github.com/YDaiLab/Meta-Signer/blob/bd6a1cd98d1035f848ecb6e53d9ee67a85871db2/src/utils/metasigner_io.py#L34
    '''
    auc = roc_auc_score(y_true, y_score, average='weighted')        
    mcc = matthews_corrcoef(y_true, y_pred)
    pres = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')
    
    print('roc-auc: %.3f, mcc: %.3f, pres: %.3f, recall: %.3f, f1: %.3f' % (auc, mcc, pres, recall, f1))

    return auc, mcc, pres, recall, f1

In [44]:
data_path = '../IBD/../../benchmark_data/IBD/'
dataset = 'IBD'
save_dir = 'results'

dfx = pd.read_csv(os.path.join(data_path, 'abundance.tsv'),sep='\t', header=None, index_col=0)
dfy = pd.read_csv(os.path.join(data_path, 'labels.txt'),sep='\t', header=None, index_col=0)
dfx = dfx.T
dfx = np.log(dfx + 1e-2)
y = pd.get_dummies(dfy.index.map({'ibd':1, 'n':0}))

In [45]:
mp = AggMap(dfx, metric = 'correlation')
mp = mp.fit(cluster_channels = 3, verbose = 0, var_thr = 0)

2021-08-11 11:39:48,013 - [32mINFO[0m - [bidd-aggmap][0m - Calculating distance ...[0m
2021-08-11 11:39:48,025 - [32mINFO[0m - [bidd-aggmap][0m - the number of process is 16[0m


100%|##########| 97903/97903 [00:03<00:00, 30331.37it/s]
100%|##########| 97903/97903 [00:00<00:00, 1540531.92it/s]
100%|##########| 443/443 [00:00<00:00, 810.88it/s]


2021-08-11 11:39:53,469 - [32mINFO[0m - [bidd-aggmap][0m - applying hierarchical clustering to obtain group information ...[0m
2021-08-11 11:39:54,411 - [32mINFO[0m - [bidd-aggmap][0m - Applying grid feature map(assignment), this may take several minutes(1~30 min)[0m
2021-08-11 11:39:54,622 - [32mINFO[0m - [bidd-aggmap][0m - Finished[0m


In [46]:
X = mp.batch_transform(dfx.values, scale = 'minmax') # NaN values should be the lowest value
Y = y.values.astype(float)

100%|##########| 110/110 [00:02<00:00, 41.47it/s]


In [47]:
import tensorflow as tf

In [None]:
outer_fold = 10
repeat_seeds = [8]#5 repeats random seeds 8, 16, 32, 64, 128

each_fold_results = []
run_all_res = []

for repeat_seed in repeat_seeds: 
    
    outer = StratifiedKFold(n_splits = outer_fold, shuffle = True, random_state = repeat_seed)
    outer_idx = outer.split(range(len(dfy)), dfy.index)

    print('#'*50 + ' %s ' % repeat_seed + '#'*50 )
    run_one_res = []
    for i, idx in enumerate(outer_idx):
        
        fold_num = "fold_%s" % str(i).zfill(2) 
        
        train_idx, test_idx = idx
        
        testY = Y[test_idx]
        testX = X[test_idx]
        
        trainX = X[train_idx]
        trainY = Y[train_idx]

        print("\n input train and test X shape is %s, %s " % (trainX.shape,  testX.shape))
        
        #weighted loss
        #pos_weight = trainY[:,0].sum() / trainY[:,1].sum()
        #loss = lambda labels, logits: tf.nn.weighted_cross_entropy_with_logits(labels=labels, logits=logits, pos_weight = 10)

        clf = AggModel.MultiClassEstimator(epochs = 50, conv1_kernel_size = 13, #loss = loss,
                                           batch_size = 2, verbose = 0, gpuid=6,) #
        
        clf.fit(trainX, trainY)  #,  class_weight={0:1, 1:10}
        pred_proba = clf.predict_proba(testX)
        
        y_true = testY[:,1] 
        y_score = pred_proba[:,1]
        y_pred = np.argmax(pred_proba, axis=1)
        
        dfr = pd.DataFrame([y_true, y_score, y_pred]).T
        dfr.columns = ['y_true', 'y_score', 'y_pred']
        dfr.index = dfy.iloc[test_idx].index
        auc, mcc, pres, recall, f1  = score(dfr)
        run_one_res.append(dfr)
        
        ts = pd.Series([auc, mcc, pres, recall, f1, i, repeat_seed]).round(3)
        ts.index = ['auc', 'mcc', 'pres', 'recall', 'f1', 'i', 'repeat_seed']
        print(ts.to_dict())
        
        each_fold_results.append(ts.to_dict())
        
    run_all_res.append(pd.concat(run_one_res))

################################################## 8 ##################################################

 input train and test X shape is (99, 22, 21, 3), (11, 22, 21, 3) 
{'epochs': 50, 'lr': 0.0001, 'loss': 'categorical_crossentropy', 'conv1_kernel_size': 13, 'dense_layers': [128], 'dense_avf': 'relu', 'last_avf': 'softmax', 'batch_size': 2, 'dropout': 0.0, 'batch_norm': False, 'n_inception': 2, 'monitor': 'val_loss', 'patience': 10000, 'random_state': 32, 'verbose': 0, 'name': 'AggMap MultiClass Estimator', 'gpuid': '6'}
roc-auc: 0.889, mcc: 0.671, pres: 0.918, recall: 0.909, f1: 0.896
{'auc': 0.889, 'mcc': 0.671, 'pres': 0.918, 'recall': 0.909, 'f1': 0.896, 'i': 0.0, 'repeat_seed': 8.0}

 input train and test X shape is (99, 22, 21, 3), (11, 22, 21, 3) 
{'epochs': 50, 'lr': 0.0001, 'loss': 'categorical_crossentropy', 'conv1_kernel_size': 13, 'dense_layers': [128], 'dense_avf': 'relu', 'last_avf': 'softmax', 'batch_size': 2, 'dropout': 0.0, 'batch_norm': False, 'n_inception': 2, 'mo

In [52]:
pd.DataFrame(each_fold_results).mean()

auc            0.9236
mcc            0.6640
pres           0.8922
recall         0.8817
f1             0.8751
i              4.5000
repeat_seed    8.0000
dtype: float64

In [53]:
pd.DataFrame(each_fold_results).std()

auc            0.112564
mcc            0.277826
pres           0.105792
recall         0.105515
f1             0.107608
i              3.027650
repeat_seed    0.000000
dtype: float64

In [54]:
pd.DataFrame(each_fold_results)

Unnamed: 0,auc,mcc,pres,recall,f1,i,repeat_seed
0,0.889,0.671,0.918,0.909,0.896,0.0,8.0
1,1.0,1.0,1.0,1.0,1.0,1.0,8.0
2,0.889,0.389,0.818,0.818,0.818,2.0,8.0
3,1.0,0.77,0.939,0.909,0.916,3.0,8.0
4,1.0,0.671,0.918,0.909,0.896,4.0,8.0
5,1.0,0.77,0.919,0.909,0.903,5.0,8.0
6,0.708,0.083,0.636,0.636,0.636,6.0,8.0
7,1.0,0.77,0.919,0.909,0.903,7.0,8.0
8,1.0,1.0,1.0,1.0,1.0,8.0,8.0
9,0.75,0.516,0.855,0.818,0.783,9.0,8.0
