In [1]:
# -*- coding: utf-8 -*-
"""
Created on Sun Aug 16 17:10:53 2020

@author: wanxiang.shen@u.nus.edu
"""

import warnings, os
warnings.filterwarnings("ignore")


import pandas as pd
import numpy as np

from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import confusion_matrix, precision_recall_curve, roc_auc_score
from sklearn.metrics import auc as calculate_auc

import matplotlib.pyplot as plt
import seaborn as sns

from aggmap import AggMap, AggModel, loadmap
from aggmap import show
np.random.seed(666) #just for reaptable results

def prc_auc_score(y_true, y_score):
    precision, recall, threshold  = precision_recall_curve(y_true, y_score) #PRC_AUC
    auc = calculate_auc(recall, precision)
    return auc


def score(dfr):
    y_true = dfr.y_true
    y_score = dfr.y_score
    y_pred = dfr.y_score.round()

    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()

    acc = (tp + tn) / sum([tn, fp, fn, tp])
    auc = roc_auc_score(y_true, y_score)

    print('acc: %.3f, roc-auc: %.3f' % (acc,auc))

    return acc, auc

In [2]:
label = pd.read_excel('./subject_info.xlsx')
data = pd.read_excel('./data_species.xlsx', index_col = 0)

In [3]:
label.groupby('Group').size()

Group
HS               30
Healthy         127
MP               40
Stage_0          27
Stage_III_IV     54
Stage_I_II       69
dtype: int64

## Model-1: Healthy vs. Stage_0:

In [18]:
stage = 'Stage_0'
dfy = label[label.Group.isin(['Healthy', stage])].set_index('Subject_ID')[['Group']] #, 'Stage_I_II', 'Stage_III_IV'
idx=dfy[dfy['Group'] == 'Stage_0'].index
s = data.loc[idx].mean()
flist = s[s > 1e-7].index # the same as the paper used
data1 = data[flist]
data1.shape

(347, 4173)

In [19]:
Y = pd.get_dummies(dfy['Group']).values
dfx = data1.loc[dfy.index]
dfx = np.log(dfx + 1e-8) #log transoform

In [20]:
dfy.shape, dfx.shape

((154, 1), (154, 4173))

In [21]:
mp = AggMap(dfx, metric = 'correlation')
mp.save('./agg.mp')
mp = loadmap('./agg.mp')

2020-12-13 11:55:30,588 - [32mINFO[0m - [bidd-aggmap][0m - Calculating distance ...[0m
2020-12-13 11:55:31,735 - [32mINFO[0m - [bidd-aggmap][0m - the number of process is 16[0m


100%|##########| 8704878/8704878 [04:29<00:00, 32338.14it/s]
100%|##########| 8704878/8704878 [00:04<00:00, 1891630.18it/s]
100%|##########| 4173/4173 [00:05<00:00, 808.74it/s]


In [22]:
mp.fit(cluster_channels = 5, verbose = 0, var_thr = 0)

2020-12-13 12:00:15,029 - [32mINFO[0m - [bidd-aggmap][0m - applying hierarchical clustering to obtain group information ...[0m
2020-12-13 12:00:25,234 - [32mINFO[0m - [bidd-aggmap][0m - Applying grid feature map(assignment), this may take several minutes(1~30 min)[0m
2020-12-13 12:01:11,325 - [32mINFO[0m - [bidd-aggmap][0m - Finished[0m


<aggmap.map.AggMap at 0x7f94977ce5c0>

In [23]:
mp.fmap_shape

(65, 65)

In [24]:
mp.plot_scatter()

2020-12-13 12:01:11,350 - [32mINFO[0m - [bidd-aggmap][0m - generate file: ./feature points_4173_correlation_umap_scatter[0m
2020-12-13 12:01:11,427 - [32mINFO[0m - [bidd-aggmap][0m - save html file to ./feature points_4173_correlation_umap_scatter[0m


In [25]:
X = mp.batch_transform(dfx.values, scale_method = 'standard')

100%|##########| 154/154 [00:12<00:00, 12.03it/s]


In [26]:
class_ = pd.get_dummies(dfy['Group']).columns
class_

Index(['Healthy', 'Stage_0'], dtype='object')

In [27]:
outer_fold = 10
repeat_seeds = [8, 16, 32, 64, 128]#5 repeats random seeds 8, 16, 32, 64, 128

run_all_res = []
run_all = []
for repeat_seed in repeat_seeds: 
    
    outer = StratifiedKFold(n_splits = outer_fold, shuffle = True, random_state = repeat_seed)
    outer_idx = outer.split(X, Y[:,0])

    
    run_one_res = []
    for i, idx in enumerate(outer_idx):
        
        fold_num = "fold_%s" % str(i).zfill(2) 
        
        train_idx, test_idx = idx
        
        testY = Y[test_idx]
        testX = X[test_idx]
        
        trainX = X[train_idx]
        trainY = Y[train_idx]

        print("\n input train and test X shape is %s, %s " % (trainX.shape,  testX.shape))
        
        clf = AggModel.MultiClassEstimator(epochs = 50, gpuid = 3, conv1_kernel_size = 3,
                                           batch_size = 2, verbose = 0) #
        clf.fit(trainX, trainY ) 

        ## save model
        clf._model.save(os.path.join('./Stage_0_model', 'seed_%s_fold_%s.h5' % (repeat_seed, i)))
    
        y_true = testY[:,1] #Stage_0
        y_score = clf.predict_proba(testX)[:,1]
        
        dfr = pd.DataFrame([y_true, y_score]).T
        dfr.columns = ['y_true', 'y_score']
        dfr.index = dfy.iloc[test_idx].index
        
        acc, auc = score(dfr)
        
        run_one_res.append(dfr)
        
        run_all.append([repeat_seed, i, auc])
    run_all_res.append(pd.concat(run_one_res))


 input train and test X shape is (138, 65, 65, 5), (16, 65, 65, 5) 
MultiClassEstimator(batch_size=2, conv1_kernel_size=3, epochs=50, gpuid='3')
acc: 0.938, roc-auc: 0.923

 input train and test X shape is (138, 65, 65, 5), (16, 65, 65, 5) 
MultiClassEstimator(batch_size=2, conv1_kernel_size=3, epochs=50, gpuid='3')
acc: 0.938, roc-auc: 0.846

 input train and test X shape is (138, 65, 65, 5), (16, 65, 65, 5) 
MultiClassEstimator(batch_size=2, conv1_kernel_size=3, epochs=50, gpuid='3')
acc: 0.938, roc-auc: 0.923

 input train and test X shape is (138, 65, 65, 5), (16, 65, 65, 5) 
MultiClassEstimator(batch_size=2, conv1_kernel_size=3, epochs=50, gpuid='3')
acc: 0.938, roc-auc: 0.974

 input train and test X shape is (139, 65, 65, 5), (15, 65, 65, 5) 
MultiClassEstimator(batch_size=2, conv1_kernel_size=3, epochs=50, gpuid='3')
acc: 1.000, roc-auc: 1.000

 input train and test X shape is (139, 65, 65, 5), (15, 65, 65, 5) 
MultiClassEstimator(batch_size=2, conv1_kernel_size=3, epochs=50, 

In [28]:
results = []
for dfr, repeat_seed in zip(run_all_res,repeat_seeds):

    y_true = dfr.y_true
    y_score = dfr.y_score
    y_pred = dfr.y_score.round()


    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()

    acc = (tp + tn) / sum([tn, fp, fn, tp])

    sensitivity = tp / sum([tp, fn])
    specificity = tn / sum([tn, fp])

    prc_auc = prc_auc_score(y_true, y_score)
    roc_auc = roc_auc_score(y_true, y_score)

    precision = tp / sum([tp, fp])
    recall =  tp / sum([tp, fn]) #equals to sensitivity

    res = {
           'repeat_seed':repeat_seed,

           'accuracy':acc, 

           'prc_auc':prc_auc, 
           'roc_auc':roc_auc,

           'sensitivity': sensitivity, 
           'specificity': specificity,

           'precision':precision,
           'recall':recall,
           'F1': 2*precision*sensitivity/(precision+sensitivity)
          }
    
    results.append(res)

In [29]:
pd.DataFrame(results).roc_auc.mean()

0.8575094779819189

In [30]:
pd.DataFrame(results)

Unnamed: 0,repeat_seed,accuracy,prc_auc,roc_auc,sensitivity,specificity,precision,recall,F1
0,8,0.922078,0.699627,0.870225,0.592593,0.992126,0.941176,0.592593,0.727273
1,16,0.902597,0.693383,0.844561,0.481481,0.992126,0.928571,0.481481,0.634146
2,32,0.915584,0.694986,0.86585,0.555556,0.992126,0.9375,0.555556,0.697674
3,64,0.915584,0.653532,0.852435,0.555556,0.992126,0.9375,0.555556,0.697674
4,128,0.915584,0.725371,0.854477,0.555556,0.992126,0.9375,0.555556,0.697674


In [16]:
# outer_fold = 10

# run_all = []
# run_all_res = []
# for repeat_seed in [8, 16, 32, 64, 128]: #5 repeats random seeds 8, 16, 32, 64, 128
    
#     outer = StratifiedKFold(n_splits = outer_fold, shuffle = True, random_state = repeat_seed)
#     outer_idx = outer.split(X, Y[:,0])

#     for i, idx in enumerate(outer_idx):
        
#         fold_num = "fold_%s" % str(i).zfill(2) 
        
#         train_idx, test_idx = idx
        
#         testY = Y[test_idx]
#         testX = X[test_idx]
        
#         trainX = X[train_idx]
#         trainY = Y[train_idx]

#         print("\n input train and test X shape is %s, %s " % (trainX.shape,  testX.shape))
        
#         clf = AggModel.MultiClassEstimator(epochs = 50, gpuid = 3, conv1_kernel_size = 3,
#                                            batch_size = 2, verbose = 1) #

#         clf.fit(trainX, trainY ) 
        
#         run_one = []
#         run_one_res = []
#         for j in range(Y.shape[1]):
            
#             y_true = testY[:,j]
#             y_pred = clf.predict(testX)[:,j]
#             y_score = clf.predict_proba(testX)[:,j]
            
#             run_one_res.append([y_true, y_score])
            
#             tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()

#             acc = (tp + tn) / sum([tn, fp, fn, tp])

#             sensitivity = tp / sum([tp, fn])
#             specificity = tn / sum([tn, fp])

#             prc_auc = prc_auc_score(y_true, y_score)
#             roc_auc = roc_auc_score(y_true, y_score)

#             precision = tp / sum([tp, fp])
#             recall =  tp / sum([tp, fn]) #equals to sensitivity

#             res = {'fold': fold_num,
#                    'repeat_seed':repeat_seed,

#                    'accuracy':acc, 

#                    'prc_auc':prc_auc, 
#                    'roc_auc':roc_auc,

#                    'sensitivity': sensitivity, 
#                    'specificity': specificity,

#                    'precision':precision,
#                    'recall':recall,
#                    'F1': 2*precision*sensitivity/(precision+sensitivity)
#                   }
            
#             print(roc_auc)
#             run_one.append(res)
        
#         run_all.append(pd.DataFrame(run_one, index = class_))
#         run_all_res.append(run_one_res)

# f = []
# for res in run_all_res:
#     f.append(pd.DataFrame(res[0]).T)
#     dfr = pd.concat(f).reset_index(drop=True)
# roc_auc_score(dfr[0], dfr[1])
# len(dfr)
# pd.concat(run_all).loc['Stage_0'].roc_auc.mean()
# pd.concat(run_all)

##  Healthy vs. Stage_III_IV     