In [2]:
import warnings, os
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
from glob import glob


import matplotlib.pyplot as plt
import matplotlib.image as mpimg

from aggmap import AggMap, loadmap
import seaborn as sns


import xgboost as xgb
import lightgbm as lgb

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import confusion_matrix, precision_recall_curve, roc_auc_score
from sklearn.metrics import auc as calculate_auc

def score(dfr):
    y_true = dfr.y_true
    y_score = dfr.y_score
    y_pred = dfr.y_score.round()

    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()

    acc = (tp + tn) / sum([tn, fp, fn, tp])
    auc = roc_auc_score(y_true, y_score)
    ber =  (fp / (fp + tn) + fn / (tp + fn)) * 0.5

    print('acc: %.3f, roc-auc: %.3f, BER: %.3f' % (acc,auc,ber))

    return acc, auc


def get_best_params(X, y):
    clf = lgb.LGBMClassifier(verbose=-1, force_col_wise=True)
    parameters = {'n_estimators': [ 50, 100, 200],
                  'max_depth': [4,  6, 10]}

    grid = GridSearchCV(clf, parameters, scoring='roc_auc', n_jobs=9, iid=False, cv=5, verbose=0)
    gres = grid.fit(X, y)
    return grid.best_params_

In [3]:
mp = loadmap('/home/shenwanxiang/Research/AggMapNet_dataset/TCGA-S-G.mp')


In [4]:
lst = glob('/home/shenwanxiang/Research/AggMapNet_dataset/TCGA-S/*.h5')

n_fold = 5

res = []
for p in lst:
    
    print('#'*50 + ' %s ' % p + '#'*50 )
    dfx = pd.read_hdf(p, key = 'expression')
    dfy = pd.read_hdf(p, key = 'labels')
    print(p, dfy.value_counts().to_dict())


    outer = KFold(n_splits = n_fold, shuffle = True, random_state = 123)
    outer_idx = list(outer.split(dfx, dfy))

    fold_all = []
    for i, idx in enumerate(outer_idx):

        fold_num = "fold_%s" % str(i).zfill(2) 

        train_idx, test_idx = idx

        trainX = dfx.iloc[train_idx].values
        trainY = dfy.iloc[train_idx].values

        testX = dfx.iloc[test_idx].values
        testY = dfy.iloc[test_idx].values

        ## training
        print("\nInput train and test X shape is %s, %s  \n" % (trainX.shape,  testX.shape))
        print("Getting the best parameters by gridsearch \n")
        best_params = get_best_params(trainX, trainY)
        clf = lgb.LGBMClassifier(**best_params, verbose=-1, n_jobs=-1, force_col_wise=True)
        clf.fit(trainX, trainY)

        ## evaluation
        y_true = testY
        y_score = clf.predict_proba(testX)[:,1]
        dfr = pd.DataFrame([y_true, y_score]).T
        dfr.columns = ['y_true', 'y_score']
        dfr.index = dfy.iloc[test_idx].index
        acc, auc = score(dfr)
        
        fold_all.append(auc)
        res.append([auc, i, p, best_params])
        
    print('The auc score for %s is %s.' % (p, np.mean(fold_all)))


################################################## /home/shenwanxiang/Research/AggMapNet_dataset/TCGA-S/O_z-score_train_THCA_stage.h5 ##################################################
/home/shenwanxiang/Research/AggMapNet_dataset/TCGA-S/O_z-score_train_THCA_stage.h5 {0: 291, 1: 222}


100%|##########| 513/513 [01:58<00:00,  4.34it/s]



Input train and test X shape is (410, 17970), (103, 17970)  

Getting the best parameters by gridsearch 

acc: 0.583, roc-auc: 0.620, BER: 0.433

Input train and test X shape is (410, 17970), (103, 17970)  

Getting the best parameters by gridsearch 

acc: 0.621, roc-auc: 0.682, BER: 0.391

Input train and test X shape is (410, 17970), (103, 17970)  

Getting the best parameters by gridsearch 

acc: 0.621, roc-auc: 0.627, BER: 0.391

Input train and test X shape is (411, 17970), (102, 17970)  

Getting the best parameters by gridsearch 

acc: 0.676, roc-auc: 0.722, BER: 0.346

Input train and test X shape is (411, 17970), (102, 17970)  

Getting the best parameters by gridsearch 

acc: 0.627, roc-auc: 0.688, BER: 0.399
The auc score for /home/shenwanxiang/Research/AggMapNet_dataset/TCGA-S/O_z-score_train_THCA_stage.h5 is 0.668005598286905.
################################################## /home/shenwanxiang/Research/AggMapNet_dataset/TCGA-S/O_z-score_train_UCEC_stage.h5 #############

100%|##########| 554/554 [02:32<00:00,  3.63it/s]



Input train and test X shape is (443, 17970), (111, 17970)  

Getting the best parameters by gridsearch 

acc: 0.658, roc-auc: 0.621, BER: 0.393

Input train and test X shape is (443, 17970), (111, 17970)  

Getting the best parameters by gridsearch 

acc: 0.676, roc-auc: 0.673, BER: 0.397

Input train and test X shape is (443, 17970), (111, 17970)  

Getting the best parameters by gridsearch 

acc: 0.694, roc-auc: 0.740, BER: 0.365

Input train and test X shape is (443, 17970), (111, 17970)  

Getting the best parameters by gridsearch 

acc: 0.631, roc-auc: 0.621, BER: 0.413

Input train and test X shape is (444, 17970), (110, 17970)  

Getting the best parameters by gridsearch 

acc: 0.627, roc-auc: 0.651, BER: 0.408
The auc score for /home/shenwanxiang/Research/AggMapNet_dataset/TCGA-S/O_z-score_train_UCEC_stage.h5 is 0.661155967651837.
################################################## /home/shenwanxiang/Research/AggMapNet_dataset/TCGA-S/O_z-score_train_COAD_stage.h5 #############

100%|##########| 505/505 [02:14<00:00,  3.76it/s]



Input train and test X shape is (404, 17970), (101, 17970)  

Getting the best parameters by gridsearch 

acc: 0.614, roc-auc: 0.667, BER: 0.399

Input train and test X shape is (404, 17970), (101, 17970)  

Getting the best parameters by gridsearch 

acc: 0.653, roc-auc: 0.705, BER: 0.371

Input train and test X shape is (404, 17970), (101, 17970)  

Getting the best parameters by gridsearch 

acc: 0.683, roc-auc: 0.726, BER: 0.363

Input train and test X shape is (404, 17970), (101, 17970)  

Getting the best parameters by gridsearch 

acc: 0.693, roc-auc: 0.732, BER: 0.313

Input train and test X shape is (404, 17970), (101, 17970)  

Getting the best parameters by gridsearch 

acc: 0.663, roc-auc: 0.701, BER: 0.374
The auc score for /home/shenwanxiang/Research/AggMapNet_dataset/TCGA-S/O_z-score_train_COAD_stage.h5 is 0.7064490319811292.
################################################## /home/shenwanxiang/Research/AggMapNet_dataset/TCGA-S/O_z-score_validate_LUSC_stage.h5 #########

100%|##########| 504/504 [02:29<00:00,  3.38it/s]



Input train and test X shape is (403, 17970), (101, 17970)  

Getting the best parameters by gridsearch 

acc: 0.554, roc-auc: 0.630, BER: 0.445

Input train and test X shape is (403, 17970), (101, 17970)  

Getting the best parameters by gridsearch 

acc: 0.535, roc-auc: 0.594, BER: 0.450

Input train and test X shape is (403, 17970), (101, 17970)  

Getting the best parameters by gridsearch 

acc: 0.653, roc-auc: 0.704, BER: 0.346

Input train and test X shape is (403, 17970), (101, 17970)  

Getting the best parameters by gridsearch 

acc: 0.535, roc-auc: 0.581, BER: 0.467

Input train and test X shape is (404, 17970), (100, 17970)  

Getting the best parameters by gridsearch 

acc: 0.560, roc-auc: 0.593, BER: 0.446
The auc score for /home/shenwanxiang/Research/AggMapNet_dataset/TCGA-S/O_z-score_validate_LUSC_stage.h5 is 0.6202340495167118.
################################################## /home/shenwanxiang/Research/AggMapNet_dataset/TCGA-S/O_z-score_train_KIRC_stage.h5 #########

100%|##########| 544/544 [02:28<00:00,  3.67it/s]



Input train and test X shape is (435, 17970), (109, 17970)  

Getting the best parameters by gridsearch 

acc: 0.706, roc-auc: 0.788, BER: 0.336

Input train and test X shape is (435, 17970), (109, 17970)  

Getting the best parameters by gridsearch 

acc: 0.789, roc-auc: 0.846, BER: 0.242

Input train and test X shape is (435, 17970), (109, 17970)  

Getting the best parameters by gridsearch 

acc: 0.697, roc-auc: 0.743, BER: 0.343

Input train and test X shape is (435, 17970), (109, 17970)  

Getting the best parameters by gridsearch 

acc: 0.706, roc-auc: 0.770, BER: 0.321

Input train and test X shape is (436, 17970), (108, 17970)  

Getting the best parameters by gridsearch 

acc: 0.741, roc-auc: 0.821, BER: 0.313
The auc score for /home/shenwanxiang/Research/AggMapNet_dataset/TCGA-S/O_z-score_train_KIRC_stage.h5 is 0.7935926336768012.
################################################## /home/shenwanxiang/Research/AggMapNet_dataset/TCGA-S/O_z-score_test_BRCA_stage.h5 #############

100%|##########| 1134/1134 [04:42<00:00,  4.01it/s]



Input train and test X shape is (907, 17970), (227, 17970)  

Getting the best parameters by gridsearch 

acc: 0.771, roc-auc: 0.704, BER: 0.467

Input train and test X shape is (907, 17970), (227, 17970)  

Getting the best parameters by gridsearch 

acc: 0.758, roc-auc: 0.643, BER: 0.446

Input train and test X shape is (907, 17970), (227, 17970)  

Getting the best parameters by gridsearch 

acc: 0.775, roc-auc: 0.621, BER: 0.477

Input train and test X shape is (907, 17970), (227, 17970)  

Getting the best parameters by gridsearch 

acc: 0.736, roc-auc: 0.586, BER: 0.497

Input train and test X shape is (908, 17970), (226, 17970)  

Getting the best parameters by gridsearch 

acc: 0.779, roc-auc: 0.693, BER: 0.476
The auc score for /home/shenwanxiang/Research/AggMapNet_dataset/TCGA-S/O_z-score_test_BRCA_stage.h5 is 0.6495766267957043.
################################################## /home/shenwanxiang/Research/AggMapNet_dataset/TCGA-S/O_z-score_train_SKCM_stage.h5 #############

100%|##########| 249/249 [00:49<00:00,  5.00it/s]



Input train and test X shape is (199, 17970), (50, 17970)  

Getting the best parameters by gridsearch 

acc: 0.680, roc-auc: 0.715, BER: 0.392

Input train and test X shape is (199, 17970), (50, 17970)  

Getting the best parameters by gridsearch 

acc: 0.620, roc-auc: 0.664, BER: 0.459

Input train and test X shape is (199, 17970), (50, 17970)  

Getting the best parameters by gridsearch 

acc: 0.680, roc-auc: 0.711, BER: 0.384

Input train and test X shape is (199, 17970), (50, 17970)  

Getting the best parameters by gridsearch 

acc: 0.540, roc-auc: 0.577, BER: 0.500

Input train and test X shape is (200, 17970), (49, 17970)  

Getting the best parameters by gridsearch 

acc: 0.571, roc-auc: 0.564, BER: 0.494
The auc score for /home/shenwanxiang/Research/AggMapNet_dataset/TCGA-S/O_z-score_train_SKCM_stage.h5 is 0.6462477945337728.
################################################## /home/shenwanxiang/Research/AggMapNet_dataset/TCGA-S/O_z-score_train_LIHC_stage.h5 #################

100%|##########| 374/374 [01:13<00:00,  5.10it/s]



Input train and test X shape is (299, 17970), (75, 17970)  

Getting the best parameters by gridsearch 

acc: 0.667, roc-auc: 0.744, BER: 0.334

Input train and test X shape is (299, 17970), (75, 17970)  

Getting the best parameters by gridsearch 

acc: 0.600, roc-auc: 0.623, BER: 0.403

Input train and test X shape is (299, 17970), (75, 17970)  

Getting the best parameters by gridsearch 

acc: 0.520, roc-auc: 0.533, BER: 0.482

Input train and test X shape is (299, 17970), (75, 17970)  

Getting the best parameters by gridsearch 

acc: 0.667, roc-auc: 0.709, BER: 0.343

Input train and test X shape is (300, 17970), (74, 17970)  

Getting the best parameters by gridsearch 

acc: 0.608, roc-auc: 0.640, BER: 0.387
The auc score for /home/shenwanxiang/Research/AggMapNet_dataset/TCGA-S/O_z-score_train_LIHC_stage.h5 is 0.649552858492883.
################################################## /home/shenwanxiang/Research/AggMapNet_dataset/TCGA-S/O_z-score_train_LUAD_stage.h5 ##################

100%|##########| 542/542 [01:46<00:00,  5.11it/s]



Input train and test X shape is (433, 17970), (109, 17970)  

Getting the best parameters by gridsearch 

acc: 0.569, roc-auc: 0.640, BER: 0.432

Input train and test X shape is (433, 17970), (109, 17970)  

Getting the best parameters by gridsearch 

acc: 0.587, roc-auc: 0.685, BER: 0.423

Input train and test X shape is (434, 17970), (108, 17970)  

Getting the best parameters by gridsearch 

acc: 0.648, roc-auc: 0.705, BER: 0.358

Input train and test X shape is (434, 17970), (108, 17970)  

Getting the best parameters by gridsearch 

acc: 0.574, roc-auc: 0.628, BER: 0.452

Input train and test X shape is (434, 17970), (108, 17970)  

Getting the best parameters by gridsearch 

acc: 0.565, roc-auc: 0.553, BER: 0.486
The auc score for /home/shenwanxiang/Research/AggMapNet_dataset/TCGA-S/O_z-score_train_LUAD_stage.h5 is 0.6421858985921486.
################################################## /home/shenwanxiang/Research/AggMapNet_dataset/TCGA-S/O_z-score_train_STAD_stage.h5 ############

100%|##########| 416/416 [01:20<00:00,  5.17it/s]



Input train and test X shape is (332, 17970), (84, 17970)  

Getting the best parameters by gridsearch 

acc: 0.560, roc-auc: 0.574, BER: 0.441

Input train and test X shape is (333, 17970), (83, 17970)  

Getting the best parameters by gridsearch 

acc: 0.590, roc-auc: 0.632, BER: 0.402

Input train and test X shape is (333, 17970), (83, 17970)  

Getting the best parameters by gridsearch 

acc: 0.530, roc-auc: 0.600, BER: 0.464

Input train and test X shape is (333, 17970), (83, 17970)  

Getting the best parameters by gridsearch 

acc: 0.482, roc-auc: 0.514, BER: 0.521

Input train and test X shape is (333, 17970), (83, 17970)  

Getting the best parameters by gridsearch 

acc: 0.530, roc-auc: 0.513, BER: 0.470
The auc score for /home/shenwanxiang/Research/AggMapNet_dataset/TCGA-S/O_z-score_train_STAD_stage.h5 is 0.5664507026990981.


In [5]:
dfres = pd.DataFrame(res, columns = ['score', 'fold', 'task', 'best_params'])
dfres.to_csv('./LGB_5FCV_results_stage.csv')
dfres.groupby('task').mean().sort_values('score',ascending=False)

Unnamed: 0_level_0,score,fold
task,Unnamed: 1_level_1,Unnamed: 2_level_1
/home/shenwanxiang/Research/AggMapNet_dataset/TCGA-S/O_z-score_train_KIRC_stage.h5,0.793593,2
/home/shenwanxiang/Research/AggMapNet_dataset/TCGA-S/O_z-score_train_COAD_stage.h5,0.706449,2
/home/shenwanxiang/Research/AggMapNet_dataset/TCGA-S/O_z-score_train_THCA_stage.h5,0.668006,2
/home/shenwanxiang/Research/AggMapNet_dataset/TCGA-S/O_z-score_train_UCEC_stage.h5,0.661156,2
/home/shenwanxiang/Research/AggMapNet_dataset/TCGA-S/O_z-score_test_BRCA_stage.h5,0.649577,2
/home/shenwanxiang/Research/AggMapNet_dataset/TCGA-S/O_z-score_train_LIHC_stage.h5,0.649553,2
/home/shenwanxiang/Research/AggMapNet_dataset/TCGA-S/O_z-score_train_SKCM_stage.h5,0.646248,2
/home/shenwanxiang/Research/AggMapNet_dataset/TCGA-S/O_z-score_train_LUAD_stage.h5,0.642186,2
/home/shenwanxiang/Research/AggMapNet_dataset/TCGA-S/O_z-score_validate_LUSC_stage.h5,0.620234,2
/home/shenwanxiang/Research/AggMapNet_dataset/TCGA-S/O_z-score_train_STAD_stage.h5,0.566451,2


In [6]:
dfr = dfres.groupby('task').mean().sort_values('score',ascending=False)
order = ['COAD','KIRC','LIHC','LUAD','SKCM','STAD','THCA','UCEC','LUSC','BRCA']

dfr.index = dfr.index.map(lambda x:x.split('_')[-2])
dfr = dfr.loc[order]

In [7]:
dfr

Unnamed: 0_level_0,score,fold
task,Unnamed: 1_level_1,Unnamed: 2_level_1
COAD,0.706449,2
KIRC,0.793593,2
LIHC,0.649553,2
LUAD,0.642186,2
SKCM,0.646248,2
STAD,0.566451,2
THCA,0.668006,2
UCEC,0.661156,2
LUSC,0.620234,2
BRCA,0.649577,2
