In [4]:
import warnings, os
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
from glob import glob


import matplotlib.pyplot as plt
import matplotlib.image as mpimg

from aggmap import AggMap, loadmap
import seaborn as sns


import xgboost as xgb
import lightgbm as lgb

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import confusion_matrix, precision_recall_curve, roc_auc_score
from sklearn.metrics import auc as calculate_auc

def score(dfr):
    y_true = dfr.y_true
    y_score = dfr.y_score
    y_pred = dfr.y_score.round()

    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()

    acc = (tp + tn) / sum([tn, fp, fn, tp])
    auc = roc_auc_score(y_true, y_score)
    ber =  (fp / (fp + tn) + fn / (tp + fn)) * 0.5

    print('acc: %.3f, roc-auc: %.3f, BER: %.3f' % (acc,auc,ber))

    return acc, auc


def get_best_params(X, y):
    clf = lgb.LGBMClassifier(verbose=-1, force_col_wise=True)
    parameters = {'n_estimators': [ 50, 100, 200],
                  'max_depth': [4,  6, 10]}

    grid = GridSearchCV(clf, parameters, scoring='roc_auc', n_jobs=9, iid=False, cv=5, verbose=0)
    gres = grid.fit(X, y)
    return grid.best_params_

In [5]:
mp = loadmap('/home/shenwanxiang/Research/AggMapNet_dataset/TCGA-S-G.mp')


In [6]:
lst = glob('/home/shenwanxiang/Research/AggMapNet_dataset/TCGA-G/*.h5')

n_fold = 5

res = []
for p in lst:
    
    print('#'*50 + ' %s ' % p + '#'*50 )
    dfx = pd.read_hdf(p, key = 'expression')
    dfy = pd.read_hdf(p, key = 'labels')
    print(p, dfy.value_counts().to_dict())


    outer = KFold(n_splits = n_fold, shuffle = True, random_state = 123)
    outer_idx = list(outer.split(dfx, dfy))

    fold_all = []
    for i, idx in enumerate(outer_idx):

        fold_num = "fold_%s" % str(i).zfill(2) 

        train_idx, test_idx = idx

        trainX = dfx.iloc[train_idx].values
        trainY = dfy.iloc[train_idx].values

        testX = dfx.iloc[test_idx].values
        testY = dfy.iloc[test_idx].values

        ## training
        print("\nInput train and test X shape is %s, %s  \n" % (trainX.shape,  testX.shape))
        print("Getting the best parameters by gridsearch \n")
        best_params = get_best_params(trainX, trainY)
        clf = lgb.LGBMClassifier(**best_params, verbose=-1, n_jobs=-1, force_col_wise=True)
        clf.fit(trainX, trainY)

        ## evaluation
        y_true = testY
        y_score = clf.predict_proba(testX)[:,1]
        dfr = pd.DataFrame([y_true, y_score]).T
        dfr.columns = ['y_true', 'y_score']
        dfr.index = dfy.iloc[test_idx].index
        acc, auc = score(dfr)
        
        fold_all.append(auc)
        res.append([auc, i, p, best_params])
        
    print('The auc score for %s is %s.' % (p, np.mean(fold_all)))


################################################## /home/shenwanxiang/Research/AggMapNet_dataset/TCGA-G/O_z-score_train_LIHC_grade.h5 ##################################################
/home/shenwanxiang/Research/AggMapNet_dataset/TCGA-G/O_z-score_train_LIHC_grade.h5 {0: 250, 1: 124}


100%|##########| 374/374 [01:13<00:00,  5.06it/s]



Input train and test X shape is (299, 17970), (75, 17970)  

Getting the best parameters by gridsearch 

acc: 0.733, roc-auc: 0.724, BER: 0.338

Input train and test X shape is (299, 17970), (75, 17970)  

Getting the best parameters by gridsearch 

acc: 0.640, roc-auc: 0.511, BER: 0.483

Input train and test X shape is (299, 17970), (75, 17970)  

Getting the best parameters by gridsearch 

acc: 0.680, roc-auc: 0.722, BER: 0.413

Input train and test X shape is (299, 17970), (75, 17970)  

Getting the best parameters by gridsearch 

acc: 0.653, roc-auc: 0.695, BER: 0.423

Input train and test X shape is (300, 17970), (74, 17970)  

Getting the best parameters by gridsearch 

acc: 0.635, roc-auc: 0.734, BER: 0.454
The auc score for /home/shenwanxiang/Research/AggMapNet_dataset/TCGA-G/O_z-score_train_LIHC_grade.h5 is 0.6772981234250599.
################################################## /home/shenwanxiang/Research/AggMapNet_dataset/TCGA-G/O_z-score_train_KIRC_grade.h5 #################

100%|##########| 544/544 [02:41<00:00,  3.37it/s]



Input train and test X shape is (435, 17970), (109, 17970)  

Getting the best parameters by gridsearch 

acc: 0.587, roc-auc: 0.593, BER: 0.450

Input train and test X shape is (435, 17970), (109, 17970)  

Getting the best parameters by gridsearch 

acc: 0.688, roc-auc: 0.635, BER: 0.394

Input train and test X shape is (435, 17970), (109, 17970)  

Getting the best parameters by gridsearch 

acc: 0.624, roc-auc: 0.619, BER: 0.429

Input train and test X shape is (435, 17970), (109, 17970)  

Getting the best parameters by gridsearch 

acc: 0.541, roc-auc: 0.604, BER: 0.525

Input train and test X shape is (436, 17970), (108, 17970)  

Getting the best parameters by gridsearch 

acc: 0.620, roc-auc: 0.520, BER: 0.464
The auc score for /home/shenwanxiang/Research/AggMapNet_dataset/TCGA-G/O_z-score_train_KIRC_grade.h5 is 0.5941510330906186.
################################################## /home/shenwanxiang/Research/AggMapNet_dataset/TCGA-G/O_z-score_train_STAD_grade.h5 ############

100%|##########| 416/416 [01:37<00:00,  4.25it/s]



Input train and test X shape is (332, 17970), (84, 17970)  

Getting the best parameters by gridsearch 

acc: 0.643, roc-auc: 0.708, BER: 0.389

Input train and test X shape is (333, 17970), (83, 17970)  

Getting the best parameters by gridsearch 

acc: 0.699, roc-auc: 0.821, BER: 0.303

Input train and test X shape is (333, 17970), (83, 17970)  

Getting the best parameters by gridsearch 

acc: 0.602, roc-auc: 0.693, BER: 0.411

Input train and test X shape is (333, 17970), (83, 17970)  

Getting the best parameters by gridsearch 

acc: 0.747, roc-auc: 0.804, BER: 0.299

Input train and test X shape is (333, 17970), (83, 17970)  

Getting the best parameters by gridsearch 

acc: 0.687, roc-auc: 0.803, BER: 0.328
The auc score for /home/shenwanxiang/Research/AggMapNet_dataset/TCGA-G/O_z-score_train_STAD_grade.h5 is 0.7657232877479399.
################################################## /home/shenwanxiang/Research/AggMapNet_dataset/TCGA-G/O_z-score_train_UCEC_grade.h5 #################

100%|##########| 554/554 [02:45<00:00,  3.35it/s]



Input train and test X shape is (443, 17970), (111, 17970)  

Getting the best parameters by gridsearch 

acc: 0.838, roc-auc: 0.906, BER: 0.171

Input train and test X shape is (443, 17970), (111, 17970)  

Getting the best parameters by gridsearch 

acc: 0.847, roc-auc: 0.922, BER: 0.158

Input train and test X shape is (443, 17970), (111, 17970)  

Getting the best parameters by gridsearch 

acc: 0.829, roc-auc: 0.884, BER: 0.183

Input train and test X shape is (443, 17970), (111, 17970)  

Getting the best parameters by gridsearch 

acc: 0.811, roc-auc: 0.849, BER: 0.204

Input train and test X shape is (444, 17970), (110, 17970)  

Getting the best parameters by gridsearch 

acc: 0.818, roc-auc: 0.897, BER: 0.185
The auc score for /home/shenwanxiang/Research/AggMapNet_dataset/TCGA-G/O_z-score_train_UCEC_grade.h5 is 0.8915693165033953.
################################################## /home/shenwanxiang/Research/AggMapNet_dataset/TCGA-G/O_z-score_train_PAAD_grade.h5 ############

100%|##########| 179/179 [00:39<00:00,  4.56it/s]



Input train and test X shape is (143, 17970), (36, 17970)  

Getting the best parameters by gridsearch 

acc: 0.778, roc-auc: 0.594, BER: 0.453

Input train and test X shape is (143, 17970), (36, 17970)  

Getting the best parameters by gridsearch 

acc: 0.639, roc-auc: 0.569, BER: 0.500

Input train and test X shape is (143, 17970), (36, 17970)  

Getting the best parameters by gridsearch 

acc: 0.722, roc-auc: 0.708, BER: 0.519

Input train and test X shape is (143, 17970), (36, 17970)  

Getting the best parameters by gridsearch 

acc: 0.694, roc-auc: 0.478, BER: 0.569

Input train and test X shape is (144, 17970), (35, 17970)  

Getting the best parameters by gridsearch 

acc: 0.600, roc-auc: 0.767, BER: 0.467
The auc score for /home/shenwanxiang/Research/AggMapNet_dataset/TCGA-G/O_z-score_train_PAAD_grade.h5 is 0.6229259963865578.
################################################## /home/shenwanxiang/Research/AggMapNet_dataset/TCGA-G/O_z-score_train_CESC_grade.h5 #################

100%|##########| 306/306 [01:23<00:00,  3.68it/s]



Input train and test X shape is (244, 17970), (62, 17970)  

Getting the best parameters by gridsearch 

acc: 0.629, roc-auc: 0.566, BER: 0.410

Input train and test X shape is (245, 17970), (61, 17970)  

Getting the best parameters by gridsearch 

acc: 0.541, roc-auc: 0.563, BER: 0.486

Input train and test X shape is (245, 17970), (61, 17970)  

Getting the best parameters by gridsearch 

acc: 0.656, roc-auc: 0.691, BER: 0.437

Input train and test X shape is (245, 17970), (61, 17970)  

Getting the best parameters by gridsearch 

acc: 0.607, roc-auc: 0.654, BER: 0.415

Input train and test X shape is (245, 17970), (61, 17970)  

Getting the best parameters by gridsearch 

acc: 0.574, roc-auc: 0.560, BER: 0.447
The auc score for /home/shenwanxiang/Research/AggMapNet_dataset/TCGA-G/O_z-score_train_CESC_grade.h5 is 0.6066440925264456.
################################################## /home/shenwanxiang/Research/AggMapNet_dataset/TCGA-G/O_z-score_test_HNSC_grade.h5 ##################

100%|##########| 504/504 [02:00<00:00,  4.19it/s]



Input train and test X shape is (403, 17970), (101, 17970)  

Getting the best parameters by gridsearch 

acc: 0.772, roc-auc: 0.775, BER: 0.424

Input train and test X shape is (403, 17970), (101, 17970)  

Getting the best parameters by gridsearch 

acc: 0.762, roc-auc: 0.634, BER: 0.443

Input train and test X shape is (403, 17970), (101, 17970)  

Getting the best parameters by gridsearch 

acc: 0.792, roc-auc: 0.784, BER: 0.383

Input train and test X shape is (403, 17970), (101, 17970)  

Getting the best parameters by gridsearch 

acc: 0.782, roc-auc: 0.737, BER: 0.413

Input train and test X shape is (404, 17970), (100, 17970)  

Getting the best parameters by gridsearch 

acc: 0.710, roc-auc: 0.774, BER: 0.441
The auc score for /home/shenwanxiang/Research/AggMapNet_dataset/TCGA-G/O_z-score_test_HNSC_grade.h5 is 0.7408072312996229.
################################################## /home/shenwanxiang/Research/AggMapNet_dataset/TCGA-G/O_z-score_train_LGG_grade.h5 ##############

100%|##########| 532/532 [02:37<00:00,  3.37it/s]



Input train and test X shape is (425, 17970), (107, 17970)  

Getting the best parameters by gridsearch 

acc: 0.757, roc-auc: 0.823, BER: 0.247

Input train and test X shape is (425, 17970), (107, 17970)  

Getting the best parameters by gridsearch 

acc: 0.636, roc-auc: 0.711, BER: 0.364

Input train and test X shape is (426, 17970), (106, 17970)  

Getting the best parameters by gridsearch 

acc: 0.670, roc-auc: 0.716, BER: 0.332

Input train and test X shape is (426, 17970), (106, 17970)  

Getting the best parameters by gridsearch 

acc: 0.717, roc-auc: 0.762, BER: 0.267

Input train and test X shape is (426, 17970), (106, 17970)  

Getting the best parameters by gridsearch 

acc: 0.670, roc-auc: 0.710, BER: 0.330
The auc score for /home/shenwanxiang/Research/AggMapNet_dataset/TCGA-G/O_z-score_train_LGG_grade.h5 is 0.7445717087544969.


In [7]:
dfres = pd.DataFrame(res, columns = ['score', 'fold', 'task', 'best_params'])
dfres.to_csv('./LGB_5FCV_results_grade.csv')
dfres.groupby('task').mean().sort_values('score',ascending=False)

Unnamed: 0_level_0,score,fold
task,Unnamed: 1_level_1,Unnamed: 2_level_1
/home/shenwanxiang/Research/AggMapNet_dataset/TCGA-G/O_z-score_train_UCEC_grade.h5,0.891569,2
/home/shenwanxiang/Research/AggMapNet_dataset/TCGA-G/O_z-score_train_STAD_grade.h5,0.765723,2
/home/shenwanxiang/Research/AggMapNet_dataset/TCGA-G/O_z-score_train_LGG_grade.h5,0.744572,2
/home/shenwanxiang/Research/AggMapNet_dataset/TCGA-G/O_z-score_test_HNSC_grade.h5,0.740807,2
/home/shenwanxiang/Research/AggMapNet_dataset/TCGA-G/O_z-score_train_LIHC_grade.h5,0.677298,2
/home/shenwanxiang/Research/AggMapNet_dataset/TCGA-G/O_z-score_train_PAAD_grade.h5,0.622926,2
/home/shenwanxiang/Research/AggMapNet_dataset/TCGA-G/O_z-score_train_CESC_grade.h5,0.606644,2
/home/shenwanxiang/Research/AggMapNet_dataset/TCGA-G/O_z-score_train_KIRC_grade.h5,0.594151,2


In [8]:
dfr = dfres.groupby('task').mean().sort_values('score',ascending=False)
order = ['CESC','KIRC','LGG','LIHC','PAAD','STAD','UCEC','HNSC']

dfr.index = dfr.index.map(lambda x:x.split('_')[-2])
dfr = dfr.loc[order]

In [9]:
dfr

Unnamed: 0_level_0,score,fold
task,Unnamed: 1_level_1,Unnamed: 2_level_1
CESC,0.606644,2
KIRC,0.594151,2
LGG,0.744572,2
LIHC,0.677298,2
PAAD,0.622926,2
STAD,0.765723,2
UCEC,0.891569,2
HNSC,0.740807,2
