In [1]:
# -*- coding: utf-8 -*-
"""
Created on Sun Aug 16 17:10:53 2020

@author: wanxiang.shen@u.nus.edu
"""

import warnings, os
warnings.filterwarnings("ignore")


import pandas as pd
import numpy as np
from glob import glob
from joblib import load, dump
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import confusion_matrix, precision_recall_curve, roc_auc_score
from sklearn.metrics import auc as calculate_auc

import matplotlib.pyplot as plt
import seaborn as sns

from aggmap import AggMap, AggModel, loadmap
from aggmap import show
np.random.seed(666) #just for reaptable results


In [2]:
flist = glob('./data/*.csv.gzip')
flist = pd.Series(flist).sort_values().tolist()
fall = []
for i in flist:
    df1 = pd.read_csv(i, compression='gzip', index_col = 0)
    df1['class'] = i.split('/')[-1].split('.csv')[0]
    fall.append(df1)
    
df = pd.concat(fall, axis=0)

In [None]:
dfx = df[df.columns[:-1]]
dfy = df[df.columns[-1:]]
dfx = np.log2(dfx + 1) #apply log2(x + 1)

In [4]:
data_save_folder = '/raid/shenwanxiang/transcriptome/pan-cancer'
mp = loadmap('/raid/shenwanxiang/agg_mp_object/pan-cancer.mp')


X1 = load(os.path.join(data_save_folder, 'Agg1.data')) 
X5 = load(os.path.join(data_save_folder, 'Agg5.data')) 

Y = pd.get_dummies(dfy['class']).values

In [5]:
mp.fmap_shape

(102, 102)

In [7]:
dfy.shape, dfx.shape, X5.shape, Y.shape

((10446, 1), (10446, 10381), (10446, 102, 102, 5), (10446, 33))

In [8]:
class_ = pd.get_dummies(dfy['class']).columns
class_

Index(['01_ACC', '02_BLCA', '03_BRCA', '04_CESC', '05_CHOL', '06_COAD',
       '07_DLBC', '08_ESCA', '09_GBM', '10_HNSC', '11_KICH', '12_KIRC',
       '13_KIRP', '14_LAML', '15_LGG', '16_LIHC', '17_LUAD', '18_LUSC',
       '19_MESO', '20_OV', '21_PAAD', '22_PCPG', '23_PRAD', '24_READ',
       '25_SARC', '26_SKCM', '27_STAD', '28_TGCT', '29_THCA', '30_THYM',
       '31_UCEC', '32_UCS', '33_UVM'],
      dtype='object')

## 10 fold cv performances 

In [18]:
outer_fold = 10

X = X5
each_fold_results = []
outer = StratifiedKFold(n_splits = outer_fold, shuffle = True)
outer_idx = outer.split(X, dfy.values)

dfy['idx'] = range(len(dfy))

print('#'*50  )
run_one_res = []
for i in range(10):

    fold_num = "fold_%s" % str(i).zfill(2) 
    save_path = './c1_c5_history/c5/%s' % fold_num
    if not os.path.exists(save_path):
        os.makedirs(save_path)

    old_save_path = './fold_results_c5/%s' % fold_num
    test_index = pd.read_csv(os.path.join(old_save_path, 'test_true_label.csv'), index_col=0).index
    test_idx = dfy[dfy.index.isin(test_index)]['idx'].tolist()
    train_idx = dfy[~dfy.index.isin(test_index)]['idx'].tolist()


    testY = Y[test_idx]
    testX = X[test_idx]

    trainX = X[train_idx]
    trainY = Y[train_idx]

    test_true_label = pd.DataFrame(testY).idxmax(axis=1).to_frame(name = 'y_true')
    test_true_label.index = dfy.iloc[test_idx].index
    test_true_label = test_true_label.join(dfy.iloc[test_idx])
    test_true_label.to_csv(os.path.join(save_path, 'test_true_label.csv'))


    print("\n input train and test X shape is %s, %s " % (trainX.shape,  testX.shape))

    clf = AggModel.MultiClassEstimator(epochs = 100, batch_size = 64, lr = 1e-3,  monitor = 'val_auc',
                                       gpuid = 4, verbose = 1, metric = 'ACC', ) #
    clf.fit(trainX, trainY, testX, testY) 

    clf._model.save(os.path.join(save_path, 'model.h5'))
    
    pd.DataFrame(clf.history.history).to_csv(os.path.join(save_path, 'history_%s.csv' % fold_num))
    
    test_pred_label = pd.DataFrame(clf.predict(testX)).idxmax(axis=1).to_frame(name = 'y_pred')
    test_pred_label.index = test_true_label.index
    test_pred_label.to_csv(os.path.join(save_path, 'test_pred_label.csv'))

    test_avg_loss, test_avg_acc = clf._model.evaluate(testX, testY, verbose=0)
    print('test_avg_loss: %.3f, test_avg_acc: %.3f, ' % (test_avg_loss, test_avg_acc))

    each_fold_results.append(test_avg_acc)


##################################################

 input train and test X shape is (9401, 102, 102, 5), (1045, 102, 102, 5) 
{'epochs': 100, 'lr': 0.001, 'conv1_kernel_size': 13, 'dense_layers': [128], 'dense_avf': 'relu', 'batch_size': 64, 'dropout': 0.0, 'batch_norm': False, 'n_inception': 2, 'monitor': 'val_auc', 'patience': 10000, 'random_state': 32, 'verbose': 1, 'name': 'AggMap MultiClass Estimator', 'gpuid': '4'}
Train on 9401 samples, validate on 1045 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Ep

In [19]:
outer_fold = 10

X = X1
each_fold_results = []
outer = StratifiedKFold(n_splits = outer_fold, shuffle = True)
outer_idx = outer.split(X, dfy.values)

dfy['idx'] = range(len(dfy))

print('#'*50  )
run_one_res = []
for i in range(10):

    fold_num = "fold_%s" % str(i).zfill(2) 
    save_path = './c1_c5_history/c1/%s' % fold_num
    if not os.path.exists(save_path):
        os.makedirs(save_path)

    old_save_path = './fold_results_c5/%s' % fold_num
    test_index = pd.read_csv(os.path.join(old_save_path, 'test_true_label.csv'), index_col=0).index
    test_idx = dfy[dfy.index.isin(test_index)]['idx'].tolist()
    train_idx = dfy[~dfy.index.isin(test_index)]['idx'].tolist()


    testY = Y[test_idx]
    testX = X[test_idx]

    trainX = X[train_idx]
    trainY = Y[train_idx]

    test_true_label = pd.DataFrame(testY).idxmax(axis=1).to_frame(name = 'y_true')
    test_true_label.index = dfy.iloc[test_idx].index
    test_true_label = test_true_label.join(dfy.iloc[test_idx])
    test_true_label.to_csv(os.path.join(save_path, 'test_true_label.csv'))


    print("\n input train and test X shape is %s, %s " % (trainX.shape,  testX.shape))

    clf = AggModel.MultiClassEstimator(epochs = 100, batch_size = 64, lr = 1e-3,  monitor = 'val_auc',
                                       gpuid = 4, verbose = 1, metric = 'ACC', ) #
    clf.fit(trainX, trainY, testX, testY) 

    clf._model.save(os.path.join(save_path, 'model.h5'))
    
    pd.DataFrame(clf.history.history).to_csv(os.path.join(save_path, 'history_%s.csv' % fold_num))
    
    test_pred_label = pd.DataFrame(clf.predict(testX)).idxmax(axis=1).to_frame(name = 'y_pred')
    test_pred_label.index = test_true_label.index
    test_pred_label.to_csv(os.path.join(save_path, 'test_pred_label.csv'))

    test_avg_loss, test_avg_acc = clf._model.evaluate(testX, testY, verbose=0)
    print('test_avg_loss: %.3f, test_avg_acc: %.3f, ' % (test_avg_loss, test_avg_acc))

    each_fold_results.append(test_avg_acc)


##################################################

 input train and test X shape is (9401, 102, 102, 1), (1045, 102, 102, 1) 
{'epochs': 100, 'lr': 0.001, 'conv1_kernel_size': 13, 'dense_layers': [128], 'dense_avf': 'relu', 'batch_size': 64, 'dropout': 0.0, 'batch_norm': False, 'n_inception': 2, 'monitor': 'val_auc', 'patience': 10000, 'random_state': 32, 'verbose': 1, 'name': 'AggMap MultiClass Estimator', 'gpuid': '4'}
Train on 9401 samples, validate on 1045 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Ep