In [1]:
# -*- coding: utf-8 -*-
"""
Created on Sun Aug 16 17:10:53 2020

@author: wanxiang.shen@u.nus.edu
"""

import warnings, os
warnings.filterwarnings("ignore")


import pandas as pd
import numpy as np
from glob import glob

from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import confusion_matrix, precision_recall_curve, roc_auc_score
from sklearn.metrics import auc as calculate_auc

import matplotlib.pyplot as plt
import seaborn as sns

from aggmap import AggMap, loadmap
from aggmap import show
np.random.seed(666) #just for reaptable results

from aggmap.AggModel import RegressionEstimator, MultiClassEstimator, MultiLabelEstimator



In [2]:
flist = glob('./data/*.csv.gzip')
flist = pd.Series(flist).sort_values().tolist()
fall = []
for i in flist:
    df1 = pd.read_csv(i, compression='gzip', index_col = 0)
    df1['class'] = i.split('/')[-1].split('.csv')[0]
    fall.append(df1)
    
df = pd.concat(fall, axis=0)

In [3]:
dfx = df[df.columns[:-1]]
dfy = df[df.columns[-1:]]
dfx = np.log2(dfx + 1) #apply log2(x + 1)

In [None]:
scale_method = 'minmax'
mp = loadmap('/raid/shenwanxiang/agg_mp_object/pan-cancer.mp')
X = mp.batch_transform(dfx.values, scale_method = scale_method) 
Y = pd.get_dummies(dfy['class']).values

 10%|9         | 1018/10446 [03:16<33:38,  4.67it/s]

In [None]:
mp.fmap_shape

In [None]:
dfy.shape, dfx.shape

In [None]:
class_ = pd.get_dummies(dfy['class']).columns
class_

## feature maps

In [None]:
idx = dfy.reset_index().groupby('class').apply(lambda x:x.index.tolist())
rows = idx.index.tolist()
fontsize = 20
fig, axes = plt.subplots(nrows=3, ncols=11, figsize=(25, 8), sharex = True, sharey = True)

for row, ax in zip(rows, axes.ravel()):
    pid = idx.loc[row][0]  
    pid_name = row.split('_')[-1]
    x = X[pid]
    #ax.imshow(x.sum(axis=-1))
    #show.imshow(x, ax = ax,  mode = 'dark', color_list= pd.Series(mp.colormaps).tolist(), x_max=1, vmin=0, vmax=0.5) #
    show.imshow(x, ax = ax,  mode = 'dark', color_list= pd.Series(mp.colormaps).tolist(), x_max=x.max()-0.05, vmin=-0.1, ) # vmax=x.max()
    ax.set_xlabel(pid_name, fontsize = fontsize) 

fig.tight_layout()
fig.savefig('./fmp-5.png', bbox_inches='tight', dpi=400) 
# fig.savefig('./fmp.svg', bbox_inches='tight', dpi=400) 

In [None]:
idx = dfy.reset_index().groupby('class').apply(lambda x:x.index.tolist())
rows = idx.index.tolist()
fontsize = 20
fig, axes = plt.subplots(nrows=3, ncols=11, figsize=(25, 8), sharex = True, sharey = True)

for row, ax in zip(rows, axes.ravel()):
    pid = idx.loc[row][0]  
    pid_name = row.split('_')[-1]
    x = X[pid]
    ax.imshow(x.sum(axis=-1), cmap = 'gray')
    ax.set_xticks([])
    ax.set_yticks([])
    #show.imshow(x.sum(axis=-1).reshape(102,102,1), ax = ax,  mode = 'dark', color_list= ['gray'], x_max=1) #
    ax.set_xlabel(pid_name, fontsize = fontsize) 

    
fig.tight_layout()
fig.savefig('./fmp-1.png', bbox_inches='tight', dpi=400) 
# fig.savefig('./fmp.svg', bbox_inches='tight', dpi=400) 

## 10 fold cv performances 

In [None]:
def get_best_epochs(X, y):
    
    ## get best epochs
    kf = StratifiedKFold(n_splits = 5, shuffle = True, random_state=1)
    split_idx = kf.split(X, y[:,1])
    
    epochs = []
    for i, idx in enumerate(split_idx):

        train_idx, test_idx = idx

        testY = y[test_idx]
        testX = X[test_idx]

        trainX = X[train_idx]
        trainY = y[train_idx]
        
        clf = MultiClassEstimator(batch_size = 64, 
                                  lr = 1e-3,  
                                  batch_norm = True, 
                                  n_inception = 3,
                                  gpuid = 7, 
                                  verbose = 0, 
                                  patience = 20, 
                                  monitor = 'val_loss') #
        
        clf.fit(trainX, trainY, testX, testY)  

        epochs.append(clf._performance.best_epoch)
    
    best_epoch = int(np.mean(epochs))
    print('Best Avg. Epochs: %s' % best_epoch)
    return best_epoch

outer_fold = 10
# n, w,h,c = X.shape
# X1 = X.sum(axis=-1).reshape(n,w,h,1)

each_fold_results = []
outer = StratifiedKFold(n_splits = outer_fold, shuffle = True)
outer_idx = outer.split(X, dfy.values)

print('#'*50  )
run_one_res = []
for i, idx in enumerate(outer_idx):

    fold_num = "fold_%s" % str(i).zfill(2) 

    save_path = './new_res/%s' % fold_num
    if not os.path.exists(save_path):
        os.makedirs(save_path)

    train_idx, test_idx = idx

    testY = Y[test_idx]
    testX = X[test_idx]

    trainX = X[train_idx]
    trainY = Y[train_idx]

    test_true_label = pd.DataFrame(testY).idxmax(axis=1).to_frame(name = 'y_true')
    test_true_label.index = dfy.iloc[test_idx].index
    test_true_label = test_true_label.join(dfy.iloc[test_idx])
    test_true_label.to_csv(os.path.join(save_path, 'test_true_label.csv'))

    best_epochs = get_best_epochs(trainX, trainY)
    
    print("\n input train and test X shape is %s, %s " % (trainX.shape,  testX.shape))

    clf = MultiClassEstimator(epochs = best_epochs, batch_size = 64, lr = 1e-3,  
                              batch_norm = True, n_inception = 3,
                              gpuid = 7, verbose = 1, metric = 'ACC', ) #
    clf.fit(trainX, trainY) 

    clf._model.save(os.path.join(save_path, 'model.h5'))
    
    test_pred_label = pd.DataFrame(clf.predict(testX)).idxmax(axis=1).to_frame(name = 'y_pred')
    test_pred_label.index = test_true_label.index
    test_pred_label.to_csv(os.path.join(save_path, 'test_pred_label.csv'))

    test_avg_loss, test_avg_acc = clf._model.evaluate(testX, testY, verbose=0)
    print('test_avg_loss: %.3f, test_avg_acc: %.3f, ' % (test_avg_loss, test_avg_acc))

    each_fold_results.append(test_avg_acc)

In [None]:
clf.get_params()