In [1]:
import os
import random
import numpy as np
import pandas as pd

from sklearn.metrics import auc, roc_curve, precision_recall_curve

import torch




In [5]:
def fill_mask(y_trn, m_trn):
  y_pos = y_trn.sum(axis=0)
  y_neg = ((1 - y_trn) * m_trn).sum(axis=0)

  y_add = np.array([[1 if (m_trn[idx,idy] == 0) and (y_pos[idy] > y_neg[idy]) else 0 for idy in range(y_trn.shape[1])] for idx in range(y_trn.shape[0])])

  y_trn = y_trn + y_add

  m_trn = np.ones(m_trn.shape)

  return y_trn, m_trn




def bin2idx(omic_bin):
  """ Transfer a binarized matrix into a index matrix (for input of embedding layer).

  omic_bin: (num_sample, num_feature), each value in {0,1}
  omic_idx: 0 is used for padding, and therefore meaningful index starts from 1.

  """

  num_max_omic = omic_bin.sum(axis=1).max() # max num of mutation in a single sample
  omic_idx = np.zeros((len(omic_bin), num_max_omic), dtype=int )
  for idx, line in enumerate(omic_bin):
    line = [idy+1 for idy, val in enumerate(line) if val == 1]
    omic_idx[idx][0:len(line)] = line

  return omic_idx

def split_dataset(dataset, ratio=0.8):
  """ Split the dataset according to the ratio of training/test sets.
  
  Parameters
  ----------
  dataset: dict
    dict of lists, including omic profiles, cancer types, sensitivities, sample names
  ratio: float
    size(train_set)/size(train_set+test_set)

  Returns
  -------
  train_set, test_set: dict

  """

  num_sample = len(dataset["tmr"])
  num_train_sample = int(num_sample*ratio)

  train_set = {k:dataset[k][0:num_train_sample] for k in dataset.keys()}
  test_set = {k:dataset[k][num_train_sample:] for k in dataset.keys()}

  return train_set, test_set


def get_ptw_ids(drug_info, tgt):

  id2pw = {id:pw for id,pw in zip(drug_info.index,drug_info['Target pathway'])}
  pws = [id2pw.get(int(c),'Unknown') for c in tgt.columns]
  pw2id = {pw:id for id,pw in enumerate(list(set(pws)))}
  ptw_ids = [pw2id[pw] for pw in pws]

  return ptw_ids

def load_dataset(input_dir="data/input", drug_id=-1, shuffle_feature=False):
    tgt = pd.read_csv(os.path.join(input_dir,'gdsc.csv'),index_col=0)
    drug_info = pd.read_csv(os.path.join(input_dir,'drug_info_gdsc.csv'),index_col=0)
    ptw_ids = get_ptw_ids(drug_info,tgt)
    
    omics_data = {'mut':None, 'cnv':None, 'exp':None, 'met':None}
    for omic in omics_data.keys():
        omics_data[omic] = pd.read_csv(
            os.path.join(input_dir,omic+'_'+'gdsc.csv'), index_col=0)
    
    common_samples = [v.index for v in omics_data.values()]
    common_samples = list( set(tgt.index).intersection(*common_samples))
    
    tgt = tgt.loc[common_samples]
    for omic in omics_data.keys():
        omics_data[omic] = omics_data[omic].loc[common_samples]

    tmr = list(tgt.index) # barcodes/names of tumors
    msk = tgt.notnull().astype(int).values # mask of target data: 1->data available, 0->nan
    tgt = tgt.fillna(0).astype(int).values # fill nan element of target with 0.

    num_sample = len(tmr)
    
    omics_data_keys = list(omics_data.keys())
    for omic in omics_data_keys:
        omic_val = omics_data.pop(omic)
        omic_val = omic_val.values
        omics_data[omic+'_bin'] = omic_val
        omics_data[omic+'_idx'] = bin2idx(omics_data[omic+'_bin'])
    
    
    omics_data['tgt'] = tgt
    omics_data['msk'] = msk
    omics_data['tmr'] = tmr
    
    return omics_data, ptw_ids

In [6]:
dataset, ptw_ids = load_dataset(input_dir='../data/input',  drug_id=-1)

train_set, test_set = split_dataset(dataset, ratio=0.8)

In [14]:
dataset.keys()

dict_keys(['mut_bin', 'mut_idx', 'cnv_bin', 'cnv_idx', 'exp_bin', 'exp_idx', 'met_bin', 'met_idx', 'tgt', 'msk', 'tmr'])

In [21]:
train_set['tgt']

array([[0, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [18]:
train_set['tgt'],train_set['msk'] = fill_mask(train_set['tgt'],train_set['msk'])

In [20]:
train_set['tgt'].shape

(676, 260)

In [7]:
dataset['tgt'],dataset['msk'] = fill_mask(dataset['tgt'],dataset['msk'])

In [9]:
dataset['tgt']

array([[0, 0, 0, ..., 0, 0, 0],
       [1, 0, 1, ..., 1, 1, 1],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [1, 0, 0, ..., 1, 0, 1],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 1, 0, 1]])