This is a notebook for testing the dataloaders

In [7]:
import cellbox
import os
import numpy as np
import pandas as pd
import tensorflow.compat.v1 as tf
import shutil
import argparse
import json
tf.disable_v2_behavior()

### Tensorflow original code

In [18]:
def set_seed(in_seed):
    int_seed = int(in_seed)
    tf.compat.v1.set_random_seed(int_seed)
    np.random.seed(int_seed)


def prepare_workdir(in_cfg):
    # Read Data
    in_cfg.root_dir = os.getcwd()
    in_cfg.node_index = pd.read_csv(in_cfg.node_index_file, header=None, names=None) \
        if hasattr(in_cfg, 'node_index_file') else pd.DataFrame(np.arange(in_cfg.n_x))

    # Create Output Folder
    experiment_path = 'results/{}_{}'.format(in_cfg.experiment_id, md5)
    try:
        os.makedirs(experiment_path)
    except Exception:
        pass
    out_cfg = vars(in_cfg)
    out_cfg = {key: out_cfg[key] for key in out_cfg if type(out_cfg[key]) is not pd.DataFrame}
    os.chdir(experiment_path)
    json.dump(out_cfg, open('config.json', 'w'), indent=4)

    if "leave one out" in in_cfg.experiment_type:
        try:
            in_cfg.model_prefix = '{}_{}'.format(in_cfg.model_prefix, in_cfg.drug_index)
        except Exception('Drug index not specified') as e:
            raise e

    in_cfg.working_index = in_cfg.model_prefix + "_" + str(working_index).zfill(3)

    try:
        shutil.rmtree(in_cfg.working_index)
    except Exception:
        pass
    os.makedirs(in_cfg.working_index)
    os.chdir(in_cfg.working_index)

    with open("record_eval.csv", 'w') as f:
        f.write("epoch,iter,train_loss,valid_loss,train_mse,valid_mse,test_mse,time_elapsed\n")

    print('Working directory is ready at {}.'.format(experiment_path))
    return 0

experiment_config_path = "/users/ngun7t/Documents/cellbox-jun-6/configs_dev/Example.single_to_combo.json"
working_index = 0
stage = {
    "nT": 100,
    "sub_stages":[
        {"lr_val": 0.1,"l1lambda": 0.01, "n_iter_patience":1000},
        {"lr_val": 0.01,"l1lambda": 0.01},
        {"lr_val": 0.01,"l1lambda": 0.0001},
        {"lr_val": 0.001,"l1lambda": 0.00001}
    ]}

cfg = cellbox.config.Config(experiment_config_path)
cfg.ckpt_path_full = os.path.join('./', cfg.ckpt_name)
md5 = cellbox.utils.md5(cfg)
cfg.drug_index = 5         # Change this for testing purposes
cfg.seed = working_index + cfg.seed if hasattr(cfg, "seed") else working_index + 1000
set_seed(cfg.seed)
print(vars(cfg))

prepare_workdir(cfg)
logger = cellbox.utils.TimeLogger(time_logger_step=1, hierachy=3)
args = cfg
for i, stage in enumerate(cfg.stages):
    set_seed(cfg.seed)
    cfg = cellbox.dataset.factory(cfg)
    #args.sub_stages = stage['sub_stages']
    #args.n_T = stage['nT']
    #model = cellbox.model.factory(args)
    if i == 0: break

{'experiment_id': 'Example_S2C', 'model_prefix': 'seed', 'ckpt_name': 'model11.ckpt', 'export_verbose': 3, 'experiment_type': 'single to combo', 'sparse_data': False, 'batchsize': 16, 'trainset_ratio': 0.7, 'validset_ratio': 0.8, 'n_batches_eval': None, 'add_noise_level': 0, 'dT': 0.1, 'ode_solver': 'heun', 'envelope_form': 'tanh', 'envelope': 0, 'pert_form': 'by u', 'ode_degree': 1, 'ode_last_steps': 2, 'n_iter_buffer': 50, 'n_iter_patience': 100, 'weight_loss': 'None', 'l1lambda': 0.0001, 'l2lambda': 0.0001, 'model': 'CellBox', 'pert_file': '/users/ngun7t/Documents/cellbox-jun-6/data/pert_ind_last_col.csv', 'expr_file': '/users/ngun7t/Documents/cellbox-jun-6/data/expr_ind_last_col.csv', 'node_index_file': '/users/ngun7t/Documents/cellbox-jun-6/data/node_Index.csv', 'n_protein_nodes': 82, 'n_activity_nodes': 87, 'n_x': 100, 'envelop_form': 'tanh', 'envelop': 0, 'n_epoch': 10000, 'n_iter': 10000, 'stages': [{'nT': 100, 'sub_stages': [{'lr_val': 0.1, 'l1lambda': 0.01, 'n_iter_patience':

In [19]:
print(cfg.dataset["pert_train"].shape)
print(cfg.dataset["pert_valid"].shape)
print(cfg.dataset["pert_test"].shape)
print(type(cfg.dataset["pert_train"]))

d = vars(cfg)
iter_train = d["iter_train"]
iter_monitor = d["iter_monitor"]
feed_dict = d["feed_dicts"]

(18, 100)
(5, 100)
(66, 100)
<class 'numpy.ndarray'>


#### A function to identify which row will not show up in loo mode

In [20]:
def populate_drug_indices_map():
    drug_indices_map = []
    for drug_index in range(14):
        double_idx = cfg.loo.all(axis=1)
        testidx = (cfg.loo == drug_index).any(axis=1)

        if False:
            testidx = pd.concat([testidx, double_idx], axis=1)
            testidx = testidx.all(axis=1)

        loo_label = pd.read_csv("/users/ngun7t/Documents/cellbox-jun-6/data/loo_label.csv", header=None)[testidx]
        for i in range(14):
            if (loo_label == i).any(axis=1).all():
                drug_indices_map.append(i)
                break
    return drug_indices_map

def loo_rows(drug_index, drug_indices_map, singles):
    true_drug_index = drug_indices_map[drug_index]
    loo_label = pd.read_csv("/users/ngun7t/Documents/cellbox-jun-6/data/loo_label.csv", header=None)
    ind_rows = loo_label.index[(loo_label[[0, 1]] == true_drug_index).any(axis=1)].tolist()
    return np.array(ind_rows) - 1

drug_indices_map = populate_drug_indices_map()
if cfg.experiment_type == "leave one out (w/o single)":
    inds = loo_rows(cfg.drug_index, drug_indices_map, False)
else:
    inds = loo_rows(cfg.drug_index, drug_indices_map, True)

In [5]:
drug_indices_map

[0, 1, 2, 3, 4, 5, 6, 7, 10, 11, 12, 8, 9]

#### A function to identify which row will not show up in single-to-combo mode

In [21]:
def s2c_row_inds():
    loo_label = pd.read_csv("/users/ngun7t/Documents/cellbox-jun-6/data/loo_label.csv", header=None)
    rows_with_single_drugs = loo_label.index[(loo_label[[0, 1]] == 0).any(axis=1)].tolist()
    rows_with_multiple_drugs = list(set(list(range(loo_label.shape[0]))) - set(rows_with_single_drugs))
    return rows_with_single_drugs, rows_with_multiple_drugs
    
rows_with_single_drugs, rows_with_multiple_drugs = s2c_row_inds()

In [22]:
from tensorflow.core.protobuf import rewriter_config_pb2
from tensorflow.compat.v1.errors import OutOfRangeError

config = tf.compat.v1.ConfigProto()
off = rewriter_config_pb2.RewriterConfig.OFF
config.graph_options.rewrite_options.memory_optimization = off

# Launching session
sess = tf.compat.v1.Session(config=config)
sess.run(tf.compat.v1.global_variables_initializer())

items_pert, items_expr = [], []
next_element = iter_train.get_next()

sess.run(iter_train.initializer, feed_dict=feed_dict['train_set'])

try:
    while True:
        result1, result2 = sess.run(next_element, feed_dict=feed_dict['train_set'])
        items_pert.append(result1)
        items_expr.append(result2)
except OutOfRangeError:
    print("Finished")

Finished


In [23]:
len(items_pert)

2

In [25]:
for pert in items_pert:
    ind = pert[:, -1]
    print(f"Ind: {ind} shares the ind that contains the drug index: {np.intersect1d(ind, rows_with_single_drugs)}")
    
#cfg.drug_index

Ind: [41. 76. 11. 34. 24. 66. 23. 29. 42. 70. 79. 43. 51. 69. 54. 53.] shares the ind that contains the drug index: [11. 23. 24. 29. 34. 41. 42. 43. 51. 53. 54. 66. 69. 70. 76. 79.]
Ind: [88. 12.] shares the ind that contains the drug index: [12. 88.]


### Pytorch code

In [2]:
def set_seed(in_seed):
    int_seed = int(in_seed)
    tf.compat.v1.set_random_seed(int_seed)
    np.random.seed(int_seed)


def prepare_workdir(in_cfg):
    # Read Data
    in_cfg.root_dir = os.getcwd()
    in_cfg.node_index = pd.read_csv(in_cfg.node_index_file, header=None, names=None) \
        if hasattr(in_cfg, 'node_index_file') else pd.DataFrame(np.arange(in_cfg.n_x))

    # Create Output Folder
    experiment_path = 'results/{}_{}'.format(in_cfg.experiment_id, md5)
    try:
        os.makedirs(experiment_path)
    except Exception:
        pass
    out_cfg = vars(in_cfg)
    out_cfg = {key: out_cfg[key] for key in out_cfg if type(out_cfg[key]) is not pd.DataFrame}
    os.chdir(experiment_path)
    json.dump(out_cfg, open('config.json', 'w'), indent=4)

    if "leave one out" in in_cfg.experiment_type:
        try:
            in_cfg.model_prefix = '{}_{}'.format(in_cfg.model_prefix, in_cfg.drug_index)
        except Exception('Drug index not specified') as e:
            raise e

    in_cfg.working_index = in_cfg.model_prefix + "_" + str(working_index).zfill(3)

    try:
        shutil.rmtree(in_cfg.working_index)
    except Exception:
        pass
    os.makedirs(in_cfg.working_index)
    os.chdir(in_cfg.working_index)

    with open("record_eval.csv", 'w') as f:
        f.write("epoch,iter,train_loss,valid_loss,train_mse,valid_mse,test_mse,time_elapsed\n")

    print('Working directory is ready at {}.'.format(experiment_path))
    return 0

experiment_config_path = "/users/ngun7t/Documents/cellbox-jun-6/configs_dev/Example.leave_one_drug_out.json"
working_index = 0
stage = {
    "nT": 100,
    "sub_stages":[
        {"lr_val": 0.1,"l1lambda": 0.01, "n_iter_patience":1000},
        {"lr_val": 0.01,"l1lambda": 0.01},
        {"lr_val": 0.01,"l1lambda": 0.0001},
        {"lr_val": 0.001,"l1lambda": 0.00001}
    ]}

cfg = cellbox.config.Config(experiment_config_path)
cfg.ckpt_path_full = os.path.join('./', cfg.ckpt_name)
md5 = cellbox.utils.md5(cfg)
cfg.drug_index = 5         # Change this for testing purposes
cfg.seed = working_index + cfg.seed if hasattr(cfg, "seed") else working_index + 1000
set_seed(cfg.seed)
print(vars(cfg))

prepare_workdir(cfg)
logger = cellbox.utils.TimeLogger(time_logger_step=1, hierachy=3)
args = cfg
for i, stage in enumerate(cfg.stages):
    set_seed(cfg.seed)
    cfg = cellbox.dataset_torch.factory(cfg)
    #args.sub_stages = stage['sub_stages']
    #args.n_T = stage['nT']
    #model = cellbox.model.factory(args)
    if i == 0: break

{'experiment_id': 'Example_LOO', 'model_prefix': 'drug', 'ckpt_name': 'model11.ckpt', 'export_verbose': 3, 'experiment_type': 'leave one out (w/o single)', 'sparse_data': False, 'batchsize': 16, 'trainset_ratio': 0.7, 'validset_ratio': 0.8, 'n_batches_eval': None, 'add_noise_level': 0, 'dT': 0.1, 'ode_solver': 'heun', 'envelope_form': 'tanh', 'envelope': 0, 'pert_form': 'by u', 'ode_degree': 1, 'ode_last_steps': 2, 'n_iter_buffer': 50, 'n_iter_patience': 100, 'weight_loss': 'None', 'l1lambda': 0.0001, 'l2lambda': 0.0001, 'model': 'CellBox', 'pert_file': '/users/ngun7t/Documents/cellbox-jun-6/data/pert_ind_last_col.csv', 'expr_file': '/users/ngun7t/Documents/cellbox-jun-6/data/expr_ind_last_col.csv', 'node_index_file': '/users/ngun7t/Documents/cellbox-jun-6/data/node_Index.csv', 'n_protein_nodes': 82, 'n_activity_nodes': 87, 'n_x': 100, 'envelop_form': 'tanh', 'envelop': 0, 'n_epoch': 10000, 'n_iter': 10000, 'stages': [{'nT': 100, 'sub_stages': [{'lr_val': 0.1, 'l1lambda': 0.01, 'n_iter

In [6]:
def populate_drug_indices_map():
    drug_indices_map = []
    for drug_index in range(14):
        double_idx = cfg.loo.all(axis=1)
        testidx = (cfg.loo == drug_index).any(axis=1)

        if False:
            testidx = pd.concat([testidx, double_idx], axis=1)
            testidx = testidx.all(axis=1)

        loo_label = pd.read_csv("/users/ngun7t/Documents/cellbox-jun-6/data/loo_label.csv", header=None)[testidx]
        for i in range(14):
            if (loo_label == i).any(axis=1).all():
                drug_indices_map.append(i)
                break
    return drug_indices_map

def loo_rows(drug_index, drug_indices_map, singles):
    true_drug_index = drug_indices_map[drug_index]
    loo_label = pd.read_csv("/users/ngun7t/Documents/cellbox-jun-6/data/loo_label.csv", header=None)
    ind_rows = loo_label.index[(loo_label[[0, 1]] == true_drug_index).any(axis=1)].tolist()
    return np.array(ind_rows) - 1

drug_indices_map = populate_drug_indices_map()
if cfg.experiment_type == "leave one out (w/o single)":
    inds = loo_rows(cfg.drug_index, drug_indices_map, False)
else:
    inds = loo_rows(cfg.drug_index, drug_indices_map, True)

In [3]:
items_pert, items_expr = [], []
for pert, expr in cfg.iter_train:
    items_pert.append(pert)
    items_expr.append(expr)

print(len(items_pert))

4


In [6]:
np.array(items_pert[0]).shape

(16, 100)

In [8]:
for pert in items_pert:
    ind = pert[:, -1]
    print(f"Ind: {ind} shares the ind that contains the drug index: {np.intersect1d(ind, inds+1)}")

Ind: tensor([21., 42., 28.,  8., 11., 66., 86., 26., 45.,  7., 19., 64., 49.,  4.,
        75., 44.]) shares the ind that contains the drug index: []
Ind: tensor([55., 48., 85., 31., 10., 50., 24., 71., 87., 68., 33., 27., 23., 83.,
        30., 67.]) shares the ind that contains the drug index: []
Ind: tensor([74.,  5.,  9., 18., 32., 60., 65., 15., 61., 25., 88., 57., 13.,  6.,
        81.,  2.]) shares the ind that contains the drug index: []
Ind: tensor([77., 20., 58., 17., 80., 78., 54., 69., 43., 12., 29., 14.]) shares the ind that contains the drug index: []


### Some random code

In [5]:
loo_label = pd.read_csv("/users/ngun7t/Documents/cellbox-jun-6/data/loo_label.csv", header=None)
ind_rows = loo_label.index[(loo_label[[0, 1]] == 0).any(axis=1)].tolist()

In [8]:
loo_label.iloc[ind_rows]

Unnamed: 0,0,1
0,1,0
10,1,0
11,2,0
12,2,0
23,3,0
24,3,0
29,4,0
33,4,0
34,5,0
41,5,0


In [47]:
drug_index = int(cfg.drug_index)
double_idx = cfg.loo.all(axis=1)

testidx = (cfg.loo == drug_index).any(axis=1)

if True:
    testidx = pd.concat([testidx, double_idx], axis=1)
    testidx = testidx.all(axis=1)

nexp, _ = cfg.pert.shape
nvalid = nexp - sum(testidx)
ntrain = int(nvalid * cfg.validset_ratio)

valid_pos = np.random.choice(range(nvalid), nvalid, replace=False)

In [52]:
cfg.loo

Unnamed: 0,0,1
0,1,0
1,1,3
2,1,4
3,1,5
4,1,6
...,...,...
84,7,10
85,10,12
86,8,10
87,9,10


In [51]:
testidx

0     False
1     False
2     False
3     False
4     False
      ...  
84     True
85     True
86     True
87     True
88    False
Length: 89, dtype: bool

In [45]:
(cfg.loo == 10).any(axis=1)

0     False
1     False
2     False
3     False
4     False
      ...  
84     True
85     True
86     True
87     True
88     True
Length: 89, dtype: bool

In [73]:
def pad_and_realign(x, length, idx_shift=0):
    x -= idx_shift
    padded = np.pad(x, (0, length - len(x)), 'constant')
    return padded

group_df = pd.DataFrame(np.where(cfg.pert != 0), index=['row_id', 'pert_idx']).T.groupby('row_id')
max_combo_degree = group_df.pert_idx.count().max()
cfg.loo = pd.DataFrame(group_df.pert_idx.apply(
    lambda x: pad_and_realign(x, max_combo_degree, cfg.n_activity_nodes - 1)
).tolist())

### Config file and csv data

#### Read and check shape

In [14]:
import pandas as pd

pert_df = pd.read_csv("/users/ngun7t/Documents/cellbox/data/pert.csv")
expr_df = pd.read_csv("/users/ngun7t/Documents/cellbox-jun-6/data/expr.csv")
print(pert_df.shape)
print(expr_df.shape)

(88, 99)
(88, 99)


In [16]:
pert_df.head()

Unnamed: 0,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,...,0.88,0.89,0.90,0.91,0.92,0.93,0.94,0.95,0.96,0.97
0,0,0,0,0,0,0,0,0,0,0,...,-0.551,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,0,0,0,0,0,0,0,0,0,...,0.0,-0.415,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,-0.515,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,-0.622,0.0,0.0,0.0,0.0,0.0,0.0
4,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,-0.943,0.0,0.0,0.0,0.0,0.0


In [17]:
expr_df.head()

Unnamed: 0,-0.435,-0.608,-0.855,-0.712,-0.113,-0.089,0.096,0.291,0.428,0.375,...,0.4,0.5,0.6,0.7,0.8,0.9,0.10,0.11,0.12,0.13
0,-0.308,-0.322,-0.999,-0.594,0.022,-0.061,0.093,0.222,0.255,0.645,...,-0.89,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.402,-0.549,-0.837,-0.675,-0.011,-0.103,0.089,0.418,0.276,0.439,...,0.0,-0.393,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.475,-0.652,-0.935,-0.751,-0.085,-0.019,0.064,0.207,0.147,0.416,...,0.0,0.0,-0.474,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.611,-1.097,-1.234,-0.928,-0.114,-0.113,-0.002,-0.042,0.277,0.732,...,0.0,0.0,0.0,-0.552,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.649,-0.49,-0.294,-0.491,-0.251,-0.361,-0.018,-0.144,1.191,0.098,...,0.0,0.0,0.0,0.0,-0.737,0.0,0.0,0.0,0.0,0.0


#### Add index to Cellbox csv files

In [2]:
import pandas as pd
import numpy as np

In [23]:
pert_df = pd.read_csv("/users/ngun7t/Documents/cellbox-jun-6/data/pert.csv", header=None)
expr_df = pd.read_csv("/users/ngun7t/Documents/cellbox-jun-6/data/expr.csv", header=None)

nrows = pert_df.shape[0]
ind = list(np.arange(0, nrows))

pert_df[99] = ind
pert_df.to_csv("/users/ngun7t/Documents/cellbox-jun-6/data/pert_ind_last_col.csv", index=False, header=False)

expr_df[99] = ind
expr_df.to_csv("/users/ngun7t/Documents/cellbox-jun-6/data/expr_ind_last_col.csv", index=False, header=False)

In [24]:
pert_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,0,0,0,0,0,0,0,0,0,0,...,-0.415,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
3,0,0,0,0,0,0,0,0,0,0,...,0.0,-0.515,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3
4,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,-0.622,0.0,0.0,0.0,0.0,0.0,0.0,4


In [25]:
pert_df = pd.read_csv("/users/ngun7t/Documents/cellbox-jun-6/data/pert_ind_last_col.csv", header=None)
pert_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,0,0,0,0,0,0,0,0,0,0,...,-0.415,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
3,0,0,0,0,0,0,0,0,0,0,...,0.0,-0.515,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3
4,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,-0.622,0.0,0.0,0.0,0.0,0.0,0.0,4


### Try out new CellBox Pytorch DataLoader

In [13]:
iter_train = cfg.iter_train

perts = []
exprs = []
for pert, expr in iter_train:
    perts.append(pert)
    exprs.append(expr)
    #print(f"Shape of pert: {pert.size()} and shape of expr: {expr.size()}")

<class 'torch.utils.data.dataloader.DataLoader'>
Shape of pert: torch.Size([4, 99]) and shape of expr: torch.Size([4, 99])
Shape of pert: torch.Size([4, 99]) and shape of expr: torch.Size([4, 99])
Shape of pert: torch.Size([4, 99]) and shape of expr: torch.Size([4, 99])
Shape of pert: torch.Size([4, 99]) and shape of expr: torch.Size([4, 99])
Shape of pert: torch.Size([4, 99]) and shape of expr: torch.Size([4, 99])
Shape of pert: torch.Size([4, 99]) and shape of expr: torch.Size([4, 99])
Shape of pert: torch.Size([4, 99]) and shape of expr: torch.Size([4, 99])
Shape of pert: torch.Size([4, 99]) and shape of expr: torch.Size([4, 99])
Shape of pert: torch.Size([4, 99]) and shape of expr: torch.Size([4, 99])
Shape of pert: torch.Size([4, 99]) and shape of expr: torch.Size([4, 99])
Shape of pert: torch.Size([4, 99]) and shape of expr: torch.Size([4, 99])
Shape of pert: torch.Size([4, 99]) and shape of expr: torch.Size([4, 99])
Shape of pert: torch.Size([1, 99]) and shape of expr: torch.Siz

In [6]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, TensorDataset
from torch.utils.data import DataLoader

In [7]:
full_data = torch.from_numpy(cfg.dataset["pert_full"].to_numpy())
temp_y = torch.from_numpy(np.zeros((full_data.shape[0], 1)))

print(full_data.size())
print(temp_y.size())

dataset = TensorDataset(full_data, temp_y)
dataloader = DataLoader(
    dataset, batch_size=3, shuffle=True
)

torch.Size([89, 99])
torch.Size([89, 1])


### Scratch

In [30]:
cfg.loo

Unnamed: 0,0,1,2
0,1,0,0
1,1,3,13
2,1,4,13
3,1,5,13
4,1,6,13
...,...,...,...
84,7,10,13
85,10,12,13
86,8,10,13
87,9,10,13


In [94]:
a = np.pad([1,2,3,4,5], (0, 2), "constant")

In [95]:
a

array([1, 2, 3, 4, 5, 0, 0])

In [97]:
def pad_and_realign(x, length, idx_shift=0):
    print(list(x))
    x -= idx_shift
    padded = np.pad(x, (0, length - len(x)), 'constant')
    return padded

group_df = pd.DataFrame(np.where(cfg.pert != 0), index=['row_id', 'pert_idx']).T.groupby('row_id')
max_combo_degree = group_df.pert_idx.count().max()
cfg.loo = pd.DataFrame(group_df.pert_idx.apply(
    lambda x: pad_and_realign(x, max_combo_degree, cfg.n_activity_nodes - 1)
).tolist())

[87]
[87, 89, 99]
[87, 90, 99]
[87, 91, 99]
[87, 92, 99]
[87, 93, 99]
[87, 98, 99]
[87, 94, 99]
[87, 95, 99]
[87, 96, 99]
[87, 99]
[88, 99]
[88, 99]
[87, 88, 99]
[88, 89, 99]
[88, 90, 99]
[88, 91, 99]
[88, 92, 99]
[88, 93, 99]
[88, 98, 99]
[88, 94, 99]
[88, 95, 99]
[88, 96, 99]
[89, 99]
[89, 99]
[89, 90, 99]
[89, 93, 99]
[89, 98, 99]
[89, 95, 99]
[90, 99]
[90, 93, 99]
[90, 98, 99]
[90, 95, 99]
[90, 99]
[91, 99]
[89, 91, 99]
[90, 91, 99]
[91, 93, 99]
[91, 98, 99]
[91, 94, 99]
[91, 95, 99]
[91, 99]
[92, 99]
[92, 99]
[89, 92, 99]
[90, 92, 99]
[91, 92, 99]
[92, 93, 99]
[92, 98, 99]
[92, 94, 99]
[92, 95, 99]
[93, 99]
[93, 95, 99]
[93, 99]
[97, 99]
[87, 97, 99]
[88, 97, 99]
[89, 97, 99]
[90, 97, 99]
[91, 97, 99]
[92, 97, 99]
[93, 97, 99]
[97, 98, 99]
[94, 97, 99]
[95, 97, 99]
[96, 97, 99]
[98, 99]
[93, 98, 99]
[95, 98, 99]
[98, 99]
[94, 99]
[89, 94, 99]
[90, 94, 99]
[93, 94, 99]
[94, 98, 99]
[94, 95, 99]
[94, 99]
[95, 99]
[95, 99]
[96, 99]
[89, 96, 99]
[90, 96, 99]
[91, 96, 99]
[92, 96, 99]


In [33]:
cfg.pert.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.415,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,-0.515,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,-0.622,0.0,0.0,0.0,0.0,0.0,0.0,4.0


In [99]:
group_df.head(10)

Unnamed: 0,row_id,pert_idx
0,0,87
1,1,87
2,1,89
3,1,99
4,2,87
...,...,...
238,87,95
239,87,96
240,87,99
241,88,96


In [88]:
group_df.pert_idx

<pandas.core.groupby.generic.SeriesGroupBy object at 0x7fed1cb54f40>

In [38]:
cfg.loo

Unnamed: 0,0,1,2
0,1,0,0
1,1,3,13
2,1,4,13
3,1,5,13
4,1,6,13
...,...,...,...
84,7,10,13
85,10,12,13
86,8,10,13
87,9,10,13


In [51]:
cfg.loo.sort_values(0)

Unnamed: 0,0,1,2
0,1,0,0
55,1,11,13
13,1,2,13
10,1,13,0
9,1,10,13
...,...,...,...
88,10,13,0
62,11,12,13
54,11,13,0
69,12,13,0


In [39]:
pd.DataFrame(np.where(cfg.pert != 0), index=['row_id', 'pert_idx'])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,233,234,235,236,237,238,239,240,241,242
row_id,0,1,1,1,2,2,2,3,3,3,...,85,85,86,86,86,87,87,87,88,88
pert_idx,87,87,89,99,87,90,99,87,91,99,...,98,99,94,96,99,95,96,99,96,99


In [45]:
group_df.pert_idx.apply(
    lambda x: pad_and_realign(x, max_combo_degree, cfg.n_activity_nodes - 1)
).tolist()

[array([1, 0, 0]),
 array([ 1,  3, 13]),
 array([ 1,  4, 13]),
 array([ 1,  5, 13]),
 array([ 1,  6, 13]),
 array([ 1,  7, 13]),
 array([ 1, 12, 13]),
 array([ 1,  8, 13]),
 array([ 1,  9, 13]),
 array([ 1, 10, 13]),
 array([ 1, 13,  0]),
 array([ 2, 13,  0]),
 array([ 2, 13,  0]),
 array([ 1,  2, 13]),
 array([ 2,  3, 13]),
 array([ 2,  4, 13]),
 array([ 2,  5, 13]),
 array([ 2,  6, 13]),
 array([ 2,  7, 13]),
 array([ 2, 12, 13]),
 array([ 2,  8, 13]),
 array([ 2,  9, 13]),
 array([ 2, 10, 13]),
 array([ 3, 13,  0]),
 array([ 3, 13,  0]),
 array([ 3,  4, 13]),
 array([ 3,  7, 13]),
 array([ 3, 12, 13]),
 array([ 3,  9, 13]),
 array([ 4, 13,  0]),
 array([ 4,  7, 13]),
 array([ 4, 12, 13]),
 array([ 4,  9, 13]),
 array([ 4, 13,  0]),
 array([ 5, 13,  0]),
 array([ 3,  5, 13]),
 array([ 4,  5, 13]),
 array([ 5,  7, 13]),
 array([ 5, 12, 13]),
 array([ 5,  8, 13]),
 array([ 5,  9, 13]),
 array([ 5, 13,  0]),
 array([ 6, 13,  0]),
 array([ 6, 13,  0]),
 array([ 3,  6, 13]),
 array([ 4,  

In [44]:
np.where(np.array([[0,0,0,4,4,5], [0,0,0,4,4,5]]) > 0)

(array([0, 0, 0, 1, 1, 1]), array([3, 4, 5, 3, 4, 5]))

In [100]:
drug_indices_map = []
for drug_index in range(14):
    double_idx = cfg.loo.all(axis=1)
    testidx = (cfg.loo == drug_index).any(axis=1)

    if False:
        testidx = pd.concat([testidx, double_idx], axis=1)
        testidx = testidx.all(axis=1)

    loo_label = pd.read_csv("/users/ngun7t/Documents/cellbox-jun-6/data/loo_label.csv", header=None)[testidx]
    for i in range(14):
        if (loo_label == i).any(axis=1).all():
            drug_indices_map.append(i)
            break

In [115]:
ind

[9, 22, 65, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88]

In [117]:
ind = loo_label.index[(loo_label[[0, 1]] == 12).any(axis=1)].tolist()
loo_label.iloc[np.array(ind) - 1]

Unnamed: 0,0,1
9,1,12
22,2,12
65,8,12
79,12,0
80,12,3
81,12,4
82,12,5
83,12,6
84,12,7
85,12,9


In [114]:
loo_label

Unnamed: 0,0,1
1,1,3
2,1,4
3,1,5
4,1,6
5,1,7
...,...,...
84,12,7
85,12,9
86,12,10
87,12,11
