In [3]:
import sys
import os
import collections
import h5py
import csv

from chemicalchecker.util import logged, get_parser, save_output, features_file
from chemicalchecker.database import Dataset, Molrepo

from cmapPy.pandasGEXpress import parse

In [4]:
def pertid_to_meta(map_files):
    filenames = [map_files["GSE92742_Broad_LINCS_pert_info"]]
    pertid_meta = collections.defaultdict(set)
    for filename in filenames:
        with open(filename, "r") as f:
            reader = csv.reader(f, delimiter = "\t")
            h = next(reader)
            pert_id_idx = h.index("pert_id")
            pert_iname_idx = h.index("pert_iname")
            pert_type_idx  = h.index("pert_type")
            for r in reader:
                pertid_meta[r[pert_id_idx]].update([(r[pert_iname_idx], r[pert_type_idx])])
    return pertid_meta

def parse_molrepo():
    lincs_inchikey = {}
    molrepos = Molrepo.get_by_molrepo_name("lincs")
    for molrepo in molrepos:
        if not molrepo.inchikey:
            continue
        lincs_inchikey[molrepo.src_id] = molrepo.inchikey
    return lincs_inchikey

which_datasources = ["touchstone_conn_SUMMLY"]

dataset_code = 'D1.003'

dataset = Dataset.get(dataset_code)

map_files = {}

# Data sources associated to this dataset are stored in map_files
# Keys are the datasources names and values the file paths.
# If no datasources are necessary, the list is just empty.
for ds in dataset.datasources:
    map_files[ds.datasource_name] = ds.data_path + "/" + ds.filename


2019-05-07 17:51:49,677 chemicalchecker.util.config.config.Config [DEBUG   ] Loading config from: /aloy/home/mduran/cc_config.json
2019-05-07 17:51:49,758 chemicalchecker.util.config.config.Config [DEBUG   ] Loading config from: /aloy/home/mduran/cc_config.json
2019-05-07 17:51:49,760 chemicalchecker.util.config.config.Config [DEBUG   ] Loading config from: /aloy/home/mduran/cc_config.json
2019-05-07 17:51:49,762 chemicalchecker.util.config.config.Config [DEBUG   ] Loading config from: /aloy/home/mduran/cc_config.json
2019-05-07 17:51:49,764 chemicalchecker.util.config.config.Config [DEBUG   ] Loading config from: /aloy/home/mduran/cc_config.json
2019-05-07 17:51:49,766 chemicalchecker.util.config.config.Config [DEBUG   ] Loading config from: /aloy/home/mduran/cc_config.json
2019-05-07 17:51:49,770 chemicalchecker.util.config.config.Config [DEBUG   ] Loading config from: /aloy/home/mduran/cc_config.json
2019-05-07 17:51:49,772 chemicalchecker.util.config.config.Config [DEBUG   ] Loadin

In [5]:
which_datasources

['touchstone_conn_SUMMLY']

In [6]:
for ds in which_datasources:
    fn = map_files[ds]
    S  = parse.parse(fn)


In [18]:
import numpy as np
V = np.array(S.data_df)
pairs = collections.defaultdict(list)
for i, row in tqdm(enumerate(np.array(S.row_metadata_df.index))):
    for j, col in enumerate(np.array(S.col_metadata_df.index)):
        if V[i,j] < 90:
            continue
        else:
            if V[i,j] < 95:
                v = 1
            else:
                v = 2
        if col not in lincs_inchikey: continue
        pairs[(lincs_inchikey[col], row)] += [v]
pairs = dict((k, np.max(v)) for k,v in pairs.iteritems())

8798it [02:41, 54.51it/s]


In [13]:
lincs_inchikey = parse_molrepo()

2019-05-07 17:59:41,595 chemicalchecker.util.config.config.Config [DEBUG   ] Loading config from: /aloy/home/mduran/cc_config.json


In [17]:
from tqdm import tqdm

In [21]:
key_pairs = pairs
key_raw = collections.defaultdict(list)
for k, v in key_pairs.iteritems():
    key_raw[str(k[0])] += [(str(k[1]), v)]
features = sorted(set([x[0] for v in key_raw.values() for x in v]))


In [23]:
for k,v in key_raw.iteritems():
    print len(v)

286
197
290
362
461
320
383
200
437
275
570
176
446
215
230
134
261
471
386
227
176
273
338
191
364
265
327
330
254
255
150
222
301
181
145
289
235
215
262
249
318
288
289
417
242
214
285
218
188
228
235
267
240
203
147
170
214
150
187
208
180
353
246
172
364
248
196
258
474
391
213
288
239
182
241
226
263
207
285
376
679
261
148
432
208
331
406
280
475
409
211
182
279
277
200
231
231
488
226
316
360
451
317
324
301
265
259
335
389
481
534
452
277
182
192
251
231
219
524
315
434
412
310
240
216
338
339
580
229
300
269
218
315
320
234
290
356
178
195
220
489
226
196
199
127
423
646
250
219
429
474
130
109
170
200
545
220
148
272
354
161
326
227
295
233
274
255
253
305
353
314
284
539
206
165
428
355
160
399
178
189
352
261
248
173
286
305
336
242
358
293
217
154
192
212
259
284
223
233
246
301
325
517
220
402
193
330
435
181
224
343
234
485
322
459
183
332
235
239
288
228
218
284
366
284
227
308
363
244
188
310
225
204
199
324
207
463
255
284
270
589
197
255
819
159
231
309
262
199
245


In [6]:
from chemicalchecker.core.sign0 import sign0
s0 = sign0(".", ".", "D1.003")

from chemicalchecker.core.sign1 import sign1
s1 = sign1(".", ".", "D1.003")

s1.fit(s0)


2019-05-08 16:55:19,636 chemicalchecker.core.signature_base.BaseSignature [INFO    ] Creating model_path in: /aloy/home/mduran/myscripts/chemical_checker/package/scripts/preprocess/D1.003/models
2019-05-08 16:55:19,639 chemicalchecker.core.signature_base.BaseSignature [INFO    ] Creating stats_path in: /aloy/home/mduran/myscripts/chemical_checker/package/scripts/preprocess/D1.003/stats
2019-05-08 16:55:19,641 chemicalchecker.core.sign0.sign0 [DEBUG   ] signature path is: .
2019-05-08 16:55:19,643 chemicalchecker.core.sign0.sign0 [DEBUG   ] data_path: /aloy/home/mduran/myscripts/chemical_checker/package/scripts/preprocess/D1.003/sign0.h5
2019-05-08 16:55:19,644 chemicalchecker.util.config.config.Config [DEBUG   ] Loading config from: /aloy/home/mduran/cc_config.json
2019-05-08 16:55:19,656 chemicalchecker.core.sign1.sign1 [DEBUG   ] signature path is: .
2019-05-08 16:55:19,659 chemicalchecker.core.signature_base.BaseSignature [DEBUG   ] fit
2019-05-08 16:55:19,662 chemicalchecker.util.p





























































































































































































































































































2019-05-08 16:57:01,991 chemicalchecker.core.sign1.sign1 [INFO    ] 0.9 topics: 1769
2019-05-08 16:57:01,993 chemicalchecker.core.sign1.sign1 [INFO    ] Elbow topics: 453
2019-05-08 16:57:07,482 chemicalchecker.core.sign1.sign1 [INFO    ] Normalizing
2019-05-08 16:57:07,532 chemicalchecker.core.sign1.sign1 [INFO    ] Saving to /aloy/home/mduran/myscripts/chemical_checker/package/scripts/preprocess/D1.003/sign1.h5
2019-05-08 16:57:23,532 chemicalchecker.core.sign1.sign1 [INFO    ] ... but sorting before!
2019-05-08 16:57:23,785 chemicalchecker.core.sign1.sign1 [INFO    ] Computing cosine distance empirical P-values
2019-05-08 17:22:07,863 chemicalchecker.core.sign1.sign1 [INFO    ] Computing euclidean distance empirical P-values
2019-05-08 17:44:27,970 chemicalchecker.core.sign1.sign1 [INFO    ] Cleaning
2019-05-08 17:44:28,054 chemicalchecker.core.sign1.sign1 [INFO    ] MOA and ATC Validations
2019-05-08 17:44:41,536 chemicalchecker.core.sign1.sign1 [INFO    ] Matrix plot


In [2]:
s1.validation_path

'/aloy/home/mduran/myscripts/chemical_checker/package/scripts/preprocess/D1.003'

In [3]:
s1.model_path

'/aloy/home/mduran/myscripts/chemical_checker/package/scripts/preprocess/D1.003/models'

In [4]:
s1.stats_path

'/aloy/home/mduran/myscripts/chemical_checker/package/scripts/preprocess/D1.003/stats'