In [44]:
# This script runs the pipeline on Sanger data

import sys
sys.path.append('../src')

from ceres_infer.session_Sanger import workflow
from ceres_infer.models_Sanger import model_infer_iter_ens
import logging
logging.basicConfig(level=logging.INFO)

In [51]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [46]:
# Read mitochonrial genes
list_mito_genes = []
with open('../data/mito_gene.csv', 'r') as f:
    next(f)
    for lines in f:
        genelist = lines.split(',')
        for genes in genelist:
            if genes not in list_mito_genes:
                list_mito_genes.append(genes)

In [47]:
# Parameters
params = {
    # directories
    'outdir_run': '../out/20.0911 feat/reg_rf_boruta/', # output dir for the run
    'outdir_modtmp': '../out/20.0911 feat/reg_rf_boruta/model_perf/', # intermediate files for each model
    'indir_dmdata_Q3': '../out/20.0817 proc_data/gene_effect/dm_data.pkl', # pickled preprocessed DepMap Q3 data
    'indir_dmdata_Sanger': '../out/20.0817 proc_data/gene_effect/dm_data_Sanger.pkl', # pickled preprocessed DepMap Q3 data
    'indir_genesets': '../data/gene_sets/',
    'indir_landmarks': None, # csv file of landmarks [default: None]

    # notes
    'session_notes': 'regression model, with random forest (iterative) and boruta feature selection; \
    run on selective dependent genes (CERES std > 0.25 and CERES range > 0.6)',

    # data
    'opt_scale_data': True, # scale input data True/False
    'opt_scale_data_types': '\[(?:RNA-seq|CN)\]', # data source types to scale; in regexp
    'model_data_source': ['CERES'],
    'anlyz_set_topN': 10, # for analysis set how many of the top features to look at
    'perm_null': 1000, # number of samples to get build the null distribution, for corr
    'useGene_dependency': False, # whether to use CERES gene dependency (true) or gene effect (false)
    'scope': list_mito_genes, # scope for which target genes to run on; list of gene names, or 'all', 'differential'

    # model
    'model_name': 'rf',
    'model_params': {'n_estimators':1000,'max_depth':15,'min_samples_leaf':5,'max_features':'log2'},
    'model_paramsgrid': {},
    'model_pipeline': model_infer_iter_ens,
    
    # analysis
    'metric_eval': 'score_test',  # metric in model_results to evaluate, e.g. score_test, score_oob
    'thresholds': {'score_rd10': 0.1,  # score of reduced model - threshold for filtering
                   'recall_rd10': 0.95},  # recall of reduced model - threshold for filtering
    'min_gs_size': 4 # minimum gene set size, to be derived
}

In [None]:
# Run just the inference
wf = workflow(params)
pipeline = ['load_processed_data', 'infer']
wf.create_pipe(pipeline)
wf.run_pipe()

INFO:root:Loading preprocessed data...
INFO:root:Running model building and inference...
  0%|          | 0/104 [00:00<?, ?it/s]

Feature name/order across the datasets do not match. There are 17724 common feats, drop other columns


INFO:root:Trying to scale data, but the given data type is not found and cannot be scaled for gene ACO2
  1%|          | 1/104 [01:27<2:30:47, 87.84s/it]

Feature name/order across the datasets do not match. There are 17724 common feats, drop other columns


INFO:root:Trying to scale data, but the given data type is not found and cannot be scaled for gene ACSL3
  2%|▏         | 2/104 [02:50<2:26:41, 86.29s/it]

Feature name/order across the datasets do not match. There are 17724 common feats, drop other columns


INFO:root:Trying to scale data, but the given data type is not found and cannot be scaled for gene ADSS
  3%|▎         | 3/104 [04:33<2:33:26, 91.15s/it]

Feature name/order across the datasets do not match. There are 17724 common feats, drop other columns


INFO:root:Trying to scale data, but the given data type is not found and cannot be scaled for gene AIFM1
  4%|▍         | 4/104 [06:06<2:33:15, 91.95s/it]

Feature name/order across the datasets do not match. There are 17724 common feats, drop other columns


INFO:root:Trying to scale data, but the given data type is not found and cannot be scaled for gene AMBRA1
  5%|▍         | 5/104 [07:33<2:28:52, 90.23s/it]

Feature name/order across the datasets do not match. There are 17724 common feats, drop other columns


INFO:root:Trying to scale data, but the given data type is not found and cannot be scaled for gene ATP5ME
  6%|▌         | 6/104 [09:13<2:32:23, 93.30s/it]

Feature name/order across the datasets do not match. There are 17724 common feats, drop other columns


INFO:root:Trying to scale data, but the given data type is not found and cannot be scaled for gene BRAF
  7%|▋         | 7/104 [10:40<2:27:59, 91.54s/it]

Feature name/order across the datasets do not match. There are 17724 common feats, drop other columns


INFO:root:Trying to scale data, but the given data type is not found and cannot be scaled for gene CDS2
  8%|▊         | 8/104 [12:16<2:28:35, 92.87s/it]

Feature name/order across the datasets do not match. There are 17724 common feats, drop other columns


INFO:root:Trying to scale data, but the given data type is not found and cannot be scaled for gene COA7
  9%|▊         | 9/104 [13:57<2:30:52, 95.29s/it]

Feature name/order across the datasets do not match. There are 17724 common feats, drop other columns


INFO:root:Trying to scale data, but the given data type is not found and cannot be scaled for gene CYB5B
 10%|▉         | 10/104 [15:27<2:26:27, 93.49s/it]

Feature name/order across the datasets do not match. There are 17724 common feats, drop other columns


INFO:root:Trying to scale data, but the given data type is not found and cannot be scaled for gene CYC1
 11%|█         | 11/104 [17:10<2:29:42, 96.58s/it]

Feature name/order across the datasets do not match. There are 17724 common feats, drop other columns


INFO:root:Trying to scale data, but the given data type is not found and cannot be scaled for gene DHFR
 12%|█▏        | 12/104 [18:47<2:28:09, 96.62s/it]

Feature name/order across the datasets do not match. There are 17724 common feats, drop other columns


INFO:root:Trying to scale data, but the given data type is not found and cannot be scaled for gene DHODH


In [40]:
# Run analysis, based on pre-existing inference
wf = workflow(params)
pipeline = ['load_processed_data', 'load_model_results', 
            'analyze', 'analyze_filtered', 'derive_genesets', 'run_Rscripts']
wf.create_pipe(pipeline)
wf.run_pipe()

INFO:root:Loading preprocessed data...
INFO:root:Loading model results...


FileNotFoundError: [Errno 2] No such file or directory: '../out/20.0911 feat/reg_rf_boruta//model_results.csv'