In [8]:
import sys
sys.path.append('../src/')

from ceres_infer.session import workflow
from ceres_infer.models import model_infer_iter_ens

In [9]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [10]:
import logging
logging.basicConfig(level=logging.INFO)

In [11]:
# get mitochondrial gene list
import re
with open('../data/gs16.txt','r') as f:
    for lines in f:
        lmito = re.findall(r'\w+',lines)[1:]

In [12]:
# Parameters
params = {
    # directories
    'outdir_run': '../out/20.0929 feat/reg_rf_boruta/', # output dir for the run
    'outdir_modtmp': '../out/20.0929 feat/reg_rf_boruta/model_perf/', # intermediate files for each model
    'indir_dmdata_Q3': '../out/20.0925 proc_data/gene_effect/dm_data.pkl', # pickled preprocessed DepMap Q3 data
    'indir_dmdata_Q4': '../out/20.0925 proc_data/gene_effect/dm_data_Q4.pkl', # pickled preprocessed DepMap Q3 data
    'indir_dmdata_sanger': '../out/20.0925 proc_data/gene_effect/dm_data_sanger.pkl', # pickled preprocessed DepMap Q3 data
    'indir_genesets': '../data/gene_sets/',
    'indir_landmarks': None, # csv file of landmarks [default: None]

    # notes
    'session_notes': 'regression model, with random forest (iterative) and boruta feature selection; \
     run on selective dependent genes (CERES std > 0.25 and CERES range > 0.6)',

    # data
    'ext_data_name': 'sanger', # 'sanger' or 'Q4'
    'opt_scale_data': True, # scale input data True/False
    'opt_scale_data_types': '\[(?:RNA-seq|CN)\]', # data source types to scale; in regexp
    'model_data_source': ['CERES','RNA-seq','CN','Mut','Lineage'],
    'anlyz_set_topN': 10, # for analysis set how many of the top features to look at
    'perm_null': 1000, # number of samples to get build the null distribution, for corr
    'useGene_dependency': False, # whether to use CERES gene dependency (true) or gene effect (false)
    'scope':'differential',# scope for which target genes to run on; list of gene names, or 'all', 'differential'

    # model
    'model_name': 'rf',
    'model_params': {'n_estimators':1000,'max_depth':15,'min_samples_leaf':5,'max_features':'log2'},
    'model_paramsgrid': {},
    'model_pipeline': model_infer_iter_ens,
    'pipeline_params': {},
    
    # pipeline
    'parallelize': True, # parallelize workflow
    'processes': 24, # number of cpu processes to use
    
    # analysis
    'metric_eval': 'score_test',  # metric in model_results to evaluate, e.g. score_test, score_oob
    'thresholds': {'score_rd10': 0.1,  # score of reduced model - threshold for filtering
                   'recall_rd10': 0.95},  # recall of reduced model - threshold for filtering
    'min_gs_size': 4 # minimum gene set size, to be derived
}

In [13]:
# Run just the inference
wf = workflow(params)
pipeline = ['load_processed_data', 'infer']
wf.create_pipe(pipeline)
wf.run_pipe()

INFO:root:Loading preprocessed data...
INFO:root:Running model building and inference...
INFO:root:Total number of processors available: 40
INFO:root:Total number of processors to use: 24
  0%|          | 0/583 [00:00<?, ?it/s]

Feature name/order across the datasets do not match. There are 91205 common feats, drop other feats
Feature name/order across the datasets do not match. There are 91205 common feats, drop other feats
Feature name/order across the datasets do not match. There are 91205 common feats, drop other feats
Feature name/order across the datasets do not match. There are 91205 common feats, drop other feats
Feature name/order across the datasets do not match. There are 91205 common feats, drop other feats
Feature name/order across the datasets do not match. There are 91205 common feats, drop other feats
Feature name/order across the datasets do not match. There are 91205 common feats, drop other feats
Feature name/order across the datasets do not match. There are 91205 common feats, drop other feats
Feature name/order across the datasets do not match. There are 91205 common feats, drop other feats
Feature name/order across the datasets do not match. There are 91205 common feats, drop other feats


  0%|          | 0/583 [10:35<?, ?it/s]


AttributeError: 'list' object has no attribute 'columns'

In [None]:
# Run analysis, based on pre-existing inference
wf = workflow(params)
pipeline = ['load_processed_data', 'load_model_results', 
            'analyze', 'analyze_filtered', 'derive_genesets', 'run_Rscripts']
wf.create_pipe(pipeline)
wf.run_pipe()