In [18]:
from ceres_infer.session import workflow
from ceres_infer.models import model_infer_iter_ens

In [3]:
%load_ext autoreload
%autoreload 2

In [2]:
import logging
logging.basicConfig(level=logging.INFO)

In [4]:
params = {
    # directories
    'outdir_run': '../out/20.0817 feat/reg_rf_boruta/', # output dir for the run
    'outdir_modtmp': '../out/20.0817 feat/reg_rf_boruta/model_perf/', # intermediate files for each model
    'indir_dmdata_Q3': '../out/20.0817 proc_data/gene_effect/dm_data.pkl', # pickled preprocessed DepMap Q3 data
    'indir_dmdata_Q4': '../out/20.0817 proc_data/gene_effect/dm_data_Q4.pkl', # pickled preprocessed DepMap Q3 data
    'indir_genesets': '../data/enrichr/',
    'indir_landmarks': None, # csv file of landmarks [default: None]

    # notes
    'session_notes': 'regression model, with random forest (iterative) and boruta feature selection; \
    run on selective dependent genes (CERES std > 0.25 and CERES range > 0.6)',

    # data
    'opt_scale_data': True, # scale input data True/False
    'opt_scale_data_types': '\[(?:RNA-seq|CN)\]', # data source types to scale; in regexp
    'model_data_source': ['CERES','RNA-seq','CN','Mut','Lineage'],
    'anlyz_set_topN': 10, # for analysis set how many of the top features to look at
    'perm_null': 1000, # number of samples to get build the null distribution, for corr
    'useGene_dependency': False, # whether to use CERES gene dependency (true) or gene effect (false)
    'scope': ['KRAS','TP53'], # scope for which target genes to run on; list of gene names, or 'all', 'differential'

    # model
    'model_name': 'rf',
    'model_params': {'n_estimators':1000,'max_depth':15,'min_samples_leaf':5,'max_features':'log2'},
    'model_paramsgrid': {},
    'model_pipeline': model_infer_iter_ens,
    
    # analysis
    'metric_eval': 'score_test',  # metric in model_results to evaluate, e.g. score_test, score_oob
    'thresholds': {'score_rd10': 0.1,  # score of reduced model - threshold for filtering
                   'recall_rd10': 0.95},  # recall of reduced model - threshold for filtering
    'min_gs_size': 4 # minimum gene set size, to be derived
}

In [33]:
wf = workflow(params)
pipeline = ['load_processed_data', 'infer']
wf.create_pipe(pipeline)
wf.run_pipe()

100%|██████████| 2/2 [09:19<00:00, 279.62s/it]


In [4]:
wf = workflow(params)
pipeline = ['load_processed_data', 'load_model_results', 
            'analyze', 'analyze_filtered', 'derive_genesets', 'run_Rscripts']
wf.create_pipe(pipeline)
wf.run_pipe()

INFO:root:Loading preprocessed data...
INFO:root:Loading model results...
INFO:root:Analyzing model results...
  feat_summary = varExp_noNeg.groupby('target')['target', 'score_rd', 'score_full'].first()
INFO:root:Analyzing filtered results...
  feat_summary = varExp_noNeg.groupby('target')['target', 'score_rd', 'score_full'].first()
INFO:root:Deriving gene sets...


In [35]:
import os
wf = workflow(params)
wf._get_analysis_paths()

In [None]:
######################################################################
# Gene specific
###################################################################### 
# outdir_sub2 = '%s/gene_specific/' % outdir
# if(not os.path.exists(outdir_sub)): os.mkdir(outdir_sub2)
#
# #generate plot of specific gene
# genBarPlotGene(model_results, 'CDK4', 'score_oob', 0.5, outdir_sub=outdir_sub2)
# genBarPlotGene(model_results, 'KRAS', 'score_oob', 0.5, outdir_sub=outdir_sub2)
# genBarPlotGene(model_results, 'SOX10', 'score_oob', 0.5,  outdir_sub=outdir_sub2)