In [1]:
%load_ext autoreload
%autoreload 2

In [6]:
from ceres_infer.session import workflow
from ceres_infer.models import model_infer

In [7]:
import logging
logging.basicConfig(level=logging.INFO)

In [8]:
params = {
    # directories
    'outdir_run': '../out/20.0518 Lx/L0only_reg_dummy/', # output dir for the run
    'outdir_modtmp': '../out/20.0518 Lx/L0only_reg_dummy/model_perf/', # intermediate files for each model
    'indir_dmdata_Q3': '../out/20.0817 proc_data/gene_effect/dm_data.pkl', # pickled preprocessed DepMap Q3 data
    'indir_dmdata_Q4': '../out/20.0817 proc_data/gene_effect/dm_data_Q4.pkl', # pickled preprocessed DepMap Q3 data
    'indir_genesets': '../data/gene_sets/',
    'indir_landmarks': '../out/19.1013 tight cluster/landmarks_n200_k200.csv', # csv file of landmarks [default: None]

    # notes
    'session_notes': 'dummy regressor (median); predicting whole-genome',

    # data
    'opt_scale_data': False, # scale input data True/False
    'opt_scale_data_types': '\[(?:RNA-seq|CN)\]', # data source types to scale; in regexp
    'model_data_source': ['CERES_Lx'],
    'anlyz_set_topN': 10, # for analysis set how many of the top features to look at
    'perm_null': 1000, # number of samples to get build the null distribution, for corr
    'useGene_dependency': False, # whether to use CERES gene dependency (true) or gene effect (false)
    'scope': 'all', # scope for which target genes to run on; list of gene names, or 'all', 'differential'

    # model
    'model_name': 'dummy_reg',
    'model_params': {},
    'model_paramsgrid': {},
    'model_pipeline': model_infer,
    
    # analysis
    'metric_eval': 'score_test',  # metric in model_results to evaluate, e.g. score_test, score_oob
    'thresholds': {'score_rd10': 0.1,  # score of reduced model - threshold for filtering
                   'recall_rd10': 0.95},  # recall of reduced model - threshold for filtering
    'min_gs_size': 4 # minimum gene set size, to be derived
}

In [None]:
wf = workflow(params)
pipeline = ['load_processed_data', 'infer']
wf.create_pipe(pipeline)
wf.run_pipe()

In [None]:
# the model results contain just all, no features, so there's not much to analyze, can't run the pipeline below
# wf = workflow(params)
# pipeline = ['load_processed_data', 'load_model_results', 'analyze', 'analyze_filtered', 'derive_genesets']
# wf.create_pipe(pipeline)
# wf.run_pipe()

In [31]:
import os
import glob
outdir_anlyz = os.path.join(params['outdir_run'], 'anlyz')
outdir_concord = '%s/concordance/' % outdir_anlyz
if not os.path.exists(outdir_concord):
    os.makedirs(outdir_concord)

y_compr_fnames = glob.glob(os.path.join(params['outdir_modtmp'], 'y_compr_*.pkl'))
if len(y_compr_fnames) > 0:
    df_conc_tr = pd.DataFrame()
    df_conc_te = pd.DataFrame()
    for fname in y_compr_fnames:
        f = re.sub('.*_compr_', '', fname)
        gene = re.sub('\.pkl', '', f)
        df = pickle.load(open(fname, 'rb'))

        tmp = pd.DataFrame([{'gene': gene, 'concordance': getConcordance(df['tr'])}])
        df_conc_tr = pd.concat([df_conc_tr, tmp])

        tmp = pd.DataFrame([{'gene': gene, 'concordance': getConcordance(df['te'])}])
        df_conc_te = pd.concat([df_conc_te, tmp])

    df_conc_tr.to_csv('%s/concordance_tr.csv' % outdir_concord, index=False)
    df_conc_te.to_csv('%s/concordance_te.csv' % outdir_concord, index=False)

    plt.figure()
    ax = sns.distplot(df_conc_tr.concordance)
    ax.set(xlim=[0, 1.05], xlabel='Concordance', title='Concordance between actual and predicted')
    plt.savefig("%s/concordance_tr.pdf" % outdir_concord)
    plt.close()

    plt.figure()
    ax = sns.distplot(df_conc_te.concordance)
    ax.set(xlim=[0, 1.05], xlabel='Concordance', title='Concordance between actual and predicted')
    plt.savefig("%s/concordance_te.pdf" % outdir_concord)
    plt.close()