In [1]:
import pandas as pd

from proxbias.depmap.process import bootstrap_stats, split_models
from proxbias.depmap.load import get_depmap_data
from proxbias.utils.data_utils import get_cancer_gene_lists
from proxbias.metrics import genome_proximity_bias_score

In [2]:
crispr_effect, _, cnv_data, mutation_data = get_depmap_data(rnai_release="")
oncogenes, tsgs = get_cancer_gene_lists(crispr_effect.index)

CRISPRGeneEffect.csv from DepMap Public 22Q4 is found. Reading dataframe from cache.
Done!
OmicsCNGene.csv from DepMap Public 22Q4 is found. Reading dataframe from cache.
Done!
OmicsSomaticMutations.csv from DepMap Public 22Q4 is found. Reading dataframe from cache.
Done!


In [3]:
tp53_lof_models, tp53_wt_models, tp53_amp_models, _ = split_models(
    gene_symbol="TP53",
    candidate_models=crispr_effect.columns,
    cnv_data=cnv_data,
    mutation_data=mutation_data
)

_, _, tp53_amp_filtered_models, _ = split_models(
    gene_symbol="TP53",
    candidate_models=crispr_effect.columns,
    cnv_data=cnv_data,
    mutation_data=mutation_data,
    filter_amp=True,
)

In [3]:
# Note - n_workers should likely be around half the number of CPUs
res = bootstrap_stats(
    genes_of_interest=["TP53"],
    dependency_data=crispr_effect,
    cnv_data=cnv_data,
    mutation_data=mutation_data,
    candidate_models=list(crispr_effect.columns),
    search_mode="lof",
    n_bootstrap=32,
    eval_function=genome_proximity_bias_score,
    eval_kwargs={"n_samples": 500, "n_trials": 200, "return_samples": False},
    n_workers=1,
    filter_amp=True,
)
res.to_csv("outputs/TP53_lof_bootstrap_results.csv")

Stats for TP53 computed in 71.97563099861145 - diff is 0.029988173125000017, 278 wt and 277 lof


In [6]:
# Subset down to the genes that seem to have a strong effect based off of 4 bootstraps

all_lof_tp53_wt = bootstrap_stats(
    genes_of_interest=tsgs + oncogenes,
    dependency_data=crispr_effect,
    cnv_data=cnv_data,
    mutation_data=mutation_data,
    candidate_models=tp53_wt_models,
    search_mode="lof",
    n_bootstrap=4,
    eval_function=genome_proximity_bias_score,
    eval_kwargs={"n_samples": 500, "n_trials": 200, "return_samples": False},
    n_workers=16,
)

all_amp_tp53_wt = bootstrap_stats(
    genes_of_interest=tsgs,
    dependency_data=crispr_effect,
    cnv_data=cnv_data,
    mutation_data=mutation_data,
    candidate_models=tp53_wt_models,
    search_mode="amp",
    n_bootstrap=4,
    eval_function=genome_proximity_bias_score,
    eval_kwargs={"n_samples": 500, "n_trials": 200, "return_samples": False},
    n_workers=16,
)

four_bootstrap_tp53_wt = pd.concat([all_lof_tp53_wt, all_amp_tp53_wt], axis=1)
four_bootstrap_tp53_wt.to_csv("outputs/four_bootstrap_tp53_wt_results.csv")

Index(['IGF2', 'IRF1', 'TNFSF13'], dtype='object') not found in data.
Stats for ABRAXAS1 computed in 15.624876022338867 - diff is 0.045973785000000045, 218 wt and 40 lof
Stats for AKT1 computed in 20.81327199935913 - diff is 0.0070602450000000205, 192 wt and 42 lof
Stats for AJUBA computed in 21.713552474975586 - diff is 0.0038755550000000305, 200 wt and 38 lof
Stats for AMER1 computed in 23.433346271514893 - diff is 0.011521204999999979, 93 wt and 161 lof
Stats for AR computed in 26.743326663970947 - diff is -0.0020547450000000245, 88 wt and 163 lof
Stats for ARAF computed in 29.518576860427856 - diff is 0.010171694999999925, 88 wt and 164 lof
Stats for ARID1A computed in 29.281298875808716 - diff is -0.002491880000000002, 179 wt and 35 lof
Stats for ARID1B computed in 30.807867527008057 - diff is 0.010847880000000032, 179 wt and 47 lof
Stats for ARID4A computed in 33.39646005630493 - diff is -0.0018389150000001075, 196 wt and 36 lof
Stats for ATRIP computed in 19.78100061416626 - dif

In [7]:
all_lof_tp53_lof = bootstrap_stats(
    genes_of_interest=tsgs + oncogenes,
    dependency_data=crispr_effect,
    cnv_data=cnv_data,
    mutation_data=mutation_data,
    candidate_models=tp53_lof_models,
    search_mode="lof",
    n_bootstrap=4,
    eval_function=genome_proximity_bias_score,
    eval_kwargs={"n_samples": 500, "n_trials": 200, "return_samples": False},
    n_workers=8,
)

all_amp_tp53_lof = bootstrap_stats(
    genes_of_interest=tsgs,
    dependency_data=crispr_effect,
    cnv_data=cnv_data,
    mutation_data=mutation_data,
    candidate_models=tp53_lof_models,
    search_mode="amp",
    n_bootstrap=4,
    eval_function=genome_proximity_bias_score,
    eval_kwargs={"n_samples": 500, "n_trials": 200, "return_samples": False},
    n_workers=8,
)

four_bootstrap_tp53_lof = pd.concat([all_lof_tp53_lof, all_amp_tp53_lof], axis=0)
four_bootstrap_tp53_lof.to_csv("outputs/four_bootstrap_tp53_lof_results.csv")

Index(['IGF2', 'IRF1', 'TNFSF13'], dtype='object') not found in data.
Stats for ABL1 computed in 25.450073719024658 - diff is -0.012761935000000113, 183 wt and 42 lof
Stats for ACKR3 computed in 28.194087505340576 - diff is -0.026047390000000004, 219 wt and 42 lof
Stats for ABRAXAS1 computed in 29.65141272544861 - diff is 0.00945825499999986, 206 wt and 58 lof
Stats for AGO1 computed in 28.532398462295532 - diff is 0.002358014999999991, 198 wt and 43 lof
Stats for AKT1 computed in 31.470420360565186 - diff is 0.004288884999999909, 181 wt and 46 lof
Stats for AFF4 computed in 32.56582045555115 - diff is 0.013957020000000098, 202 wt and 29 lof
Stats for AJUBA computed in 33.36678743362427 - diff is 0.016314799999999963, 184 wt and 49 lof
Stats for AKT2 computed in 31.61945128440857 - diff is 0.007999275000000083, 189 wt and 36 lof
Stats for AMER1 computed in 19.087071418762207 - diff is 0.011445439999999918, 79 wt and 181 lof
Stats for ANKRD11 computed in 17.75159239768982 - diff is -0.0

In [4]:
four_bootstrap_tp53_lof = pd.read_csv("outputs/four_bootstrap_tp53_lof_results.csv", index_col=0)
four_bootstrap_tp53_lof["abs_diff"] = four_bootstrap_tp53_lof["diff"].abs()
four_bootstrap_tp53_lof["tp53"] = "lof"

four_bootstrap_tp53_wt = pd.read_csv("outputs/four_bootstrap_tp53_wt_results.csv", index_col=0)
four_bootstrap_tp53_wt["abs_diff"] = four_bootstrap_tp53_wt["diff"].abs()
four_bootstrap_tp53_wt["tp53"] = "wt"

combined = pd.concat([four_bootstrap_tp53_lof, four_bootstrap_tp53_wt], axis=0)
combined = combined.sort_values("abs_diff", ascending=False)

top_lof_genes = list(combined.loc[combined["tp53"] == "lof"].index[:100].unique())
top_wt_genes = list(combined.loc[combined["tp53"] == "wt"].index[:100].unique())

In [5]:
combined.loc['MDM4']

Unnamed: 0,test_stats,test_mean,wt_stats,wt_mean,diff,search_mode,n_sample_bootstrap,n_test,n_wt,abs_diff,tp53
MDM4,"[0.61311076, 0.60298466, 0.6091877, 0.61463799...",0.60998,"[0.61818872, 0.64448218, 0.6278965999999999, 0...",0.629407,-0.019427,gof,63,79,187,0.019427,lof
MDM4,"[0.60226064, 0.61556978, 0.6003597800000001, 0...",0.606009,"[0.58556076, 0.59663688, 0.58595814, 0.5912840...",0.58986,0.016149,gof,69,87,174,0.016149,wt


In [6]:
combined.head(50)

Unnamed: 0,test_stats,test_mean,wt_stats,wt_mean,diff,search_mode,n_sample_bootstrap,n_test,n_wt,abs_diff,tp53
STAT5B,"[0.62091858, 0.60725458, 0.61068534, 0.60034394]",0.609801,"[0.57164958, 0.55070746, 0.5411863, 0.55573332]",0.554819,0.054981,lof,21,27,180,0.054981,lof
PDGFRB,"[0.6149859200000001, 0.6223293000000001, 0.607...",0.61212,"[0.55251984, 0.558787, 0.5562982599999999, 0.5...",0.55799,0.05413,gof,37,47,203,0.05413,wt
SQSTM1,"[0.6093229800000001, 0.60441276, 0.62813656000...",0.611023,"[0.5574067199999999, 0.5751299400000001, 0.551...",0.560122,0.050901,gof,38,48,207,0.050901,wt
EIF4E,"[0.60474806, 0.61052928, 0.61162006, 0.62031448]",0.611803,"[0.56270074, 0.5537719600000001, 0.56142886, 0...",0.56131,0.050493,lof,31,39,218,0.050493,wt
NCOA3,"[0.63429278, 0.6277913199999999, 0.63004662, 0...",0.628915,"[0.58624634, 0.5741716800000001, 0.5782351, 0....",0.578937,0.049978,gof,76,96,158,0.049978,wt
CDKN2A,"[0.6263628000000001, 0.6584372, 0.64522464, 0....",0.642079,"[0.58637964, 0.5922116399999999, 0.59791374, 0...",0.592519,0.049559,lof,65,156,82,0.049559,lof
KRAS,"[0.6363911600000001, 0.6471104799999999, 0.646...",0.647522,"[0.6053329799999999, 0.6093973800000001, 0.590...",0.598919,0.048603,gof,55,69,163,0.048603,lof
KIT,"[0.60010862, 0.6026683799999999, 0.6020027, 0....",0.603946,"[0.56143316, 0.5361224, 0.56791242, 0.55701084]",0.55562,0.048326,lof,26,33,211,0.048326,wt
ID1,"[0.62373718, 0.63197644, 0.6373059400000001, 0...",0.628474,"[0.5755983600000001, 0.58628376, 0.57876868, 0...",0.580981,0.047493,gof,68,86,174,0.047493,wt
ABRAXAS1,"[0.60704062, 0.61300674, 0.61624818, 0.61018072]",0.611619,"[0.56925828, 0.5722387, 0.5553312, 0.56575294]",0.565645,0.045974,lof,32,40,218,0.045974,wt


In [11]:
max_diff = combined.reset_index().groupby(["index"])['diff'].apply(lambda x: x.max() - x.min()).sort_values(ascending=False)

combined.loc[max_diff.index[20:40]]

Unnamed: 0_level_0,test_stats,test_mean,wt_stats,wt_mean,diff,search_mode,n_sample_bootstrap,n_test,n_wt,abs_diff,tp53
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
MLLT3,"[0.64240092, 0.6418101799999999, 0.64497746000...",0.646305,"[0.62175182, 0.6245550799999999, 0.61889428, 0...",0.61991,0.026394,lof,92,115,132,0.026394,lof
MLLT3,"[0.58772836, 0.5893475599999999, 0.5881193, 0....",0.587409,"[0.57861544, 0.5595070600000001, 0.57431484, 0...",0.569279,0.01813,gof,24,30,132,0.01813,lof
MLLT3,"[0.58184904, 0.5867420400000001, 0.57904778, 0...",0.583724,"[0.61873388, 0.5885548199999999, 0.58749318000...",0.597408,-0.013684,lof,50,63,182,0.013684,wt
PMAIP1,"[0.6122350999999999, 0.6068483, 0.6112745, 0.6...",0.609358,"[0.58403516, 0.57590588, 0.57809676, 0.57402352]",0.578015,0.031343,lof,47,59,200,0.031343,wt
PMAIP1,"[0.63161926, 0.63501882, 0.62728862, 0.62520488]",0.629783,"[0.6330250599999999, 0.64014102, 0.63845296, 0...",0.638021,-0.008238,lof,93,131,117,0.008238,lof
PGR,"[0.59107244, 0.58729894, 0.58642592, 0.5780429...",0.58571,"[0.5633635, 0.54151102, 0.55199094, 0.56979276]",0.556665,0.029046,gof,23,29,195,0.029046,wt
PGR,"[0.57870358, 0.56755086, 0.57165594, 0.55337718]",0.567822,"[0.5633635, 0.54151102, 0.55199094, 0.56979276]",0.556665,0.011157,lof,23,29,195,0.011157,wt
PGR,"[0.58293486, 0.58416432, 0.58541984, 0.58048396]",0.583251,"[0.58489462, 0.60911414, 0.58914782, 0.59093264]",0.593522,-0.010272,lof,39,49,146,0.010272,lof
PGR,"[0.62449188, 0.6155520400000001, 0.62276990000...",0.62211,"[0.6141131400000001, 0.6152809, 0.61656356, 0....",0.613671,0.008438,gof,55,69,146,0.008438,lof
RUNX1T1,"[0.6256090200000001, 0.6251518, 0.6174759, 0.6...",0.622925,"[0.58996188, 0.57960374, 0.5762273, 0.578447]",0.58106,0.041865,gof,62,78,166,0.041865,wt


In [12]:
interesting_genes = ['SETBP1', 'PTPRD', 'PTPN2', 'JAK2', 'LYN', 'BRCA2', 'PDGFB', 'SMARCA2', 'BMPR1A', 'MDM4', 'MDM2', 'BTG2', 'CDKN2A', 'CDKN2B', 'CDKN2C', 'KRAS', 'CCND2']

In [6]:
# top on TP53 LOF

res_top_lof_tp53_lof = bootstrap_stats(
    genes_of_interest=top_lof_genes,
    dependency_data=crispr_effect,
    cnv_data=cnv_data,
    mutation_data=mutation_data,
    candidate_models=tp53_lof_models,
    search_mode="lof",
    n_bootstrap=32,
    eval_function=genome_proximity_bias_score,
    eval_kwargs={"n_samples": 500, "n_trials": 200, "return_samples": False},
    n_workers=32,
)
res_top_lof_tp53_lof.to_csv("outputs/top_lof_bootstrap_results_tp53_lof.csv")

res_top_amp_tp53_lof = bootstrap_stats(
    genes_of_interest=top_lof_genes,
    dependency_data=crispr_effect,
    cnv_data=cnv_data,
    mutation_data=mutation_data,
    candidate_models=tp53_lof_models,
    search_mode="amp",
    n_bootstrap=32,
    eval_function=genome_proximity_bias_score,
    eval_kwargs={"n_samples": 500, "n_trials": 200, "return_samples": False},
    n_workers=32,
)
res_top_amp_tp53_lof.to_csv("outputs/top_amp_bootstrap_results_tp53_lof.csv")

res_top_amp_filtered_tp53_lof = bootstrap_stats(
    genes_of_interest=top_lof_genes,
    dependency_data=crispr_effect,
    cnv_data=cnv_data,
    mutation_data=mutation_data,
    candidate_models=tp53_lof_models,
    search_mode="amp",
    n_bootstrap=32,
    eval_function=genome_proximity_bias_score,
    eval_kwargs={"n_samples": 500, "n_trials": 200, "return_samples": False},
    n_workers=32,
    filter_amp=True,
)
res_top_amp_filtered_tp53_lof.to_csv("outputs/top_amp_filtered_bootstrap_results_tp53_lof.csv")

# top on TP53 WT

res_top_lof_tp53_wt = bootstrap_stats(
    genes_of_interest=top_wt_genes,
    dependency_data=crispr_effect,
    cnv_data=cnv_data,
    mutation_data=mutation_data,
    candidate_models=tp53_wt_models,
    search_mode="lof",
    n_bootstrap=32,
    eval_function=genome_proximity_bias_score,
    eval_kwargs={"n_samples": 500, "n_trials": 200, "return_samples": False},
    n_workers=32,
)
res_top_lof_tp53_wt.to_csv("outputs/top_lof_bootstrap_results_tp53_wt.csv")

res_top_amp_tp53_wt = bootstrap_stats(
    genes_of_interest=top_wt_genes,
    dependency_data=crispr_effect,
    cnv_data=cnv_data,
    mutation_data=mutation_data,
    candidate_models=tp53_wt_models,
    search_mode="amp",
    n_bootstrap=32,
    eval_function=genome_proximity_bias_score,
    eval_kwargs={"n_samples": 500, "n_trials": 200, "return_samples": False},
    n_workers=32,
)
res_top_amp_tp53_wt.to_csv("outputs/top_amp_bootstrap_results_tp53_wt.csv")

res_top_amp_filtered_tp53_wt = bootstrap_stats(
    genes_of_interest=top_wt_genes,
    dependency_data=crispr_effect,
    cnv_data=cnv_data,
    mutation_data=mutation_data,
    candidate_models=tp53_wt_models,
    search_mode="amp",
    n_bootstrap=32,
    eval_function=genome_proximity_bias_score,
    eval_kwargs={"n_samples": 500, "n_trials": 200, "return_samples": False},
    n_workers=32,
    filter_amp=True,
)
res_top_amp_filtered_tp53_wt.to_csv("outputs/top_amp_filtered_bootstrap_results_tp53_wt.csv")

Stats for EZH1 computed in 137.63942313194275 - diff is 0.034011618124999976, 182 wt and 26 lof
Stats for FAS computed in 169.51563358306885 - diff is -0.0310170556249999, 203 wt and 55 lof
Stats for FGFR2 computed in 264.0790464878082 - diff is -0.029663935624999915, 193 wt and 58 lof
Stats for KLF5 computed in 188.35507822036743 - diff is 0.021276477500000057, 151 wt and 83 lof
Stats for LMO1 computed in 144.508061170578 - diff is -0.0053428343749999385, 189 wt and 60 lof
Stats for LMO2 computed in 169.36320424079895 - diff is -0.00688530312500002, 196 wt and 41 lof
Stats for LRP6 computed in 144.90322923660278 - diff is 0.002992931875000049, 185 wt and 38 lof
Stats for IL6ST computed in 432.93234515190125 - diff is -0.011289401249999997, 202 wt and 36 lof
Stats for ACKR3 computed in 557.3886315822601 - diff is -0.010992769374999933, 219 wt and 42 lof
Stats for AXIN1 computed in 556.4122018814087 - diff is 0.02310005062500009, 188 wt and 55 lof
Stats for ATXN7 computed in 559.4159209

In [7]:
# TSG baseline

res_tsg_lof = bootstrap_stats(
    genes_of_interest=tsgs,
    dependency_data=crispr_effect,
    cnv_data=cnv_data,
    mutation_data=mutation_data,
    candidate_models=list(crispr_effect.columns),
    search_mode="lof",
    n_bootstrap=32,
    eval_function=genome_proximity_bias_score,
    eval_kwargs={"n_samples": 500, "n_trials": 200, "return_samples": False},
    n_workers=16,
)
res_tsg_lof.to_csv("outputs/TSG_lof_bootstrap_results.cv")

res_tsg_amp = bootstrap_stats(
    genes_of_interest=tsgs,
    dependency_data=crispr_effect,
    cnv_data=cnv_data,
    mutation_data=mutation_data,
    candidate_models=list(crispr_effect.columns),
    search_mode="amp",
    n_bootstrap=32,
    eval_function=genome_proximity_bias_score,
    eval_kwargs={"n_samples": 500, "n_trials": 200, "return_samples": False},
    n_workers=16,
)
res_tsg_amp.to_csv("outputs/TSG_amp_bootstrap_results.csv")

res_tsg_amp_filtered = bootstrap_stats(
    genes_of_interest=tsgs,
    dependency_data=crispr_effect,
    cnv_data=cnv_data,
    mutation_data=mutation_data,
    candidate_models=list(crispr_effect.columns),
    search_mode="amp",
    n_bootstrap=32,
    eval_function=genome_proximity_bias_score,
    eval_kwargs={"n_samples": 500, "n_trials": 200, "return_samples": False},
    n_workers=16,
    filter_amp=True,
)
res_tsg_amp_filtered.to_csv("outputs/TSG_amp_filtered_bootstrap_results.csv")

Index(['IGF2', 'TNFSF13'], dtype='object') not found in data.
Stats for AKT1 computed in 225.13407015800476 - diff is -0.01634699124999983, 726 wt and 141 lof
Stats for ABL1 computed in 284.51749205589294 - diff is 0.0004068506249998993, 749 wt and 83 lof
Stats for ACKR3 computed in 315.4099049568176 - diff is 0.004788121249999944, 883 wt and 115 lof
Stats for ABL2 computed in 317.1523172855377 - diff is 0.012153783124999928, 697 wt and 32 lof
Stats for ACVR1 computed in 324.73189520835876 - diff is 0.009110334999999914, 925 wt and 52 lof
Stats for AFF4 computed in 335.74411034584045 - diff is 0.009332527500000243, 839 wt and 69 lof
Stats for ADHFE1 computed in 345.0074973106384 - diff is 0.005396359374999937, 684 wt and 70 lof
Stats for AKT3 computed in 338.197740316391 - diff is 0.03684494250000003, 739 wt and 43 lof
Stats for AGO1 computed in 342.92612075805664 - diff is 0.0023352168750000457, 838 wt and 105 lof
Stats for ALK computed in 339.35913944244385 - diff is 0.03151209750000

In [13]:
res_top_lof_tp53_wt = pd.read_csv("outputs/top_lof_bootstrap_results_tp53_wt.csv", index_col=0)
res_top_amp_filtered_tp53_wt = pd.read_csv("outputs/top_amp_bootstrap_results_tp53_wt.csv", index_col=0)
res_top_lof_tp53_wt["abs_diff"] = res_top_lof_tp53_wt["diff"].abs()
res_top_amp_filtered_tp53_wt["abs_diff"] = res_top_amp_filtered_tp53_wt["diff"].abs()
res_top_lof_tp53_wt["tp53"] = "wt"
res_top_amp_filtered_tp53_wt["tp53"] = "wt"

res_top_lof_tp53_lof = pd.read_csv("outputs/top_lof_bootstrap_results_tp53_lof.csv", index_col=0)
res_top_amp_filtered_tp53_lof = pd.read_csv("outputs/top_amp_bootstrap_results_tp53_lof.csv", index_col=0)
res_top_lof_tp53_lof["abs_diff"] = res_top_lof_tp53_lof["diff"].abs()
res_top_amp_filtered_tp53_lof["abs_diff"] = res_top_amp_filtered_tp53_lof["diff"].abs()
res_top_lof_tp53_lof["tp53"] = "lof"
res_top_amp_filtered_tp53_lof["tp53"] = "lof"

In [14]:
overall = pd.concat([res_top_lof_tp53_wt, res_top_amp_filtered_tp53_wt, res_top_lof_tp53_lof, res_top_amp_filtered_tp53_lof], axis=0)

In [15]:
overall.sort_values("abs_diff", ascending=False).head(50)

Unnamed: 0,test_stats,test_mean,wt_stats,wt_mean,diff,search_mode,n_sample_bootstrap,n_test,n_wt,abs_diff,tp53
FGFR1,"[0.59848552, 0.6037449599999999, 0.60150987999...",0.606195,"[0.5552127400000001, 0.55296118, 0.55747576, 0...",0.558806,0.047389,lof,29,37,184,0.047389,wt
TLX3,"[0.606572, 0.6132278000000001, 0.61197886, 0.5...",0.610859,"[0.56095458, 0.57078382, 0.5584163799999999, 0...",0.566011,0.044848,gof,39,49,200,0.044848,wt
PRPF8,"[0.5859033000000001, 0.58013646, 0.58151644, 0...",0.575322,"[0.6202291, 0.62118092, 0.6188679, 0.6209395, ...",0.620159,-0.044836,lof,26,226,33,0.044836,lof
KRAS,"[0.6363911600000001, 0.6471104799999999, 0.646...",0.648995,"[0.6053329799999999, 0.6093973800000001, 0.590...",0.604169,0.044826,gof,55,69,163,0.044826,lof
SQSTM1,"[0.6093229800000001, 0.60441276, 0.62813656000...",0.612726,"[0.5574067199999999, 0.5751299400000001, 0.551...",0.568958,0.043768,gof,38,48,207,0.043768,wt
NCOA3,"[0.63429278, 0.6277913199999999, 0.63004662, 0...",0.629136,"[0.58624634, 0.5741716800000001, 0.5782351, 0....",0.586142,0.042994,gof,76,96,158,0.042994,wt
ABRAXAS1,"[0.60704062, 0.61300674, 0.61624818, 0.6101807...",0.61033,"[0.56925828, 0.5722387, 0.5553312, 0.56575294,...",0.567359,0.042971,lof,32,40,218,0.042971,wt
PDGFRB,"[0.6149859200000001, 0.6223293000000001, 0.607...",0.614778,"[0.55251984, 0.558787, 0.5562982599999999, 0.5...",0.572362,0.042416,gof,37,47,203,0.042416,wt
CDKN2A,"[0.6263628000000001, 0.6584372, 0.64522464, 0....",0.632635,"[0.58637964, 0.5922116399999999, 0.59791374, 0...",0.590725,0.041909,lof,65,156,82,0.041909,lof
ROBO1,"[0.60539634, 0.59452946, 0.6027562399999999, 0...",0.601087,"[0.56429692, 0.54867576, 0.54145862, 0.5773176...",0.559267,0.04182,lof,26,33,198,0.04182,wt


In [19]:
overall.sort_values("abs_diff", ascending=False).to_csv("top_drivers.csv")

In [16]:
# curated genes

curated_genes = ['SETBP1', 'PTPRD', 'PTPN2', 'JAK2', 'BRCA2', 'PDGFB', 'SMARCA2', 'BMPR1A', 'MDM4', 'MDM2', 'BTG2', 'CDKN2A', 'CDKN2B', 'CDKN2C', 'KRAS', 'CCND2']

curated_lof_tp53_lof = bootstrap_stats(
    genes_of_interest=curated_genes,
    dependency_data=crispr_effect,
    cnv_data=cnv_data,
    mutation_data=mutation_data,
    candidate_models=tp53_lof_models,
    search_mode="lof",
    n_bootstrap=128,
    eval_function=genome_proximity_bias_score,
    eval_kwargs={"n_samples": 500, "n_trials": 200, "return_samples": False},
    n_workers=16,
)
curated_lof_tp53_lof.to_csv("outputs/curated_lof_bootstrap_results_tp53_lof.csv")

curated_amp_tp53_lof = bootstrap_stats(
    genes_of_interest=curated_genes,
    dependency_data=crispr_effect,
    cnv_data=cnv_data,
    mutation_data=mutation_data,
    candidate_models=tp53_lof_models,
    search_mode="amp",
    n_bootstrap=128,
    eval_function=genome_proximity_bias_score,
    eval_kwargs={"n_samples": 500, "n_trials": 200, "return_samples": False},
    n_workers=16,
)
curated_amp_tp53_lof.to_csv("outputs/curated_amp_bootstrap_results_tp53_lof.csv")

curated_amp_filtered_tp53_lof = bootstrap_stats(
    genes_of_interest=curated_genes,
    dependency_data=crispr_effect,
    cnv_data=cnv_data,
    mutation_data=mutation_data,
    candidate_models=tp53_lof_models,
    search_mode="amp",
    n_bootstrap=128,
    eval_function=genome_proximity_bias_score,
    eval_kwargs={"n_samples": 500, "n_trials": 200, "return_samples": False},
    n_workers=16,
    filter_amp=True,
)
curated_amp_filtered_tp53_lof.to_csv("outputs/curated_amp_filtered_bootstrap_results_tp53_lof.csv")

# top on TP53 WT

curated_lof_tp53_wt = bootstrap_stats(
    genes_of_interest=curated_genes,
    dependency_data=crispr_effect,
    cnv_data=cnv_data,
    mutation_data=mutation_data,
    candidate_models=tp53_wt_models,
    search_mode="lof",
    n_bootstrap=128,
    eval_function=genome_proximity_bias_score,
    eval_kwargs={"n_samples": 500, "n_trials": 200, "return_samples": False},
    n_workers=16,
)
curated_lof_tp53_wt.to_csv("outputs/curated_lof_bootstrap_results_tp53_wt.csv")

curated_amp_tp53_wt = bootstrap_stats(
    genes_of_interest=curated_genes,
    dependency_data=crispr_effect,
    cnv_data=cnv_data,
    mutation_data=mutation_data,
    candidate_models=tp53_wt_models,
    search_mode="amp",
    n_bootstrap=128,
    eval_function=genome_proximity_bias_score,
    eval_kwargs={"n_samples": 500, "n_trials": 200, "return_samples": False},
    n_workers=16,
)
curated_amp_tp53_wt.to_csv("outputs/curated_amp_bootstrap_results_tp53_wt.csv")

curated_amp_filtered_tp53_wt = bootstrap_stats(
    genes_of_interest=curated_genes,
    dependency_data=crispr_effect,
    cnv_data=cnv_data,
    mutation_data=mutation_data,
    candidate_models=tp53_wt_models,
    search_mode="amp",
    n_bootstrap=128,
    eval_function=genome_proximity_bias_score,
    eval_kwargs={"n_samples": 500, "n_trials": 200, "return_samples": False},
    n_workers=16,
    filter_amp=True,
)
curated_amp_filtered_tp53_wt.to_csv("outputs/curated_amp_filtered_bootstrap_results_tp53_wt.csv")

Stats for CCND2 computed in 610.4305167198181 - diff is 0.019075886093749883, 183 wt and 39 lof
Stats for BMPR1A computed in 760.1028108596802 - diff is -0.025522798906249977, 202 wt and 51 lof
Stats for SETBP1 computed in 789.087973356247 - diff is -0.011357551093750029, 113 wt and 127 lof
Stats for BRCA2 computed in 817.9555413722992 - diff is -0.006058755625000023, 135 wt and 86 lof
Stats for CDKN2C computed in 832.3464379310608 - diff is -0.0008223846874999552, 195 wt and 44 lof
Stats for CDKN2B computed in 839.9164910316467 - diff is 0.035303087968749924, 103 wt and 154 lof
Stats for CDKN2A computed in 848.3205919265747 - diff is 0.04040170265624998, 82 wt and 156 lof
Stats for JAK2 computed in 845.2928907871246 - diff is 0.028510475468749963, 128 wt and 106 lof
Stats for PTPN2 computed in 840.7123999595642 - diff is -0.0187942993750001, 159 wt and 65 lof
Stats for SMARCA2 computed in 837.7150719165802 - diff is 0.019742756874999934, 120 wt and 107 lof
Stats for PTPRD computed in 

In [17]:
curated_lof_tp53_wt = pd.read_csv("outputs/curated_lof_bootstrap_results_tp53_wt.csv", index_col=0)
curated_amp_filtered_tp53_wt = pd.read_csv("outputs/curated_amp_bootstrap_results_tp53_wt.csv", index_col=0)
curated_lof_tp53_wt["abs_diff"] = curated_lof_tp53_wt["diff"].abs()
curated_amp_filtered_tp53_wt["abs_diff"] = curated_amp_filtered_tp53_wt["diff"].abs()
curated_lof_tp53_wt["tp53"] = "wt"
curated_amp_filtered_tp53_wt["tp53"] = "wt"

curated_lof_tp53_lof = pd.read_csv("outputs/curated_lof_bootstrap_results_tp53_lof.csv", index_col=0)
curated_amp_filtered_tp53_lof = pd.read_csv("outputs/curated_amp_bootstrap_results_tp53_lof.csv", index_col=0)
curated_lof_tp53_lof["abs_diff"] = curated_lof_tp53_lof["diff"].abs()
curated_amp_filtered_tp53_lof["abs_diff"] = curated_amp_filtered_tp53_lof["diff"].abs()
curated_lof_tp53_lof["tp53"] = "lof"
curated_amp_filtered_tp53_lof["tp53"] = "lof"

In [18]:
curated = pd.concat([curated_lof_tp53_wt, curated_amp_filtered_tp53_wt, curated_lof_tp53_lof, curated_amp_filtered_tp53_lof], axis=0)
curated.sort_index()[["tp53", "search_mode", "diff", "wt_mean", "test_mean", "n_sample_bootstrap"]]

Unnamed: 0,tp53,search_mode,diff,wt_mean,test_mean,n_sample_bootstrap
BMPR1A,lof,lof,-0.025523,0.605784,0.580261,40
BMPR1A,wt,lof,0.006521,0.558478,0.564998,23
BRCA2,wt,lof,0.026354,0.566966,0.59332,32
BRCA2,lof,gof,-0.004732,0.585152,0.58042,28
BRCA2,lof,lof,-0.006059,0.624225,0.618167,68
BTG2,wt,gof,0.014281,0.592646,0.606927,69
BTG2,lof,gof,-0.015771,0.622446,0.606675,64
CCND2,lof,gof,0.038912,0.589478,0.62839,43
CCND2,wt,gof,0.038394,0.568075,0.60647,35
CCND2,lof,lof,0.019076,0.578461,0.597537,31


In [20]:
curated.sort_index().to_csv("select_drivers.csv")