In [1]:
import pandas as pd

from proxbias.depmap.process import bootstrap_stats, split_models
from proxbias.depmap.load import get_depmap_data
from proxbias.utils.data_utils import get_cancer_gene_lists
from proxbias.metrics import genome_proximity_bias_score

In [2]:
crispr_effect, _, cnv_data, mutation_data = get_depmap_data(rnai_release="")
oncogenes, tsgs = get_cancer_gene_lists(crispr_effect.index)

CRISPRGeneEffect.csv from DepMap Public 22Q4 is found. Reading dataframe from cache.
Done!
OmicsCNGene.csv from DepMap Public 22Q4 is found. Reading dataframe from cache.
Done!
OmicsSomaticMutations.csv from DepMap Public 22Q4 is found. Reading dataframe from cache.
Done!


In [3]:
# Note - n_workers should likely be around half the number of CPUs
res = bootstrap_stats(
    genes_of_interest=["TP53"],
    dependency_data=crispr_effect,
    cnv_data=cnv_data,
    mutation_data=mutation_data,
    candidate_models=list(crispr_effect.columns),
    search_mode="lof",
    n_bootstrap=32,
    eval_function=genome_proximity_bias_score,
    eval_kwargs={"n_samples": 500, "n_trials": 200, "return_samples": False},
    n_workers=1,
    filter_gof=True,
)
res.to_csv("outputs/TP53_lof_bootstrap_results.csv")

Stats for TP53 computed in 71.97563099861145 - diff is 0.029988173125000017, 278 wt and 277 lof


In [4]:
tp53_lof_models, tp53_wt_models, tp53_gof_models, _ = split_models(
    gene_symbol="TP53",
    candidate_models=crispr_effect.columns,
    cnv_data=cnv_data,
    mutation_data=mutation_data
)

_, _, tp53_gof_filtered_models, _ = split_models(
    gene_symbol="TP53",
    candidate_models=crispr_effect.columns,
    cnv_data=cnv_data,
    mutation_data=mutation_data,
    filter_gof=True,
)

In [5]:
# Subset down to the genes that seem to have a strong effect based off of 4 bootstraps

all_lof_tp53_wt = bootstrap_stats(
    genes_of_interest=tsgs + oncogenes,
    dependency_data=crispr_effect,
    cnv_data=cnv_data,
    mutation_data=mutation_data,
    candidate_models=tp53_wt_models,
    search_mode="lof",
    n_bootstrap=4,
    eval_function=genome_proximity_bias_score,
    eval_kwargs={"n_samples": 500, "n_trials": 200, "return_samples": False},
    n_workers=16,
)

all_gof_tp53_wt = bootstrap_stats(
    genes_of_interest=tsgs,
    dependency_data=crispr_effect,
    cnv_data=cnv_data,
    mutation_data=mutation_data,
    candidate_models=tp53_wt_models,
    search_mode="gof",
    n_bootstrap=4,
    eval_function=genome_proximity_bias_score,
    eval_kwargs={"n_samples": 500, "n_trials": 200, "return_samples": False},
    n_workers=16,
)

four_bootstrap_tp53_wt = pd.concat([all_lof_tp53_wt, all_gof_tp53_wt], axis=1)
four_bootstrap_tp53_wt.to_csv("outputs/four_bootstrap_tp53_wt_results.csv")

Index(['IGF2', 'IRF1', 'TNFSF13'], dtype='object') not found in data.
Stats for ABRAXAS1 computed in 14.315263986587524 - diff is 0.045973784999999934, 218 wt and 40 lof
Stats for AJUBA computed in 14.778756856918335 - diff is 0.0038755550000000305, 200 wt and 38 lof
Stats for AKT1 computed in 14.958284378051758 - diff is 0.0070602450000000205, 192 wt and 42 lof
Stats for AMER1 computed in 15.32546591758728 - diff is 0.01152120500000009, 93 wt and 161 lof
Stats for AR computed in 15.1503324508667 - diff is -0.0020547450000000245, 88 wt and 163 lof
Stats for ARAF computed in 14.471314191818237 - diff is 0.010171694999999925, 88 wt and 164 lof
Stats for ARID1A computed in 14.583870887756348 - diff is -0.002491880000000002, 179 wt and 35 lof
Stats for ARID1B computed in 14.582617282867432 - diff is 0.010847879999999921, 179 wt and 47 lof
Stats for ARID4A computed in 14.334502935409546 - diff is -0.0018389150000001075, 196 wt and 36 lof
Stats for ARID5B computed in 14.635968685150146 - dif

In [None]:
all_lof_tp53_lof = bootstrap_stats(
    genes_of_interest=tsgs + oncogenes,
    dependency_data=crispr_effect,
    cnv_data=cnv_data,
    mutation_data=mutation_data,
    candidate_models=tp53_lof_models,
    search_mode="lof",
    n_bootstrap=4,
    eval_function=genome_proximity_bias_score,
    eval_kwargs={"n_samples": 500, "n_trials": 200, "return_samples": False},
    n_workers=8,
)

all_gof_tp53_lof = bootstrap_stats(
    genes_of_interest=tsgs,
    dependency_data=crispr_effect,
    cnv_data=cnv_data,
    mutation_data=mutation_data,
    candidate_models=tp53_lof_models,
    search_mode="gof",
    n_bootstrap=4,
    eval_function=genome_proximity_bias_score,
    eval_kwargs={"n_samples": 500, "n_trials": 200, "return_samples": False},
    n_workers=8,
)

four_bootstrap_tp53_lof = pd.concat([all_lof_tp53_lof, all_gof_tp53_lof], axis=1)
four_bootstrap_tp53_lof.to_csv("outputs/four_bootstrap_tp53_lof_results.csv")

In [None]:
four_bootstrap_tp53_lof["abs_diff"] = four_bootstrap_tp53_lof["diff"].abs()
four_bootstrap_tp53_lof = four_bootstrap_tp53_lof.sort_values("abs_diff", ascending=False)

four_bootstrap_tp53_wt["abs_diff"] = four_bootstrap_tp53_wt["diff"].abs()
four_bootstrap_tp53_wt = four_bootstrap_tp53_wt.sort_values("abs_diff", ascending=False)

top_genes = list(set(four_bootstrap_tp53_lof.index[:40].tolist()) | set(four_bootstrap_tp53_wt.index[:40].tolist()))

In [None]:
print(top_genes)

In [None]:
# TSG baseline

res_tsg_lof = bootstrap_stats(
    genes_of_interest=tsgs,
    dependency_data=crispr_effect,
    cnv_data=cnv_data,
    mutation_data=mutation_data,
    candidate_models=list(crispr_effect.columns),
    search_mode="lof",
    n_bootstrap=32,
    eval_function=genome_proximity_bias_score,
    eval_kwargs={"n_samples": 500, "n_trials": 200, "return_samples": False},
    n_workers=16,
)
res_tsg_lof.to_csv("outputs/TSG_lof_bootstrap_results.cv")

res_tsg_gof = bootstrap_stats(
    genes_of_interest=tsgs,
    dependency_data=crispr_effect,
    cnv_data=cnv_data,
    mutation_data=mutation_data,
    candidate_models=list(crispr_effect.columns),
    search_mode="gof",
    n_bootstrap=32,
    eval_function=genome_proximity_bias_score,
    eval_kwargs={"n_samples": 500, "n_trials": 200, "return_samples": False},
    n_workers=16,
)
res_tsg_gof.to_csv("outputs/TSG_gof_bootstrap_results.csv")

res_tsg_gof_filtered = bootstrap_stats(
    genes_of_interest=tsgs,
    dependency_data=crispr_effect,
    cnv_data=cnv_data,
    mutation_data=mutation_data,
    candidate_models=list(crispr_effect.columns),
    search_mode="gof",
    n_bootstrap=32,
    eval_function=genome_proximity_bias_score,
    eval_kwargs={"n_samples": 500, "n_trials": 200, "return_samples": False},
    n_workers=16,
    filter_gof=True,
)
res_tsg_gof_filtered.to_csv("outputs/TSG_gof_filtered_bootstrap_results.csv")

Index(['IGF2', 'TNFSF13'], dtype='object') not found in data.
Stats for ABL1 computed in 41.8803985118866 - diff is 0.00040685062500001035, 749 wt and 83 test
Stats for ABL2 computed in 39.43450140953064 - diff is 0.01215378312500004, 697 wt and 32 test
Stats for ACKR3 computed in 41.62993407249451 - diff is 0.004788121250000055, 883 wt and 115 test
Stats for ACVR1 computed in 43.97074508666992 - diff is 0.009110334999999914, 925 wt and 52 test
Stats for ADHFE1 computed in 41.76025438308716 - diff is 0.005396359374999937, 684 wt and 70 test
Stats for AFF4 computed in 42.89986729621887 - diff is 0.00933252750000002, 839 wt and 69 test
Stats for AGO1 computed in 41.853593587875366 - diff is 0.0023352168750000457, 838 wt and 105 test
Stats for AKT1 computed in 41.58080530166626 - diff is -0.016346991249999943, 726 wt and 141 test
Stats for AKT2 computed in 42.96501183509827 - diff is 0.006857814374999993, 805 wt and 93 test
Stats for AKT3 computed in 42.005114793777466 - diff is 0.0368449

KeyboardInterrupt: 

In [None]:
# TSG on TP53 LOF

res_tsg_lof = bootstrap_stats(
    genes_of_interest=tsgs,
    dependency_data=crispr_effect,
    cnv_data=cnv_data,
    mutation_data=mutation_data,
    candidate_models=tp53_lof_models,
    search_mode="lof",
    n_bootstrap=32,
    eval_function=genome_proximity_bias_score,
    eval_kwargs={"n_samples": 500, "n_trials": 200, "return_samples": False},
    n_workers=16,
)
res_tsg_lof.to_csv("outputs/TSG_lof_bootstrap_results_tp53_lof.csv")

res_tsg_gof = bootstrap_stats(
    genes_of_interest=tsgs,
    dependency_data=crispr_effect,
    cnv_data=cnv_data,
    mutation_data=mutation_data,
    candidate_models=tp53_lof_models,
    search_mode="gof",
    n_bootstrap=32,
    eval_function=genome_proximity_bias_score,
    eval_kwargs={"n_samples": 500, "n_trials": 200, "return_samples": False},
    n_workers=16,
)
res_tsg_gof.to_csv("outputs/TSG_gof_bootstrap_results_tp53_lof.csv")

res_tsg_gof_filtered = bootstrap_stats(
    genes_of_interest=tsgs,
    dependency_data=crispr_effect,
    cnv_data=cnv_data,
    mutation_data=mutation_data,
    candidate_models=tp53_lof_models,
    search_mode="gof",
    n_bootstrap=32,
    eval_function=genome_proximity_bias_score,
    eval_kwargs={"n_samples": 500, "n_trials": 200, "return_samples": False},
    n_workers=16,
    filter_gof=True,
)
res_tsg_gof_filtered.to_csv("outputs/TSG_gof_filtered_bootstrap_results_tp53_lof.csv")

In [None]:
# TSG on TP53 WT

res_tsg_lof = bootstrap_stats(
    genes_of_interest=tsgs,
    dependency_data=crispr_effect,
    cnv_data=cnv_data,
    mutation_data=mutation_data,
    candidate_models=tp53_wt_models,
    search_mode="lof",
    n_bootstrap=32,
    eval_function=genome_proximity_bias_score,
    eval_kwargs={"n_samples": 500, "n_trials": 200, "return_samples": False},
    n_workers=16,
)
res_tsg_lof.to_csv("outputs/TSG_lof_bootstrap_results_tp53_wt.csv")

res_tsg_gof = bootstrap_stats(
    genes_of_interest=tsgs,
    dependency_data=crispr_effect,
    cnv_data=cnv_data,
    mutation_data=mutation_data,
    candidate_models=tp53_wt_models,
    search_mode="gof",
    n_bootstrap=32,
    eval_function=genome_proximity_bias_score,
    eval_kwargs={"n_samples": 500, "n_trials": 200, "return_samples": False},
    n_workers=16,
)
res_tsg_gof.to_csv("outputs/TSG_gof_bootstrap_results_tp53_wt.csv")

res_tsg_gof_filtered = bootstrap_stats(
    genes_of_interest=tsgs,
    dependency_data=crispr_effect,
    cnv_data=cnv_data,
    mutation_data=mutation_data,
    candidate_models=tp53_wt_models,
    search_mode="gof",
    n_bootstrap=32,
    eval_function=genome_proximity_bias_score,
    eval_kwargs={"n_samples": 500, "n_trials": 200, "return_samples": False},
    n_workers=16,
    filter_gof=True,
)
res_tsg_gof_filtered.to_csv("outputs/TSG_gof_filtered_bootstrap_results_tp53_wt.csv")

In [None]:
# TSG on TP53 GOF

res_tsg_lof = bootstrap_stats(
    genes_of_interest=tsgs,
    dependency_data=crispr_effect,
    cnv_data=cnv_data,
    mutation_data=mutation_data,
    candidate_models=tp53_gof_models,
    search_mode="lof",
    n_bootstrap=32,
    eval_function=genome_proximity_bias_score,
    eval_kwargs={"n_samples": 500, "n_trials": 200, "return_samples": False},
    n_workers=16,
)
res_tsg_lof.to_csv("outputs/TSG_lof_bootstrap_results_tp53_gof.csv")

res_tsg_gof = bootstrap_stats(
    genes_of_interest=tsgs,
    dependency_data=crispr_effect,
    cnv_data=cnv_data,
    mutation_data=mutation_data,
    candidate_models=tp53_gof_models,
    search_mode="gof",
    n_bootstrap=32,
    eval_function=genome_proximity_bias_score,
    eval_kwargs={"n_samples": 500, "n_trials": 200, "return_samples": False},
    n_workers=16,
)
res_tsg_gof.to_csv("outputs/TSG_gof_bootstrap_results_tp53_gof.csv")

res_tsg_gof_filtered = bootstrap_stats(
    genes_of_interest=tsgs,
    dependency_data=crispr_effect,
    cnv_data=cnv_data,
    mutation_data=mutation_data,
    candidate_models=tp53_gof_models,
    search_mode="gof",
    n_bootstrap=32,
    eval_function=genome_proximity_bias_score,
    eval_kwargs={"n_samples": 500, "n_trials": 200, "return_samples": False},
    n_workers=16,
    filter_gof=True,
)
res_tsg_gof_filtered.to_csv("outputs/TSG_gof_filtered_bootstrap_results_tp53_gof.csv")

In [None]:
# Oncogenes baseline

res_oncogene_lof = bootstrap_stats(
    genes_of_interest=oncogenes,
    dependency_data=crispr_effect,
    cnv_data=cnv_data,
    mutation_data=mutation_data,
    candidate_models=list(crispr_effect.columns),
    search_mode="lof",
    n_bootstrap=32,
    eval_function=genome_proximity_bias_score,
    eval_kwargs={"n_samples": 500, "n_trials": 200, "return_samples": False},
    n_workers=16,
)
res_oncogene_lof.to_csv("outputs/oncogene_lof_bootstrap_results.csv")

res_oncogene_gof = bootstrap_stats(
    genes_of_interest=oncogenes,
    dependency_data=crispr_effect,
    cnv_data=cnv_data,
    mutation_data=mutation_data,
    candidate_models=list(crispr_effect.columns),
    search_mode="gof",
    n_bootstrap=32,
    eval_function=genome_proximity_bias_score,
    eval_kwargs={"n_samples": 500, "n_trials": 200, "return_samples": False},
    n_workers=16,
)
res_oncogene_gof.to_csv("outputs/oncogene_gof_bootstrap_results.csv")

res_oncogene_gof_filtered = bootstrap_stats(
    genes_of_interest=oncogenes,
    dependency_data=crispr_effect,
    cnv_data=cnv_data,
    mutation_data=mutation_data,
    candidate_models=list(crispr_effect.columns),
    search_mode="gof",
    n_bootstrap=32,
    eval_function=genome_proximity_bias_score,
    eval_kwargs={"n_samples": 500, "n_trials": 200, "return_samples": False},
    n_workers=16,
    filter_gof=True,
)
res_oncogene_gof_filtered.to_csv("outputs/oncogene_gof_filtered_bootstrap_results.csv")

In [None]:
# Oncogenes on TP53 LOF

res_oncogene_lof = bootstrap_stats(
    genes_of_interest=oncogenes,
    dependency_data=crispr_effect,
    cnv_data=cnv_data,
    mutation_data=mutation_data,
    candidate_models=tp53_lof_models,
    search_mode="lof",
    n_bootstrap=32,
    eval_function=genome_proximity_bias_score,
    eval_kwargs={"n_samples": 500, "n_trials": 200, "return_samples": False},
    n_workers=16,
)
res_oncogene_lof.to_csv("outputs/oncogene_lof_bootstrap_results_tp53_lof.csv")

res_oncogene_gof = bootstrap_stats(
    genes_of_interest=oncogenes,
    dependency_data=crispr_effect,
    cnv_data=cnv_data,
    mutation_data=mutation_data,
    candidate_models=tp53_lof_models,
    search_mode="gof",
    n_bootstrap=32,
    eval_function=genome_proximity_bias_score,
    eval_kwargs={"n_samples": 500, "n_trials": 200, "return_samples": False},
    n_workers=16,
)
res_oncogene_gof.to_csv("outputs/oncogene_gof_bootstrap_results_tp53_lof.csv")

res_oncogene_gof_filtered = bootstrap_stats(
    genes_of_interest=oncogenes,
    dependency_data=crispr_effect,
    cnv_data=cnv_data,
    mutation_data=mutation_data,
    candidate_models=tp53_lof_models,
    search_mode="gof",
    n_bootstrap=32,
    eval_function=genome_proximity_bias_score,
    eval_kwargs={"n_samples": 500, "n_trials": 200, "return_samples": False},
    n_workers=16,
    filter_gof=True,
)
res_oncogene_gof_filtered.to_csv("outputs/oncogene_gof_filtered_bootstrap_results_tp53_lof.csv")

In [None]:
# Oncogenes on TP53 WT

res_oncogene_lof = bootstrap_stats(
    genes_of_interest=oncogenes,
    dependency_data=crispr_effect,
    cnv_data=cnv_data,
    mutation_data=mutation_data,
    candidate_models=tp53_wt_models,
    search_mode="lof",
    n_bootstrap=32,
    eval_function=genome_proximity_bias_score,
    eval_kwargs={"n_samples": 500, "n_trials": 200, "return_samples": False},
    n_workers=16,
)
res_oncogene_lof.to_csv("outputs/oncogene_lof_bootstrap_results_tp53_wt.csv")

res_oncogene_gof = bootstrap_stats(
    genes_of_interest=oncogenes,
    dependency_data=crispr_effect,
    cnv_data=cnv_data,
    mutation_data=mutation_data,
    candidate_models=tp53_wt_models,
    search_mode="gof",
    n_bootstrap=32,
    eval_function=genome_proximity_bias_score,
    eval_kwargs={"n_samples": 500, "n_trials": 200, "return_samples": False},
    n_workers=16,
)
res_oncogene_gof.to_csv("outputs/oncogene_gof_bootstrap_results_tp53_wt.csv")

res_oncogene_gof_filtered = bootstrap_stats(
    genes_of_interest=oncogenes,
    dependency_data=crispr_effect,
    cnv_data=cnv_data,
    mutation_data=mutation_data,
    candidate_models=tp53_wt_models,
    search_mode="gof",
    n_bootstrap=32,
    eval_function=genome_proximity_bias_score,
    eval_kwargs={"n_samples": 500, "n_trials": 200, "return_samples": False},
    n_workers=16,
    filter_gof=True,
)
res_oncogene_gof_filtered.to_csv("outputs/oncogene_gof_filtered_bootstrap_results_tp53_wt.csv")

In [None]:
# Oncogenes on TP53 GOF

res_oncogene_lof = bootstrap_stats(
    genes_of_interest=oncogenes,
    dependency_data=crispr_effect,
    cnv_data=cnv_data,
    mutation_data=mutation_data,
    candidate_models=tp53_gof_models,
    search_mode="lof",
    n_bootstrap=32,
    eval_function=genome_proximity_bias_score,
    eval_kwargs={"n_samples": 500, "n_trials": 200, "return_samples": False},
    n_workers=16,
)
res_oncogene_lof.to_csv("outputs/oncogene_lof_bootstrap_results_tp53_gof.csv")

res_oncogene_gof = bootstrap_stats(
    genes_of_interest=oncogenes,
    dependency_data=crispr_effect,
    cnv_data=cnv_data,
    mutation_data=mutation_data,
    candidate_models=tp53_gof_models,
    search_mode="gof",
    n_bootstrap=32,
    eval_function=genome_proximity_bias_score,
    eval_kwargs={"n_samples": 500, "n_trials": 200, "return_samples": False},
    n_workers=16,
)
res_oncogene_gof.to_csv("outputs/oncogene_gof_bootstrap_results_tp53_gof.csv")

res_oncogene_gof_filtered = bootstrap_stats(
    genes_of_interest=oncogenes,
    dependency_data=crispr_effect,
    cnv_data=cnv_data,
    mutation_data=mutation_data,
    candidate_models=tp53_gof_models,
    search_mode="gof",
    n_bootstrap=32,
    eval_function=genome_proximity_bias_score,
    eval_kwargs={"n_samples": 500, "n_trials": 200, "return_samples": False},
    n_workers=16,
    filter_gof=True,
)
res_oncogene_gof_filtered.to_csv("outputs/oncogene_gof_filtered_bootstrap_results_tp53_gof.csv")