In [None]:
import pandas as pd

from proxbias.depmap.process import compute_monte_carlo_stats, split_models
from proxbias.depmap.load import get_depmap_data
from proxbias.utils.data_utils import get_cancer_gene_lists
from proxbias.metrics import genome_proximity_bias_score

In [None]:
crispr_effect, _, cnv_data, mutation_data = get_depmap_data(rnai_release="")
oncogenes, tsgs = get_cancer_gene_lists(crispr_effect.index)

In [None]:
tp53_lof_models, tp53_wt_models, tp53_amp_models, _ = split_models(
    gene_symbol="TP53", candidate_models=crispr_effect.columns, cnv_data=cnv_data, mutation_data=mutation_data
)

_, _, tp53_amp_filtered_models, _ = split_models(
    gene_symbol="TP53",
    candidate_models=crispr_effect.columns,
    cnv_data=cnv_data,
    mutation_data=mutation_data,
    filter_amp=True,
)

In [None]:
# Note - n_workers should likely be around half the number of CPUs
res = compute_monte_carlo_stats(
    genes_of_interest=["TP53"],
    dependency_data=crispr_effect,
    cnv_data=cnv_data,
    mutation_data=mutation_data,
    candidate_models=list(crispr_effect.columns),
    search_mode="lof",
    n_iterations=128,
    eval_function=genome_proximity_bias_score,
    eval_kwargs={"n_samples": 500, "n_trials": 200, "return_samples": False},
    n_workers=1,
    filter_amp=True,
)
res.to_csv("outputs/TP53_lof_results.csv")

In [None]:
# top on TP53 LOF

fixed_sampling_lof_tp53_lof = compute_monte_carlo_stats(
    genes_of_interest=tsgs + oncogenes,
    dependency_data=crispr_effect,
    cnv_data=cnv_data,
    mutation_data=mutation_data,
    candidate_models=tp53_lof_models,
    search_mode="lof",
    n_iterations=32,
    eval_function=genome_proximity_bias_score,
    eval_kwargs={"n_samples": 500, "n_trials": 200, "return_samples": False},
    n_workers=64,
    fixed_cell_line_sampling=True,
)
fixed_sampling_lof_tp53_lof.to_csv("outputs/fixed_sampling_lof_results_tp53_lof.csv")

fixed_sampling_amp_filtered_tp53_lof = compute_monte_carlo_stats(
    genes_of_interest=tsgs + oncogenes,
    dependency_data=crispr_effect,
    cnv_data=cnv_data,
    mutation_data=mutation_data,
    candidate_models=tp53_lof_models,
    search_mode="amp",
    n_iterations=32,
    eval_function=genome_proximity_bias_score,
    eval_kwargs={"n_samples": 500, "n_trials": 200, "return_samples": False},
    n_workers=64,
    fixed_cell_line_sampling=True,
    filter_amp=True,
)
fixed_sampling_amp_filtered_tp53_lof.to_csv("outputs/fixed_sampling_amp_filtered_results_tp53_lof.csv")

# top on TP53 WT

fixed_sampling_lof_tp53_wt = compute_monte_carlo_stats(
    genes_of_interest=tsgs + oncogenes,
    dependency_data=crispr_effect,
    cnv_data=cnv_data,
    mutation_data=mutation_data,
    candidate_models=tp53_wt_models,
    search_mode="lof",
    n_iterations=32,
    eval_function=genome_proximity_bias_score,
    eval_kwargs={"n_samples": 500, "n_trials": 200, "return_samples": False},
    n_workers=64,
    fixed_cell_line_sampling=True,
)
fixed_sampling_lof_tp53_wt.to_csv("outputs/fixed_sampling_lof_results_tp53_wt.csv")

fixed_sampling_amp_filtered_tp53_wt = compute_monte_carlo_stats(
    genes_of_interest=tsgs + oncogenes,
    dependency_data=crispr_effect,
    cnv_data=cnv_data,
    mutation_data=mutation_data,
    candidate_models=tp53_wt_models,
    search_mode="amp",
    n_iterations=32,
    eval_function=genome_proximity_bias_score,
    eval_kwargs={"n_samples": 500, "n_trials": 200, "return_samples": False},
    n_workers=64,
    fixed_cell_line_sampling=True,
    filter_amp=True,
)
fixed_sampling_amp_filtered_tp53_wt.to_csv("outputs/fixed_sampling_amp_filtered_results_tp53_wt.csv")

In [None]:
fixed_sampling_lof_tp53_lof = pd.read_csv("outputs/fixed_sampling_lof_results_tp53_lof.csv", index_col=0)
fixed_sampling_lof_tp53_lof["tp53"] = "lof"
fixed_sampling_amp_filtered_tp53_lof = pd.read_csv(
    "outputs/fixed_sampling_amp_filtered_results_tp53_lof.csv", index_col=0
)
fixed_sampling_amp_filtered_tp53_lof["tp53"] = "lof"
fixed_sampling_lof_tp53_wt = pd.read_csv("outputs/fixed_sampling_lof_results_tp53_wt.csv", index_col=0)
fixed_sampling_lof_tp53_wt["tp53"] = "wt"
fixed_sampling_amp_filtered_tp53_wt = pd.read_csv(
    "outputs/fixed_sampling_amp_filtered_results_tp53_wt.csv", index_col=0
)
fixed_sampling_amp_filtered_tp53_wt["tp53"] = "wt"
combined = pd.concat(
    [
        fixed_sampling_lof_tp53_lof,
        fixed_sampling_amp_filtered_tp53_lof,
        fixed_sampling_lof_tp53_wt,
        fixed_sampling_amp_filtered_tp53_wt,
    ]
)
combined["abs_diff"] = combined["diff"].abs()
combined.to_csv("outputs/fixed_sampling.csv")
for tp53 in ["lof", "wt"]:
    for search_mode in ["lof", "amp"]:
        for ascending in [True, False]:
            print(f"tp53: {tp53}, search_mode: {search_mode}, {'bottom' if ascending else 'top'} \n \n")
            if ascending:
                candidates = combined.loc[
                    (combined["tp53"] == tp53) & (combined["search_mode"] == search_mode) & (combined["diff"] <= -0.01)
                ]
            else:
                candidates = combined.loc[
                    (combined["tp53"] == tp53) & (combined["search_mode"] == search_mode) & (combined["diff"] >= 0.02)
                ]
            for gene in candidates.sort_values("abs_diff", ascending=ascending).index:
                print(f"{gene},")
            print("\n\n\n")

In [None]:
interesting = (
    combined.loc[(combined["search_mode"] == "amp") & (combined["diff"] <= -0.01)]
    .sort_values("abs_diff", ascending=False)
    .index
)

In [None]:
interesting_subset = (
    combined.loc[interesting]
    .reset_index()
    .sort_values("abs_diff", ascending=False)
    .groupby("index")
    .abs_diff.apply(lambda x: x.max() - x.min())
    .sort_values(ascending=False)
    .index[:5]
)

combined.loc[interesting_subset]

In [None]:
# Subset down to the genes that seem to have a strong effect based off of 4 iterations

all_lof_tp53_wt = compute_monte_carlo_stats(
    genes_of_interest=tsgs + oncogenes,
    dependency_data=crispr_effect,
    cnv_data=cnv_data,
    mutation_data=mutation_data,
    candidate_models=tp53_wt_models,
    search_mode="lof",
    n_iterations=4,
    eval_function=genome_proximity_bias_score,
    eval_kwargs={"n_samples": 500, "n_trials": 200, "return_samples": False},
    n_workers=32,
)

all_amp_tp53_wt = compute_monte_carlo_stats(
    genes_of_interest=tsgs + oncogenes,
    dependency_data=crispr_effect,
    cnv_data=cnv_data,
    mutation_data=mutation_data,
    candidate_models=tp53_wt_models,
    search_mode="amp",
    n_iterations=4,
    eval_function=genome_proximity_bias_score,
    eval_kwargs={"n_samples": 500, "n_trials": 200, "return_samples": False},
    n_workers=32,
    filter_amp=True,
)

four_iteration_tp53_wt = pd.concat([all_lof_tp53_wt, all_amp_tp53_wt], axis=0)
four_iteration_tp53_wt.to_csv("outputs/four_iteration_tp53_wt_results.csv")

In [None]:
all_lof_tp53_lof = compute_monte_carlo_stats(
    genes_of_interest=tsgs + oncogenes,
    dependency_data=crispr_effect,
    cnv_data=cnv_data,
    mutation_data=mutation_data,
    candidate_models=tp53_lof_models,
    search_mode="lof",
    n_iterations=4,
    eval_function=genome_proximity_bias_score,
    eval_kwargs={"n_samples": 500, "n_trials": 200, "return_samples": False},
    n_workers=32,
)

all_amp_tp53_lof = compute_monte_carlo_stats(
    genes_of_interest=tsgs + oncogenes,
    dependency_data=crispr_effect,
    cnv_data=cnv_data,
    mutation_data=mutation_data,
    candidate_models=tp53_lof_models,
    search_mode="amp",
    n_iterations=4,
    eval_function=genome_proximity_bias_score,
    eval_kwargs={"n_samples": 500, "n_trials": 200, "return_samples": False},
    n_workers=32,
    filter_amp=True,
)

four_iteration_tp53_lof = pd.concat([all_lof_tp53_lof, all_amp_tp53_lof], axis=0)
four_iteration_tp53_lof.to_csv("outputs/four_iteration_tp53_lof_results.csv")

In [None]:
four_iteration_tp53_lof = pd.read_csv("outputs/four_iteration_tp53_lof_results.csv", index_col=0)
four_iteration_tp53_lof["abs_diff"] = four_iteration_tp53_lof["diff"].abs()
four_iteration_tp53_lof["tp53"] = "lof"

four_iteration_tp53_wt = pd.read_csv("outputs/four_iteration_tp53_wt_results.csv", index_col=0)
four_iteration_tp53_wt["abs_diff"] = four_iteration_tp53_wt["diff"].abs()
four_iteration_tp53_wt["tp53"] = "wt"

combined = pd.concat([four_iteration_tp53_lof, four_iteration_tp53_wt], axis=0)
combined = combined.sort_values("abs_diff", ascending=False)

top_lof_genes = list(combined.loc[combined["tp53"] == "lof"].index.unique()[:100])
top_wt_genes = list(combined.loc[combined["tp53"] == "wt"].index.unique()[:100])

In [None]:
combined.to_csv("outputs/combined_four_results.csv")

In [None]:
combined.loc["MDM4"]

In [None]:
combined.head(50)

In [None]:
max_diff = (
    combined.reset_index().groupby(["index"])["diff"].apply(lambda x: x.max() - x.min()).sort_values(ascending=False)
)

combined.loc[max_diff.index[0:20]]

In [None]:
# top on TP53 LOF

res_top_lof_tp53_lof = compute_monte_carlo_stats(
    genes_of_interest=top_lof_genes,
    dependency_data=crispr_effect,
    cnv_data=cnv_data,
    mutation_data=mutation_data,
    candidate_models=tp53_lof_models,
    search_mode="lof",
    n_iterations=32,
    eval_function=genome_proximity_bias_score,
    eval_kwargs={"n_samples": 500, "n_trials": 200, "return_samples": False},
    n_workers=32,
)
res_top_lof_tp53_lof.to_csv("outputs/top_lof_results_tp53_lof.csv")

res_top_amp_tp53_lof = compute_monte_carlo_stats(
    genes_of_interest=top_lof_genes,
    dependency_data=crispr_effect,
    cnv_data=cnv_data,
    mutation_data=mutation_data,
    candidate_models=tp53_lof_models,
    search_mode="amp",
    n_iterations=32,
    eval_function=genome_proximity_bias_score,
    eval_kwargs={"n_samples": 500, "n_trials": 200, "return_samples": False},
    n_workers=32,
)
res_top_amp_tp53_lof.to_csv("outputs/top_amp_results_tp53_lof.csv")

res_top_amp_filtered_tp53_lof = compute_monte_carlo_stats(
    genes_of_interest=top_lof_genes,
    dependency_data=crispr_effect,
    cnv_data=cnv_data,
    mutation_data=mutation_data,
    candidate_models=tp53_lof_models,
    search_mode="amp",
    n_iterations=32,
    eval_function=genome_proximity_bias_score,
    eval_kwargs={"n_samples": 500, "n_trials": 200, "return_samples": False},
    n_workers=32,
    filter_amp=True,
)
res_top_amp_filtered_tp53_lof.to_csv("outputs/top_amp_filtered_results_tp53_lof.csv")

# top on TP53 WT

res_top_lof_tp53_wt = compute_monte_carlo_stats(
    genes_of_interest=top_wt_genes,
    dependency_data=crispr_effect,
    cnv_data=cnv_data,
    mutation_data=mutation_data,
    candidate_models=tp53_wt_models,
    search_mode="lof",
    n_iterations=32,
    eval_function=genome_proximity_bias_score,
    eval_kwargs={"n_samples": 500, "n_trials": 200, "return_samples": False},
    n_workers=32,
)
res_top_lof_tp53_wt.to_csv("outputs/top_lof_results_tp53_wt.csv")

res_top_amp_tp53_wt = compute_monte_carlo_stats(
    genes_of_interest=top_wt_genes,
    dependency_data=crispr_effect,
    cnv_data=cnv_data,
    mutation_data=mutation_data,
    candidate_models=tp53_wt_models,
    search_mode="amp",
    n_iterations=32,
    eval_function=genome_proximity_bias_score,
    eval_kwargs={"n_samples": 500, "n_trials": 200, "return_samples": False},
    n_workers=32,
)
res_top_amp_tp53_wt.to_csv("outputs/top_amp_results_tp53_wt.csv")

res_top_amp_filtered_tp53_wt = compute_monte_carlo_stats(
    genes_of_interest=top_wt_genes,
    dependency_data=crispr_effect,
    cnv_data=cnv_data,
    mutation_data=mutation_data,
    candidate_models=tp53_wt_models,
    search_mode="amp",
    n_iterations=32,
    eval_function=genome_proximity_bias_score,
    eval_kwargs={"n_samples": 500, "n_trials": 200, "return_samples": False},
    n_workers=32,
    filter_amp=True,
)
res_top_amp_filtered_tp53_wt.to_csv("outputs/top_amp_filtered_results_tp53_wt.csv")

In [None]:
res_top_lof_tp53_wt = pd.read_csv("outputs/top_lof_results_tp53_wt.csv", index_col=0)
res_top_amp_filtered_tp53_wt = pd.read_csv("outputs/top_amp_results_tp53_wt.csv", index_col=0)
res_top_lof_tp53_wt["abs_diff"] = res_top_lof_tp53_wt["diff"].abs()
res_top_amp_filtered_tp53_wt["abs_diff"] = res_top_amp_filtered_tp53_wt["diff"].abs()
res_top_lof_tp53_wt["tp53"] = "wt"
res_top_amp_filtered_tp53_wt["tp53"] = "wt"

res_top_lof_tp53_lof = pd.read_csv("outputs/top_lof_results_tp53_lof.csv", index_col=0)
res_top_amp_filtered_tp53_lof = pd.read_csv("outputs/top_amp_results_tp53_lof.csv", index_col=0)
res_top_lof_tp53_lof["abs_diff"] = res_top_lof_tp53_lof["diff"].abs()
res_top_amp_filtered_tp53_lof["abs_diff"] = res_top_amp_filtered_tp53_lof["diff"].abs()
res_top_lof_tp53_lof["tp53"] = "lof"
res_top_amp_filtered_tp53_lof["tp53"] = "lof"

In [None]:
overall = pd.concat(
    [res_top_lof_tp53_wt, res_top_amp_filtered_tp53_wt, res_top_lof_tp53_lof, res_top_amp_filtered_tp53_lof], axis=0
)

In [None]:
overall.sort_values("abs_diff", ascending=False).head(50)

In [None]:
overall.sort_values("abs_diff", ascending=False).to_csv("top_drivers.csv")

In [None]:
bottom_tp53_wt_lof = list(
    overall.loc[(overall["tp53"] == "wt") & (overall["search_mode"] == "lof")]
    .sort_values("diff", ascending=True)
    .index[:20]
)
bottom_tp53_wt_amp = list(
    overall.loc[(overall["tp53"] == "wt") & (overall["search_mode"] == "amp")]
    .sort_values("diff", ascending=True)
    .index[:20]
)
bottom_tp53_lof_lof = list(
    overall.loc[(overall["tp53"] == "lof") & (overall["search_mode"] == "lof")]
    .sort_values("diff", ascending=True)
    .index[:20]
)
bottom_tp53_lof_amp = list(
    overall.loc[(overall["tp53"] == "lof") & (overall["search_mode"] == "amp")]
    .sort_values("diff", ascending=True)
    .index[:20]
)

top_tp53_wt_lof = list(
    overall.loc[(overall["tp53"] == "wt") & (overall["search_mode"] == "lof")]
    .sort_values("diff", ascending=False)
    .index[:20]
)
top_tp53_wt_amp = list(
    overall.loc[(overall["tp53"] == "wt") & (overall["search_mode"] == "amp")]
    .sort_values("diff", ascending=False)
    .index[:20]
)
top_tp53_lof_lof = list(
    overall.loc[(overall["tp53"] == "lof") & (overall["search_mode"] == "lof")]
    .sort_values("diff", ascending=False)
    .index[:20]
)
top_tp53_lof_amp = list(
    overall.loc[(overall["tp53"] == "lof") & (overall["search_mode"] == "amp")]
    .sort_values("diff", ascending=False)
    .index[:20]
)

In [None]:
for gene in top_tp53_lof_amp:
    print(f"{gene},")

In [None]:
# curated genes

curated_genes = [
    "SETBP1",
    "PTPRD",
    "PTPN2",
    "JAK2",
    "BRCA2",
    "PDGFB",
    "SMARCA2",
    "BMPR1A",
    "MDM4",
    "MDM2",
    "BTG2",
    "CDKN2A",
    "CDKN2B",
    "CDKN2C",
    "KRAS",
    "CCND2",
]

curated_lof_tp53_lof = compute_monte_carlo_stats(
    genes_of_interest=curated_genes,
    dependency_data=crispr_effect,
    cnv_data=cnv_data,
    mutation_data=mutation_data,
    candidate_models=tp53_lof_models,
    search_mode="lof",
    n_iterations=128,
    eval_function=genome_proximity_bias_score,
    eval_kwargs={"n_samples": 500, "n_trials": 200, "return_samples": False},
    n_workers=16,
)
curated_lof_tp53_lof.to_csv("outputs/curated_lof_results_tp53_lof.csv")

curated_amp_tp53_lof = compute_monte_carlo_stats(
    genes_of_interest=curated_genes,
    dependency_data=crispr_effect,
    cnv_data=cnv_data,
    mutation_data=mutation_data,
    candidate_models=tp53_lof_models,
    search_mode="amp",
    n_iterations=128,
    eval_function=genome_proximity_bias_score,
    eval_kwargs={"n_samples": 500, "n_trials": 200, "return_samples": False},
    n_workers=16,
)
curated_amp_tp53_lof.to_csv("outputs/curated_amp_results_tp53_lof.csv")

curated_amp_filtered_tp53_lof = compute_monte_carlo_stats(
    genes_of_interest=curated_genes,
    dependency_data=crispr_effect,
    cnv_data=cnv_data,
    mutation_data=mutation_data,
    candidate_models=tp53_lof_models,
    search_mode="amp",
    n_iterations=128,
    eval_function=genome_proximity_bias_score,
    eval_kwargs={"n_samples": 500, "n_trials": 200, "return_samples": False},
    n_workers=16,
    filter_amp=True,
)
curated_amp_filtered_tp53_lof.to_csv("outputs/curated_amp_filtered_results_tp53_lof.csv")

# top on TP53 WT

curated_lof_tp53_wt = compute_monte_carlo_stats(
    genes_of_interest=curated_genes,
    dependency_data=crispr_effect,
    cnv_data=cnv_data,
    mutation_data=mutation_data,
    candidate_models=tp53_wt_models,
    search_mode="lof",
    n_iterations=128,
    eval_function=genome_proximity_bias_score,
    eval_kwargs={"n_samples": 500, "n_trials": 200, "return_samples": False},
    n_workers=16,
)
curated_lof_tp53_wt.to_csv("outputs/curated_lof_results_tp53_wt.csv")

curated_amp_tp53_wt = compute_monte_carlo_stats(
    genes_of_interest=curated_genes,
    dependency_data=crispr_effect,
    cnv_data=cnv_data,
    mutation_data=mutation_data,
    candidate_models=tp53_wt_models,
    search_mode="amp",
    n_iterations=128,
    eval_function=genome_proximity_bias_score,
    eval_kwargs={"n_samples": 500, "n_trials": 200, "return_samples": False},
    n_workers=16,
)
curated_amp_tp53_wt.to_csv("outputs/curated_amp_results_tp53_wt.csv")

curated_amp_filtered_tp53_wt = compute_monte_carlo_stats(
    genes_of_interest=curated_genes,
    dependency_data=crispr_effect,
    cnv_data=cnv_data,
    mutation_data=mutation_data,
    candidate_models=tp53_wt_models,
    search_mode="amp",
    n_iterations=128,
    eval_function=genome_proximity_bias_score,
    eval_kwargs={"n_samples": 500, "n_trials": 200, "return_samples": False},
    n_workers=16,
    filter_amp=True,
)
curated_amp_filtered_tp53_wt.to_csv("outputs/curated_amp_filtered_results_tp53_wt.csv")

In [None]:
curated_lof_tp53_wt = pd.read_csv("outputs/curated_lof_results_tp53_wt.csv", index_col=0)
curated_amp_filtered_tp53_wt = pd.read_csv("outputs/curated_amp_results_tp53_wt.csv", index_col=0)
curated_lof_tp53_wt["abs_diff"] = curated_lof_tp53_wt["diff"].abs()
curated_amp_filtered_tp53_wt["abs_diff"] = curated_amp_filtered_tp53_wt["diff"].abs()
curated_lof_tp53_wt["tp53"] = "wt"
curated_amp_filtered_tp53_wt["tp53"] = "wt"

curated_lof_tp53_lof = pd.read_csv("outputs/curated_lof_results_tp53_lof.csv", index_col=0)
curated_amp_filtered_tp53_lof = pd.read_csv("outputs/curated_amp_results_tp53_lof.csv", index_col=0)
curated_lof_tp53_lof["abs_diff"] = curated_lof_tp53_lof["diff"].abs()
curated_amp_filtered_tp53_lof["abs_diff"] = curated_amp_filtered_tp53_lof["diff"].abs()
curated_lof_tp53_lof["tp53"] = "lof"
curated_amp_filtered_tp53_lof["tp53"] = "lof"

In [None]:
curated = pd.concat(
    [curated_lof_tp53_wt, curated_amp_filtered_tp53_wt, curated_lof_tp53_lof, curated_amp_filtered_tp53_lof], axis=0
)
curated.sort_index()[["tp53", "search_mode", "diff", "wt_mean", "test_mean", "n_samples"]]

In [None]:
all_genes = oncogenes + tsgs

gene_dict = {}
for gene in all_genes:
    tp53_lof_models, tp53_wt_models, tp53_amp_models
    tp53_lof_lof, tp53_lof_wt, tp53_lof_amp, _ = split_models(
        gene, candidate_models=tp53_lof_models, cnv_data=cnv_data, mutation_data=mutation_data
    )
    tp53_wt_lof, tp53_wt_wt, tp53_wt_amp, _ = split_models(
        gene, candidate_models=tp53_wt_models, cnv_data=cnv_data, mutation_data=mutation_data
    )
    tp53_amp_lof, tp53_amp_wt, tp53_amp_amp, _ = split_models(
        gene, candidate_models=tp53_amp_models, cnv_data=cnv_data, mutation_data=mutation_data
    )
    gene_dict[gene] = [
        len(tp53_lof_lof),
        len(tp53_lof_wt),
        len(tp53_lof_amp),
        len(tp53_wt_lof),
        len(tp53_wt_wt),
        len(tp53_wt_amp),
        len(tp53_amp_lof),
        len(tp53_amp_wt),
        len(tp53_amp_amp),
    ]

In [None]:
cell_line_counts = pd.DataFrame.from_dict(
    gene_dict,
    orient="index",
    columns=[
        "tp53_lof_gene_lof",
        "tp53_lof_gene_wt",
        "tp53_lof_gene_amp",
        "tp53_wt_gene_lof",
        "tp53_wt_gene_wt",
        "tp53_wt_gene_amp",
        "tp53_amp_gene_lof",
        "tp53_amp_gene_wt",
        "tp53_amp_gene_amp",
    ],
)

In [None]:
cell_line_counts.to_csv("cell_line_counts.csv")

In [None]:
bottom_tp53_wt_lof = overall.loc[(overall["tp53"] == "wt") & (overall["search_mode"] == "lof")].sort_values(
    "diff", ascending=True
)
bottom_tp53_wt_amp = overall.loc[(overall["tp53"] == "wt") & (overall["search_mode"] == "amp")].sort_values(
    "diff", ascending=True
)
bottom_tp53_lof_lof = overall.loc[(overall["tp53"] == "lof") & (overall["search_mode"] == "lof")].sort_values(
    "diff", ascending=True
)
bottom_tp53_lof_amp = overall.loc[(overall["tp53"] == "lof") & (overall["search_mode"] == "amp")].sort_values(
    "diff", ascending=True
)
top_tp53_wt_lof = overall.loc[(overall["tp53"] == "wt") & (overall["search_mode"] == "lof")].sort_values(
    "diff", ascending=False
)
top_tp53_wt_amp = overall.loc[(overall["tp53"] == "wt") & (overall["search_mode"] == "amp")].sort_values(
    "diff", ascending=False
)
top_tp53_lof_lof = overall.loc[(overall["tp53"] == "lof") & (overall["search_mode"] == "lof")].sort_values(
    "diff", ascending=False
)
top_tp53_lof_amp = overall.loc[(overall["tp53"] == "lof") & (overall["search_mode"] == "amp")].sort_values(
    "diff", ascending=False
)

In [None]:
# select genes

select_genes = [
    "SETBP1",
    "PTPRD",
    "PTPN2",
    "JAK2",
    "BRCA2",
    "PDGFB",
    "SMARCA2",
    "BMPR1A",
    "MDM4",
    "MDM2",
    "BTG2",
    "CDKN2A",
    "CDKN2B",
    "CDKN2C",
    "KRAS",
    "CCND2",
]

select_lof_tp53_lof = compute_monte_carlo_stats(
    genes_of_interest=select_genes,
    dependency_data=crispr_effect,
    cnv_data=cnv_data,
    mutation_data=mutation_data,
    candidate_models=tp53_lof_models,
    search_mode="lof",
    n_iterations=128,
    eval_function=genome_proximity_bias_score,
    eval_kwargs={"n_samples": 500, "n_trials": 200, "return_samples": False},
    n_workers=16,
    fixed_cell_line_sampling=True,
)
select_lof_tp53_lof.to_csv("outputs/select_lof_results_tp53_lof.csv")

select_amp_tp53_lof = compute_monte_carlo_stats(
    genes_of_interest=select_genes,
    dependency_data=crispr_effect,
    cnv_data=cnv_data,
    mutation_data=mutation_data,
    candidate_models=tp53_lof_models,
    search_mode="amp",
    n_iterations=128,
    eval_function=genome_proximity_bias_score,
    eval_kwargs={"n_samples": 500, "n_trials": 200, "return_samples": False},
    n_workers=16,
    fixed_cell_line_sampling=True,
)
select_amp_tp53_lof.to_csv("outputs/select_amp_results_tp53_lof.csv")

select_amp_filtered_tp53_lof = compute_monte_carlo_stats(
    genes_of_interest=select_genes,
    dependency_data=crispr_effect,
    cnv_data=cnv_data,
    mutation_data=mutation_data,
    candidate_models=tp53_lof_models,
    search_mode="amp",
    n_iterations=128,
    eval_function=genome_proximity_bias_score,
    eval_kwargs={"n_samples": 500, "n_trials": 200, "return_samples": False},
    n_workers=16,
    fixed_cell_line_sampling=True,
    filter_amp=True,
)
select_amp_filtered_tp53_lof.to_csv("outputs/select_amp_filtered_results_tp53_lof.csv")

# top on TP53 WT

select_lof_tp53_wt = compute_monte_carlo_stats(
    genes_of_interest=select_genes,
    dependency_data=crispr_effect,
    cnv_data=cnv_data,
    mutation_data=mutation_data,
    candidate_models=tp53_wt_models,
    search_mode="lof",
    n_iterations=128,
    eval_function=genome_proximity_bias_score,
    eval_kwargs={"n_samples": 500, "n_trials": 200, "return_samples": False},
    n_workers=16,
    fixed_cell_line_sampling=True,
)
select_lof_tp53_wt.to_csv("outputs/select_lof_results_tp53_wt.csv")

select_amp_tp53_wt = compute_monte_carlo_stats(
    genes_of_interest=select_genes,
    dependency_data=crispr_effect,
    cnv_data=cnv_data,
    mutation_data=mutation_data,
    candidate_models=tp53_wt_models,
    search_mode="amp",
    n_iterations=128,
    eval_function=genome_proximity_bias_score,
    eval_kwargs={"n_samples": 500, "n_trials": 200, "return_samples": False},
    n_workers=16,
    fixed_cell_line_sampling=True,
)
select_amp_tp53_wt.to_csv("outputs/select_amp_results_tp53_wt.csv")

select_amp_filtered_tp53_wt = compute_monte_carlo_stats(
    genes_of_interest=select_genes,
    dependency_data=crispr_effect,
    cnv_data=cnv_data,
    mutation_data=mutation_data,
    candidate_models=tp53_wt_models,
    search_mode="amp",
    n_iterations=128,
    eval_function=genome_proximity_bias_score,
    eval_kwargs={"n_samples": 500, "n_trials": 200, "return_samples": False},
    n_workers=16,
    fixed_cell_line_sampling=True,
    filter_amp=True,
)
select_amp_filtered_tp53_wt.to_csv("outputs/select_amp_filtered_results_tp53_wt.csv")
select_lof_tp53_wt = pd.read_csv("outputs/select_lof_results_tp53_wt.csv", index_col=0)
select_amp_filtered_tp53_wt = pd.read_csv("outputs/select_amp_results_tp53_wt.csv", index_col=0)
select_lof_tp53_wt["abs_diff"] = select_lof_tp53_wt["diff"].abs()
select_amp_filtered_tp53_wt["abs_diff"] = select_amp_filtered_tp53_wt["diff"].abs()
select_lof_tp53_wt["tp53"] = "wt"
select_amp_filtered_tp53_wt["tp53"] = "wt"

select_lof_tp53_lof = pd.read_csv("outputs/select_lof_results_tp53_lof.csv", index_col=0)
select_amp_filtered_tp53_lof = pd.read_csv("outputs/select_amp_results_tp53_lof.csv", index_col=0)
select_lof_tp53_lof["abs_diff"] = select_lof_tp53_lof["diff"].abs()
select_amp_filtered_tp53_lof["abs_diff"] = select_amp_filtered_tp53_lof["diff"].abs()
select_lof_tp53_lof["tp53"] = "lof"
select_amp_filtered_tp53_lof["tp53"] = "lof"
select = pd.concat(
    [select_lof_tp53_wt, select_amp_filtered_tp53_wt, select_lof_tp53_lof, select_amp_filtered_tp53_lof], axis=0
)
select.sort_index()[["tp53", "search_mode", "diff", "wt_mean", "test_mean", "n_sample_bootstrap"]]

In [None]:
select = pd.concat(
    [select_lof_tp53_wt, select_amp_filtered_tp53_wt, select_lof_tp53_lof, select_amp_filtered_tp53_lof], axis=0
)
select.sort_index()[["tp53", "search_mode", "diff", "wt_mean", "test_mean", "n_sample_bootstrap"]]
select.sort_index().to_csv("outputs/select_genes.csv")

In [None]:
select = pd.read_csv("outputs/select_genes.csv", index_col=0)

In [None]:
select = pd.concat(
    [select_lof_tp53_wt, select_amp_filtered_tp53_wt, select_lof_tp53_lof, select_amp_filtered_tp53_lof], axis=0
)
select.sort_index()[["tp53", "search_mode", "diff", "wt_mean", "test_mean", "n_sample_bootstrap"]]
select["wt_stats"] = select["wt_stats"].apply(lambda x: eval(x))
select["test_stats"] = select["test_stats"].apply(lambda x: eval(x))
select_2 = select.explode(column=["wt_stats", "test_stats"])

In [None]:
select_2.index.name = "gene"
select_2 = select_2.reset_index()

In [None]:
select_2["wt_stats"] = select_2["wt_stats"].astype(float)
select_2["test_stats"] = select_2["test_stats"].astype(float)

In [None]:
import seaborn as sns

sns.set(rc={"figure.figsize": (40, 10)})

sns.set_style("whitegrid")
sns.violinplot(data=select_2, x="gene", y="test_stats", hue="tp53", split=True, inner="quartile")
sns.violinplot(data=select_2, x="gene", y="wt_stats", hue="tp53", split=True, inner="quartile")

In [None]:
# weird genes

weird_genes = ["HLA-C"]

weird_lof_tp53_lof = compute_monte_carlo_stats(
    genes_of_interest=weird_genes,
    dependency_data=crispr_effect,
    cnv_data=cnv_data,
    mutation_data=mutation_data,
    candidate_models=tp53_lof_models,
    search_mode="lof",
    n_iterations=128,
    eval_function=genome_proximity_bias_score,
    eval_kwargs={"n_samples": 500, "n_trials": 200, "return_samples": False},
    n_workers=1,
    fixed_cell_line_sampling=True,
)
weird_lof_tp53_lof.to_csv("outputs/weird_lof_results_tp53_lof.csv")

weird_amp_filtered_tp53_lof = compute_monte_carlo_stats(
    genes_of_interest=weird_genes,
    dependency_data=crispr_effect,
    cnv_data=cnv_data,
    mutation_data=mutation_data,
    candidate_models=tp53_lof_models,
    search_mode="amp",
    n_iterations=128,
    eval_function=genome_proximity_bias_score,
    eval_kwargs={"n_samples": 500, "n_trials": 200, "return_samples": False},
    n_workers=1,
    fixed_cell_line_sampling=True,
    filter_amp=True,
)
weird_amp_filtered_tp53_lof.to_csv("outputs/weird_amp_filtered_results_tp53_lof.csv")

# top on TP53 WT

weird_lof_tp53_wt = compute_monte_carlo_stats(
    genes_of_interest=weird_genes,
    dependency_data=crispr_effect,
    cnv_data=cnv_data,
    mutation_data=mutation_data,
    candidate_models=tp53_wt_models,
    search_mode="lof",
    n_iterations=128,
    eval_function=genome_proximity_bias_score,
    eval_kwargs={"n_samples": 500, "n_trials": 200, "return_samples": False},
    n_workers=1,
    fixed_cell_line_sampling=True,
)
weird_lof_tp53_wt.to_csv("outputs/weird_lof_results_tp53_wt.csv")

weird_amp_filtered_tp53_wt = compute_monte_carlo_stats(
    genes_of_interest=weird_genes,
    dependency_data=crispr_effect,
    cnv_data=cnv_data,
    mutation_data=mutation_data,
    candidate_models=tp53_wt_models,
    search_mode="amp",
    n_iterations=128,
    eval_function=genome_proximity_bias_score,
    eval_kwargs={"n_samples": 500, "n_trials": 200, "return_samples": False},
    n_workers=1,
    fixed_cell_line_sampling=True,
    filter_amp=True,
)
weird_amp_filtered_tp53_wt.to_csv("outputs/weird_amp_filtered_results_tp53_wt.csv")

weird_lof_tp53_wt = pd.read_csv("outputs/weird_lof_results_tp53_wt.csv", index_col=0)
weird_amp_filtered_tp53_wt = pd.read_csv("outputs/weird_amp_results_tp53_wt.csv", index_col=0)
weird_lof_tp53_wt["abs_diff"] = weird_lof_tp53_wt["diff"].abs()
weird_amp_filtered_tp53_wt["abs_diff"] = weird_amp_filtered_tp53_wt["diff"].abs()
weird_lof_tp53_wt["tp53"] = "wt"
weird_amp_filtered_tp53_wt["tp53"] = "wt"

weird_lof_tp53_lof = pd.read_csv("outputs/weird_lof_results_tp53_lof.csv", index_col=0)
weird_amp_filtered_tp53_lof = pd.read_csv("outputs/weird_amp_results_tp53_lof.csv", index_col=0)
weird_lof_tp53_lof["abs_diff"] = weird_lof_tp53_lof["diff"].abs()
weird_amp_filtered_tp53_lof["abs_diff"] = weird_amp_filtered_tp53_lof["diff"].abs()
weird_lof_tp53_lof["tp53"] = "lof"
weird_amp_filtered_tp53_lof["tp53"] = "lof"
weird = pd.concat(
    [weird_lof_tp53_wt, weird_amp_filtered_tp53_wt, weird_lof_tp53_lof, weird_amp_filtered_tp53_lof], axis=0
)
weird.sort_index()[["tp53", "search_mode", "diff", "wt_mean", "test_mean", "n_sample_bootstrap"]]