Pipeline to create the personalized Boolean Networks based on gene expression and cnv 

In [1]:
import pandas as pd
import os
import shutil

from functions.generate_utils.identification_patients.get_patients_sens_res import get_patients
from functions.generate_models import pre_process_re, generate_models_re
from functions.analysis_utils.MaBoSS_simulation.maboss_phenotype_patient import compute_phenotype_table, collect_group_data

from functions.analysis_utils.stats.stats_proba import compute_mannwhitneyu_test_means
from functions.analysis_utils.results_MaBoSS_visualization.boxplot_phenotype import create_boxplot

In [None]:
# User's data to choose
number_patients = 20 # in each group
drug_interest = 'Refametinib'
tissue_interest = 'PAN_CANCER'
tissue_remove = 'Haematopoietic and Lymphoid'



# Models parameters
node_to_remove = ['FUSED_EVENT', 'AR_ERG']
type_models = 'genes_models'



# Generic Boolean Network 
# Inputs
inputs_list = ['EGF', 'FGF', 'TGFB', 'Androgen', 'Hypoxia', 'Nutrients', 'Carcinogen', 'Acidosis', 'TNF', 'SPOP']

# Phenotypes
phenotype_interest = ["Proliferation","Invasion","DNA_Repair","Migration","Apoptosis"]



In [None]:
# Loading the data (TCGA, cell model passport)

annotations_models = pd.read_csv('data/model_list_20250407.csv')
mutations_data = pd.read_csv('data/mutations_all_20250318.csv')
drug_data = pd.read_csv('data/drug_sensitivity.csv')

montagud_data = (
    pd.read_csv('data/Montagud_inter_nodes_data.csv', header=1)
    .loc[:, ['Target node', 'Interaction type', 'Source']])
rna_seq_data = pd.read_csv('data/rnaseq_merged/rnaseq_merged_20250117.csv')
cnv_data = pd.read_csv('data/cellmodel_data/cnv_summary_20250207.csv')


models_depmap_annotation = pd.read_csv('data/cellmodel_data/model_list_20250423.csv')
mutations_data_depseq = pd.read_csv('data/depmap_data/OmicsSomaticMutations.csv')


In [None]:
# Create folders structure 

patients_categ = ['resistant', 'sensitive']

if not os.path.exists(f"analysis/{drug_interest}/models/generic"):
        os.makedirs(f"analysis/{drug_interest}/models/generic")

for patient_categ in patients_categ:
    if not os.path.exists(f"analysis/{drug_interest}/results/{patient_categ}"):
        os.makedirs(f"analysis/{drug_interest}/results/{patient_categ}")

    if not os.path.exists(f"analysis/{drug_interest}/models/{patient_categ}"):
        os.makedirs(f"analysis/{drug_interest}/models/{patient_categ}")



dest_dir = f"analysis/{drug_interest}/models/generic"
os.makedirs(dest_dir, exist_ok=True)

# Copy the files
shutil.copy('data/montagud_models/Montagud2022_Prostate_Cancer.bnd', dest_dir)
shutil.copy('data/montagud_models/Montagud2022_Prostate_Cancer.cfg', dest_dir)


folder_generic_models = f"analysis/{drug_interest}/models/generic/"
folder_models = f"analysis/{drug_interest}/models"

In [None]:
# Pre process data and saved them in the data filtered folder
# AS tissue interest is PAN cancer -> tissue_interest = None (want all tissues)

top_resistant_ids, top_sensitive_ids, montagud_nodes, rna_seq_data_filtered, cnv_data_filtered, table_rna_seq_patients = pre_process_re(
    montagud_data,
    rna_seq_data,
    cnv_data,
    number_patients,
    drug_data,
    annotations_models,
    drug_interest,
    tissue_interest=None,
    tissue_remove=tissue_remove,
    node_to_remove=node_to_remove,
)


os.makedirs(f"analysis/{drug_interest}/data_filtered", exist_ok=True)
rna_seq_data_filtered.to_csv(f"analysis/{drug_interest}/data_filtered/rna_seq_data_filtered.csv")
cnv_data_filtered.to_csv(f"analysis/{drug_interest}/data_filtered/cnv_data_filtered.csv")
table_rna_seq_patients.to_csv(f"analysis/{drug_interest}/data_filtered/table_rna_seq_patients.csv")




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  annotations_models_filtered.rename(


top_resistant_ids: ['SIDM01120', 'SIDM00719', 'SIDM00711', 'SIDM00292', 'SIDM00631', 'SIDM00506', 'SIDM00892', 'SIDM00255', 'SIDM00183', 'SIDM00881', 'SIDM00767', 'SIDM00524', 'SIDM00770', 'SIDM00920', 'SIDM00716', 'SIDM01056', 'SIDM00870', 'SIDM00444', 'SIDM00971', 'SIDM00508']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rna_seq_data.rename(columns={"gene_symbol": "gene_name"}, inplace=True)


In [None]:
# create the personalized models (gene expression and cnv data)

generate_models_re(
    folder_generic_models,
    folder_models,
    top_resistant_ids,
    top_sensitive_ids,
    drug_interest,
    phenotype_interest,
    rna_seq_data,
    montagud_nodes,
    table_rna_seq_patients,
    cnv_data_filtered,
)