In [1]:
import maboss
import ginsim
import pandas as pd 
import numpy as np


from identification_patients.get_patients_sens_res import get_patients

from create_generic_models.create_generic_patients_cfgs import create_generic_patients_cfgs_bnds
from pre_process_data.pre_process_genes import process_genes

from create_person_models.tailor_cfgs_patients_gene import personalized_patients_genes_cfgs
from create_person_models.tailor_bnd_mutations import personalized_patients_mutations_bnds

from MaBoSS_simulation.MaBoSS_phenotype_distribution import compute_phenotypes_distribution, compute_mean_patients
from pre_process_data.identify_mutations_patients import identif_mutations_kras_egfr

# # from pre_process_profiles_table_data_lung import create_genes_patients

from stats.stats_proba import compute_mannwhitneyu_test_means
from results_MaBoSS_visualization.boxplot_phenotype import create_boxplot
from results_MaBoSS_visualization.create_phenotypes_patients_table import vizualise_table_phenotype_condition
from results_MaBoSS_visualization.patients_ids_phenotype_table import create_table_patients_phenotypes
from gene_enrichment.genes_signature import compute_genes_mean_signature


In [2]:
import os
print(os.getcwd())


/Users/romane/repos/icr/Attractor-Resistance


Step 0: Import data 

In [3]:
# Import data
annotations_models = pd.read_csv('data/model_list_20250407.csv')
mutations_data = pd.read_csv('data/mutations_all_20250318.csv')
drug_data = pd.read_csv('data/drug_sensitivity.csv')

montagud_data = (
    pd.read_csv('data/Montagud_inter_nodes_data.csv', header=1)
    .loc[:, ['Target node', 'Interaction type', 'Source']])
montagud_nodes = list(set(montagud_data['Target node'].tolist() + montagud_data['Source'].tolist()))
rna_seq_data = pd.read_csv('data/rnaseq_merged/rnaseq_merged_20250117.csv')
#genes_data_filtered = pd.read_csv('filtered_data/rna_seq_lung_clean.csv')


In [18]:
# chose folder where we want all the personalized boolean models and associated results saved 
folder_pers_models='models/personalized_boolean_large_groups'
folder_result = 'results'
# Output directories
output_dir_resistant = f'{folder_pers_models}/resistant_patient/generic_models'
output_dir_sec_resistant = f'{folder_pers_models}/resistant_patient/personalized_boolean_modified'
output_dir_sensitive = f'{folder_pers_models}/sensitive_patient/generic_models'
output_dir_sec_sensitive = f'{folder_pers_models}/sensitive_patient/personalized_boolean_modified'
bnd_dir_res = f'{folder_pers_models}/resistant_patient/personalized_boolean_modified/models_gene_expression'
bnd_dir_sens = f'{folder_pers_models}/sensitive_patient/personalized_boolean_modified/models_gene_expression'



In [5]:
tissue_remove = 'Haematopoietic and Lymphoid'
tissue_interest = 'Lung'

Step 1: Check what drug is the best to keep (the one with most resistant and sensitive)

In [26]:
# results = {}
# drug_interests = drug_data['DRUG_NAME'].unique().tolist()
# #print(drug_interests)
# for drug_interest in drug_interests:
#     results[drug_interest] = get_patients(100, drug_data, annotations_models, tissue_interest, drug_interest)

# drug_interest: {
#     "name": drug_interest,
#     "<-1.5": float((df['Z_SCORE'] < -1.5).sum()),
#     ">1.5": float((df['Z_SCORE'] > 1.5).sum()),
#     "mean": float(df['Z_SCORE'].mean()),
#     "std": float(df['Z_SCORE'].std()),
#     "abs_zscore": float(df['Z_SCORE'].abs().mean()),
# }

# list_results = results.values()
# pd_results = pd.DataFrame(list_results)
# pd_results.to_csv(f'{folder}/drug_analysis.csv")

Step 2: Select cancer and drug of interest (tissue_interest, drug_interest)
Get the 100 most sensitive patient
Pre-process data


In [6]:
# Pre-process genes data 
# top_resistant_ids, top_sensitive_ids= get_patients(drug_data, annotations_models, drug_interest, tissue_interest)
drug_interest = 'AZD8931' #'Avagacestat' AZD8931
tissue_remove = 'Haematopoietic and Lymphoid'
top_resistant_ids, top_sensitive_ids, drug_tissue_data= get_patients(100, drug_data, annotations_models, drug_interest)
patients_ids = top_sensitive_ids + top_resistant_ids

# check if KRAS is also in the montagud_data
rna_seq_data_filtered = process_genes(patients_ids, montagud_data, rna_seq_data)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  annotations_models_filtered.rename(columns={"model_id": "SANGER_MODEL_ID"}, inplace=True)


data for sensitive
    SANGER_MODEL_ID   Z_SCORE
10        SIDM00049 -3.706856
512       SIDM00690 -3.297692
233       SIDM00350 -3.283305
758       SIDM01027 -3.257150
323       SIDM00465 -3.249644
data for resistant
    SANGER_MODEL_ID   Z_SCORE
531       SIDM00719  2.803536
179       SIDM00291  2.551463
898       SIDM01210  2.524426
373       SIDM00524  2.519573
180       SIDM00292  2.432931


In [8]:
print(rna_seq_data_filtered.head())

gene_expression_level                               High Gene Expression  \
model_id                                                                   
SIDM00043              BAD, BMP2, EGFR, FOS, FRS2, JUN, LDHA, NF1, RU...   
SIDM00046              CFLAR, EGF, EGFR, EP300, ETV1, FGFR3, FOXA1, I...   
SIDM00048              BAD, FOS, FOXA1, FRS2, IDH1, JUN, LDHA, NCOR2,...   
SIDM00049              AXIN1, BRCA2, EGFR, FGFR3, FOS, IDH1, MYC, NCO...   
SIDM00087                                               EEF2K, MYC, TERT   

gene_expression_level                                Low Gene Expression  
model_id                                                                  
SIDM00043                                                      BAX, EZH2  
SIDM00046                          ATM, BCL2, CDH2, MXI1, RB1, TERT, VHL  
SIDM00048                                ATR, BAX, BCL2, EEF2, SMO, TERT  
SIDM00049                       ATM, BCL2, CDH2, ETS1, FOXA1, LDHA, MDM2  
SIDM00087        

Step 3: Create generic boolean networks with the sensitive and resistant ID names


In [7]:
folder_generic_models_cfg = 'models/generic_models/Adeno_lung_Cancer.cfg'
folder_generic_models_bnd = 'models/generic_models/Adeno_lung_Cancer.bnd'
create_generic_patients_cfgs_bnds(folder_generic_models_cfg, folder_generic_models_bnd, folder_pers_models, top_resistant_ids, top_sensitive_ids, drug_interest)


All .cfg and .bnd files created for sensitive and resistant patients.


Step 4: Personalize the cfg files with genes/ proteins


In [15]:
personalized_patients_genes_cfgs(rna_seq_data, montagud_data, output_dir_resistant, output_dir_sec_resistant, patients_ids, rna_seq_data_filtered, drug_interest)
personalized_patients_genes_cfgs(rna_seq_data, montagud_data, output_dir_sensitive, output_dir_sec_sensitive, patients_ids, rna_seq_data_filtered, drug_interest)

SIDM00948_AZD8931.cfg
SIDM00446_AZD8931.bnd
Modified and saved: models/personalized_boolean_large_groups/resistant_patient/personalized_boolean_modified/models_gene_expression/SIDM00446_AZD8931.bnd
SIDM01057_AZD8931.cfg
SIDM00524_AZD8931.bnd
Modified and saved: models/personalized_boolean_large_groups/resistant_patient/personalized_boolean_modified/models_gene_expression/SIDM00524_AZD8931.bnd
SIDM00422_AZD8931.cfg
SIDM01079_AZD8931.cfg
Modified and saved: models/personalized_boolean_large_groups/resistant_patient/personalized_boolean_modified/models_gene_expression/SIDM01079_AZD8931.cfg
SIDM00495_AZD8931.bnd
SIDM00621_AZD8931.bnd
Modified and saved: models/personalized_boolean_large_groups/resistant_patient/personalized_boolean_modified/models_gene_expression/SIDM00621_AZD8931.bnd
SIDM01121_AZD8931.bnd
Modified and saved: models/personalized_boolean_large_groups/resistant_patient/personalized_boolean_modified/models_gene_expression/SIDM01121_AZD8931.bnd
SIDM01177_AZD8931.cfg
Modified a

TO DO: change the path for the results 

Step 5: Identification of which patients id have KRAS or EGFR mutation
personalize the bnd files with the mutations common to Lung (KRAS/ EGFR):

In [16]:
personalized_patients_mutations_bnds(mutations_data,patients_ids,bnd_dir_res, drug_interest)
personalized_patients_mutations_bnds(mutations_data,patients_ids,bnd_dir_sens, drug_interest)

SIDM00723
kras
<re.Match object; span=(13717, 13841), match='Node KRAS {\n  logic = (EGFR | TGFBR | FRS2 | FGF>

KRAS node before replacement: Node KRAS {
  logic = (EGFR | TGFBR | FRS2 | FGFR3);
  rate_up = @logic ? $u_KRAS : 0;
  rate_down = @logic ? 0 : $d_KRAS;
}
SIDM00723: KRAS mutation — node modified
Modified content for SIDM00723_AZD8931.bnd:
Node Acidosis {
  logic = (Acidosis);
  rate_up = @logic ? $u_Acidosis : 0;
  rate_down = @logic ? 0 : $d_Acidosis;
}

Node AKT {
  logic = ((HSPs | (PDK1 & PIP3) | PIP3 | (SHH & PIP3)) & !PTCH1);
  rate_up = @logic ? $u_AKT : 0;
  rate_down = @logic ? 0 : $d_AKT;
}

Node AMPK {
  logic = (ATR | HIF1 | AMP_ATP | ATM) & !FGFR3;
  rate_up = @logic ? $u_AMPK : 0;
  rate_down = @logic ? 0 : $d_AMPK;
}

Node AMP_ATP {
  logic = (!Nutrients);
  rate_up = @logic ? $u_AMP_ATP : 0;
  rate_down = @logic ? 0 : $d_AMP_ATP;
}

Node Androgen {
  logic = (Androgen);
  rate_up = @logic ? $u_Androgen : 0;
  rate_down = @logic ? 0 : $d_Androgen;
}

Node APA

Step 6: compute the phenotype distribution 

In [23]:
dic_patient_resistant =f'{folder_pers_models}/resistant_patient/personalized_boolean_modified/models_gene_expression'
dic_patient_sensitive =f'{folder_pers_models}/sensitive_patient/personalized_boolean_modified/models_gene_expression'
inputs_list = ['EGF', 'FGF', 'TGFb', 'Nutrients', 'Hypoxia', 'Acidosis', 'Androgen', 'TNFalpha', 'Carcinogen']


patient_res_data_dict = compute_phenotypes_distribution(folder_result, dic_patient_resistant, inputs_list, 'resistant', drug_interest)
patient_sens_data_dict = compute_phenotypes_distribution(folder_result, dic_patient_sensitive, inputs_list, 'sensitive', drug_interest)
patients_res_df_mean, patients_res_df_std, stats_results_data_res_df = compute_mean_patients(patient_res_data_dict)
patients_sens_df_mean, patients_sens_df_std, stats_results_data_sens_df = compute_mean_patients(patient_sens_data_dict)



patients_res_df_mean.to_csv(f'{folder_result}/resistant_results/only_gene_expression/single_input_on/patients_resistant_df_mean_{drug_interest}.csv', index=True)
patients_res_df_std.to_csv(f'{folder_result}/resistant_results/only_gene_expression/single_input_on/patients_resistant_df_std_{drug_interest}.csv', index=True)
stats_results_data_res_df.to_csv(f'{folder_result}/resistant_results/only_gene_expression/single_input_on/patients_resistant_values_stats_{drug_interest}.csv', index=True)
stats_results_data_sens_df.to_csv(f'{folder_result}/sensitive_results/only_gene_expression/single_input_on/patients_sensitive_values_stats_{drug_interest}.csv', index=True)
patients_sens_df_mean.to_csv(f'{folder_result}/sensitive_results/only_gene_expression/single_input_on/patients_sensitive_df_mean_{drug_interest}.csv', index=True)
patients_sens_df_std.to_csv(f'{folder_result}/sensitive_results/only_gene_expression/single_input_on/patients_sensitive_df_std_{drug_interest}.csv', index=True)




--- Results for patient: SIDM01079_AZD8931 ---
                  <nil>  Apoptosis  Proliferation  Metastasis
EGF_ON         0.905736   0.000000       0.000833    0.075630
FGF_ON         0.957386   0.033914       0.006000    0.000800
TGFb_ON        0.000200   0.999800       0.000000    0.000000
Nutrients_ON   1.000000   0.000000       0.000000    0.000000
Hypoxia_ON     0.996894   0.000000       0.000000    0.003106
Acidosis_ON    1.000000   0.000000       0.000000    0.000000
Androgen_ON    0.905536   0.000000       0.000833    0.075430
TNFalpha_ON    1.000000   0.000000       0.000000    0.000000
Carcinogen_ON  0.974407   0.018086       0.005307    0.000600

--- Results for patient: SIDM01177_AZD8931 ---
                  <nil>  Apoptosis  Proliferation  Metastasis
EGF_ON         0.921200   0.000000       0.064000      0.0000
FGF_ON         0.993800   0.000000       0.006200      0.0000
TGFb_ON        0.719000   0.113400       0.000200      0.1674
Nutrients_ON   1.000000   0.000000  

KeyboardInterrupt: 

Step 7: Compute stats test between two mean datasets

In [None]:
patient_res_stats_values = pd.read_csv(f'{folder_result}/resistant_results/only_gene_expression/single_input_on/patients_resistant_values_stats_{drug_interest}.csv')
patient_sens_stats_values = pd.read_csv(f'{folder_result}/sensitive_results/only_gene_expression/single_input_on/patients_sensitive_values_stats_{drug_interest}.csv')
compute_mannwhitneyu_test_means(folder_result,patient_res_stats_values, patient_sens_stats_values, drug_interest)

Step 8: Vizualise the boxplot of phenotype distribution output

In [None]:
patient_res_values = pd.read_csv(f'{folder_result}/resistant_results/only_gene_expression/single_input_on/patients_resistant_values_stats_{drug_interest}.csv')
patient_sens_values = pd.read_csv(f'{folder_result}/sensitive_results/only_gene_expression/single_input_on/patients_sensitive_values_stats_{drug_interest}.csv')
data_greater_side = pd.read_csv(f'{folder_result}/sensitive_resistant_results/p_values_df_mannwhitneyu_greater_sign_{drug_interest}.csv')
create_boxplot(folder_result, patient_res_values, patient_sens_values, data_greater_side)

Step 9: create table of patients with conditions- phenotype

In [None]:
dir_res_data = f'{folder_result}/resistant_results/only_gene_expression/single_input_on/phenotype_distribution_patients'
dir_sens_data = f'{folder_result}/sensitive_results/only_gene_expression/single_input_on/phenotype_distribution_patients'
patients_phenot_table = create_table_patients_phenotypes(folder_result, dir_res_data, dir_sens_data)

Step 10: Create heatmap figure 

In [None]:
patient_resistant_mean = pd.read_csv(f'{folder_result}/resistant_results/only_gene_expression/single_input_on/patients_resistant_df_mean_{drug_interest}.csv')
patient_sensitive_mean = pd.read_csv(f'{folder_result}/sensitive_results/only_gene_expression/single_input_on/patients_sensitive_df_mean_{drug_interest}.csv')
vizualise_table_phenotype_condition(folder_result, patient_resistant_mean, patient_sensitive_mean)

Step 11: Identify genes differently expressed in the patients with high 

In [None]:
patients_phenot_table = pd.read_csv(f'{folder_result}/sensitive_resistant_results/patients_phenot_table.csv')
genes_stats_results_metast_TGFb = compute_genes_mean_signature(folder_result, montagud_nodes, 'Metastasis', 'TGFb', patients_phenot_table, top_resistant_ids, top_sensitive_ids)
genes_stats_results_prolif_egf = compute_genes_mean_signature(folder_result, montagud_nodes, 'Proliferation', 'EGF', patients_phenot_table, top_resistant_ids, top_sensitive_ids)

Step 11: check there is not correlation between phenotype distribution and cancer type 


In [None]:
patients_phenot_table['SANGER_MODEL_ID'] = patients_phenot_table['Unnamed: 0'].str.split('_').str[0]
conditions = [
    patients_phenot_table['SANGER_MODEL_ID'].isin(top_resistant_ids),
    patients_phenot_table['SANGER_MODEL_ID'].isin(top_sensitive_ids)
    ]
choices = ['Resistant', 'Sensitive']
patients_phenot_table.loc[:,'Drug status'] = np.select(conditions, choices, default = '')



ids_tissue_data = drug_tissue_data[['SANGER_MODEL_ID', 'tissue']]
ids_tissue_data = ids_tissue_data.drop_duplicates(subset='SANGER_MODEL_ID')


# merge tissues and model id 
patients_phenot_table = pd.merge(patients_phenot_table, ids_tissue_data, on = 'SANGER_MODEL_ID')
print(patients_phenot_table)



# look the number of each cancer for the condition-phenotype of interest
condition = 'TGFb'
phenotype = 'Metastasis'


# resistant group changes according to what is the condition and the phenotype
# group_proliferation_resistant: group with high phenotype 

group_phenotype_resistant = patients_phenot_table[
    (patients_phenot_table['Drug status'] == 'Resistant') & 
    (patients_phenot_table[f'{condition}_ON_{phenotype}'] >= 0.1)
]

# print(group_phenotype_resistant['tissue'].value_counts()) # EGF- proliferation: 4 lung, 1 breast, 1 haematopoetic
#                                                           # TGFb- Metastasis: 21 haemato, 2 skin, 2 breast, 1 lung, 1 large intestine, 1 endom, 1 liver
