In [1]:
import maboss
import ginsim
import pandas as pd 
import numpy as np
import mygene
import os
import shutil

from create_generic_models.create_generic_patients_cfgs import create_generic_patients_cfg_bnd_validation
from create_generic_models.update_phenotypes_generic_models import generic_models_update_phenotypes


# from create_person_models.tailor_cfgs_patients_gene import personalized_patients_genes_cfgs_validation
from pre_process_data.pre_process_genes import create_table_rna_seq_patients

from identification_patients.validation_get_patients_ids import get_patients_valid

from pre_process_data.tcga_preprocess_data import pre_process_tcga_data

from create_person_models.tailor_cfgs_patients_gene import personalized_patients_genes_cfgs
from create_person_models.tailor_bnd_cnv import tailor_bnd_cnv_validation

from MaBoSS_simulation.MaBoSS_phenotype_distribution import compute_phenotypes_distribution, compute_mean_patients

from create_person_models.tailor_bnd_tsg_onco_mutations import tailor_bnd_mutat_validation

In [2]:
# Import data
phenotype_data = pd.read_csv('data/TCGA_data/TCGA_GDC-PANCAN_tumor_stage_phenotype.csv')
genes_data = pd.read_csv('data/TCGA_data/TCGA_GDC-PANCAN_genes_FPKM_UQ.csv')

cnv_data = pd.read_csv('data/TCGA_data/TCGA-PANCAN_cnv.csv')
montagud_data = (
    pd.read_csv('data/Montagud_inter_nodes_data.csv', header=1)
    .loc[:, ['Target node', 'Interaction type', 'Source']])

# Create list of genes of interest (in Montagud data)
montagud_nodes = list(set(montagud_data['Target node'].tolist() + montagud_data['Source'].tolist()))
montagud_nodes = [node for node in montagud_nodes if node != '0/1']
montagud_nodes = [node.upper() for node in montagud_nodes if isinstance(node, str)]
montagud_nodes.append('KRAS')
to_remove = ['RAS', 'FUSED_EVENT', 'NKX3_1', 'SPOP', 'AR_ERG']

montagud_nodes = [node for node in montagud_nodes if node not in to_remove]
montagud_nodes = list(set(montagud_nodes))

In [3]:
patients_id, phenotype_data_filtered = get_patients_valid(phenotype_data, genes_data, cnv_data)
print(patients_id)

['TCGA-GC-A3RD-01A', 'TCGA-A2-A25E-01A', 'TCGA-CF-A1HS-01A', 'TCGA-HQ-A2OF-01A', 'TCGA-FD-A5C1-01A', 'TCGA-BT-A20U-01A', 'TCGA-CF-A9FL-01A', 'TCGA-A8-A06Q-01A', 'TCGA-GC-A3WC-01A', 'TCGA-SY-A9G5-01A', 'TCGA-A2-A04Q-01A', 'TCGA-AR-A24N-01A', 'TCGA-BH-A1FU-01A', 'TCGA-OR-A5L5-01A', 'TCGA-A2-A0ER-01A', 'TCGA-E2-A1IN-01A', 'TCGA-AR-A24S-01A', 'TCGA-AO-A0J4-01A', 'TCGA-OR-A5JI-01A', 'TCGA-B6-A0IP-01A', 'TCGA-OR-A5J6-01A', 'TCGA-E7-A3X6-01A', 'TCGA-BH-A6R8-01A', 'TCGA-FD-A43X-01A', 'TCGA-B6-A0I1-01A', 'TCGA-DK-A6B0-01A', 'TCGA-BT-A20W-01A', 'TCGA-GD-A76B-01A', 'TCGA-AC-A5XU-01A', 'TCGA-BT-A42C-01A', 'TCGA-XF-A8HC-01A', 'TCGA-YC-A9TC-01A', 'TCGA-GC-A3YS-01A', 'TCGA-BT-A42F-01A', 'TCGA-LC-A66R-01A', 'TCGA-CU-A0YO-01A', 'TCGA-XF-AAMZ-01A', 'TCGA-XF-AAN7-01A', 'TCGA-OR-A5KO-01A', 'TCGA-P6-A5OG-01A']


In [4]:
# pre-process tcga data based on montagud nodes and patients ids

# df_melted_cnv, df_melted_genes = pre_process_tcga_data(cnv_data, genes_data, patients_id, montagud_nodes)

df_melted_cnv= pd.read_csv('data/TCGA_data/filtered_data/cnv_samples_table.csv')
df_melted_gene= pd.read_csv('data/TCGA_data/filtered_data/genes_samples_table.csv')

In [5]:
print(df_melted_gene.head())

   Unnamed: 0          model_id gene_symbol   rsem_tpm
0           0  TCGA-GC-A3RD-01A       AXIN1  17.570207
1           1  TCGA-A2-A25E-01A       AXIN1  16.120079
2           2  TCGA-CF-A1HS-01A       AXIN1  17.055333
3           3  TCGA-HQ-A2OF-01A       AXIN1  17.203970
4           4  TCGA-FD-A5C1-01A       AXIN1  16.951919


In [6]:
 # Create generic models 

folder_generic_models_cfg = 'validation/generic_models/Adeno_lung_Cancer.cfg'
folder_generic_models_bnd = 'validation/generic_models/Adeno_lung_Cancer.bnd'

folder_pers_models = 'validation/personalized_models/models_gene_expression'

tissue = 'PAN_CANCER'

drug_name = 'PAN_CANCER'

# create_generic_patients_cfg_bnd_validation(folder_generic_models_cfg, folder_generic_models_bnd, folder_pers_models, patients_id, tissue)


In [7]:
# update phenotypes in generic models 

phenotype_interest = ["Proliferation","Invasion","DNA_Repair","Migration","Apoptosis"]
original_data_dir = "validation/personalized_models/models_gene_expression"
results_dir = "validation/personalized_models/models_gene_expression"



# generic_models_update_phenotypes(phenotype_interest, original_data_dir, results_dir)


In [8]:
# personalize the boolean networks with genes 
table_rna_seq_patients = create_table_rna_seq_patients(df_melted_gene)
print(table_rna_seq_patients.head())

gene_expression_level                               High Gene Expression  \
model_id                                                                   
TCGA-A2-A04Q-01A                                      CFLAR, ETS1, YWHAZ   
TCGA-A2-A0ER-01A       APAF1, AR, BCL2, EEF2K, EP300, FADD, FRS2, LDH...   
TCGA-A2-A25E-01A       APAF1, BCL2, BRCA1, EGF, FRS2, KRAS, MDM2, NCO...   
TCGA-A8-A06Q-01A       APAF1, AR, ATR, BRCA2, EEF2K, EGF, FADD, FRS2,...   
TCGA-AC-A5XU-01A       APAF1, AXIN1, BCL2, COX4I2, EEF2, EEF2K, MED12...   

gene_expression_level                                Low Gene Expression  
model_id                                                                  
TCGA-A2-A04Q-01A                             BAD, EEF2, FGFR3, NF1, PTEN  
TCGA-A2-A0ER-01A       AXIN1, BAD, BAX, BMP2, EGFR, IDH1, JUN, PTCH1,...  
TCGA-A2-A25E-01A         AXIN1, BAD, BAX, EGFR, FADD, PTCH1, SHH, ZBTB17  
TCGA-A8-A06Q-01A       AXIN1, BAD, BAX, BMP2, COX4I2, EGFR, ETS1, ETV...  
TCGA-AC-A5XU-01A 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rna_seq_data.rename(columns={"gene_symbol": "gene_name"}, inplace=True)


In [9]:
# personalized_patients_genes_cfgs(df_melted_gene,montagud_nodes,folder_pers_models,folder_pers_models,patients_id,table_rna_seq_patients,drug_name)

In [10]:
# personalize with CNV
# tailor_bnd_cnv_validation(df_melted_cnv, folder_pers_models, tissue)

In [11]:
# add mutations info 

# Add somatic mutations data 
# keep only the one we had for the main pipeline

mutations_data = pd.read_csv('data/TCGA_data/TCGA_mutations_mutect2_GDC-PANCAN.csv')
mutations_data_filtered = mutations_data[mutations_data['Sample_ID'].isin(patients_id)]
mutations_data_filtered = mutations_data_filtered[mutations_data_filtered['gene'].isin(montagud_nodes)]

# check if genes are TSG/ Oncogenes
onco_tsg_gene = pd.read_csv('data/unknown_origin/oncogenes_tsg.csv')
onco_tsg_gene = onco_tsg_gene[['Hugo Symbol', 'Is Oncogene', 'Is Tumor Suppressor Gene']]
onco_tsg_gene_filtered = onco_tsg_gene[onco_tsg_gene['Hugo Symbol'].isin(montagud_nodes)]
onco_tsg_gene_filtered = onco_tsg_gene_filtered.rename(columns={'Hugo Symbol': 'gene'})
# oncogenes = onco_tsg_gene_filtered[onco_tsg_gene_filtered['Is Oncogene'] == 'Yes']
# tsg_genes = onco_tsg_gene_filtered[onco_tsg_gene_filtered['Is Tumor Suppressor Gene'] == 'Yes']


mutations_annotated = mutations_data_filtered.merge(
    onco_tsg_gene_filtered[['gene', 'Is Oncogene', 'Is Tumor Suppressor Gene']],
    on='gene',
    how='left'
)
mutations_annotated = mutations_annotated.rename(columns={'Is Oncogene': 'oncogene', 'Is Tumor Suppressor Gene': 'tsg'})

mutations_annotated = mutations_annotated[
    mutations_annotated['oncogene'].notna() | mutations_annotated['tsg'].notna()
]

# loss function mutation assumption -> TSG and 
lof_effects = ["frameshift_variant", "stop_gained", "start_lost", "splice_region_variant"]
lof_mutations = mutations_annotated[mutations_annotated['effect'].isin(lof_effects)]
lof_mutations_tsg = lof_mutations[(lof_mutations['tsg'] == 'Yes') & (lof_mutations['oncogene'] == 'No')]
lof_mutations_tsg_filtered = lof_mutations_tsg[['Sample_ID', 'gene']]


mutations_onco = mutations_annotated[(mutations_annotated['tsg'] == 'No') & (mutations_annotated['oncogene'] == 'Yes')]

# dna_vaf > 0.5 -> clonal mutation (mutation probably in the early tumor cells)
gof_effects = ['p.G12D','p.S249C', 'p.Y373C']
gof_mutations = mutations_onco[mutations_onco['Amino_Acid_Change'].isin(gof_effects)]
gof_mutations_filtered = gof_mutations[['Sample_ID', 'gene']]




Let's try to add more info to reflect metastasis change, add mutations data

In [12]:
# tailor_bnd_mutat_validation(lof_mutations_tsg_filtered,gof_mutations_filtered,folder_pers_models, tissue)

In [13]:
# compute phenotype distribution


# why nothing is computed ??

folder = 'validation/results'
inputs_list = ['EGF', 'FGF', 'TGFb', 'Nutrients', 'Hypoxia', 'Acidosis', 'Androgen', 'TNFalpha', 'Carcinogen']


patients_data_dict = compute_phenotypes_distribution(phenotype_interest, folder,folder_pers_models, inputs_list, tissue)


--- Results for patient: TCGA-OR-A5JI-01A_PAN_CANCER ---
patients columns are: Index(['<nil>', 'Invasion -- Migration -- Proliferation', 'Apoptosis',
       'Apoptosis -- Proliferation', 'Proliferation', 'Apoptosis -- Invasion',
       'Invasion -- Proliferation', 'Invasion', 'Invasion -- Migration',
       'Apoptosis -- DNA_Repair', 'DNA_Repair'],
      dtype='object')
               Proliferation  Invasion  DNA_Repair  Migration  Apoptosis
EGF_ON                   NaN       NaN         NaN        0.0        NaN
FGF_ON              0.101963       NaN         NaN        0.0   0.395571
TGFb_ON                  NaN       NaN         NaN        0.0        NaN
Nutrients_ON        0.410210       NaN         NaN        0.0        NaN
Hypoxia_ON               NaN       NaN         NaN        0.0        NaN
Acidosis_ON         0.000200       NaN         NaN        0.0        NaN
Androgen_ON              NaN       NaN         NaN        0.0        NaN
TNFalpha_ON              NaN  0.935992    

KeyboardInterrupt: 

In [None]:
phenotype_data_filtered = phenotype_data_filtered[phenotype_data_filtered['sample'].isin(patients_id)]
# print(phenotype_data_filtered)


stage_1_group_ids= list(phenotype_data_filtered[phenotype_data_filtered['Tumor Group'] == 'Early Stage']['sample'])
stage_2_group_ids= list(phenotype_data_filtered[phenotype_data_filtered['Tumor Group'] == 'Larger Tumor']['sample'])
stage_3_group_ids= list(phenotype_data_filtered[phenotype_data_filtered['Tumor Group'] == 'Advanced Local Speed']['sample'])
stage_4_group_ids= list(phenotype_data_filtered[phenotype_data_filtered['Tumor Group'] == 'Metastatic']['sample'])

print(stage_2_group_ids)


In [None]:
# save all the files to their associated directory

# Base results folder where original phenotype files are stored
folder_phenotype_results = 'validation/results/only_gene_expression/single_input_on/phenotype_distribution_patients/'

# Define group names and sample IDs
groups = {
    'stage_1_group': stage_1_group_ids,
    'stage_2_group': stage_2_group_ids,
    'stage_3_group': stage_3_group_ids,
    'stage_4_group': stage_4_group_ids,
}

# Move or copy files to each group folder
for group_name, sample_ids in groups.items():
    group_folder = os.path.join(folder_phenotype_results, group_name)
    os.makedirs(group_folder, exist_ok=True)

    for sample_id in sample_ids:
        filename = f'{sample_id}_PAN_CANCER_phenotypes.csv'
        src_path = os.path.join(folder_phenotype_results, filename)
        dst_path = os.path.join(group_folder, filename)

        if os.path.exists(src_path):
            shutil.copy(src_path, dst_path)  # Use shutil.move if you want to move instead of copy
            print(f"Copied {filename} to {group_folder}")
        else:
            print(f"File not found: {src_path}")


In [None]:
number = 1
dic_patients_data = f'validation/results/only_gene_expression/single_input_on/phenotype_distribution_patients/stage_{number}_group'
phenotype_interest = ["Proliferation","Invasion","DNA_Repair","Migration","Apoptosis"]

def load_patients_data(folder_path):
    patient_dict = {}
    for file in os.listdir(folder_path):
        if file.endswith("_phenotypes.csv"):
            sample_id = file.replace("_PAN_CANCER_phenotypes.csv", "").replace("_phenotypes.csv", "")
            df = pd.read_csv(os.path.join(folder_path, file), index_col=0)
            df = df.fillna(0)
            patient_dict[sample_id] = df
    return patient_dict


# Load the data
dic_patients_data_group = load_patients_data(dic_patients_data)
df_results_mean, df_results_std, stats_results_data_df = compute_mean_patients(phenotype_interest, dic_patients_data_group)


file_path = os.path.join(dic_patients_data, "mean_phenotype_distribution.csv")
df_results_mean.to_csv(file_path)



In [None]:
print(df_results_mean)

df_results_mean.to_csv(f'validation/results/only_gene_expression/single_input_on/phenotype_distribution_patients/stage_{number}_group/mean_stage{number}_group/df_results_mean_stage_{number}.csv')
df_results_std.to_csv(f'validation/results/only_gene_expression/single_input_on/phenotype_distribution_patients/stage_{number}_group/mean_stage{number}_group/df_results_std_stage_{number}.csv')
stats_results_data_df.to_csv(f'validation/results/only_gene_expression/single_input_on/phenotype_distribution_patients/stage_{number}_group/mean_stage{number}_group/stats_results_data_df_stage_{number}.csv')