In [1]:
import maboss
import ginsim
import pandas as pd 
import numpy as np
import mygene
import os
import shutil

from create_generic_models.create_generic_patients_cfgs import create_generic_patients_cfg_bnd_validation
from create_generic_models.update_phenotypes_generic_models import generic_models_update_phenotypes


# from create_person_models.tailor_cfgs_patients_gene import personalized_patients_genes_cfgs_validation
from pre_process_data.pre_process_genes import create_table_rna_seq_patients

from identification_patients.validation_get_patients_ids import get_patients_valid

from pre_process_data.tcga_preprocess_data import pre_process_tcga_data

from create_person_models.tailor_cfgs_patients_gene import personalized_patients_genes_cfgs
from create_person_models.tailor_bnd_cnv import tailor_bnd_cnv_validation

from MaBoSS_simulation.MaBoSS_phenotype_distribution import compute_mean_patients

from create_person_models.tailor_bnd_tsg_onco_mutations import tailor_bnd_mutat_validation

from MaBoSS_simulation.maboss_phenotype_patient import compute_phenotype_table, compute_phenotype_mean_group_validation, combine_groups_values
from stats.stats_proba import compute_kruskal_test_means_validation

In [2]:
# Import data
phenotype_data = pd.read_csv('data/TCGA_data/TCGA_GDC-PANCAN_tumor_stage_phenotype.csv')
genes_data = pd.read_csv('data/TCGA_data/TCGA_GDC-PANCAN_genes_FPKM_UQ.csv')
cnv_data = pd.read_csv('data/TCGA_data/TCGA-PANCAN_cnv.csv')

montagud_data = (
    pd.read_csv('data/Montagud_inter_nodes_data.csv', header=1)
    .loc[:, ['Target node', 'Interaction type', 'Source']])

# Create list of genes of interest (in Montagud data)
montagud_nodes = list(set(montagud_data['Target node'].tolist() + montagud_data['Source'].tolist()))
montagud_nodes = [node for node in montagud_nodes if node != '0/1']
montagud_nodes = [node.upper() for node in montagud_nodes if isinstance(node, str)]
montagud_nodes.append('KRAS')
to_remove = ['RAS', 'FUSED_EVENT', 'NKX3_1', 'SPOP', 'AR_ERG']

montagud_nodes = [node for node in montagud_nodes if node not in to_remove]
montagud_nodes = list(set(montagud_nodes))

In [3]:
print(phenotype_data.columns)
phenotype_data[['diagnoses.morphology']]

Index(['sample', 'demographic.age_at_index', 'demographic.created_datetime',
       'demographic.days_to_birth', 'demographic.days_to_death',
       'demographic.demographic_id', 'demographic.ethnicity',
       'demographic.gender', 'demographic.race', 'demographic.state',
       'demographic.submitter_id', 'demographic.updated_datetime',
       'demographic.vital_status', 'demographic.year_of_birth',
       'demographic.year_of_death', 'diagnoses.age_at_diagnosis',
       'diagnoses.classification_of_tumor', 'diagnoses.created_datetime',
       'diagnoses.days_to_diagnosis', 'diagnoses.days_to_last_follow_up',
       'diagnoses.diagnosis_id', 'diagnoses.icd_10_code',
       'diagnoses.last_known_disease_status', 'diagnoses.morphology',
       'diagnoses.primary_diagnosis', 'diagnoses.prior_malignancy',
       'diagnoses.prior_treatment', 'diagnoses.progression_or_recurrence',
       'diagnoses.site_of_resection_or_biopsy', 'diagnoses.state',
       'diagnoses.submitter_id', 'diagnoses

Unnamed: 0,diagnoses.morphology
0,8380/3
1,8441/3
2,8380/3
3,
4,8441/3
...,...
14313,8140/3
14314,8260/3
14315,8140/3
14316,8140/3


In [4]:
phenotype_data['diagnoses.state']

0        released
1        released
2        released
3             NaN
4        released
           ...   
14313    released
14314    released
14315    released
14316    released
14317    released
Name: diagnoses.state, Length: 14318, dtype: object

In [5]:

# diagnoses.tumor_grade, diagnoses.morphology
# stratify by cancer type: diagnoses.primary_diagnosis

phenotype_data_filtered = phenotype_data[['diagnoses.tumor_grade', 'diagnoses.tumor_stage', 'diagnoses.primary_diagnosis']]


In [6]:
print(phenotype_data_filtered)

      diagnoses.tumor_grade diagnoses.tumor_stage  \
0              not reported          not reported   
1              not reported          not reported   
2              not reported          not reported   
3                       NaN                   NaN   
4              not reported          not reported   
...                     ...                   ...   
14313          not reported             stage iia   
14314          not reported              stage ii   
14315          not reported              stage iv   
14316          not reported            stage iiia   
14317          not reported            stage iiia   

            diagnoses.primary_diagnosis  
0      Endometrioid adenocarcinoma, NOS  
1        Serous cystadenocarcinoma, NOS  
2      Endometrioid adenocarcinoma, NOS  
3                                   NaN  
4        Serous cystadenocarcinoma, NOS  
...                                 ...  
14313               Adenocarcinoma, NOS  
14314     Papillary adenoca

In [7]:
phenotype_data_filtered = phenotype_data_filtered[
    ~phenotype_data_filtered['diagnoses.tumor_grade'].isin(['not reported']) & 
    phenotype_data_filtered['diagnoses.tumor_grade'].notna()
]
phenotype_data_filtered

Unnamed: 0,diagnoses.tumor_grade,diagnoses.tumor_stage,diagnoses.primary_diagnosis


In [8]:
patients_id, phenotype_data_filtered = get_patients_valid(phenotype_data, genes_data, cnv_data)
print(patients_id)

['TCGA-GC-A3RD-01A', 'TCGA-A2-A25E-01A', 'TCGA-CF-A1HS-01A', 'TCGA-HQ-A2OF-01A', 'TCGA-FD-A5C1-01A', 'TCGA-BT-A20U-01A', 'TCGA-CF-A9FL-01A', 'TCGA-A8-A06Q-01A', 'TCGA-GC-A3WC-01A', 'TCGA-SY-A9G5-01A', 'TCGA-A2-A04Q-01A', 'TCGA-AR-A24N-01A', 'TCGA-BH-A1FU-01A', 'TCGA-OR-A5L5-01A', 'TCGA-A2-A0ER-01A', 'TCGA-E2-A1IN-01A', 'TCGA-AR-A24S-01A', 'TCGA-AO-A0J4-01A', 'TCGA-OR-A5JI-01A', 'TCGA-B6-A0IP-01A', 'TCGA-OR-A5J6-01A', 'TCGA-E7-A3X6-01A', 'TCGA-BH-A6R8-01A', 'TCGA-FD-A43X-01A', 'TCGA-B6-A0I1-01A', 'TCGA-DK-A6B0-01A', 'TCGA-BT-A20W-01A', 'TCGA-GD-A76B-01A', 'TCGA-AC-A5XU-01A', 'TCGA-BT-A42C-01A', 'TCGA-XF-A8HC-01A', 'TCGA-YC-A9TC-01A', 'TCGA-GC-A3YS-01A', 'TCGA-BT-A42F-01A', 'TCGA-LC-A66R-01A', 'TCGA-CU-A0YO-01A', 'TCGA-XF-AAMZ-01A', 'TCGA-XF-AAN7-01A', 'TCGA-OR-A5KO-01A', 'TCGA-P6-A5OG-01A']


In [9]:
# pre-process tcga data based on montagud nodes and patients ids

# df_melted_cnv, df_melted_genes = pre_process_tcga_data(cnv_data, genes_data, patients_id, montagud_nodes)

df_melted_cnv= pd.read_csv('data/TCGA_data/filtered_data/cnv_samples_table.csv')
df_melted_gene= pd.read_csv('data/TCGA_data/filtered_data/genes_samples_table.csv')

In [10]:
print(df_melted_gene.head())

   Unnamed: 0          model_id gene_symbol   rsem_tpm
0           0  TCGA-GC-A3RD-01A       AXIN1  17.570207
1           1  TCGA-A2-A25E-01A       AXIN1  16.120079
2           2  TCGA-CF-A1HS-01A       AXIN1  17.055333
3           3  TCGA-HQ-A2OF-01A       AXIN1  17.203970
4           4  TCGA-FD-A5C1-01A       AXIN1  16.951919


In [11]:
 # Create generic models 

folder_generic_models_cfg = 'validation/generic_models/Adeno_lung_Cancer.cfg'
folder_generic_models_bnd = 'validation/generic_models/Adeno_lung_Cancer.bnd'

folder_pers_models = 'validation/personalized_models/models_gene_expression'

tissue = 'PAN_CANCER'

drug_name = 'PAN_CANCER'

create_generic_patients_cfg_bnd_validation(folder_generic_models_cfg, folder_generic_models_bnd, folder_pers_models, patients_id, tissue)


All .cfg and .bnd files created for the validation.


In [12]:
# update phenotypes in generic models 

phenotype_interest = ["Proliferation","Invasion","DNA_Repair","Migration","Apoptosis"]
original_data_dir = "validation/personalized_models/models_gene_expression"
results_dir = "validation/personalized_models/models_gene_expression"



generic_models_update_phenotypes(phenotype_interest, original_data_dir, results_dir)


Updated TWIST1.is_internal=1 in TCGA-OR-A5JI-01A_PAN_CANCER.cfg
Updated eEF2.is_internal=1 in TCGA-OR-A5JI-01A_PAN_CANCER.cfg
Updated Slug.is_internal=1 in TCGA-OR-A5JI-01A_PAN_CANCER.cfg
Updated CDH2.is_internal=1 in TCGA-OR-A5JI-01A_PAN_CANCER.cfg
Updated EGF.is_internal=1 in TCGA-OR-A5JI-01A_PAN_CANCER.cfg
Updated ERG.is_internal=1 in TCGA-OR-A5JI-01A_PAN_CANCER.cfg
Updated GSH.is_internal=1 in TCGA-OR-A5JI-01A_PAN_CANCER.cfg
Updated beta_catenin.is_internal=1 in TCGA-OR-A5JI-01A_PAN_CANCER.cfg
Updated CyclinB.is_internal=1 in TCGA-OR-A5JI-01A_PAN_CANCER.cfg
Updated SMO.is_internal=1 in TCGA-OR-A5JI-01A_PAN_CANCER.cfg
Updated BRCA1.is_internal=1 in TCGA-OR-A5JI-01A_PAN_CANCER.cfg
Updated p90RSK.is_internal=1 in TCGA-OR-A5JI-01A_PAN_CANCER.cfg
Updated EP300.is_internal=1 in TCGA-OR-A5JI-01A_PAN_CANCER.cfg
Updated PIP3.is_internal=1 in TCGA-OR-A5JI-01A_PAN_CANCER.cfg
Updated PTEN.is_internal=1 in TCGA-OR-A5JI-01A_PAN_CANCER.cfg
Updated Caspase9.is_internal=1 in TCGA-OR-A5JI-01A_PAN_CA

In [13]:
# personalize the boolean networks with genes 
table_rna_seq_patients = create_table_rna_seq_patients(df_melted_gene)
print(table_rna_seq_patients.head())

gene_expression_level                               High Gene Expression  \
model_id                                                                   
TCGA-A2-A04Q-01A                                      CFLAR, ETS1, YWHAZ   
TCGA-A2-A0ER-01A       APAF1, AR, BCL2, EEF2K, EP300, FADD, FRS2, LDH...   
TCGA-A2-A25E-01A       APAF1, BCL2, BRCA1, EGF, FRS2, KRAS, MDM2, NCO...   
TCGA-A8-A06Q-01A       APAF1, AR, ATR, BRCA2, EEF2K, EGF, FADD, FRS2,...   
TCGA-AC-A5XU-01A       APAF1, AXIN1, BCL2, COX4I2, EEF2, EEF2K, MED12...   

gene_expression_level                                Low Gene Expression  
model_id                                                                  
TCGA-A2-A04Q-01A                             BAD, EEF2, FGFR3, NF1, PTEN  
TCGA-A2-A0ER-01A       AXIN1, BAD, BAX, BMP2, EGFR, IDH1, JUN, PTCH1,...  
TCGA-A2-A25E-01A         AXIN1, BAD, BAX, EGFR, FADD, PTCH1, SHH, ZBTB17  
TCGA-A8-A06Q-01A       AXIN1, BAD, BAX, BMP2, COX4I2, EGFR, ETS1, ETV...  
TCGA-AC-A5XU-01A 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rna_seq_data.rename(columns={"gene_symbol": "gene_name"}, inplace=True)


In [14]:
personalized_patients_genes_cfgs(df_melted_gene,montagud_nodes,folder_pers_models,folder_pers_models,patients_id,table_rna_seq_patients,drug_name)

Modified and saved: validation/personalized_models/models_gene_expression/TCGA-OR-A5JI-01A_PAN_CANCER.cfg
Modified and saved: validation/personalized_models/models_gene_expression/TCGA-LC-A66R-01A_PAN_CANCER.cfg
Modified and saved: validation/personalized_models/models_gene_expression/TCGA-B6-A0IP-01A_PAN_CANCER.bnd
Modified and saved: validation/personalized_models/models_gene_expression/TCGA-GC-A3RD-01A_PAN_CANCER.bnd
Modified and saved: validation/personalized_models/models_gene_expression/TCGA-BT-A20W-01A_PAN_CANCER.cfg
Modified and saved: validation/personalized_models/models_gene_expression/TCGA-GC-A3WC-01A_PAN_CANCER.cfg
Modified and saved: validation/personalized_models/models_gene_expression/TCGA-AC-A5XU-01A_PAN_CANCER.bnd
Modified and saved: validation/personalized_models/models_gene_expression/TCGA-BH-A1FU-01A_PAN_CANCER.cfg
Modified and saved: validation/personalized_models/models_gene_expression/TCGA-FD-A5C1-01A_PAN_CANCER.bnd
Modified and saved: validation/personalized_mo

In [15]:
# personalize with CNV
tailor_bnd_cnv_validation(df_melted_cnv, folder_pers_models, tissue)

🔍 Processing patient TCGA-AR-A24N-01A, gene: AXIN1
AXIN1 node found. Replacing...
🔍 Processing patient TCGA-AR-A24N-01A, gene: SMO
SMO node found. Replacing...
🔍 Processing patient TCGA-AR-A24N-01A, gene: EEF2K
No EEF2K node found in file for patient TCGA-AR-A24N-01A
🔍 Processing patient TCGA-AR-A24N-01A, gene: AR
AR node found. Replacing...
🔍 Processing patient TCGA-AR-A24N-01A, gene: E2F1
E2F1 node found. Replacing...
🔍 Processing patient TCGA-AR-A24N-01A, gene: ZBTB17
ZBTB17 node found. Replacing...
🔍 Processing patient TCGA-AR-A24N-01A, gene: MED12
MED12 node found. Replacing...
🔍 Processing patient TCGA-AR-A24N-01A, gene: BAX
BAX node found. Replacing...
🔍 Processing patient TCGA-AR-A24N-01A, gene: ATM
ATM node found. Replacing...
🔍 Processing patient TCGA-AR-A24N-01A, gene: TERT
TERT node found. Replacing...
🔍 Processing patient TCGA-AR-A24N-01A, gene: FRS2
FRS2 node found. Replacing...
🔍 Processing patient TCGA-AR-A24N-01A, gene: SHH
SHH node found. Replacing...
🔍 Processing pat

In [None]:
# add mutations info 

# Add somatic mutations data 
# keep only the one we had for the main pipeline

mutations_data = pd.read_csv('data/TCGA_data/TCGA_mutations_mutect2_GDC-PANCAN.csv')
mutations_data_filtered = mutations_data[mutations_data['Sample_ID'].isin(patients_id)]
mutations_data_filtered = mutations_data_filtered[mutations_data_filtered['gene'].isin(montagud_nodes)]

# check if genes are TSG/ Oncogenes
onco_tsg_gene = pd.read_csv('data/unknown_origin/oncogenes_tsg.csv')
onco_tsg_gene = onco_tsg_gene[['Hugo Symbol', 'Is Oncogene', 'Is Tumor Suppressor Gene']]
onco_tsg_gene_filtered = onco_tsg_gene[onco_tsg_gene['Hugo Symbol'].isin(montagud_nodes)]
onco_tsg_gene_filtered = onco_tsg_gene_filtered.rename(columns={'Hugo Symbol': 'gene'})
# oncogenes = onco_tsg_gene_filtered[onco_tsg_gene_filtered['Is Oncogene'] == 'Yes']
# tsg_genes = onco_tsg_gene_filtered[onco_tsg_gene_filtered['Is Tumor Suppressor Gene'] == 'Yes']


mutations_annotated = mutations_data_filtered.merge(
    onco_tsg_gene_filtered[['gene', 'Is Oncogene', 'Is Tumor Suppressor Gene']],
    on='gene',
    how='left'
)
mutations_annotated = mutations_annotated.rename(columns={'Is Oncogene': 'oncogene', 'Is Tumor Suppressor Gene': 'tsg'})

mutations_annotated = mutations_annotated[
    mutations_annotated['oncogene'].notna() | mutations_annotated['tsg'].notna()
]

# loss function mutation assumption -> TSG and 
lof_effects = ["frameshift_variant", "stop_gained", "start_lost", "splice_region_variant"]
lof_mutations = mutations_annotated[mutations_annotated['effect'].isin(lof_effects)]
lof_mutations_tsg = lof_mutations[(lof_mutations['tsg'] == 'Yes') & (lof_mutations['oncogene'] == 'No')]
lof_mutations_tsg_filtered = lof_mutations_tsg[['Sample_ID', 'gene']]


mutations_onco = mutations_annotated[(mutations_annotated['tsg'] == 'No') & (mutations_annotated['oncogene'] == 'Yes')]

# dna_vaf > 0.5 -> clonal mutation (mutation probably in the early tumor cells)
gof_effects = ['p.G12D','p.S249C', 'p.Y373C']
gof_mutations = mutations_onco[mutations_onco['Amino_Acid_Change'].isin(gof_effects)]
gof_mutations_filtered = gof_mutations[['Sample_ID', 'gene']]




KeyboardInterrupt: 

Let's try to add more info to reflect metastasis change, add mutations data

In [None]:
# tailor_bnd_mutat_validation(lof_mutations_tsg_filtered,gof_mutations_filtered,folder_pers_models, tissue)

In [None]:
# Compute phenotype distribution for each patient

folder_models = "validation/personalized_models/models_gene_expression"
folder_save_results = "validation/results/phenotype_distribution"
tissue = 'PAN_CANCER'
phenotypes_interest = [
        "Proliferation",
        "Invasion",
        "DNA_Repair",
        "Migration",
        "Apoptosis",
    ]
inputs_list = ['EGF', 'FGF', 'Androgen', 'TGFb', 'Hypoxia', 'Nutrients', 'Carcinogen', 'Acidosis', 'TNFalpha']
for patient  in patients_id:
    compute_phenotype_table(folder_save_results, folder_models, patient, inputs_list, phenotypes_interest, tissue)


In [14]:
phenotype_data_filtered = phenotype_data_filtered[phenotype_data_filtered['sample'].isin(patients_id)]
# print(phenotype_data_filtered)


stage_1_group_ids= list(phenotype_data_filtered[phenotype_data_filtered['Tumor Group'] == 'Early Stage']['sample'])
stage_2_group_ids= list(phenotype_data_filtered[phenotype_data_filtered['Tumor Group'] == 'Larger Tumor']['sample'])
stage_3_group_ids= list(phenotype_data_filtered[phenotype_data_filtered['Tumor Group'] == 'Advanced Local Speed']['sample'])
stage_4_group_ids= list(phenotype_data_filtered[phenotype_data_filtered['Tumor Group'] == 'Metastatic']['sample'])



In [15]:
stages_groups = ["stage_1_group", "stage_2_group", "stage_3_group", "stage_4_group"]

folder_groups_means = "validation/results/phenotype_group_means"
compute_phenotype_mean_group_validation(stages_groups, folder_groups_means)

              Proliferation  Invasion  DNA_Repair  Migration  Apoptosis
Acidosis           0.771178  0.091511    0.701395   0.033085   0.152028
Androgen           0.771870  0.090220    0.700500   0.077837   0.054281
Carcinogen         0.781296  0.083356    0.916561   0.010400   0.278770
EGF                0.771669  0.092319    0.700499   0.078226   0.065108
FGF                0.808667  0.087568    0.702669   0.021828   0.124075
Hypoxia            0.735644  0.088537    0.699362   0.034040   0.115543
Nutrients          0.805048  0.092880    0.700600   0.033181   0.079830
TGFb               0.762335  0.159608    0.701658   0.041244   0.131951
TNFalpha           0.708399  0.295983    0.700200   0.103930   0.075465
Overall_Mean       0.768456  0.120220    0.724827   0.048197   0.119672
              Proliferation  Invasion  DNA_Repair  Migration  Apoptosis
Acidosis           0.755000  0.056364    0.699001   0.017819   0.135570
Androgen           0.772750  0.066829    0.700600   0.039802   0

Unnamed: 0,Proliferation,Invasion,DNA_Repair,Migration,Apoptosis
Acidosis,0.902985,0.045993,0.899116,0.015143,0.142882
Androgen,0.91596,0.049285,0.9006,0.032691,0.044682
Carcinogen,0.8972,0.041482,0.954894,0.001,0.214682
EGF,0.917193,0.052114,0.90007,0.036638,0.054528
FGF,0.915419,0.046478,0.9,0.013766,0.057718
Hypoxia,0.899315,0.039662,0.8982,0.015953,0.104561
Nutrients,0.916235,0.054916,0.901389,0.015092,0.057389
TGFb,0.90908,0.067892,0.901084,0.015635,0.074335
TNFalpha,0.904,0.099264,0.9,0.031365,0.056923
Overall_Mean,0.908599,0.055232,0.90615,0.019698,0.089744


In [18]:
# combine values of the two groups -> so the stats test is easier later

base_path = 'validation/results/phenotype_group_means'
data_combined = combine_groups_values(base_path)
print(data_combined)

{('EGF', 'Proliferation'): {'GroupA': [0.999999, 0.2135929999999999, 0.999999, 0.999999, 0.348155, 0.999999, 0.154949, 0.999999, 0.999999, 0.999999, 0.999999, 0.999999, 0.999999, 0.999999, 0.067979, 0.999999, 0.232288, 0.999999, 0.999999, 0.479216], 'GroupB': [0.999999, 0.999999, 0.999999, 0.999999, 0.999999, 0.999999, 0.999999, 0.067979, 0.067979, 0.999999, 0.1719379999999999, 0.999999, 0.999999, 0.999999, 0.999999, 0.999999, 0.999999, 0.999999, 0.999999, 0.999999]}, ('EGF', 'Invasion'): {'GroupA': [0.002713, 0.078676, 0.002713, 0.002713, 0.206308, 0.002713, 0.619218, 0.002713, 0.002713, 0.002713, 0.002713, 0.002713, 0.002713, 0.002713, 0.0272479999999999, 0.002713, 0.345025, 0.002713, 0.002713, 0.313446], 'GroupB': [0.002713, 0.002713, 0.002713, 0.002713, 0.002713, 0.002713, 0.002713, 0.0272479999999999, 0.0272479999999999, 0.002713, 0.4967210000000001, 0.002713, 0.002713, 0.002713, 0.002713, 0.002713, 0.002713, 0.002713, 0.002713, 0.002713]}, ('EGF', 'DNA_Repair'): {'GroupA': [0.999

In [19]:

# stats test
significant_results = compute_kruskal_test_means_validation(data_combined)
print(significant_results)

Empty DataFrame
Columns: [Input, Phenotype, Kruskal_Stat, p_value, Significant]
Index: []
