In [18]:
import maboss
import ginsim
import pandas as pd 
import numpy as np
import mygene
import os
import shutil

from create_generic_models.create_generic_patients_cfgs import create_generic_patients_cfg_bnd_validation
from create_generic_models.update_phenotypes_generic_models import generic_models_update_phenotypes


# from create_person_models.tailor_cfgs_patients_gene import personalized_patients_genes_cfgs_validation
from pre_process_data.pre_process_genes import create_table_rna_seq_patients

from identification_patients.validation_get_patients_ids import get_patients_valid

from pre_process_data.tcga_preprocess_data import pre_process_tcga_data

from create_person_models.tailor_cfgs_patients_gene import personalized_patients_genes_cfgs
from create_person_models.tailor_bnd_cnv import tailor_bnd_cnv_validation

from MaBoSS_simulation.MaBoSS_phenotype_distribution import compute_mean_patients

from create_person_models.tailor_bnd_tsg_onco_mutations import tailor_bnd_mutat_validation

from MaBoSS_simulation.maboss_phenotype_patient import compute_phenotype_table, compute_phenotype_mean_group_validation, combine_groups_values
from stats.stats_proba import compute_kruskal_test_means_validation

In [19]:
# Import data
phenotype_data = pd.read_csv('data/TCGA_data/prostate/TCGA_PRAD_phenotypes.csv')
genes_data = pd.read_csv('data/TCGA_data/prostate/TCGA_PRAD_genes_illumina.csv', sep='\t')
cnv_data = pd.read_csv('data/TCGA_data/prostate/TCGA_PRAD_cnv_gistic2.csv',sep='\t')



# keep all montagud nodes
montagud_data = (
    pd.read_csv('data/Montagud_inter_nodes_data.csv', header=1)
    .loc[:, ['Target node', 'Interaction type', 'Source']])

# Create list of genes of interest (in Montagud data)
montagud_nodes = list(set(montagud_data['Target node'].tolist() + montagud_data['Source'].tolist()))
montagud_nodes = [node for node in montagud_nodes if node != '0/1']
montagud_nodes = [node.upper() for node in montagud_nodes if isinstance(node, str)]
# montagud_nodes.append('KRAS')
# to_remove = ['RAS', 'FUSED_EVENT', 'NKX3_1', 'SPOP', 'AR_ERG']

# montagud_nodes = [node for node in montagud_nodes if node not in to_remove]
# montagud_nodes = list(set(montagud_nodes))

In [20]:

# diagnoses.tumor_grade, diagnoses.morphology
# stratify by cancer type: diagnoses.primary_diagnosis

phenotype_data_filtered = phenotype_data[['sampleID','gleason_score']]


In [21]:
print(phenotype_data_filtered)
print(phenotype_data_filtered['gleason_score'].value_counts())

            sampleID  gleason_score
0    TCGA-2A-A8VL-01              6
1    TCGA-2A-A8VO-01              6
2    TCGA-2A-A8VT-01              9
3    TCGA-2A-A8VV-01              6
4    TCGA-2A-A8VX-01              8
..               ...            ...
561  TCGA-ZG-A9M4-01              9
562  TCGA-ZG-A9MC-01              9
563  TCGA-ZG-A9N3-01              9
564  TCGA-ZG-A9ND-01              9
565  TCGA-ZG-A9NI-01              9

[566 rows x 2 columns]
gleason_score
7     301
9     144
8      67
6      50
10      4
Name: count, dtype: int64


In [22]:
# create 3 groups: gleason score of 6, gleason score of 7, and of gleason score of > 8

group_0 = [6]
group_1 = [7]
group_2 = [8, 9, 10]

conditions = [
phenotype_data_filtered["gleason_score"].isin(group_0),
phenotype_data_filtered["gleason_score"].isin(group_1),
phenotype_data_filtered["gleason_score"].isin(group_2),

]
choices = ['low_aggressive', 'middle_aggressive', 'high_aggressive']

phenotype_data_filtered.loc[:, "Gleason_group"] = np.select(
conditions, choices, default=""
)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  phenotype_data_filtered.loc[:, "Gleason_group"] = np.select(


In [23]:
print(phenotype_data_filtered)
sampled_df = phenotype_data_filtered.groupby("Gleason_group", group_keys=False).apply(
    lambda x: x.sample(n=min(len(x), 30), random_state=42)
)

            sampleID  gleason_score    Gleason_group
0    TCGA-2A-A8VL-01              6   low_aggressive
1    TCGA-2A-A8VO-01              6   low_aggressive
2    TCGA-2A-A8VT-01              9  high_aggressive
3    TCGA-2A-A8VV-01              6   low_aggressive
4    TCGA-2A-A8VX-01              8  high_aggressive
..               ...            ...              ...
561  TCGA-ZG-A9M4-01              9  high_aggressive
562  TCGA-ZG-A9MC-01              9  high_aggressive
563  TCGA-ZG-A9N3-01              9  high_aggressive
564  TCGA-ZG-A9ND-01              9  high_aggressive
565  TCGA-ZG-A9NI-01              9  high_aggressive

[566 rows x 3 columns]


  sampled_df = phenotype_data_filtered.groupby("Gleason_group", group_keys=False).apply(


In [24]:
print(sampled_df)
patients_id = list(sampled_df['sampleID'])
print(patients_id)

            sampleID  gleason_score      Gleason_group
551  TCGA-ZG-A9L5-01              9    high_aggressive
563  TCGA-ZG-A9N3-01              9    high_aggressive
464  TCGA-V1-A9ZK-01              8    high_aggressive
525  TCGA-YL-A8SF-01              8    high_aggressive
52   TCGA-CH-5792-01              9    high_aggressive
..               ...            ...                ...
382  TCGA-KK-A6E5-01              7  middle_aggressive
392  TCGA-KK-A7AZ-01              7  middle_aggressive
110  TCGA-EJ-7330-01              7  middle_aggressive
166  TCGA-EJ-A8FN-01              7  middle_aggressive
165  TCGA-EJ-A7NN-01              7  middle_aggressive

[90 rows x 3 columns]
['TCGA-ZG-A9L5-01', 'TCGA-ZG-A9N3-01', 'TCGA-V1-A9ZK-01', 'TCGA-YL-A8SF-01', 'TCGA-CH-5792-01', 'TCGA-KK-A8I9-01', 'TCGA-YL-A8HK-01', 'TCGA-HC-A48F-01', 'TCGA-ZG-A9LM-01', 'TCGA-V1-A9ZR-01', 'TCGA-XJ-A9DX-01', 'TCGA-EJ-8472-01', 'TCGA-YL-A9WI-01', 'TCGA-CH-5761-01', 'TCGA-HC-7821-01', 'TCGA-YL-A8SC-01', 'TCGA-YL-A8S

In [25]:
print(cnv_data.head())

  Gene Symbol  TCGA-2A-A8VL-01  TCGA-2A-A8VO-01  TCGA-2A-A8VT-01  \
0       ACAP3                0                0                0   
1      ACTRT2                0                0                0   
2        AGRN                0                0                0   
3     ANKRD65                0                0                0   
4      ATAD3A                0                0                0   

   TCGA-2A-A8VV-01  TCGA-2A-A8VX-01  TCGA-2A-A8W1-01  TCGA-2A-A8W3-01  \
0                0                0                0                0   
1                0                0                0                0   
2                0                0                0                0   
3                0                0                0                0   
4                0                0                0                0   

   TCGA-2A-AAYF-01  TCGA-2A-AAYO-01  TCGA-2A-AAYU-01  TCGA-4L-AA1F-01  \
0                0                0                0                0   
1     

In [26]:
cnv_data_col = list(cnv_data.columns)
common_col = list(set(cnv_data_col) & set(patients_id))
col_keep = ['Gene Symbol'] + common_col
cnv_data_filtered = cnv_data[col_keep]
print(cnv_data_filtered.head())

  Gene Symbol  TCGA-Y6-A8TL-01  TCGA-G9-6342-01  TCGA-HC-7821-01  \
0       ACAP3                0                0               -1   
1      ACTRT2                0                0               -1   
2        AGRN                0                0               -1   
3     ANKRD65                0                0               -1   
4      ATAD3A                0                0               -1   

   TCGA-J4-A83N-01  TCGA-V1-A9OT-01  TCGA-YL-A9WH-01  TCGA-G9-6329-01  \
0                0                0               -1                0   
1                0                0               -1                0   
2                0                0               -1                0   
3                0                0               -1                0   
4                0                0               -1                0   

   TCGA-J4-A67Q-01  TCGA-HC-7736-01  TCGA-HC-8260-01  TCGA-ZG-A9LM-01  \
0                0                0                0               -1   
1     

In [27]:
df_melted_cnv = cnv_data_filtered.melt(
    id_vars=["Gene Symbol"],       # columns to keep fixed
    var_name="samples_id",         # name for the variable column (sample IDs)
    value_name="expression_value"  # name for the values
)


df_melted_cnv['Gene Symbol'] = df_melted_cnv['Gene Symbol'].str.split('|').str[0] 


df_melted_cnv = df_melted_cnv.rename(
    columns={
        "samples_id": "model_id",
        "Gene Symbol": "gene_symbol",
        "expression_value": "rsem_tpm",
    }
)

print(df_melted_cnv.head())


  gene_symbol         model_id  rsem_tpm
0       ACAP3  TCGA-Y6-A8TL-01         0
1      ACTRT2  TCGA-Y6-A8TL-01         0
2        AGRN  TCGA-Y6-A8TL-01         0
3     ANKRD65  TCGA-Y6-A8TL-01         0
4      ATAD3A  TCGA-Y6-A8TL-01         0


In [28]:
group_loss = [-1, -2]
group_normal = [0]
group_gain = [1, 2]

conditions = [
    df_melted_cnv["rsem_tpm"].isin(group_loss),
    df_melted_cnv["rsem_tpm"].isin(group_normal),
    df_melted_cnv["rsem_tpm"].isin(group_gain),
]
choices = ["Loss", "Normal", "Gain"]
df_melted_cnv.loc[:, "effect"] = np.select(conditions, choices, default="")

df_melted_cnv = df_melted_cnv[df_melted_cnv['gene_symbol'].isin(montagud_nodes)]

In [29]:
genes_data_col = list(genes_data.columns)
common_col = list(set(genes_data_col) & set(patients_id))
col_keep = ['sample'] + common_col
genes_data_filtered = genes_data[col_keep]


df_melted_gene = genes_data_filtered.melt(
    id_vars=["sample"],       # columns to keep fixed
    var_name="samples_id",         # name for the variable column (sample IDs)
    value_name="expression_value"  # name for the values
)


df_melted_gene['sample'] = df_melted_gene['sample'].str.split('|').str[0] 


df_melted_gene = df_melted_gene.rename(
    columns={
        "samples_id": "model_id",
        "sample": "gene_symbol",
        "expression_value": "rsem_tpm",
    }
)
df_melted_gene['gene_symbol'] = df_melted_gene['gene_symbol'].str.upper()
df_melted_gene = df_melted_gene[df_melted_gene['gene_symbol'].isin(montagud_nodes)]

In [30]:
# pre-process tcga data based on montagud nodes and patients ids


# df_melted_cnv= pd.read_csv('data/TCGA_data/filtered_data/cnv_samples_table.csv')
# df_melted_gene= pd.read_csv('data/TCGA_data/filtered_data/genes_samples_table.csv')

In [31]:
 # Create generic models 

folder_generic_models_cfg = 'validation/prostate/generic_models/Montagud2022_Prostate_Cancer.cfg'
folder_generic_models_bnd = 'validation/prostate/generic_models/Montagud2022_Prostate_Cancer.bnd'

folder_pers_models = 'validation/prostate/personalized_models'

tissue = 'Prostate'

create_generic_patients_cfg_bnd_validation(folder_generic_models_cfg, folder_generic_models_bnd, folder_pers_models, patients_id, tissue)


All .cfg and .bnd files created for the validation.


In [15]:
# update phenotypes in generic models 

phenotype_interest = ["Proliferation","Invasion","DNA_Repair","Migration","Apoptosis"]
original_data_dir = "validation/prostate/personalized_models"
results_dir = "validation/prostate/personalized_models"


# generic_models_update_phenotypes(phenotype_interest, original_data_dir, results_dir)


In [16]:
# personalize the boolean networks with genes 
table_rna_seq_patients = create_table_rna_seq_patients(df_melted_gene)
print(table_rna_seq_patients.head())

gene_expression_level                               High Gene Expression  \
model_id                                                                   
TCGA-2A-A8VL-01                                          EEF2K, FOS, JUN   
TCGA-2A-AAYO-01                                                BAX, TERT   
TCGA-2A-AAYU-01                                           BAX, ERG, SPOP   
TCGA-CH-5738-01               APAF1, ATM, ATR, CFLAR, FOS, JUN, RB1, SHH   
TCGA-CH-5743-01        APAF1, ATM, BCL2, BMP2, BRCA2, CFLAR, EEF2K, E...   

gene_expression_level                       Low Gene Expression  
model_id                                                         
TCGA-2A-A8VL-01                        BRCA1, BRCA2, FADD, FRS2  
TCGA-2A-AAYO-01                                     EGFR, NCOR2  
TCGA-2A-AAYU-01                   BCL2, BMP2, DAXX, EEF2K, PTEN  
TCGA-CH-5738-01              AXIN1, BAD, CDH2, DAXX, E2F1, FADD  
TCGA-CH-5743-01        AXIN1, BAD, E2F1, EEF2, FADD, IDH1, TERT  


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rna_seq_data.rename(columns={"gene_symbol": "gene_name"}, inplace=True)


In [None]:
# personalized_patients_genes_cfgs(df_melted_gene,montagud_nodes,folder_pers_models,folder_pers_models,patients_id,table_rna_seq_patients,tissue)

In [None]:
# personalize with CNV
# tailor_bnd_cnv_validation(df_melted_cnv, folder_pers_models, tissue)

🔍 Processing patient TCGA-HC-A48F-01, gene: ZBTB17
Patient TCGA-HC-A48F-01 is in both gain and loss groups. Please review.
ZBTB17 node found. Replacing...
🔍 Processing patient TCGA-HC-A48F-01, gene: JUN
Patient TCGA-HC-A48F-01 is in both gain and loss groups. Please review.
JUN node found. Replacing...
🔍 Processing patient TCGA-HC-A48F-01, gene: PDK1
Patient TCGA-HC-A48F-01 is in both gain and loss groups. Please review.
PDK1 node found. Replacing...
🔍 Processing patient TCGA-HC-A48F-01, gene: CFLAR
Patient TCGA-HC-A48F-01 is in both gain and loss groups. Please review.
No CFLAR node found in file for patient TCGA-HC-A48F-01
🔍 Processing patient TCGA-HC-A48F-01, gene: IDH1
Patient TCGA-HC-A48F-01 is in both gain and loss groups. Please review.
IDH1 node found. Replacing...
🔍 Processing patient TCGA-HC-A48F-01, gene: VHL
Patient TCGA-HC-A48F-01 is in both gain and loss groups. Please review.
VHL node found. Replacing...
🔍 Processing patient TCGA-HC-A48F-01, gene: ATR
Patient TCGA-HC-A48

In [19]:
# # add mutations info 

# # Add somatic mutations data 
# # keep only the one we had for the main pipeline

# mutations_data = pd.read_csv('data/TCGA_data/TCGA_mutations_mutect2_GDC-PANCAN.csv')
# mutations_data_filtered = mutations_data[mutations_data['Sample_ID'].isin(patients_id)]
# mutations_data_filtered = mutations_data_filtered[mutations_data_filtered['gene'].isin(montagud_nodes)]

# # check if genes are TSG/ Oncogenes
# onco_tsg_gene = pd.read_csv('data/unknown_origin/oncogenes_tsg.csv')
# onco_tsg_gene = onco_tsg_gene[['Hugo Symbol', 'Is Oncogene', 'Is Tumor Suppressor Gene']]
# onco_tsg_gene_filtered = onco_tsg_gene[onco_tsg_gene['Hugo Symbol'].isin(montagud_nodes)]
# onco_tsg_gene_filtered = onco_tsg_gene_filtered.rename(columns={'Hugo Symbol': 'gene'})
# # oncogenes = onco_tsg_gene_filtered[onco_tsg_gene_filtered['Is Oncogene'] == 'Yes']
# # tsg_genes = onco_tsg_gene_filtered[onco_tsg_gene_filtered['Is Tumor Suppressor Gene'] == 'Yes']


# mutations_annotated = mutations_data_filtered.merge(
#     onco_tsg_gene_filtered[['gene', 'Is Oncogene', 'Is Tumor Suppressor Gene']],
#     on='gene',
#     how='left'
# )
# mutations_annotated = mutations_annotated.rename(columns={'Is Oncogene': 'oncogene', 'Is Tumor Suppressor Gene': 'tsg'})

# mutations_annotated = mutations_annotated[
#     mutations_annotated['oncogene'].notna() | mutations_annotated['tsg'].notna()
# ]

# # loss function mutation assumption -> TSG and 
# lof_effects = ["frameshift_variant", "stop_gained", "start_lost", "splice_region_variant"]
# lof_mutations = mutations_annotated[mutations_annotated['effect'].isin(lof_effects)]
# lof_mutations_tsg = lof_mutations[(lof_mutations['tsg'] == 'Yes') & (lof_mutations['oncogene'] == 'No')]
# lof_mutations_tsg_filtered = lof_mutations_tsg[['Sample_ID', 'gene']]


# mutations_onco = mutations_annotated[(mutations_annotated['tsg'] == 'No') & (mutations_annotated['oncogene'] == 'Yes')]
# print(mutations_onco.head())
# # dna_vaf > 0.5 -> clonal mutation (mutation probably in the early tumor cells)
# # gof_effects = ['p.G12D','p.S249C', 'p.Y373C']
# # gof_mutations = mutations_onco[mutations_onco['Amino_Acid_Change'].isin(gof_effects)]
# # gof_mutations_filtered = gof_mutations[['Sample_ID', 'gene']]




Let's try to add more info to reflect metastasis change, add mutations data

In [20]:
# tailor_bnd_mutat_validation(lof_mutations_tsg_filtered,gof_mutations_filtered,folder_pers_models, tissue)

In [17]:
# Compute phenotype distribution for each patient
folder_models = 'validation/prostate/personalized_models'
folder_save_results = 'validation/prostate/results/phenotype_distribution/phenotype_table'
phenotypes_interest = [
        "Proliferation",
        "Invasion",
        "DNA_Repair",
        "Migration",
        "Apoptosis",
    ]

inputs_list = ['EGF', 'FGF', 'Androgen', 'TGFb', 'Hypoxia', 'Nutrients', 'Carcinogen', 'Acidosis', 'TNFalpha', 'fused_event', 'SPOP']
for patient  in patients_id:
    compute_phenotype_table(folder_save_results, folder_models, patient, inputs_list, phenotypes_interest, tissue= 'Prostate')
# TO DO- change compute_phenotype_table -> with nice path to access and save the data!!

KeyboardInterrupt: 

In [None]:
phenotype_data_filtered = phenotype_data_filtered[phenotype_data_filtered['sample'].isin(patients_id)]
# print(phenotype_data_filtered)


stage_1_group_ids= list(phenotype_data_filtered[phenotype_data_filtered['Tumor Group'] == 'Early Stage']['sample'])
stage_2_group_ids= list(phenotype_data_filtered[phenotype_data_filtered['Tumor Group'] == 'Larger Tumor']['sample'])
stage_3_group_ids= list(phenotype_data_filtered[phenotype_data_filtered['Tumor Group'] == 'Advanced Local Speed']['sample'])
stage_4_group_ids= list(phenotype_data_filtered[phenotype_data_filtered['Tumor Group'] == 'Metastatic']['sample'])



In [None]:
stages_groups = ["stage_1_group", "stage_2_group", "stage_3_group", "stage_4_group"]

folder_groups_means = "validation/results/phenotype_group_means"
compute_phenotype_mean_group_validation(stages_groups, folder_groups_means)

In [None]:
# combine values of the two groups -> so the stats test is easier later

base_path = 'validation/results/phenotype_group_means'
data_combined = combine_groups_values(base_path)
print(data_combined)

In [None]:

# stats test
significant_results = compute_kruskal_test_means_validation(data_combined)
print(significant_results)