In [13]:
import maboss
import ginsim
import pandas as pd 
import numpy as np
import mygene
import os
import shutil
import ast
from scipy.stats import kruskal

from create_generic_models.create_generic_patients_cfgs import create_generic_patients_cfg_bnd_validation
from create_generic_models.update_phenotypes_generic_models import generic_models_update_phenotypes


# from create_person_models.tailor_cfgs_patients_gene import personalized_patients_genes_cfgs_validation
from pre_process_data.pre_process_genes import create_table_rna_seq_patients

from identification_patients.validation_get_patients_ids import get_patients_valid

from pre_process_data.tcga_preprocess_data import pre_process_tcga_data

from create_person_models.tailor_cfgs_patients_gene import personalized_patients_genes_cfgs
from create_person_models.tailor_bnd_cnv import tailor_bnd_cnv_validation

from create_person_models.tailor_bnd_tsg_onco_mutations import tailor_bnd_mutat_validation

from MaBoSS_simulation.maboss_phenotype_patient import compute_phenotype_table, compute_phenotype_mean_group_validation, combine_groups_values
from stats.stats_proba import compute_kruskal_test_means_validation

In [None]:
# Import data
phenotype_data = pd.read_csv('data/TCGA_data/prostate/TCGA_PRAD_phenotypes.csv')
genes_data = pd.read_csv('data/TCGA_data/prostate/TCGA_PRAD_genes_illumina.csv', sep='\t')
cnv_data = pd.read_csv('data/TCGA_data/prostate/TCGA_PRAD_cnv_gistic2.csv',sep='\t')

# keep all montagud nodes
montagud_data = (
    pd.read_csv('data/Montagud_inter_nodes_data.csv', header=1)
    .loc[:, ['Target node', 'Interaction type', 'Source']])

# Create list of genes of interest (in Montagud data)
montagud_nodes = list(set(montagud_data['Target node'].tolist() + montagud_data['Source'].tolist()))
montagud_nodes = [node for node in montagud_nodes if node != '0/1']
montagud_nodes = [node.upper() for node in montagud_nodes if isinstance(node, str)]
# montagud_nodes.append('KRAS')
# to_remove = ['RAS', 'FUSED_EVENT', 'NKX3_1', 'SPOP', 'AR_ERG']

# montagud_nodes = [node for node in montagud_nodes if node not in to_remove]
# montagud_nodes = list(set(montagud_nodes))

In [3]:
# changes montagud nodes according to modif done in the cfg/ bnd files

montagud_nodes = ['CASP8' if x == 'CASPASE8' else x for x in montagud_nodes]
montagud_nodes = ['CASP3' if x == 'CASPASE3' else x for x in montagud_nodes]
montagud_nodes = ['CASP9' if x == 'CASPASE9' else x for x in montagud_nodes]


montagud_nodes = ['CCNB1' if x == 'CYCLINB' else x for x in montagud_nodes]
montagud_nodes = ['CCND1' if x == 'CYCLIND' else x for x in montagud_nodes]


montagud_nodes = ['DVL1' if x == 'DSH' else x for x in montagud_nodes]

montagud_nodes = ['CTNNB1' if x == 'BETA_CATENIN' else x for x in montagud_nodes]

montagud_nodes = ['CDH1' if x == 'E_CADHERIN' else x for x in montagud_nodes]
montagud_nodes = ['CYCS' if x == 'CYCC' else x for x in montagud_nodes]

montagud_nodes = ['MEK1' if x == 'MEK1_2' else x for x in montagud_nodes]
montagud_nodes = ['NFKB' if x == 'NFK_B' else x for x in montagud_nodes]
montagud_nodes = ['SNAI1' if x == 'SNAIL' else x for x in montagud_nodes]
montagud_nodes = ['TNF' if x == 'TNFALPHA' else x for x in montagud_nodes]
montagud_nodes = ['TSC1' if x == 'TSC1_2' else x for x in montagud_nodes]
montagud_nodes = ['BCL2L1' if x == 'BCL_XL' else x for x in montagud_nodes]
montagud_nodes = ['MAP3K1' if x == 'MAP3K1_3' else x for x in montagud_nodes]
montagud_nodes = ['CHK1' if x == 'CHK1_2' else x for x in montagud_nodes]




montagud_nodes.append('MEK2')
montagud_nodes.append('TSC2')
montagud_nodes.append('MAP3K3')
montagud_nodes.append('CHK2')



In [4]:
print(montagud_nodes)

['GLI', 'BAX', 'CARCINOGEN', 'ETS1', 'ZBTB17', 'P70S6KAB', 'CASP9', 'CDH2', 'PKC', 'AMPK', 'AR', 'CCNB1', 'CASP3', 'HYPOXIA', 'MAP3K1', 'EGF', 'P38', 'FRS2', 'FGFR3', 'BAD', 'MYC_MAX', 'AMP_ATP', 'MED12', 'CTNNB1', 'LDHA', 'TNF', 'BRCA2', 'APAF1', 'NF_KB', 'VHL', 'MDM2', 'ERK', 'NUTRIENTS', 'DNA_DAMAGE', 'P15', 'FGF', 'FOXA1', 'FOXO', 'ANDROGEN', 'P90RSK', 'EMT', 'DVL1', 'SPOP', 'GLUT1', 'DAXX', 'SLUG', 'JUN', 'JNK', 'EP300', 'MEK1', 'BAK', 'DNA_REPAIR', 'AR_ERG', 'TAK1', 'WNT', 'SMAD', 'ATM', 'CFLAR', 'FADD', 'MTORC1', 'CHK1', 'NF1', 'TGFB', 'BRCA1', 'ACIDOSIS', 'BIRC5', 'EEF2K', 'NCOR2', 'EEF2', 'LACTIC_ACID', 'PTEN', 'TCF', 'P53', 'CYTOC', 'ETV1', 'MTORC2', 'P14ARF', 'GADD45', 'APOPTOSIS', 'RAF', 'IDH1', 'SMO', 'EZH2', 'PIP3', 'BMP2', 'RHEB', 'FOS', 'GSH', 'PI3K', 'E2F1', 'NCOR1', 'MXI1', 'VEGF', 'METASTASIS', 'MYC', 'CCND1', 'INVASION', 'IKK', 'SNAI1', 'PHDS', 'HIF1', 'PDK1', 'BCL2', 'P21', 'PTCH1', 'RUNX2', 'RAS', 'PROLIFERATION', 'YWHAZ', 'RTK', 'TSC1', 'COX4I2', 'FUSED_EVENT', '

In [5]:

# diagnoses.tumor_grade, diagnoses.morphology
# stratify by cancer type: diagnoses.primary_diagnosis

phenotype_data_filtered = phenotype_data[['sampleID','gleason_score']]


In [6]:
print(phenotype_data_filtered)
print(phenotype_data_filtered['gleason_score'].value_counts())

            sampleID  gleason_score
0    TCGA-2A-A8VL-01              6
1    TCGA-2A-A8VO-01              6
2    TCGA-2A-A8VT-01              9
3    TCGA-2A-A8VV-01              6
4    TCGA-2A-A8VX-01              8
..               ...            ...
561  TCGA-ZG-A9M4-01              9
562  TCGA-ZG-A9MC-01              9
563  TCGA-ZG-A9N3-01              9
564  TCGA-ZG-A9ND-01              9
565  TCGA-ZG-A9NI-01              9

[566 rows x 2 columns]
gleason_score
7     301
9     144
8      67
6      50
10      4
Name: count, dtype: int64


In [7]:
# create 3 groups: gleason score of 6, gleason score of 7, and of gleason score of > 8

group_0 = [6]
group_1 = [7]
group_2 = [8, 9, 10]

conditions = [
phenotype_data_filtered["gleason_score"].isin(group_0),
phenotype_data_filtered["gleason_score"].isin(group_1),
phenotype_data_filtered["gleason_score"].isin(group_2),

]
choices = ['low_aggressive', 'middle_aggressive', 'high_aggressive']

phenotype_data_filtered.loc[:, "Gleason_group"] = np.select(
conditions, choices, default=""
)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  phenotype_data_filtered.loc[:, "Gleason_group"] = np.select(


In [8]:
print(phenotype_data_filtered)
sampled_df = phenotype_data_filtered.groupby("Gleason_group", group_keys=False).apply(
    lambda x: x.sample(n=min(len(x), 30), random_state=42)
)

            sampleID  gleason_score    Gleason_group
0    TCGA-2A-A8VL-01              6   low_aggressive
1    TCGA-2A-A8VO-01              6   low_aggressive
2    TCGA-2A-A8VT-01              9  high_aggressive
3    TCGA-2A-A8VV-01              6   low_aggressive
4    TCGA-2A-A8VX-01              8  high_aggressive
..               ...            ...              ...
561  TCGA-ZG-A9M4-01              9  high_aggressive
562  TCGA-ZG-A9MC-01              9  high_aggressive
563  TCGA-ZG-A9N3-01              9  high_aggressive
564  TCGA-ZG-A9ND-01              9  high_aggressive
565  TCGA-ZG-A9NI-01              9  high_aggressive

[566 rows x 3 columns]


  sampled_df = phenotype_data_filtered.groupby("Gleason_group", group_keys=False).apply(


In [9]:
print(sampled_df)
patients_id = list(sampled_df['sampleID'])
print(patients_id)

            sampleID  gleason_score      Gleason_group
551  TCGA-ZG-A9L5-01              9    high_aggressive
563  TCGA-ZG-A9N3-01              9    high_aggressive
464  TCGA-V1-A9ZK-01              8    high_aggressive
525  TCGA-YL-A8SF-01              8    high_aggressive
52   TCGA-CH-5792-01              9    high_aggressive
..               ...            ...                ...
382  TCGA-KK-A6E5-01              7  middle_aggressive
392  TCGA-KK-A7AZ-01              7  middle_aggressive
110  TCGA-EJ-7330-01              7  middle_aggressive
166  TCGA-EJ-A8FN-01              7  middle_aggressive
165  TCGA-EJ-A7NN-01              7  middle_aggressive

[90 rows x 3 columns]
['TCGA-ZG-A9L5-01', 'TCGA-ZG-A9N3-01', 'TCGA-V1-A9ZK-01', 'TCGA-YL-A8SF-01', 'TCGA-CH-5792-01', 'TCGA-KK-A8I9-01', 'TCGA-YL-A8HK-01', 'TCGA-HC-A48F-01', 'TCGA-ZG-A9LM-01', 'TCGA-V1-A9ZR-01', 'TCGA-XJ-A9DX-01', 'TCGA-EJ-8472-01', 'TCGA-YL-A9WI-01', 'TCGA-CH-5761-01', 'TCGA-HC-7821-01', 'TCGA-YL-A8SC-01', 'TCGA-YL-A8S

In [10]:
print(cnv_data.head())

  Gene Symbol  TCGA-2A-A8VL-01  TCGA-2A-A8VO-01  TCGA-2A-A8VT-01  \
0       ACAP3                0                0                0   
1      ACTRT2                0                0                0   
2        AGRN                0                0                0   
3     ANKRD65                0                0                0   
4      ATAD3A                0                0                0   

   TCGA-2A-A8VV-01  TCGA-2A-A8VX-01  TCGA-2A-A8W1-01  TCGA-2A-A8W3-01  \
0                0                0                0                0   
1                0                0                0                0   
2                0                0                0                0   
3                0                0                0                0   
4                0                0                0                0   

   TCGA-2A-AAYF-01  TCGA-2A-AAYO-01  TCGA-2A-AAYU-01  TCGA-4L-AA1F-01  \
0                0                0                0                0   
1     

In [11]:
cnv_data_col = list(cnv_data.columns)
common_col = list(set(cnv_data_col) & set(patients_id))
col_keep = ['Gene Symbol'] + common_col
cnv_data_filtered = cnv_data[col_keep]
print(cnv_data_filtered.head())

  Gene Symbol  TCGA-HC-A48F-01  TCGA-HI-7170-01  TCGA-G9-6347-01  \
0       ACAP3                0                0                0   
1      ACTRT2                0                0                0   
2        AGRN                0                0                0   
3     ANKRD65                0                0                0   
4      ATAD3A                0                0                0   

   TCGA-2A-AAYU-01  TCGA-EJ-5517-01  TCGA-V1-A8WN-01  TCGA-VN-A88Q-01  \
0                0                0                0                0   
1                0                0                0                0   
2                0                0                0                0   
3                0                0                0                0   
4                0                0                0                0   

   TCGA-2A-AAYO-01  TCGA-EJ-7330-01  TCGA-G9-6329-01  TCGA-CH-5743-01  \
0                0                0                0                0   
1     

In [12]:
df_melted_cnv = cnv_data_filtered.melt(
    id_vars=["Gene Symbol"],       # columns to keep fixed
    var_name="samples_id",         # name for the variable column (sample IDs)
    value_name="expression_value"  # name for the values
)


df_melted_cnv['Gene Symbol'] = df_melted_cnv['Gene Symbol'].str.split('|').str[0] 


df_melted_cnv = df_melted_cnv.rename(
    columns={
        "samples_id": "model_id",
        "Gene Symbol": "gene_symbol",
        "expression_value": "rsem_tpm",
    }
)

print(df_melted_cnv.head())


  gene_symbol         model_id  rsem_tpm
0       ACAP3  TCGA-HC-A48F-01         0
1      ACTRT2  TCGA-HC-A48F-01         0
2        AGRN  TCGA-HC-A48F-01         0
3     ANKRD65  TCGA-HC-A48F-01         0
4      ATAD3A  TCGA-HC-A48F-01         0


In [13]:
group_loss = [-1, -2]
group_normal = [0]
group_gain = [1, 2]

conditions = [
    df_melted_cnv["rsem_tpm"].isin(group_loss),
    df_melted_cnv["rsem_tpm"].isin(group_normal),
    df_melted_cnv["rsem_tpm"].isin(group_gain),
]
choices = ["Loss", "Normal", "Gain"]
df_melted_cnv.loc[:, "effect"] = np.select(conditions, choices, default="")

df_melted_cnv = df_melted_cnv[df_melted_cnv['gene_symbol'].isin(montagud_nodes)]
df_melted_cnv.to_csv('data/TCGA_data/prostate/filtered_data/cnv_samples_table.csv')

In [14]:
# list(df_melted_cnv['gene_symbol'].unique())
# df_melted_cnv[df_melted_cnv['gene_symbol'] == 'BCL2L1']

In [15]:
genes_data_col = list(genes_data.columns)
common_col = list(set(genes_data_col) & set(patients_id))
col_keep = ['sample'] + common_col
genes_data_filtered = genes_data[col_keep]


df_melted_gene = genes_data_filtered.melt(
    id_vars=["sample"],       # columns to keep fixed
    var_name="samples_id",         # name for the variable column (sample IDs)
    value_name="expression_value"  # name for the values
)

df_melted_gene['sample'] = df_melted_gene['sample'].str.split('|').str[0] 

df_melted_gene = df_melted_gene.rename(
    columns={
        "samples_id": "model_id",
        "sample": "gene_symbol",
        "expression_value": "rsem_tpm",
    }
)
df_melted_gene['gene_symbol'] = df_melted_gene['gene_symbol'].str.upper()
df_melted_gene = df_melted_gene[df_melted_gene['gene_symbol'].isin(montagud_nodes)]
df_melted_gene['gene_symbol'] = df_melted_gene['gene_symbol'].str.replace('_', '', regex=False)

df_melted_gene.to_csv('data/TCGA_data/prostate/filtered_data/genes_samples_table.csv')

In [16]:
# remove '_' 
print(df_melted_gene.head())

     gene_symbol         model_id  rsem_tpm
61          RHEB  TCGA-HC-A48F-01   10.2003
204       COX4I2  TCGA-HC-A48F-01    3.6574
408        RUNX2  TCGA-HC-A48F-01    6.5601
960         PDK1  TCGA-HC-A48F-01    9.2461
1041        BMP2  TCGA-HC-A48F-01    5.2294


In [17]:
# list(df_melted_gene['gene_symbol'].unique())
# df_melted_gene[df_melted_gene['gene_symbol'] == 'CCNB1']

In [18]:
# pre-process tcga data based on montagud nodes and patients ids

df_melted_cnv= pd.read_csv('data/TCGA_data/prostate/filtered_data/cnv_samples_table.csv')
df_melted_gene= pd.read_csv('data/TCGA_data/prostate/filtered_data/genes_samples_table.csv')

In [19]:
 # Create generic models 

folder_generic_models_cfg = 'validation/prostate/generic_models/Montagud2022_Prostate_Cancer.cfg'
folder_generic_models_bnd = 'validation/prostate/generic_models/Montagud2022_Prostate_Cancer.bnd'

folder_pers_models = 'validation/prostate/personalized_models'

tissue = 'Prostate'

# create_generic_patients_cfg_bnd_validation(folder_generic_models_cfg, folder_generic_models_bnd, folder_pers_models, patients_id, tissue)


In [20]:
# update phenotypes in generic models 

phenotype_interest = ["Proliferation","Invasion","DNA_Repair","Migration","Apoptosis"]
original_data_dir = "validation/prostate/personalized_models"
results_dir = "validation/prostate/personalized_models"


# generic_models_update_phenotypes(phenotype_interest, original_data_dir, results_dir)


In [21]:
# personalize the boolean networks with genes 
table_rna_seq_patients = create_table_rna_seq_patients(df_melted_gene)
print(table_rna_seq_patients.head())

gene_expression_level                               High Gene Expression  \
model_id                                                                   
TCGA-2A-A8VL-01                                          EEF2K, FOS, JUN   
TCGA-2A-AAYO-01                                         BAX, CASP9, TERT   
TCGA-2A-AAYU-01                                     BAX, CDH1, ERG, SPOP   
TCGA-CH-5738-01        APAF1, ATM, ATR, CFLAR, CTNNB1, FOS, JUN, RB1,...   
TCGA-CH-5743-01        APAF1, ATM, BCL2, BMP2, BRCA2, CASP3, CCND1, C...   

gene_expression_level                                Low Gene Expression  
model_id                                                                  
TCGA-2A-A8VL-01                                 BRCA1, BRCA2, FADD, FRS2  
TCGA-2A-AAYO-01                                              EGFR, NCOR2  
TCGA-2A-AAYU-01        BCL2, BMP2, CCND1, DAXX, EEF2K, MAP3K1, PTEN, TNF  
TCGA-CH-5738-01                AXIN1, BAD, CCND1, CDH2, DAXX, E2F1, FADD  
TCGA-CH-5743-01  

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rna_seq_data.rename(columns={"gene_symbol": "gene_name"}, inplace=True)


In [22]:
# personalized_patients_genes_cfgs(df_melted_gene,montagud_nodes,folder_pers_models,folder_pers_models,patients_id,table_rna_seq_patients,tissue)

In [23]:
# personalize with CNV
# tailor_bnd_cnv_validation(df_melted_cnv, folder_pers_models, tissue)

In [24]:
# # add mutations info 

# # Add somatic mutations data 
# # keep only the one we had for the main pipeline

# mutations_data = pd.read_csv('data/TCGA_data/TCGA_mutations_mutect2_GDC-PANCAN.csv')
# mutations_data_filtered = mutations_data[mutations_data['Sample_ID'].isin(patients_id)]
# mutations_data_filtered = mutations_data_filtered[mutations_data_filtered['gene'].isin(montagud_nodes)]

# # check if genes are TSG/ Oncogenes
# onco_tsg_gene = pd.read_csv('data/unknown_origin/oncogenes_tsg.csv')
# onco_tsg_gene = onco_tsg_gene[['Hugo Symbol', 'Is Oncogene', 'Is Tumor Suppressor Gene']]
# onco_tsg_gene_filtered = onco_tsg_gene[onco_tsg_gene['Hugo Symbol'].isin(montagud_nodes)]
# onco_tsg_gene_filtered = onco_tsg_gene_filtered.rename(columns={'Hugo Symbol': 'gene'})
# # oncogenes = onco_tsg_gene_filtered[onco_tsg_gene_filtered['Is Oncogene'] == 'Yes']
# # tsg_genes = onco_tsg_gene_filtered[onco_tsg_gene_filtered['Is Tumor Suppressor Gene'] == 'Yes']


# mutations_annotated = mutations_data_filtered.merge(
#     onco_tsg_gene_filtered[['gene', 'Is Oncogene', 'Is Tumor Suppressor Gene']],
#     on='gene',
#     how='left'
# )
# mutations_annotated = mutations_annotated.rename(columns={'Is Oncogene': 'oncogene', 'Is Tumor Suppressor Gene': 'tsg'})

# mutations_annotated = mutations_annotated[
#     mutations_annotated['oncogene'].notna() | mutations_annotated['tsg'].notna()
# ]

# # loss function mutation assumption -> TSG and 
# lof_effects = ["frameshift_variant", "stop_gained", "start_lost", "splice_region_variant"]
# lof_mutations = mutations_annotated[mutations_annotated['effect'].isin(lof_effects)]
# lof_mutations_tsg = lof_mutations[(lof_mutations['tsg'] == 'Yes') & (lof_mutations['oncogene'] == 'No')]
# lof_mutations_tsg_filtered = lof_mutations_tsg[['Sample_ID', 'gene']]


# mutations_onco = mutations_annotated[(mutations_annotated['tsg'] == 'No') & (mutations_annotated['oncogene'] == 'Yes')]
# print(mutations_onco.head())
# # dna_vaf > 0.5 -> clonal mutation (mutation probably in the early tumor cells)
# # gof_effects = ['p.G12D','p.S249C', 'p.Y373C']
# # gof_mutations = mutations_onco[mutations_onco['Amino_Acid_Change'].isin(gof_effects)]
# # gof_mutations_filtered = gof_mutations[['Sample_ID', 'gene']]




Let's try to add more info to reflect metastasis change, add mutations data

In [25]:
# tailor_bnd_mutat_validation(lof_mutations_tsg_filtered,gof_mutations_filtered,folder_pers_models, tissue)

In [26]:
# Compute phenotype distribution for each patient
folder_models = 'validation/prostate/personalized_models'
folder_save_results = 'validation/prostate/results/phenotype_distribution/phenotype_table'
phenotypes_interest = [
        "Proliferation",
        "Invasion",
        "DNA_Repair",
        "Migration",
        "Apoptosis",
    ]

inputs_list = ['EGF', 'FGF', 'TGFB', 'Androgen', 'Hypoxia', 'Nutrients', 'Carcinogen', 'Acidosis', 'TNF', 'fused_event', 'SPOP']
# for patient  in patients_id:
#     compute_phenotype_table(folder_save_results, folder_models, patient, inputs_list, phenotypes_interest, tissue= 'Prostate')


In [27]:
phenotype_data_filtered.head()
# print(phenotype_data_filtered['Gleason_group'].unique())

Unnamed: 0,sampleID,gleason_score,Gleason_group
0,TCGA-2A-A8VL-01,6,low_aggressive
1,TCGA-2A-A8VO-01,6,low_aggressive
2,TCGA-2A-A8VT-01,9,high_aggressive
3,TCGA-2A-A8VV-01,6,low_aggressive
4,TCGA-2A-A8VX-01,8,high_aggressive


In [28]:
low_group_ids= list(phenotype_data_filtered[phenotype_data_filtered['Gleason_group'] == 'low_aggressive']['sampleID'])
medium_group_ids= list(phenotype_data_filtered[phenotype_data_filtered['Gleason_group'] == 'middle_aggressive']['sampleID'])
high_group_ids= list(phenotype_data_filtered[phenotype_data_filtered['Gleason_group'] == 'high_aggressive']['sampleID'])



In [29]:
# #move each files to directory corresponding
# # to do

# import os
# import shutil

# # Map group names to sample ID lists
# group_mapping = {
#     "low_group": low_group_ids,
#     "medium_group": medium_group_ids,
#     "high_group": high_group_ids,
# }

# # Folder where all the files currently are
# source_dir = "validation/prostate/results/phenotype_distribution"

# # Destination base directory (can be same as source if you want)
# dest_base_dir = "validation/prostate/results/phenotype_group_means"

# # Loop over all files in the source directory
# for filename in os.listdir(source_dir):
#     if not filename.startswith("phenotype_table_"):
#         continue
#     sample_id = filename.replace("phenotype_table_", "").replace(".csv", "")
#     # Determine the group of this sample
#     group_found = False
#     for group_name, id_list in group_mapping.items():
#         if sample_id in id_list:
#             group_folder = os.path.join(dest_base_dir, group_name)
#             os.makedirs(group_folder, exist_ok=True)

#             src_path = os.path.join(source_dir, filename)
#             dst_path = os.path.join(group_folder, filename)
#             shutil.move(src_path, dst_path)

#             group_found = True
#             break

#     if not group_found:
#         print(f"⚠️ Sample ID {sample_id} not found in any group list.")


In [30]:
# combine all the values
groups = ["low_group", "medium_group", "high_group"]

folder_groups_means = "validation/prostate/results/phenotype_group_means"
# to do-> change name (only compute mean not validation)
mean_df =compute_phenotype_mean_group_validation(groups, folder_groups_means)

              Proliferation  Invasion  DNA_Repair  Migration  Apoptosis
Acidosis           0.366059  0.331998    0.235884   0.024329   0.363642
Androgen           0.378743  0.352069    0.239454   0.051197   0.364997
Carcinogen         0.377359  0.345996    0.400864   0.013867   0.411317
EGF                0.374162  0.347758    0.233172   0.051207   0.367908
FGF                0.389455  0.347433    0.236985   0.023721   0.381571
Hypoxia            0.324061  0.339164    0.230691   0.028574   0.365637
Nutrients          0.399506  0.344355    0.237794   0.030017   0.364394
Overall_Mean       0.365869  0.378429    0.248794   0.037309   0.374090
SPOP               0.391617  0.371726    0.230629   0.051310   0.341411
TGFB               0.334074  0.503393    0.227178   0.041223   0.418340
TNF                0.317056  0.535548    0.226876   0.069388   0.368572
fused_event        0.372464  0.343281    0.237205   0.025562   0.367195
Overall_Mean       0.365869  0.378429    0.248794   0.037309   0

In [None]:
# combine values of a directory together

from collections import defaultdict
def collect_group_data(group_folder_path):
    combined_data = defaultdict(lambda: defaultdict(list))

    for file in os.listdir(group_folder_path):
        if file.startswith("phenotype") and file.endswith(".csv"):
            file_path = os.path.join(group_folder_path, file)
            df = pd.read_csv(file_path, index_col=0)

            for input_name in df.index:
                for phenotype in df.columns:
                    value = df.at[input_name, phenotype]
                    combined_data[input_name][phenotype].append(float(value))

    result_df = pd.DataFrame.from_dict(combined_data, orient='index')
    result_df.to_csv(os.path.join(group_folder_path, "combined_results.csv"))

    return result_df


base_path = "validation/prostate/results/phenotype_group_means"

group_names = ["low_group", "medium_group", "high_group"]
group_dataframes = {}

for group in group_names:
    folder_path = os.path.join(base_path, group)
    group_df = collect_group_data(folder_path)
    group_dataframes[group] = group_df

In [None]:


# Paths to your combined data CSVs
base_path = "validation/prostate/results/phenotype_group_means"
group_files = {
    "low": os.path.join(base_path, "low_group", "combined_results.csv"),
    "medium": os.path.join(base_path, "medium_group", "combined_results.csv"),
    "high": os.path.join(base_path, "high_group", "combined_results.csv"),
}

# Load all groups into dict of DataFrames
group_dfs = {}
for group, path in group_files.items():
    # Because each cell is a list saved as a string, parse it back to list
    df = pd.read_csv(path, index_col=0)
    # Convert strings like '[1.2, 3.4]' back to Python lists using ast.literal_eval
    df = df.applymap(ast.literal_eval)
    group_dfs[group] = df

# Get all inputs and phenotypes from one dataframe (assuming all share the same shape)
inputs = group_dfs["low"].index
phenotypes = group_dfs["low"].columns

# Prepare result storage
kruskal_results = pd.DataFrame(index=inputs, columns=phenotypes)

# Run Kruskal-Wallis test for each (input, phenotype)
for input_name in inputs:
    for phenotype in phenotypes:
        data_low = group_dfs["low"].at[input_name, phenotype]
        data_medium = group_dfs["medium"].at[input_name, phenotype]
        data_high = group_dfs["high"].at[input_name, phenotype]

        # Run the Kruskal-Wallis test only if all groups have data
        if data_low and data_medium and data_high:
            stat, pvalue = kruskal(data_low, data_medium, data_high)
            kruskal_results.at[input_name, phenotype] = pvalue
        else:
            kruskal_results.at[input_name, phenotype] = None

# Optionally, save the p-values table to CSV
kruskal_results.to_csv(os.path.join(base_path, "kruskal_pvalues.csv"))
print(kruskal_results)


            Proliferation  Invasion DNA_Repair Migration Apoptosis
EGF              0.001063  0.000571   0.003473  0.000655  0.001269
FGF              0.000953  0.000488   0.000663  0.847542  0.000973
TGFB             0.000558  0.001172   0.001881    0.9022  0.001157
Androgen         0.001063  0.000585   0.003988   0.89582  0.001246
Hypoxia           0.00217   0.00061   0.000888  0.000905  0.001296
Nutrients        0.000853  0.000562   0.001149  0.000825  0.001097
Carcinogen       0.001197   0.00067    0.00151  0.176619  0.000587
Acidosis         0.001081  0.000558   0.001752  0.143918  0.001457
TNF              0.000992  0.001417   0.000463  0.869862  0.001274
fused_event      0.000985  0.000619   0.004247  0.000962  0.001311
SPOP             0.001116  0.000642   0.001532  0.914569  0.001974


  df = df.applymap(ast.literal_eval)
  df = df.applymap(ast.literal_eval)
  df = df.applymap(ast.literal_eval)


In [36]:
from statsmodels.stats.multitest import multipletests

# Flatten p-values to a 1D array, ignoring None or NaNs
pvals = kruskal_results.values.flatten()
pvals = [p for p in pvals if p is not None]

# Adjust using BH method
_, pvals_adj, _, _ = multipletests(pvals, alpha=0.05, method='fdr_bh')

# Now, you need to put adjusted p-values back into the DataFrame shape
# Create a copy to fill
adjusted_df = kruskal_results.copy()

# Fill adjusted p-values sequentially where there was a non-None p-value
idx = 0
for i in adjusted_df.index:
    for j in adjusted_df.columns:
        if adjusted_df.at[i, j] is not None:
            adjusted_df.at[i, j] = pvals_adj[idx]
            idx += 1
print(adjusted_df)


            Proliferation  Invasion DNA_Repair Migration Apoptosis
EGF              0.001948  0.001948   0.004152  0.001948  0.001948
FGF              0.001948  0.001948   0.001948  0.914015  0.001948
TGFB             0.001948  0.001948   0.002406  0.914569  0.001948
Androgen         0.001948  0.001948   0.004667  0.914569  0.001948
Hypoxia          0.002653  0.001948   0.001948  0.001948  0.001948
Nutrients        0.001948  0.001948   0.001948  0.001948  0.001948
Carcinogen       0.001948  0.001948   0.002056   0.19428  0.001948
Acidosis         0.001948  0.001948   0.002294  0.161541  0.002054
TNF              0.001948   0.00205   0.001948  0.914569  0.001948
fused_event      0.001948  0.001948   0.004867  0.001948  0.001948
SPOP             0.001948  0.001948   0.002056  0.914569  0.002467


In [39]:
# keep only the significant results
significant_df = adjusted_df.copy()
significant_df[significant_df >= 0.05] = np.nan
significant_df

Unnamed: 0,Proliferation,Invasion,DNA_Repair,Migration,Apoptosis
EGF,0.001948,0.001948,0.004152,0.001948,0.001948
FGF,0.001948,0.001948,0.001948,,0.001948
TGFB,0.001948,0.001948,0.002406,,0.001948
Androgen,0.001948,0.001948,0.004667,,0.001948
Hypoxia,0.002653,0.001948,0.001948,0.001948,0.001948
Nutrients,0.001948,0.001948,0.001948,0.001948,0.001948
Carcinogen,0.001948,0.001948,0.002056,,0.001948
Acidosis,0.001948,0.001948,0.002294,,0.002054
TNF,0.001948,0.00205,0.001948,,0.001948
fused_event,0.001948,0.001948,0.004867,0.001948,0.001948
