In [1]:
import maboss
import ginsim
import pandas as pd 
import numpy as np
import mygene
import os
import shutil
import ast
from scipy.stats import kruskal
from statsmodels.stats.multitest import multipletests
from collections import defaultdict
from pathlib import Path


from create_generic_models.create_generic_patients_cfgs import create_generic_patients_cfg_bnd_validation
from create_generic_models.update_phenotypes_generic_models import generic_models_update_phenotypes
from collections import defaultdict
from statsmodels.stats.multitest import multipletests


# from create_person_models.tailor_cfgs_patients import personalized_patients_genes_cfgs_validation
from pre_process_data.pre_process_genes import create_table_rna_seq_patients, identify_genes_synonyms

from identification_patients.validation_get_patients_ids import get_patients_valid

from pre_process_data.tcga_preprocess_data import pre_process_tcga_data

from create_person_models.tailor_cfgs_patients import personalized_patients_genes_cfgs
from create_person_models.tailor_bnd_cnv import tailor_bnd_cnv_validation

from create_person_models.tailor_bnd_tsg_onco_mutations import tailor_bnd_mutat_validation

from MaBoSS_simulation.maboss_phenotype_patient import compute_phenotype_table, compute_phenotype_mean_group_validation
from stats.stats_proba import compute_kruskal_test_means_validation

In [2]:
# create directory where the models and results will be saved

type_models = 'genes_models'

src_dir_generic_models = f'models/prostate/generic/{type_models}'
folder_generic_models = f'validation/prostate/{type_models}/generic_models'
# folder_generic_models_bnd = f'validation/prostate/{type_models}/generic_models'
folder_pers_models = f'validation/prostate/{type_models}/personalized_models'

folder_save_results = f"validation/prostate/{type_models}/results/phenotype_distribution/phenotype_table"
dest_base_dir = f"validation/prostate/{type_models}/results/phenotype_group_means"

directories = [
    Path(src_dir_generic_models).parent,
    Path(folder_generic_models),
    Path(folder_pers_models),
    Path(folder_save_results),
    Path(dest_base_dir),
]

# Create them
for directory in directories:
    directory.mkdir(parents=True, exist_ok=True)



# copy the generic model from the models to validation generic models
for filename in os.listdir(src_dir_generic_models):
    src_file = os.path.join(src_dir_generic_models, filename)
    dst_file = os.path.join(folder_generic_models, filename)
    if os.path.isfile(src_file):
        shutil.copy(src_file, dst_file)


In [3]:
tissue = 'Prostate'

# for the model
phenotype_interest = ["Proliferation","Invasion","DNA_Repair","Migration","Apoptosis"]


In [4]:
# Import data
phenotype_data = pd.read_csv('data/TCGA_data/prostate/TCGA_PRAD_phenotypes.csv')
genes_data = pd.read_csv('data/TCGA_data/prostate/TCGA_PRAD_genes_illumina.csv', sep='\t')
cnv_data = pd.read_csv('data/TCGA_data/prostate/TCGA_PRAD_cnv_gistic2.csv',sep='\t')
genes_synonyms = pd.read_csv("data/e_ensembl/genes_synonyms.csv", delimiter="\t")
uniprot_data = pd.read_csv("data/uniprot/uniprot_map.csv")

# keep all montagud nodes
montagud_data = (
    pd.read_csv('data/Montagud_inter_nodes_data.csv', header=1)
    .loc[:, ['Target node', 'Interaction type', 'Source']])

# Create list of genes of interest (in Montagud data)
montagud_nodes = list(set(montagud_data['Target node'].tolist() + montagud_data['Source'].tolist()))
montagud_nodes = [node for node in montagud_nodes if node != '0/1']
montagud_nodes = [node.upper() for node in montagud_nodes if isinstance(node, str)]
# montagud_nodes.append('KRAS')
# to_remove = ['RAS', 'FUSED_EVENT', 'NKX3_1', 'SPOP', 'AR_ERG']

# montagud_nodes = [node for node in montagud_nodes if node not in to_remove]
# montagud_nodes = list(set(montagud_nodes))

In [5]:
# changes montagud nodes according to modif done in the cfg/ bnd files

montagud_nodes = ['CASP8' if x == 'CASPASE8' else x for x in montagud_nodes]
montagud_nodes = ['CASP3' if x == 'CASPASE3' else x for x in montagud_nodes]
montagud_nodes = ['CASP9' if x == 'CASPASE9' else x for x in montagud_nodes]


montagud_nodes = ['CCNB1' if x == 'CYCLINB' else x for x in montagud_nodes]
montagud_nodes = ['CCND1' if x == 'CYCLIND' else x for x in montagud_nodes]


montagud_nodes = ['DVL1' if x == 'DSH' else x for x in montagud_nodes]

montagud_nodes = ['CTNNB1' if x == 'BETA_CATENIN' else x for x in montagud_nodes]

montagud_nodes = ['CDH1' if x == 'E_CADHERIN' else x for x in montagud_nodes]
montagud_nodes = ['CYCS' if x == 'CYCC' else x for x in montagud_nodes]

montagud_nodes = ['MEK1' if x == 'MEK1_2' else x for x in montagud_nodes]
montagud_nodes = ['NFKB' if x == 'NFK_B' else x for x in montagud_nodes]
montagud_nodes = ['SNAI1' if x == 'SNAIL' else x for x in montagud_nodes]
montagud_nodes = ['TNF' if x == 'TNFALPHA' else x for x in montagud_nodes]
montagud_nodes = ['TSC1' if x == 'TSC1_2' else x for x in montagud_nodes]
montagud_nodes = ['BCL2L1' if x == 'BCL_XL' else x for x in montagud_nodes]
montagud_nodes = ['MAP3K1' if x == 'MAP3K1_3' else x for x in montagud_nodes]
montagud_nodes = ['CHK1' if x == 'CHK1_2' else x for x in montagud_nodes]




montagud_nodes.append('MEK2')
montagud_nodes.append('TSC2')
montagud_nodes.append('MAP3K3')
montagud_nodes.append('CHK2')



In [6]:
# stratify by gleason score

phenotype_data_filtered = phenotype_data[['sampleID','gleason_score']]


In [7]:
# create 3 groups: gleason score of 6, gleason score of 7, and of gleason score of > 8

group_0 = [6]
group_1 = [7]
group_2 = [8, 9, 10]

conditions = [
phenotype_data_filtered["gleason_score"].isin(group_0),
phenotype_data_filtered["gleason_score"].isin(group_1),
phenotype_data_filtered["gleason_score"].isin(group_2),

]
choices = ['low_aggressive', 'middle_aggressive', 'high_aggressive']

phenotype_data_filtered.loc[:, "Gleason_group"] = np.select(
conditions, choices, default=""
)

sampled_df = phenotype_data_filtered.groupby("Gleason_group", group_keys=False).apply(
    lambda x: x.sample(n=min(len(x), 30), random_state=42)
)
patients_id = list(sampled_df['sampleID'])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  phenotype_data_filtered.loc[:, "Gleason_group"] = np.select(
  sampled_df = phenotype_data_filtered.groupby("Gleason_group", group_keys=False).apply(


In [8]:
# pre-process CNV data

cnv_data_col = list(cnv_data.columns)
common_col = list(set(cnv_data_col) & set(patients_id))
col_keep = ['Gene Symbol'] + common_col
cnv_data_filtered = cnv_data[col_keep]


df_melted_cnv = cnv_data_filtered.melt(
    id_vars=["Gene Symbol"],       # columns to keep fixed
    var_name="samples_id",         # name for the variable column (sample IDs)
    value_name="expression_value"  # name for the values
)


df_melted_cnv['Gene Symbol'] = df_melted_cnv['Gene Symbol'].str.split('|').str[0] 

df_melted_cnv = df_melted_cnv.rename(
    columns={
        "samples_id": "model_id",
        "Gene Symbol": "gene_symbol",
        "expression_value": "rsem_tpm",
    }
)
group_loss = [-1, -2]
group_normal = [0]
group_gain = [1, 2]

conditions = [
    df_melted_cnv["rsem_tpm"].isin(group_loss),
    df_melted_cnv["rsem_tpm"].isin(group_normal),
    df_melted_cnv["rsem_tpm"].isin(group_gain),
]
choices = ["Loss", "Normal", "Gain"]
df_melted_cnv.loc[:, "effect"] = np.select(conditions, choices, default="")

df_melted_cnv = df_melted_cnv[df_melted_cnv['gene_symbol'].isin(montagud_nodes)]
df_melted_cnv.to_csv('data/TCGA_data/prostate/filtered_data/cnv_samples_table.csv')

In [9]:
df_genes_copy= genes_data.copy()
genes_synonyms_df = identify_genes_synonyms(genes_synonyms, uniprot_data, montagud_nodes)
print(genes_synonyms_df.head())

# then replace genes names by montagud nodes

  Gene name_x Gene Synonym montagud_node
0        BMP2        BMP2A          BMP2
1       NCOA3         ACTR         NCOA3
2       NCOA3         ACTR         NCOA3
3       NCOA3         AIB1         NCOA3
4       NCOA3         AIB1         NCOA3


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_synonyms["Gene name"] = data_synonyms["Gene name"].str.upper()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_synonyms["Gene Synonym"] = data_synonyms["Gene Synonym"].str.upper()


In [10]:
# pre-process genes data

genes_data_col = list(genes_data.columns)
common_col = list(set(genes_data_col) & set(patients_id))
col_keep = ['sample'] + common_col
genes_data_filtered = genes_data[col_keep]


df_melted_gene = genes_data_filtered.melt(
    id_vars=["sample"],       # columns to keep fixed
    var_name="samples_id",         # name for the variable column (sample IDs)
    value_name="expression_value"  # name for the values
)

df_melted_gene['sample'] = df_melted_gene['sample'].str.split('|').str[0] 

df_melted_gene = df_melted_gene.rename(
    columns={
        "samples_id": "model_id",
        "sample": "gene_symbol",
        "expression_value": "rsem_tpm",
    }
)
df_melted_gene['gene_symbol'] = df_melted_gene['gene_symbol'].str.upper()
df_melted_gene['gene_symbol'] = df_melted_gene['gene_symbol'].str.replace('_', '', regex=False)




In [11]:
df_melted_gene_copy = df_melted_gene.copy()
print(df_melted_gene_copy.head())

  gene_symbol         model_id  rsem_tpm
0   ARHGEF10L  TCGA-YL-A9WH-01    8.7340
1       HIF3A  TCGA-YL-A9WH-01    4.0102
2       RNF17  TCGA-YL-A9WH-01    0.0000
3       RNF10  TCGA-YL-A9WH-01   12.3159
4       RNF11  TCGA-YL-A9WH-01   10.4304


In [12]:
melted = genes_synonyms_df.melt(
    id_vars='montagud_node',
    value_vars=['Gene name_x', 'Gene Synonym'],
    value_name='gene'
)[['gene', 'montagud_node']].drop_duplicates()


print(melted)
mapping_dict = dict(zip(melted['gene'], melted['montagud_node']))

df_melted_gene['gene_symbol'] = df_melted_gene['gene_symbol'].map(mapping_dict).fillna(df_melted_gene['gene_symbol'])



          gene montagud_node
0         BMP2          BMP2
1        NCOA3         NCOA3
111      MAPK1           ERK
115       CDH2          CDH2
181      AXIN1         AXIN1
...        ...           ...
19969     CJUN           JUN
19974    APAF3         CASP9
19976  ICELAP6         CASP9
19978     MCH6         CASP9
19980  PPP1R56         CASP9

[376 rows x 2 columns]


In [13]:
print(df_melted_gene_copy[df_melted_gene_copy['gene_symbol'] != df_melted_gene['gene_symbol']])

        gene_symbol         model_id  rsem_tpm
60             OPA1  TCGA-YL-A9WH-01   10.7861
106          MRPL28  TCGA-YL-A9WH-01   10.2733
573            RAC3  TCGA-YL-A9WH-01    9.0444
1892         PRKAA2  TCGA-YL-A9WH-01    8.3485
2326        SLC22A3  TCGA-YL-A9WH-01    6.3459
...             ...              ...       ...
1804658        EZH1  TCGA-YL-A8S8-01    9.6834
1804885       EPHB2  TCGA-YL-A8S8-01    8.6746
1806228      AKR1B1  TCGA-YL-A8S8-01    8.8926
1806392        FZR1  TCGA-YL-A8S8-01    9.9405
1806529      TCEAL1  TCGA-YL-A8S8-01    8.7856

[4400 rows x 3 columns]


In [14]:
df_melted_gene = df_melted_gene[df_melted_gene['gene_symbol'].isin(montagud_nodes)]

df_melted_gene.to_csv('data/TCGA_data/prostate/filtered_data/genes_samples_table.csv')

In [15]:
# read the pre-processed data
df_melted_cnv= pd.read_csv('data/TCGA_data/prostate/filtered_data/cnv_samples_table.csv')
df_melted_gene= pd.read_csv('data/TCGA_data/prostate/filtered_data/genes_samples_table.csv')

In [16]:
 # Create generic models 
folder_generic_models_cfg = f'{folder_generic_models}/Montagud2022_Prostate_Cancer.cfg'
folder_generic_models_bnd = f'{folder_generic_models}/Montagud2022_Prostate_Cancer.bnd'

create_generic_patients_cfg_bnd_validation(folder_generic_models_cfg, folder_generic_models_bnd, folder_pers_models, patients_id, tissue)


All .cfg and .bnd files created for the validation.


In [17]:
# update phenotypes in generic models 
generic_models_update_phenotypes(phenotype_interest, folder_pers_models, folder_pers_models)

Updated EEF2K.is_internal=1 in TCGA-ZG-A9L5-01_Prostate.cfg
Updated LDHA.is_internal=1 in TCGA-ZG-A9L5-01_Prostate.cfg
Updated VHL.is_internal=1 in TCGA-ZG-A9L5-01_Prostate.cfg
Updated GADD45.is_internal=1 in TCGA-ZG-A9L5-01_Prostate.cfg
Updated CDH2.is_internal=1 in TCGA-ZG-A9L5-01_Prostate.cfg
Updated PI3K.is_internal=1 in TCGA-ZG-A9L5-01_Prostate.cfg
Updated FGF.is_internal=1 in TCGA-ZG-A9L5-01_Prostate.cfg
Updated RB1.is_internal=1 in TCGA-ZG-A9L5-01_Prostate.cfg
Updated MYC.is_internal=1 in TCGA-ZG-A9L5-01_Prostate.cfg
Updated RAF.is_internal=1 in TCGA-ZG-A9L5-01_Prostate.cfg
Updated TCF.is_internal=1 in TCGA-ZG-A9L5-01_Prostate.cfg
Updated Apoptosis.is_internal=0 in TCGA-ZG-A9L5-01_Prostate.cfg
Updated CCNB1.is_internal=1 in TCGA-ZG-A9L5-01_Prostate.cfg
Updated fused_event.is_internal=1 in TCGA-ZG-A9L5-01_Prostate.cfg
Updated MAP3K3.is_internal=1 in TCGA-ZG-A9L5-01_Prostate.cfg
Updated CTNNB1.is_internal=1 in TCGA-ZG-A9L5-01_Prostate.cfg
Updated JUN.is_internal=1 in TCGA-ZG-A9L5-

In [18]:
# personalize the boolean networks with genes 
table_rna_seq_patients = create_table_rna_seq_patients(df_melted_gene)
print(table_rna_seq_patients.head())

gene_expression_level                               High Gene Expression  \
model_id                                                                   
TCGA-2A-A8VL-01        CDH1, EEF2K, EMT, FOS, JUN, NCOA3, P15, P21, P...   
TCGA-2A-AAYO-01              BAX, CASP9, E2F1, EMT, MEK1, P15, P38, TERT   
TCGA-2A-AAYU-01                BAX, CDH1, EMT, ERG, ERK, P15, PDK1, SPOP   
TCGA-CH-5738-01        APAF1, ATM, CFLAR, CTNNB1, EMT, ERK, FOS, GADD...   
TCGA-CH-5743-01        APAF1, ATM, ATR, BCL2, BMP2, BRCA2, CASP3, CCN...   

gene_expression_level                                Low Gene Expression  
model_id                                                                  
TCGA-2A-A8VL-01        BIRC5, BRCA1, BRCA2, CHK2, E2F1, EMT, ERK, EZH...  
TCGA-2A-AAYO-01         AR, ATR, CDH1, E2F1, EGFR, NCOR2, P15, P38, VEGF  
TCGA-2A-AAYU-01        AR, BCL2, BMP2, CCND1, DAXX, E2F1, EEF2K, EMT,...  
TCGA-CH-5738-01        AXIN1, BAD, CCND1, CDH1, CDH2, DAXX, FADD, MEK...  
TCGA-CH-5743-01  

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rna_seq_data.rename(columns={"gene_symbol": "gene_name"}, inplace=True)


In [19]:
personalized_patients_genes_cfgs(df_melted_gene,montagud_nodes,folder_pers_models,folder_pers_models,patients_id,table_rna_seq_patients,tissue)

Modified and saved: validation/prostate/genes_models/personalized_models/TCGA-ZG-A9L5-01_Prostate.cfg
Modified and saved: validation/prostate/genes_models/personalized_models/TCGA-QU-A6IO-01_Prostate.bnd
Modified and saved: validation/prostate/genes_models/personalized_models/TCGA-J4-AATV-01_Prostate.cfg
Modified and saved: validation/prostate/genes_models/personalized_models/TCGA-EJ-A7NN-01_Prostate.bnd
Modified and saved: validation/prostate/genes_models/personalized_models/TCGA-G9-7519-01_Prostate.bnd
Modified and saved: validation/prostate/genes_models/personalized_models/TCGA-VP-A87D-01_Prostate.bnd
Modified and saved: validation/prostate/genes_models/personalized_models/TCGA-G9-6343-01_Prostate.bnd
Modified and saved: validation/prostate/genes_models/personalized_models/TCGA-EJ-8472-01_Prostate.bnd
Modified and saved: validation/prostate/genes_models/personalized_models/TCGA-CH-5743-01_Prostate.bnd
Modified and saved: validation/prostate/genes_models/personalized_models/TCGA-KK-A

In [20]:
# personalize with CNV
tailor_bnd_cnv_validation(df_melted_cnv, folder_pers_models, tissue)

🔍 Processing patient TCGA-YL-A9WH-01, gene: DVL1
Patient TCGA-YL-A9WH-01 is in both gain and loss groups. Please review.
DVL1 node found. Replacing...
🔍 Processing patient TCGA-YL-A9WH-01, gene: CASP9
Patient TCGA-YL-A9WH-01 is in both gain and loss groups. Please review.
CASP9 node found. Replacing...
🔍 Processing patient TCGA-YL-A9WH-01, gene: ZBTB17
Patient TCGA-YL-A9WH-01 is in both gain and loss groups. Please review.
ZBTB17 node found. Replacing...
🔍 Processing patient TCGA-YL-A9WH-01, gene: JUN
Patient TCGA-YL-A9WH-01 is in both gain and loss groups. Please review.
JUN node found. Replacing...
🔍 Processing patient TCGA-YL-A9WH-01, gene: PDK1
Patient TCGA-YL-A9WH-01 is in both gain and loss groups. Please review.
PDK1 node found. Replacing...
🔍 Processing patient TCGA-YL-A9WH-01, gene: CFLAR
Patient TCGA-YL-A9WH-01 is in both gain and loss groups. Please review.
CFLAR node found. Replacing...
🔍 Processing patient TCGA-YL-A9WH-01, gene: CASP8
Patient TCGA-YL-A9WH-01 is in both gai

In [21]:
# Compute phenotype distribution for each patient
# folder_models = 'validation/prostate/personalized_models'
# folder_save_results = 'validation/prostate/results/phenotype_distribution/phenotype_table'

phenotypes_interest = [
        "Proliferation",
        "Invasion",
        "DNA_Repair",
        "Migration",
        "Apoptosis",
    ]

inputs_list = ['EGF', 'FGF', 'TGFB', 'Androgen', 'Hypoxia', 'Nutrients', 'Carcinogen', 'Acidosis', 'TNF', 'fused_event', 'SPOP']
for patient  in patients_id:
    compute_phenotype_table(folder_save_results, folder_pers_models, patient, inputs_list, phenotypes_interest, tissue= 'Prostate')


KeyboardInterrupt: 

In [None]:
low_group_ids= list(phenotype_data_filtered[phenotype_data_filtered['Gleason_group'] == 'low_aggressive']['sampleID'])
medium_group_ids= list(phenotype_data_filtered[phenotype_data_filtered['Gleason_group'] == 'middle_aggressive']['sampleID'])
high_group_ids= list(phenotype_data_filtered[phenotype_data_filtered['Gleason_group'] == 'high_aggressive']['sampleID'])



In [None]:
#move each files to directory corresponding

# Map group names to sample ID lists
group_mapping = {
    "low_group": low_group_ids,
    "medium_group": medium_group_ids,
    "high_group": high_group_ids,
}

# Loop over all files in the source directory
for filename in os.listdir(folder_save_results):
    if not filename.startswith("_TCGA"):
        continue
    sample_id = filename.replace("_", "").replace(".csv", "")
    # Determine the group of this sample
    group_found = False
    for group_name, id_list in group_mapping.items():
        if sample_id in id_list:
            group_folder = os.path.join(dest_base_dir, group_name)
            os.makedirs(group_folder, exist_ok=True)

            src_path = os.path.join(folder_save_results, filename)
            dst_path = os.path.join(group_folder, filename)
            shutil.move(src_path, dst_path)

            group_found = True
            break

    if not group_found:
        print(f" Sample ID {sample_id} not found in any group list.")


In [None]:
# combine all the values
groups = ["low_group", "medium_group", "high_group"]

mean_df = compute_phenotype_mean_group_validation(groups, dest_base_dir)

              Proliferation  Invasion  DNA_Repair  Migration  Apoptosis
Acidosis           0.366059  0.331998    0.235884   0.024329   0.363642
Androgen           0.378743  0.352069    0.239454   0.051197   0.364997
Carcinogen         0.377359  0.345996    0.400864   0.013867   0.411317
EGF                0.374162  0.347758    0.233172   0.051207   0.367908
FGF                0.389455  0.347433    0.236985   0.023721   0.381571
Hypoxia            0.324061  0.339164    0.230691   0.028574   0.365637
Nutrients          0.399506  0.344355    0.237794   0.030017   0.364394
SPOP               0.391617  0.371726    0.230629   0.051310   0.341411
TGFB               0.334074  0.503393    0.227178   0.041223   0.418340
TNF                0.317056  0.535548    0.226876   0.069388   0.368572
fused_event        0.372464  0.343281    0.237205   0.025562   0.367195
Overall_Mean       0.365869  0.378429    0.248794   0.037309   0.374090
              Proliferation  Invasion  DNA_Repair  Migration  Ap

In [None]:
# combine values of a directory together

def collect_group_data(group_folder_path):
    combined_data = defaultdict(lambda: defaultdict(list))

    for file in os.listdir(group_folder_path):
        if file.startswith("_TCGA") and file.endswith(".csv"):
            file_path = os.path.join(group_folder_path, file)
            df = pd.read_csv(file_path, index_col=0)

            for input_name in df.index:
                for phenotype in df.columns:
                    value = df.at[input_name, phenotype]
                    combined_data[input_name][phenotype].append(float(value))

    result_df = pd.DataFrame.from_dict(combined_data, orient='index')
    result_df.to_csv(os.path.join(group_folder_path, "combined_results.csv"))

    return result_df

group_names = ["low_group", "medium_group", "high_group"]
group_dataframes = {}

for group in group_names:
    folder_path = os.path.join(dest_base_dir, group)
    group_df = collect_group_data(folder_path)
    group_dataframes[group] = group_df

In [None]:
# Paths to your combined data CSVs

group_files = {
    "low": os.path.join(dest_base_dir, "low_group", "combined_results.csv"),
    "medium": os.path.join(dest_base_dir, "medium_group", "combined_results.csv"),
    "high": os.path.join(dest_base_dir, "high_group", "combined_results.csv"),
}


# Load all groups into dict of DataFrames
group_dfs = {}
for group, path in group_files.items():
    df = pd.read_csv(path, index_col=0)
    df = df.applymap(ast.literal_eval)
    group_dfs[group] = df

# Get all inputs and phenotypes from one dataframe (assuming all share the same shape)
inputs = group_dfs["low"].index
phenotypes = group_dfs["low"].columns
# Prepare result storage
kruskal_results = pd.DataFrame(index=inputs, columns=phenotypes)

# Run Kruskal-Wallis test for each (input, phenotype)
for input_name in inputs:
    for phenotype in phenotypes:
        data_low = group_dfs["low"].at[input_name, phenotype]
        data_medium = group_dfs["medium"].at[input_name, phenotype]
        data_high = group_dfs["high"].at[input_name, phenotype]

        # Run the Kruskal-Wallis test only if all groups have data
        if data_low and data_medium and data_high:
            stat, pvalue = kruskal(data_low, data_medium, data_high)
            kruskal_results.at[input_name, phenotype] = pvalue
        else:
            kruskal_results.at[input_name, phenotype] = None

# Optionally, save the p-values table to CSV
kruskal_results.to_csv(os.path.join(dest_base_dir, "kruskal_pvalues.csv"))
print(kruskal_results)


            Proliferation  Invasion DNA_Repair Migration Apoptosis
EGF              0.001063  0.000571   0.003473  0.000655  0.001269
FGF              0.000953  0.000488   0.000663  0.847542  0.000973
TGFB             0.000558  0.001172   0.001881    0.9022  0.001157
Androgen         0.001063  0.000585   0.003988   0.89582  0.001246
Hypoxia           0.00217   0.00061   0.000888  0.000905  0.001296
Nutrients        0.000853  0.000562   0.001149  0.000825  0.001097
Carcinogen       0.001197   0.00067    0.00151  0.176619  0.000587
Acidosis         0.001081  0.000558   0.001752  0.143918  0.001457
TNF              0.000992  0.001417   0.000463  0.869862  0.001274
fused_event      0.000985  0.000619   0.004247  0.000962  0.001311
SPOP             0.001116  0.000642   0.001532  0.914569  0.001974


  df = df.applymap(ast.literal_eval)
  df = df.applymap(ast.literal_eval)
  df = df.applymap(ast.literal_eval)


In [None]:

# Flatten p-values to a 1D array, ignoring None or NaNs
pvals = kruskal_results.values.flatten()
pvals = [p for p in pvals if p is not None]

# Adjust using BH method
_, pvals_adj, _, _ = multipletests(pvals, alpha=0.05, method='fdr_bh')

# Now, you need to put adjusted p-values back into the DataFrame shape
# Create a copy to fill
adjusted_df = kruskal_results.copy()

# Fill adjusted p-values sequentially where there was a non-None p-value
idx = 0
for i in adjusted_df.index:
    for j in adjusted_df.columns:
        if adjusted_df.at[i, j] is not None:
            adjusted_df.at[i, j] = pvals_adj[idx]
            idx += 1
print(adjusted_df)


            Proliferation  Invasion DNA_Repair Migration Apoptosis
EGF              0.001948  0.001948   0.004152  0.001948  0.001948
FGF              0.001948  0.001948   0.001948  0.914015  0.001948
TGFB             0.001948  0.001948   0.002406  0.914569  0.001948
Androgen         0.001948  0.001948   0.004667  0.914569  0.001948
Hypoxia          0.002653  0.001948   0.001948  0.001948  0.001948
Nutrients        0.001948  0.001948   0.001948  0.001948  0.001948
Carcinogen       0.001948  0.001948   0.002056   0.19428  0.001948
Acidosis         0.001948  0.001948   0.002294  0.161541  0.002054
TNF              0.001948   0.00205   0.001948  0.914569  0.001948
fused_event      0.001948  0.001948   0.004867  0.001948  0.001948
SPOP             0.001948  0.001948   0.002056  0.914569  0.002467


In [None]:
# keep only the significant results
significant_df = adjusted_df.copy()
significant_df[significant_df >= 0.05] = np.nan
significant_df

Unnamed: 0,Proliferation,Invasion,DNA_Repair,Migration,Apoptosis
EGF,0.001948,0.001948,0.004152,0.001948,0.001948
FGF,0.001948,0.001948,0.001948,,0.001948
TGFB,0.001948,0.001948,0.002406,,0.001948
Androgen,0.001948,0.001948,0.004667,,0.001948
Hypoxia,0.002653,0.001948,0.001948,0.001948,0.001948
Nutrients,0.001948,0.001948,0.001948,0.001948,0.001948
Carcinogen,0.001948,0.001948,0.002056,,0.001948
Acidosis,0.001948,0.001948,0.002294,,0.002054
TNF,0.001948,0.00205,0.001948,,0.001948
fused_event,0.001948,0.001948,0.004867,0.001948,0.001948
