In [31]:
import pandas as pd

In [None]:
genes_data = pd.read_csv('data/TCGA_data/prostate/TCGA_PRAD_genes_illumina.csv', sep='\t')


montagud_original_data_df = (
    pd.read_csv('data/montagud_models/Montagud_inter_nodes_data.csv', header=1)
    .loc[:, ['Target node', 'Interaction type', 'Source']])

nodes_montagud_synonyms = pd.read_csv('data/montagud_models/nodes_processed.csv')


  nodes_montagud_synonyms = pd.read_csv('data/montagud_models/nodes_processed.csv')
  nodes_montagud_synonyms = pd.read_csv('data/montagud_models/nodes_processed.csv')


In [34]:
patients_ids = ['TCGA-CH-5766-01', 'TCGA-CH-5744-01', 'TCGA-EJ-8469-01']

In [45]:

def process_montagud_nodes(
    montagud_original_data_df, nodes_montagud_synonyms,
):

    # Create list of genes of interest (in Montagud data)
    # montagud_node_model are the nodes of the model 
    montagud_node_model = list(
        set(montagud_original_data_df["Target node"].tolist() + montagud_original_data_df["Source"].tolist())
    )
    montagud_node_model = [node for node in montagud_node_model if node != "0/1"]

  
    all_montagud_nodes = list(set(nodes_montagud_synonyms['Node_synonyms'])) + list(set(nodes_montagud_synonyms['Node'])) + list(set(montagud_node_model))
    

    return montagud_node_model, all_montagud_nodes




In [47]:
def process_montagud_nodes_synonyms(nodes_montagud_synonyms):

    montagud_node_synonyms = nodes_montagud_synonyms.dropna(subset=['Node'])

     # Create multiple rows at once
    new_rows = pd.DataFrame({
        'Node': ['cFLAR', 'eEF2', 'eEF2K', 'Rheb'],  
        'HGNC symbols': ['CFLAR', 'EEF2', 'EEF2K', 'RHEB'], 
        'unique': ['CFLAR', 'EEF2', 'EEF2K', 'RHEB'] 
    })

    montagud_node_synonyms = pd.concat([montagud_node_synonyms, new_rows], ignore_index=True)


    montagud_node_synonyms = montagud_node_synonyms.rename(columns={'HGNC symbols': 'Node_synonyms'})
    
    montagud_node_synonyms['Node_synonyms'] = montagud_node_synonyms['Node_synonyms'].str.split(',')
    montagud_node_synonyms = montagud_node_synonyms.explode('Node_synonyms')

     # Strip whitespace from Node column
    montagud_node_synonyms['Node_synonyms'] = montagud_node_synonyms['Node_synonyms'].str.strip()
    montagud_node_synonyms['Node'] = montagud_node_synonyms['Node'].str.strip()


    montagud_node_synonyms = montagud_node_synonyms[montagud_node_synonyms['Node_synonyms'] != '']
    montagud_node_synonyms = montagud_node_synonyms[montagud_node_synonyms['Node'] != '']

    

    synonyms_to_nodes_dict = montagud_node_synonyms.set_index('Node_synonyms')['Node'].to_dict()

    return montagud_node_synonyms, synonyms_to_nodes_dict



In [52]:

def pre_process_genes(genes_data, patients_id, all_montagud_nodes, synonyms_to_nodes_dict):
    # pre-process genes data

    genes_data_col = list(genes_data.columns)
    common_col = list(set(genes_data_col) & set(patients_id))
    col_keep = ["sample"] + common_col
    genes_data_filtered = genes_data[col_keep]
    genes_data_filtered = genes_data_filtered[genes_data_filtered["sample"].isin(all_montagud_nodes)]

    # Special cases handling: Create both mTORC1 and mTORC2 from MTOR data
    syn_dict = {'MTOR': ['mTORC1', 'mTORC2'], 'MYC': ['MYC', 'MYC_MAX'], 'PIK3CA': ['PI3K', 'PIP3'], 'LDHA': ['LDHA', 'Lactic_acid'], 'ERG': ['AR_ERG', 'ERG']}
    # Get all keys
    list_genes_duplicates = syn_dict.keys()


    # remplace the gene symbol column names by its synonyms in the synonyms_maps dictionary
    for gene in genes_data_filtered['sample'].unique():
        if gene in synonyms_to_nodes_dict and gene not in list_genes_duplicates:
            genes_data_filtered.loc[genes_data_filtered['sample'] == gene, 'sample'] = synonyms_to_nodes_dict[gene]

    for duplicate_gene in list_genes_duplicates:
        gene_duplicate_data = genes_data_filtered[genes_data_filtered['sample'] == duplicate_gene].copy()
        if not gene_duplicate_data.empty:
            # Create mTORC1 data
            gene_duplicate_1_data = gene_duplicate_data.copy()
            gene_duplicate_1_data['sample'] = syn_dict[duplicate_gene][0]
            
            # Create mTORC2 data  
            gene_duplicate_2_data = gene_duplicate_data.copy()
            gene_duplicate_2_data['sample'] = syn_dict[duplicate_gene][1]
            
            # Remove original MTOR and add both complexes
            genes_data_filtered = genes_data_filtered[genes_data_filtered['sample'] != duplicate_gene]
            genes_data_filtered = pd.concat([genes_data_filtered, gene_duplicate_1_data, gene_duplicate_2_data], ignore_index=True)

            print(f" Duplicated {duplicate_gene}: {syn_dict[duplicate_gene][0]} ({len(gene_duplicate_1_data)} rows) + {syn_dict[duplicate_gene][1]} ({len(gene_duplicate_2_data)} rows)")


        
    df_melted_gene = genes_data_filtered.melt(
        id_vars=["sample"],  # columns to keep fixed
        var_name="samples_id",  # name for the variable column (sample IDs)
        value_name="expression_value",  # name for the values
    )

    df_melted_gene["sample"] = df_melted_gene["sample"].str.split("|").str[0]

    df_melted_gene = df_melted_gene.rename(
        columns={
            "samples_id": "model_id",
            "sample": "gene_symbol",
            "expression_value": "rsem_tpm",
        }
    )

    df_melted_gene.to_csv(
        "data/TCGA_data/prostate/filtered_data/genes_samples_table.csv"
    )

    return df_melted_gene



In [53]:
montagud_node_synonyms, synonyms_to_nodes_dict = process_montagud_nodes_synonyms(nodes_montagud_synonyms)


montagud_node_model, all_montagud_nodes = process_montagud_nodes(
    montagud_original_data_df, montagud_node_synonyms
)



In [54]:
processed_genes = pre_process_genes(genes_data, patients_ids, all_montagud_nodes, synonyms_to_nodes_dict)

 Duplicated MTOR: mTORC1 (1 rows) + mTORC2 (1 rows)
 Duplicated MYC: MYC (1 rows) + MYC_MAX (1 rows)
 Duplicated PIK3CA: PI3K (1 rows) + PIP3 (1 rows)
 Duplicated LDHA: LDHA (1 rows) + Lactic_acid (1 rows)
 Duplicated ERG: AR_ERG (1 rows) + ERG (1 rows)


In [55]:
processed_genes

Unnamed: 0,gene_symbol,model_id,rsem_tpm
0,Rheb,TCGA-EJ-8469-01,9.7289
1,TGFBR,TCGA-EJ-8469-01,9.8022
2,SMAD,TCGA-EJ-8469-01,9.8165
3,COX4I2,TCGA-EJ-8469-01,3.2995
4,RUNX2,TCGA-EJ-8469-01,6.8070
...,...,...,...
919,PIP3,TCGA-CH-5744-01,7.9894
920,LDHA,TCGA-CH-5744-01,12.9245
921,Lactic_acid,TCGA-CH-5744-01,12.9245
922,AR_ERG,TCGA-CH-5744-01,13.0318
