In [1]:
import pandas as pd
import numpy as np
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [18]:
clusternames = ["adj_ERPositive", "adj_PRPositive", "adj_HER2Positive", "adj_Triple Neg"]

gsea_frames = {}
for cluster in clusternames:
    df = pd.read_csv(f"GSEA_/{cluster}/gseapy.gsea.gene_set.report.csv", index_col=0).iloc[:,:-3]
    indexlist = []
    for i in df.index:
        new_i = i.rstrip(" ")
        indexlist.append(new_i)
    df.index = indexlist
    print(df.index)
    df['fdr'] = df.apply(lambda row: 5.0 if (row['fdr'] == 0.0) else -np.log10(row['fdr']), axis=1)
    df = df.loc[~df.index.duplicated(keep='first')]
    gsea_frames[cluster] = df
    
gsea_df = pd.DataFrame(index= gsea_frames['adj_ERPositive'].index)
for i in gsea_frames:
    gsea_df[i] = gsea_frames[i][f'fdr']

Index(['Activation of the pre-replicative complex',
       'Cell surface interactions at the vascular wall',
       'DNA Replication Pre-Initiation',
       'Immunoregulatory interactions between a Lymphoid and a non-Lymphoid cell',
       'Interleukin-10 signaling', 'Nuclear Envelope Breakdown',
       'Assembly of the pre-replicative complex',
       'tRNA processing in the nucleus', 'Signaling by Interleukins',
       'Amplification  of signal from unattached  kinetochores via a MAD2  inhibitory signal',
       ...
       'Negative regulation of activity of TFAP2 (AP-2) family transcription factors',
       'MyD88 deficiency (TLR5)',
       'Entry of Influenza Virion into Host Cell via Endocytosis',
       'Long-term potentiation', 'Glucuronidation',
       'Synthesis of PIPs at the early endosome membrane',
       'Signaling by cytosolic FGFR1 fusion mutants',
       'Neurexins and neuroligins',
       'RUNX2 regulates genes involved in differentiation of myeloid cells',
       'Re

In [19]:
gsea_df.columns = ["ER", "PR", "HER2", "Triple Neg"]
gsea_df

Unnamed: 0,ER,PR,HER2,Triple Neg
Activation of the pre-replicative complex,5.0,5.000000,1.622301,3.282650
Cell surface interactions at the vascular wall,5.0,1.972859,2.899743,2.239015
DNA Replication Pre-Initiation,5.0,5.000000,5.000000,3.384107
Immunoregulatory interactions between a Lymphoid and a non-Lymphoid cell,5.0,5.000000,2.843885,5.000000
Interleukin-10 signaling,5.0,3.604798,0.807129,5.000000
...,...,...,...,...
Synthesis of PIPs at the early endosome membrane,-0.0,0.026300,0.170997,0.030985
Signaling by cytosolic FGFR1 fusion mutants,-0.0,0.043131,0.126391,0.022732
Neurexins and neuroligins,-0.0,0.060085,0.295081,0.037112
RUNX2 regulates genes involved in differentiation of myeloid cells,-0.0,0.112589,0.063539,-0.000000


In [20]:
wall_hormones = pd.read_csv('../exp/adjusted_distributions_receptors.csv', index_col = 0)
wall_hormones.columns = ["ER", "PR", "HER2", "Triple Neg"]
wall_hormones

Unnamed: 0,ER,PR,HER2,Triple Neg
SLC-mediated transmembrane transport,37.405786,13.252255,0.663079,26.968505
"Transport of bile salts and organic acids, metal ions and amine compounds",32.452070,9.880556,0.113069,23.280225
Sphingolipid metabolism,29.140881,8.853328,0.010495,22.599006
Cyclin D associated events in G1,35.525142,14.523865,0.815315,22.485267
G1 Phase,35.525142,14.523865,0.815315,22.485267
...,...,...,...,...
Protein ubiquitination,0.075795,0.051607,0.029685,0.000851
Transcriptional regulation of granulopoiesis,0.033070,0.503143,0.031908,0.000787
Late endosomal microautophagy,0.141577,0.716284,0.011522,0.000651
"Unblocking of NMDA receptors, glutamate binding and activation",0.484781,0.694761,1.059343,0.000431


In [7]:
def read_reactome(file_name, gene_name_start = "ENSG0"):
    df = pd.read_csv(file_name, sep='\t', header=None)

    if gene_name_start == None:
        sub_df = df
    else:
        subset_vec = df[0].str.startswith(gene_name_start)
        sub_df = df.loc[subset_vec]

    genes_df = sub_df.groupby(1)[0].apply(list)
    names_df = sub_df.groupby(1)[3].max()

    out_df = pd.concat([genes_df,names_df], axis=1)
    out_df.columns = ['genes', 'pathway_name']
    out_df.index = out_df.pathway_name

    return out_df
reactome_ngenes = read_reactome("../data/Ensembl2Reactome_All_Levels.txt.gz")
length_dict = {}
for i in wall_hormones.index:
        if i in reactome_ngenes.index:
            nr_genes = len(reactome_ngenes.loc[i, "genes"])
        else:
            print(f'{i} not found')

        length_dict[i] = nr_genes

In [23]:
wall_hormones["ngenes"] = wall_hormones.index.map(length_dict)
gsea_df["ngenes"] = gsea_df.index.map(length_dict)

In [24]:
wall_hormones

Unnamed: 0,ER,PR,HER2,Triple Neg,ngenes
SLC-mediated transmembrane transport,37.405786,13.252255,0.663079,26.968505,269
"Transport of bile salts and organic acids, metal ions and amine compounds",32.452070,9.880556,0.113069,23.280225,100
Sphingolipid metabolism,29.140881,8.853328,0.010495,22.599006,98
Cyclin D associated events in G1,35.525142,14.523865,0.815315,22.485267,46
G1 Phase,35.525142,14.523865,0.815315,22.485267,46
...,...,...,...,...,...
Protein ubiquitination,0.075795,0.051607,0.029685,0.000851,85
Transcriptional regulation of granulopoiesis,0.033070,0.503143,0.031908,0.000787,106
Late endosomal microautophagy,0.141577,0.716284,0.011522,0.000651,36
"Unblocking of NMDA receptors, glutamate binding and activation",0.484781,0.694761,1.059343,0.000431,41


In [25]:
gsea_df

Unnamed: 0,ER,PR,HER2,Triple Neg,ngenes
Activation of the pre-replicative complex,5.0,5.000000,1.622301,3.282650,33.0
Cell surface interactions at the vascular wall,5.0,1.972859,2.899743,2.239015,240.0
DNA Replication Pre-Initiation,5.0,5.000000,5.000000,3.384107,104.0
Immunoregulatory interactions between a Lymphoid and a non-Lymphoid cell,5.0,5.000000,2.843885,5.000000,394.0
Interleukin-10 signaling,5.0,3.604798,0.807129,5.000000,59.0
...,...,...,...,...,...
Synthesis of PIPs at the early endosome membrane,-0.0,0.026300,0.170997,0.030985,21.0
Signaling by cytosolic FGFR1 fusion mutants,-0.0,0.043131,0.126391,0.022732,18.0
Neurexins and neuroligins,-0.0,0.060085,0.295081,0.037112,84.0
RUNX2 regulates genes involved in differentiation of myeloid cells,-0.0,0.112589,0.063539,-0.000000,4.0


In [26]:
comparison_df_index = [x for x in wall_hormones.columns if x != "ngenes"] 
print(comparison_df_index)
comparison_df = pd.DataFrame(index=comparison_df_index)
anova_mean = []
gsea_mean = []
anova_median = []
gsea_median = []
no_anova_genes = []
no_gsea_genes = []
for i in wall_hormones.columns:
    if i != "ngenes":
        anova_mean.append(wall_hormones[wall_hormones[i] > 3]['ngenes'].mean(axis=0))
        no_anova_genes.append(wall_hormones[wall_hormones[i] > 3]["ngenes"].shape[0])
        gsea_mean.append(gsea_df[gsea_df[i] > 3]["ngenes"].mean(axis=0))
        no_gsea_genes.append(gsea_df[gsea_df[i] > 3]["ngenes"].shape[0])
        anova_median.append(wall_hormones[wall_hormones[i] > 3]['ngenes'].median(axis=0))
        gsea_median.append(gsea_df[gsea_df[i] > 3]["ngenes"].median(axis=0))

comparison_df["ANOVA pathways"] = no_anova_genes

comparison_df["GSEA pathways"] = no_gsea_genes
comparison_df["ANOVA median"] = anova_median
comparison_df["GSEA median"] = gsea_median
comparison_df["Fold change"] = comparison_df["GSEA median"] / comparison_df["ANOVA median"]

        
comparison_df


['ER', 'PR', 'HER2', 'Triple Neg']


Unnamed: 0,ANOVA pathways,GSEA pathways,ANOVA median,GSEA median,Fold change
ER,706,41,30.0,148.0,4.933333
PR,455,23,30.0,86.0,2.866667
HER2,44,41,10.0,200.0,20.0
Triple Neg,508,39,30.0,92.0,3.066667
