In [1]:
import pandas as pd
import numpy as np
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [29]:
clusternames = ["adj_LumA", "adj_LumB", "adj_Basal", "adj_HER2Normal", "adj_PAM50Her2"]

gsea_frames = {}
for cluster in clusternames:
    df = pd.read_csv(f"GSEA_/{cluster}/gseapy.gsea.gene_set.report.csv", index_col=0).iloc[:,:-3]
    indexlist = []
    for i in df.index:
        new_i = i.rstrip(" ")
        indexlist.append(new_i)
    df.index = indexlist
    df['fdr'] = df.apply(lambda row: 5.0 if (row['fdr'] == 0.0) else -np.log10(row['fdr']), axis=1)
    df = df.loc[~df.index.duplicated(keep='first')]
    gsea_frames[cluster] = df
    
gsea_df = pd.DataFrame(index= gsea_frames['adj_LumA'].index)
for i in gsea_frames:
    gsea_df[i] = gsea_frames[i][f'fdr']

In [30]:
gsea_df.columns = ["LumA", "LumB", "Basal", "Normal", "Her2"]
gsea_df

Unnamed: 0,LumA,LumB,Basal,Normal,Her2
Activation of APC/C and APC/C:Cdc20 mediated degradation of mitotic proteins,5.000000,3.717286,2.980772,5.000000,1.862473
Activation of ATR in response to replication stress,5.000000,3.720666,5.000000,5.000000,1.826592
Activation of the pre-replicative complex,5.000000,5.000000,5.000000,5.000000,2.800993
Amplification of signal from unattached kinetochores via a MAD2 inhibitory signal,5.000000,5.000000,5.000000,5.000000,2.318062
Amplification of signal from the kinetochores,5.000000,5.000000,5.000000,5.000000,2.233860
...,...,...,...,...,...
FGFR2c ligand binding and activation,0.001755,-0.000000,0.001019,0.055738,0.099992
Dopamine receptors,0.001112,0.509501,0.156339,0.283271,0.137842
G alpha (s) signalling events,0.001000,0.157043,0.100279,1.063548,0.036668
Activated point mutants of FGFR2,0.000802,0.032872,0.052504,0.035618,0.341935


In [33]:
wall_df = pd.read_csv("../exp/adjusted_distributions_PAM50.csv", index_col = 0)
wall_df = wall_df.iloc[:, [0, 1, 2, 3, 4]]
wall_df.columns = ["LumA", "LumB", "Her2", "Normal", "Basal" ]
wall_df

Unnamed: 0,LumA,LumB,Her2,Normal,Basal
RHO GTPases Activate Formins,3.963590,0.995465,1.163591,3.021927,5.422721
Factors involved in megakaryocyte development and platelet production,3.963590,1.023093,1.955168,2.881715,4.784309
RHO GTPase Effectors,3.963590,1.133937,0.803294,3.095910,4.750884
EML4 and NUDC in mitotic spindle formation,3.963590,1.287602,1.161360,3.165939,4.931946
Kinesins,3.963590,1.424705,1.461955,2.855323,4.386029
...,...,...,...,...,...
Role of ABL in ROBO-SLIT signaling,0.301409,2.976070,0.996696,1.528871,0.357934
Transcriptional regulation of granulopoiesis,0.301409,1.641689,0.365430,0.350232,0.410021
Clathrin-mediated endocytosis,0.301409,0.555611,1.115075,0.337099,0.485192
STAT5 Activation,0.301293,1.025219,0.680057,0.654833,0.357934


In [34]:
def read_reactome(file_name, gene_name_start = "ENSG0"):
    df = pd.read_csv(file_name, sep='\t', header=None)

    if gene_name_start == None:
        sub_df = df
    else:
        subset_vec = df[0].str.startswith(gene_name_start)
        sub_df = df.loc[subset_vec]

    genes_df = sub_df.groupby(1)[0].apply(list)
    names_df = sub_df.groupby(1)[3].max()

    out_df = pd.concat([genes_df,names_df], axis=1)
    out_df.columns = ['genes', 'pathway_name']
    out_df.index = out_df.pathway_name

    return out_df
reactome_ngenes = read_reactome("../data/Ensembl2Reactome_All_Levels.txt.gz")
length_dict = {}
for i in wall_df.index:
        if i in reactome_ngenes.index:
            nr_genes = len(reactome_ngenes.loc[i, "genes"])
        else:
            print(f'{i} not found')

        length_dict[i] = nr_genes

In [35]:
wall_df["ngenes"] = wall_df.index.map(length_dict)
gsea_df["ngenes"] = gsea_df.index.map(length_dict)

In [36]:
wall_df

Unnamed: 0,LumA,LumB,Her2,Normal,Basal,ngenes
RHO GTPases Activate Formins,3.963590,0.995465,1.163591,3.021927,5.422721,152
Factors involved in megakaryocyte development and platelet production,3.963590,1.023093,1.955168,2.881715,4.784309,194
RHO GTPase Effectors,3.963590,1.133937,0.803294,3.095910,4.750884,392
EML4 and NUDC in mitotic spindle formation,3.963590,1.287602,1.161360,3.165939,4.931946,122
Kinesins,3.963590,1.424705,1.461955,2.855323,4.386029,75
...,...,...,...,...,...,...
Role of ABL in ROBO-SLIT signaling,0.301409,2.976070,0.996696,1.528871,0.357934,12
Transcriptional regulation of granulopoiesis,0.301409,1.641689,0.365430,0.350232,0.410021,106
Clathrin-mediated endocytosis,0.301409,0.555611,1.115075,0.337099,0.485192,146
STAT5 Activation,0.301293,1.025219,0.680057,0.654833,0.357934,7


In [37]:
gsea_df

Unnamed: 0,LumA,LumB,Basal,Normal,Her2,ngenes
Activation of APC/C and APC/C:Cdc20 mediated degradation of mitotic proteins,5.000000,3.717286,2.980772,5.000000,1.862473,94.0
Activation of ATR in response to replication stress,5.000000,3.720666,5.000000,5.000000,1.826592,38.0
Activation of the pre-replicative complex,5.000000,5.000000,5.000000,5.000000,2.800993,33.0
Amplification of signal from unattached kinetochores via a MAD2 inhibitory signal,5.000000,5.000000,5.000000,5.000000,2.318062,99.0
Amplification of signal from the kinetochores,5.000000,5.000000,5.000000,5.000000,2.233860,99.0
...,...,...,...,...,...,...
FGFR2c ligand binding and activation,0.001755,-0.000000,0.001019,0.055738,0.099992,13.0
Dopamine receptors,0.001112,0.509501,0.156339,0.283271,0.137842,6.0
G alpha (s) signalling events,0.001000,0.157043,0.100279,1.063548,0.036668,667.0
Activated point mutants of FGFR2,0.000802,0.032872,0.052504,0.035618,0.341935,17.0


In [38]:
comparison_df_index = [x for x in wall_df.columns if x != "ngenes"] 
print(comparison_df_index)
comparison_df = pd.DataFrame(index=comparison_df_index)
anova_mean = []
gsea_mean = []
anova_median = []
gsea_median = []
no_anova_genes = []
no_gsea_genes = []
for i in wall_df.columns:
    if i != "ngenes":
        anova_mean.append(wall_df[wall_df[i] > 3]['ngenes'].mean(axis=0))
        no_anova_genes.append(wall_df[wall_df[i] > 3]["ngenes"].shape[0])
        gsea_mean.append(gsea_df[gsea_df[i] > 3]["ngenes"].mean(axis=0))
        no_gsea_genes.append(gsea_df[gsea_df[i] > 3]["ngenes"].shape[0])
        anova_median.append(wall_df[wall_df[i] > 3]['ngenes'].median(axis=0))
        gsea_median.append(gsea_df[gsea_df[i] > 3]["ngenes"].median(axis=0))

comparison_df["ANOVA pathways"] = no_anova_genes
comparison_df["GSEA pathways"] = no_gsea_genes
comparison_df["ANOVA median"] = anova_median
comparison_df["GSEA median"] = gsea_median
comparison_df["Fold change"] = comparison_df["GSEA median"] / comparison_df["ANOVA median"]

        
comparison_df

['LumA', 'LumB', 'Her2', 'Normal', 'Basal']


Unnamed: 0,ANOVA pathways,GSEA pathways,ANOVA median,GSEA median,Fold change
LumA,68,60,46.0,86.0,1.869565
LumB,40,234,18.5,95.0,5.135135
Her2,59,25,15.0,123.0,8.2
Normal,79,223,31.0,77.5,2.5
Basal,216,67,31.0,76.0,2.451613
