In [1]:
import pandas as pd
from ast import literal_eval
import numpy as np
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
from statistics import mean, median
import scipy
from sklearn.decomposition import PCA
from sklearn import preprocessing
from gprofiler import GProfiler
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
figure(num=None, figsize=(8, 6), dpi=80, facecolor='w', edgecolor='k')
import operator
import qvalue as qv

In [5]:
#Reactome file containing information on pathways, the genes they contain and pathway name, also including the illumina identifier for the genes.

reactome = pd.read_csv('../data/reactome.csv', sep=',', index_col = 0)

def read_reactome(file_name, gene_name_start = "ENSG0"):
    df = pd.read_csv(file_name, sep='\t', header=None)
    
    if gene_name_start == None:
        sub_df = df
    else:
        subset_vec = df[0].str.startswith(gene_name_start)
        sub_df = df.loc[subset_vec]
    
    genes_df = sub_df.groupby(1)[0].apply(list)
    names_df = sub_df.groupby(1)[3].max()
    
    out_df = pd.concat([genes_df,names_df], axis=1)
    out_df.columns = ['genes', 'pathway_name']
    
    return out_df

low_level = read_reactome('../data/Ensembl2Reactome_All_Levels.txt.gz')

def my_pca(df, n_pc=1, normalize=True):
    df = df.dropna(axis = 0, how = 'all')#redundant, but keeping it just in case
    X = df.values.T
    if normalize:
        X2 = preprocessing.scale(X)
    else:
        X2 = X
    pca = PCA(n_components = n_pc)
    pca.fit(X2)
    my_pca.pca = pca  #needed for components
    Xnew = pca.fit_transform(X2)
    out_df = pd.DataFrame(Xnew.transpose(), index=list(range(1,n_pc+1)), columns=df.columns)
    out_df = out_df.transpose()
    
    return out_df, my_pca.pca.components_, my_pca.pca.explained_variance_ratio_

In [8]:
#Importing metabric dataset, dividing up what is clinical/expression data and changing the type of the expression columns to float
metabric_data = pd.read_csv('../data/metabric.csv.gz', index_col = 0)
#clinical_data = metabric_data.iloc[:27, :]
expression_data = metabric_data.iloc[27:,:]

#print(expression_data.columns)
dtypedict = {}
for i in expression_data.columns[1:]:
    dtypedict[i] = 'float32'
expression_data = expression_data.astype(dtypedict)




new_clinical_patient = pd.read_csv('../data/data_clinical_patient.txt', sep='\t', index_col=0).iloc[4:]
new_clinical_sample = pd.read_csv('../data/data_clinical_sample.txt', sep='\t', index_col=0).iloc[4:]
new_clinical = pd.concat([new_clinical_patient, new_clinical_sample.reindex(new_clinical_patient.index)], axis=1)
new_clinical['Triple Neg'] = new_clinical.apply(lambda row: True if ((row['ER Status'] == 'Negative') 
                                                                     and (row['PR Status'] == 'Negative') 
                                                                     and (row['HER2 Status'] == 'Negative')) else False, axis = 1)

new_clinical['ER-/PR-/HER2+'] = new_clinical.apply(lambda row: True if ((row['ER Status'] == 'Negative') 
                                                                     and (row['PR Status'] == 'Negative') 
                                                                     and (row['HER2 Status'] == 'Positive')) else False, axis = 1)

In [9]:
benign_data = pd.read_csv('../data/normals_ExpressionMatrix.txt.gz', sep='\t')
full_expression_data = pd.concat([expression_data, benign_data.reindex(expression_data.index)], axis=1)
full_expression_data

Unnamed: 0,MB-0002,MB-0008,MB-0010,MB-0035,MB-0036,MB-0050,MB-0059,MB-0060,MB-0066,MB-0101,...,MB-1036,MB-1116,MB-1044,MB-1049,MB-1068,MB-0812,MB-1076,MB-0814,MB-1087,MB-0822
ILMN_1802380,9.01387632374539,8.050127,8.385715,9.613113,9.394619,9.441010,8.726861,8.326064,8.275620,8.631910,...,9.652738,9.613235,9.357920,9.956872,9.849354,10.032023,9.781805,10.303621,9.695399,9.637358
ILMN_1893287,5.41685458556584,5.434174,5.395648,5.206674,5.468673,5.341939,5.402681,5.544091,5.602178,5.363058,...,5.482627,5.396885,5.245873,5.399439,5.297890,5.967743,5.507933,5.712740,5.441619,5.856045
ILMN_1736104,5.30362674026384,5.436292,5.670387,5.364256,5.285834,5.244118,5.357460,5.275894,5.279491,5.377604,...,5.552097,5.245200,5.361725,5.281616,5.445025,5.543827,5.284649,5.591478,5.349918,5.392241
ILMN_1792389,5.748716893322349,5.530582,5.453305,5.339252,7.558167,7.150190,5.253296,7.498247,7.162908,5.873051,...,7.261514,7.341772,6.726245,6.416305,6.230070,6.821720,6.822758,7.703177,6.735589,6.982701
ILMN_1854015,5.66622056832095,5.864693,5.782562,5.917886,5.641804,5.614012,5.893135,5.718247,5.839617,5.680772,...,5.873535,5.824425,6.075862,6.021415,5.944849,5.728423,6.020245,5.775820,5.847800,5.893000
ILMN_1904757,5.69973518838699,5.303667,5.509027,5.264163,5.463810,5.499392,5.269590,5.389881,5.277771,5.355287,...,5.671184,5.354585,5.591863,5.709113,5.476502,5.574290,5.557302,5.394057,5.323044,5.650580
ILMN_1740305,5.47078107285433,5.745757,5.120902,5.327254,5.360535,5.493600,5.478526,5.469750,5.438557,5.538726,...,5.367487,5.318978,5.370866,5.514696,5.286187,5.156295,5.658704,5.545506,6.128307,5.306813
ILMN_1665168,5.41911900204013,5.149618,5.225599,5.458613,5.217312,5.340619,5.367339,5.397219,5.374707,5.392478,...,5.396318,5.184372,5.222495,4.988902,5.240032,5.446805,5.179444,5.436025,5.431014,4.988803
ILMN_2375156,5.6112118449427495,5.934570,6.061952,6.443704,6.627111,5.489711,5.739219,5.477843,5.764232,5.445784,...,6.028725,5.770268,6.006790,6.076963,6.075991,5.839616,6.175167,6.178403,5.952542,5.994981
ILMN_1705423,5.31267123807526,5.416290,5.307981,5.433989,5.422247,5.537166,5.430320,5.266150,5.640436,5.323372,...,5.563306,5.440046,5.631286,5.461206,5.378861,5.439201,5.361383,5.169024,5.847271,5.682983


In [10]:
genes = full_expression_data.index.tolist()

gp = GProfiler(return_dataframe = True)
gp = gp.convert(organism='hsapiens',
          query=genes)

gp = gp.loc[gp['n_converted'] == 1]
gp = gp.loc[gp['name'] != 'None']
gp = gp.set_index('incoming')
gprofiler_names = gp
gprofiler_names

Unnamed: 0_level_0,converted,n_incoming,n_converted,name,description,namespaces,query
incoming,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ILMN_1802380,ENSG00000142599,1,1,RERE,arginine-glutamic acid dipeptide repeats [Sour...,"ILLUMINA_HUMANHT_12_V3,ILLUMINA_HUMANHT_12_V4,...",query_1
ILMN_1792389,ENSG00000141622,4,1,RNF165,ring finger protein 165 [Source:HGNC Symbol;Ac...,"ILLUMINA_HUMANHT_12_V3,ILLUMINA_HUMANHT_12_V4,...",query_1
ILMN_1740305,ENSG00000008226,7,1,DLEC1,DLEC1 cilia and flagella associated protein [S...,"ILLUMINA_HUMANHT_12_V3,ILLUMINA_HUMANHT_12_V4,...",query_1
ILMN_1665168,ENSG00000224368,8,1,ARAFP2,ARAF pseudogene 2 [Source:HGNC Symbol;Acc:HGNC...,"ILLUMINA_HUMANHT_12_V3,ILLUMINA_HUMANHT_12_V4,...",query_1
ILMN_2375156,ENSG00000010318,9,1,PHF7,PHD finger protein 7 [Source:HGNC Symbol;Acc:H...,"ILLUMINA_HUMANHT_12_V3,ILLUMINA_HUMANHT_12_V4,...",query_1
ILMN_1705423,ENSG00000266733,10,1,TBC1D29P,"TBC1 domain family member 29, pseudogene [Sour...","ILLUMINA_HUMANHT_12_V3,ILLUMINA_HUMANHT_12_V4,...",query_1
ILMN_1716072,ENSG00000230368,11,1,FAM41C,family with sequence similarity 41 member C [S...,"ILLUMINA_HUMANHT_12_V3,ILLUMINA_HUMANHT_12_V4,...",query_1
ILMN_1697642,ENSG00000075790,12,1,BCAP29,B cell receptor associated protein 29 [Source:...,"ILLUMINA_HUMANHT_12_V3,ILLUMINA_HUMANHT_12_V4,...",query_1
ILMN_1788184,ENSG00000176194,13,1,CIDEA,cell death inducing DFFA like effector a [Sour...,"ILLUMINA_HUMANHT_12_V3,ILLUMINA_HUMANHT_12_V4,...",query_1
ILMN_1681845,ENSG00000164329,14,1,TENT2,terminal nucleotidyltransferase 2 [Source:HGNC...,"ILLUMINA_HUMANHT_12_V3,ILLUMINA_HUMANHT_12_V4,...",query_1


In [11]:
#dataset = expression_data.set_index('Unnamed: 0') #gene_patient
pca_per_pathway = pd.DataFrame(index=full_expression_data.columns)

real_gene_names = pd.read_csv('../data/illumina2symbol.txt', sep="\t", index_col = 0)



genes_components_per_pathway = {} #nested dictionary where the 'outer dictionary' is the pathway names as keys and values are 
                                  #another dictionary with genes as keys and components as values

for pathway in reactome.index:
    genes = reactome.loc[pathway, "illumina"]
    genes = literal_eval(genes)
    genes = list(filter(lambda a: a != 'NaN', genes))
    pathwaydata = full_expression_data.loc[genes]
    if pathwaydata.index.empty == True:
        pass
    else:
        pathwaydata = pathwaydata.dropna(axis = 0, how = 'any') #has to be done so the lists match, this makes the dropna in my_pca function obsolete
        presentgenes = pathwaydata.index.values.tolist()
        if len(presentgenes) <= 1:
            pass
        else:
            res, components, explained_variance = my_pca(pathwaydata)
            pathwayname = reactome.loc[pathway, 'pathway_name']
            pca_per_pathway[pathwayname] = res

            components = components.tolist()[0]
            innerdict = {}
            for i in range(0, len(presentgenes)):
                component = components[i]
                gene = genes[i]
                if gene in real_gene_names.index:
                    real_name = real_gene_names.loc[gene, "symbol"]
                    innerdict[real_name] = component
                elif gene in gprofiler_names.index:
                    real_name = gprofiler_names.loc[gene, 'name']
                    innerdict[real_name] = component
                else:
                    innerdict[gene] = component
            sorted_innerdict = sorted(innerdict.items(), key = operator.itemgetter(1), reverse = True)
            genes_components_per_pathway[pathwayname] = [sorted_innerdict, explained_variance.flat[0]]

pca_per_pathway = pca_per_pathway.iloc[1:]

In [12]:
full_df = pd.concat([pca_per_pathway, new_clinical.reindex(pca_per_pathway.index)], axis=1)
full_df

Unnamed: 0,Interleukin-6 signaling,Apoptosis,Hemostasis,Intrinsic Pathway for Apoptosis,PKB-mediated events,PI3K Cascade,MAPK3 (ERK1) activation,Translesion synthesis by REV1,Translesion synthesis by Y family DNA polymerases bypasses lesions on DNA template,Recognition of DNA damage by PCNA-containing replication complex,...,ER Status,HER2 Status,Neoplasm Histologic Grade,Oncotree Code,PR Status,Sample Type,Tumor Size,Tumor Stage,Triple Neg,ER-/PR-/HER2+
MB-0008,0.714634,-5.925668,-1.503017,-3.268677,2.139144,0.145683,-0.065017,-0.194310,0.948032,1.107915,...,Positive,Negative,3,MDLC,Positive,Primary,40,2,False,False
MB-0010,-0.607902,-2.703663,-3.985560,-1.946897,-1.031664,-2.515123,-0.106590,-1.416732,1.831071,2.752198,...,Positive,Negative,3,IDC,Positive,Primary,31,4,False,False
MB-0035,0.030608,2.745044,-12.806355,-1.210434,1.153760,-1.670136,-0.026609,1.142965,-0.249931,-0.038749,...,Positive,Negative,2,ILC,Negative,Primary,28,2,False,False
MB-0036,-0.790183,1.231608,-4.430817,1.285830,0.066241,2.062133,-0.463225,0.387269,-1.128038,-0.915454,...,Positive,Negative,2,IDC,Positive,Primary,22,4,False,False
MB-0050,0.632033,2.351741,4.930531,0.032335,-0.411730,1.844175,1.179471,0.709233,-1.667267,-1.518911,...,Positive,Negative,2,MDLC,Positive,Primary,33,2,False,False
MB-0059,-0.382658,0.023489,-6.737842,-1.290889,-0.241835,-0.345010,-0.422671,-0.141870,0.261318,-0.391523,...,Positive,Negative,3,IDC,Positive,Primary,17,1,False,False
MB-0060,0.052539,-13.796125,-5.467725,-3.010164,-1.024481,-2.335429,0.731909,-3.917570,5.901842,6.402576,...,Positive,Negative,3,IDC,Positive,Primary,23,2,False,False
MB-0066,-0.532461,-6.928989,-1.518903,-1.588128,-0.141626,-0.882887,0.340864,-0.146421,0.799507,1.549202,...,Positive,Negative,2,IDC,Positive,Primary,16,2,False,False
MB-0101,0.463791,-2.566320,3.008907,0.754153,0.411984,-0.077724,0.516885,0.749916,-1.567490,-1.662106,...,Positive,Negative,2,ILC,Positive,Primary,34,2,False,False
MB-0102,0.213588,-5.449268,-0.370783,-1.993286,0.240742,-0.892886,1.234175,-0.470323,2.435477,2.233675,...,Positive,Negative,2,ILC,Positive,Primary,40,2,False,False


In [13]:
full_df['Integrative Cluster'] = full_df['Integrative Cluster'].fillna('11')

In [25]:
from scipy.stats import ttest_ind, mannwhitneyu
import qvalue as qv

clusterframes = {}

grouped_by_cluster = full_df.groupby('Integrative Cluster')

for group in grouped_by_cluster:
    
    groupname = group[0]
    df_cluster = pd.DataFrame(index=full_df.iloc[:,:-33].columns)
    #print(df_cluster.shape)
    group2_df = full_df[full_df['Integrative Cluster'] == '11'].iloc[:,:-33]
    df = group[1].iloc[:,:-33]
    print(df.shape)
    pvaluelist = []
    group1_mean_list = []
    group2_mean_list = []
    for pathway in df:
        group = df[pathway]
        group2 = group2_df[pathway]
        test = mannwhitneyu(group, group2)
        pvaluelist.append(test[1])
        group_mean = group.mean()
        group1_mean_list.append(group_mean)
        group2_mean = group2.mean()
        group2_mean_list.append(group2_mean) 
        
    
    df_cluster[f'Cluster {groupname}'] = group1_mean_list
    df_cluster['Other clusters'] = group2_mean_list
    df_cluster['Fold Change'] = np.log2(abs(df_cluster[f'Cluster {groupname}'])) - np.log2(abs(df_cluster['Other clusters']))
    df_cluster['p-vals'] = pvaluelist
    qv.qvalues(df_cluster, 'p-vals', f'cluster {groupname} qvalues')
    df_cluster['p-vals'] = -np.log10(df_cluster['p-vals'])
    df_cluster[f'cluster {groupname} qvalues'] = -np.log10(df_cluster[f'cluster {groupname} qvalues'])
    print(groupname)
    clusterframes[groupname] = df_cluster

(139, 2074)
1
(226, 2074)
10
(156, 2074)
11
(72, 2074)
2
(290, 2074)
3
(259, 2074)
4ER+
(83, 2074)
4ER-
(190, 2074)
5
(85, 2074)
6
(190, 2074)
7
(299, 2074)
8
(146, 2074)
9


In [18]:
clusterframes['5']

Unnamed: 0,Cluster 5,Other clusters,Fold Change,p-vals,cluster 5 qvalues
Cytosolic tRNA aminoacylation,1.918395,-4.125824,-1.104782,54.741739,51.903068
Golgi-to-ER retrograde transport,-2.698640,8.060241,-1.578590,54.601826,51.903068
Caspase activation via Dependence Receptors in the absence of ligand,-1.318337,2.486778,-0.915559,54.557681,51.903068
Regulation of PLK1 Activity at G2/M Transition,2.583399,-6.335938,-1.294288,54.535615,51.903068
Recruitment of NuMA to mitotic centrosomes,2.420407,-5.963882,-1.301002,54.520907,51.903068
Metabolism of folate and pterines,-1.300699,3.065660,-1.236911,54.337227,51.798569
AURKA Activation by TPX2,2.468980,-5.982817,-1.276910,54.183179,51.711469
COPI-dependent Golgi-to-ER retrograde traffic,2.571705,-7.322687,-1.509648,53.978129,51.564410
Loss of proteins required for interphase microtubule organization from the centrosome,2.251683,-5.318755,-1.240085,53.846520,51.529711
Loss of Nlp from mitotic centrosomes,2.251683,-5.318755,-1.240085,53.846520,51.529711


In [23]:
full_df[full_df['Integrative Cluster'] == '11']

Unnamed: 0,Interleukin-6 signaling,Apoptosis,Hemostasis,Intrinsic Pathway for Apoptosis,PKB-mediated events,PI3K Cascade,MAPK3 (ERK1) activation,Translesion synthesis by REV1,Translesion synthesis by Y family DNA polymerases bypasses lesions on DNA template,Recognition of DNA damage by PCNA-containing replication complex,...,ER Status,HER2 Status,Neoplasm Histologic Grade,Oncotree Code,PR Status,Sample Type,Tumor Size,Tumor Stage,Triple Neg,ER-/PR-/HER2+
MB-0025,0.708607,-14.107620,2.757920,-7.571628,0.246042,1.256408,1.452613,-3.518814,3.942836,5.485936,...,Positive,,3,IDC,,Primary,34,2,False,False
MB-0196,-0.716803,-1.887594,2.216235,-0.592780,-0.236070,1.582486,0.154729,-0.189662,-0.401112,-0.371287,...,,,,,,,,,,
MB-0326,1.278919,-2.040867,4.571909,-0.290515,-0.330894,-1.137399,1.621470,0.540122,-0.517956,-0.406349,...,Positive,,2,,,Primary,12,2,False,False
MB-0329,-0.929492,-1.840400,-2.517941,-2.134142,1.439821,-1.376312,-0.114939,0.969602,0.246600,-0.967212,...,Positive,,3,IDC,,Primary,30,2,False,False
MB-0330,0.174259,-12.047208,8.036728,-2.787038,-0.749373,-0.904634,0.692966,-2.609978,3.881413,1.903682,...,,,,,,,,,,
MB-0335,-1.514548,-2.164197,-9.624805,0.573039,-1.193133,-2.024887,-2.393001,-1.266100,1.438712,0.890464,...,,,,,,,,,,
MB-0355,-0.823401,1.815399,-7.621415,-0.470540,1.527815,-0.104104,-0.172485,0.261938,-1.393628,-1.459191,...,,,,,,,,,,
MB-0407,-0.523120,3.648936,-2.536624,-0.484189,0.718510,-1.939768,-0.856862,0.796828,-0.933508,-1.047953,...,,,,,,,,,,
MB-0433,0.994508,4.392758,2.665575,2.825533,-0.275470,0.277858,0.517937,1.495849,-1.893269,-2.046202,...,,,,,,,,,,
MB-0547,-0.886823,-6.393473,2.087781,-0.048183,-0.792299,-0.322035,-0.314982,-0.770209,0.605042,0.806793,...,Positive,,2,,,Primary,25,2,False,False


In [20]:
full_clusterframe = pd.DataFrame(index=clusterframes['1'].index)
for i in clusterframes:
    series = clusterframes[i][f'cluster {i} qvalues']
    full_clusterframe = pd.concat([full_clusterframe, series.reindex(full_clusterframe.index)], axis=1)

In [21]:
full_clusterframe.to_csv("../exp/normal_vs_cancer_qvalues.csv")