# BIOINF 575 - Group Project - Kegg Pathway Overlap
#### Authors:
* Ryan Rebernick
* Elysia Chou
* Mahnoor Gondal
* Tusharika Rastogi


## README
This is the space where I (Elysia) clean and merge the code.

In [1]:
import numpy as np
import pandas as pd

In [2]:
# Create a path string to adjust at the beginning, depending on whose machine the code is running.
path = '../../int/'

## 1. Map/merge the information by PATHWAY_ID and GENE_ID. 

#### First we read in the data and store in dataframes for easy access.

In [3]:

# Read the pathway names
pathway_pinfo = pd.read_table('http://rest.kegg.jp/list/pathway/hsa', header=None)
pathway_pinfo.columns = ['PATHWAY_ID', 'PATHWAY_INFO']
print(pathway_pinfo.head())
print(np.shape(pathway_pinfo))

# Read the gene-pathway
geneid_pathway = pd.read_table('http://rest.kegg.jp/link/pathway/hsa', header=None)
geneid_pathway.columns = ['GENE_ID', 'PATHWAY_ID']
print(geneid_pathway.head())
print(np.shape(geneid_pathway))

# Read the pathway names
geneid_ginfo = pd.read_table('http://rest.kegg.jp/list/hsa', header=None)
geneid_ginfo.columns = ['GENE_ID', 'GENE_INFO']
print(geneid_ginfo.head())
print(np.shape(geneid_ginfo))


      PATHWAY_ID                                       PATHWAY_INFO
0  path:hsa00010  Glycolysis / Gluconeogenesis - Homo sapiens (h...
1  path:hsa00020   Citrate cycle (TCA cycle) - Homo sapiens (human)
2  path:hsa00030   Pentose phosphate pathway - Homo sapiens (human)
3  path:hsa00040  Pentose and glucuronate interconversions - Hom...
4  path:hsa00051  Fructose and mannose metabolism - Homo sapiens...
(345, 2)
     GENE_ID     PATHWAY_ID
0  hsa:10327  path:hsa00010
1    hsa:124  path:hsa00010
2    hsa:125  path:hsa00010
3    hsa:126  path:hsa00010
4    hsa:127  path:hsa00010
(35370, 2)
         GENE_ID                                          GENE_INFO
0  hsa:100423038                  MIR466, hsa-mir-466; microRNA 466
1  hsa:100616498                   MIR378E, mir-378e; microRNA 378e
2     hsa:442913  MIR376C, MIR368, MIRN368, MIRN376C, hsa-mir-36...
3     hsa:574461                   MIR520E, MIRN520E; microRNA 520e
4  hsa:100616159                             MIR4779; microRNA 4

#### Next we need to merge the information by pathwya ids and gene ids

Note that in merging, some genes did not have values in the geneid_ginfo file. These were left as NaN. I have shown them for reference.

In [8]:
merged = pd.merge(pathway_pinfo, geneid_pathway, on = 'PATHWAY_ID', how = 'outer')
merged = pd.merge(merged, geneid_ginfo, on = 'GENE_ID', how = 'left')
merged = merged.sort_values(by = 'PATHWAY_ID')
merged

Unnamed: 0,PATHWAY_ID,PATHWAY_INFO,GENE_ID,GENE_INFO
0,path:hsa00010,Glycolysis / Gluconeogenesis - Homo sapiens (h...,hsa:10327,"AKR1A1, ALDR1, ALR, ARM, DD3, HEL-S-6; aldo-ke..."
36,path:hsa00010,Glycolysis / Gluconeogenesis - Homo sapiens (h...,hsa:3945,"LDHB, HEL-S-281, LDH-B, LDH-H, LDHBD, TRG-5; l..."
37,path:hsa00010,Glycolysis / Gluconeogenesis - Homo sapiens (h...,hsa:3948,"LDHC, CT32, LDH3, LDHX; lactate dehydrogenase C"
38,path:hsa00010,Glycolysis / Gluconeogenesis - Homo sapiens (h...,hsa:441531,"PGAM4, PGAM-B, PGAM1, PGAM3, dJ1000K24.1; phos..."
39,path:hsa00010,Glycolysis / Gluconeogenesis - Homo sapiens (h...,hsa:501,"ALDH7A1, ATQ1, EPD, PDE; aldehyde dehydrogenas..."
...,...,...,...,...
35277,path:hsa05418,Fluid shear stress and atherosclerosis - Homo ...,hsa:387,"RHOA, ARH12, ARHA, EDFAOB, RHO12, RHOH12; ras ..."
35278,path:hsa05418,Fluid shear stress and atherosclerosis - Homo ...,hsa:387082,"SUMO4, IDDM5, SMT3H4, SUMO-4, dJ281H8.4; small..."
35279,path:hsa05418,Fluid shear stress and atherosclerosis - Homo ...,hsa:406902,"MIR10A, MIRN10A, hsa-mir-10a, miRNA10A, mir-10..."
35273,path:hsa05418,Fluid shear stress and atherosclerosis - Homo ...,hsa:3685,"ITGAV, CD51, MSK8, VNRA, VTNR; integrin subuni..."


In [11]:
sum(geneid_ginfo['GENE_INFO'].isnull())

0

In [10]:
print(geneid_ginfo[geneid_ginfo['GENE_INFO'].isnull()])

Empty DataFrame
Columns: [GENE_ID, GENE_INFO]
Index: []


In [12]:

# combine the pathway information and the gene ids
merged = pd.merge(pathway_pinfo, geneid_pathway, on = 'PATHWAY_ID', how = 'outer')

# add the gene info
merged = pd.merge(merged, geneid_ginfo, on = 'GENE_ID', how = 'left')

# sort and look at the result
merged = merged.sort_values(by = 'PATHWAY_ID')
#print(merged.head())
#print(np.shape(merged))

# View the NA genes
print(merged[merged['GENE_INFO'].isnull()])



Empty DataFrame
Columns: [PATHWAY_ID, PATHWAY_INFO, GENE_ID, GENE_INFO]
Index: []


#### Save this file for future use

In [None]:

merged.to_csv(path + '2.1_merged_genes_pathways.csv', sep = ',', mode = 'w', index=False)
    

## 2. Compute the number of overlapping genes between every 2 pathways.

Our goal is to create a dataframe with 4 columns: pathway 1, pathway 2, the number of overlapping genes, and which genes overlap.
To do this we will:

* create a dictionary of each pathway to a set of its genes.
* Then we will loop through each and compare it to the remaining pathways.
* The output of this will be exported as a .csv file

#### First we read in our data that we previously generated
If running separate scripts or picking up where you left off.

In [None]:
# Run this is you are starting from this step
# merged = pd.read_table(path + '2.1_merged_genes_pathways.csv', sep=',')
# merged.head()

#### Next we will create dictionaries that store the pathways and a set of each pathways respective genes.


In [13]:

# all the unique pathways
unique_pathways = np.unique(merged['PATHWAY_ID'])
print(unique_pathways[:2])

# dictionary to store each pathwyas genes
pathway_genes = {}

# get the genes for each pathway and store in dictionary
for pw in unique_pathways:
    
    # rows of df corresponding to current pathway
    cur_pw = merged[merged.PATHWAY_ID == pw]
    # the genes from the current rows
    cur_genes = set(cur_pw.GENE_ID)
    # add to dictionary
    pathway_genes.update({pw:cur_genes})

# look at the first dicionary entry
print('\n')
print(list(pathway_genes.items())[:1])
print()

['path:hsa00010' 'path:hsa00020']


[('path:hsa00010', {'hsa:441531', 'hsa:80201', 'hsa:84532', 'hsa:57818', 'hsa:221', 'hsa:5223', 'hsa:55902', 'hsa:2203', 'hsa:2538', 'hsa:9562', 'hsa:83440', 'hsa:160287', 'hsa:5230', 'hsa:1738', 'hsa:5161', 'hsa:2821', 'hsa:3098', 'hsa:130589', 'hsa:2023', 'hsa:92483', 'hsa:5214', 'hsa:229', 'hsa:5162', 'hsa:3939', 'hsa:3945', 'hsa:8789', 'hsa:219', 'hsa:5106', 'hsa:128', 'hsa:26330', 'hsa:387712', 'hsa:5236', 'hsa:222', 'hsa:226', 'hsa:130', 'hsa:1737', 'hsa:131', 'hsa:5213', 'hsa:217', 'hsa:5105', 'hsa:2027', 'hsa:5224', 'hsa:5211', 'hsa:126', 'hsa:125', 'hsa:501', 'hsa:92579', 'hsa:7167', 'hsa:127', 'hsa:218', 'hsa:3099', 'hsa:5313', 'hsa:669', 'hsa:230', 'hsa:55276', 'hsa:223', 'hsa:5315', 'hsa:10327', 'hsa:3948', 'hsa:5232', 'hsa:2645', 'hsa:224', 'hsa:2597', 'hsa:124', 'hsa:5160', 'hsa:2026', 'hsa:3101'})]



#### Next we will loop through each pathway and crossref the overlapping genes with all the other pathways. The number of crossovers will be stored in a dataframe.

In [14]:

# lists to store values; will be converted into dataframe at the end
pw_1_list = list()
pw_2_list = list()
gene_overlap_list = list()
number_overlap_list = list()

# for each pathway in the unique pathways
for n in range(0, len(unique_pathways)):
    
    # first pathway
    pw_1 = unique_pathways[n]
    # genes from first pathway
    pw_1_genes = pathway_genes.get(pw_1)
    
    # loop through other pathway combinations,
    # such that no pair of pathways shows up twice,
    # nor do we take pairs of which each element is the same pathway.
    for nn in range(n+1, len(unique_pathways)):
        
        # second pathway
        pw_2 = unique_pathways[nn]
        # genes from first pathway
        pw_2_genes = pathway_genes.get(pw_2)
    
        # compute overlapping genes
        overlaps = pw_1_genes.intersection(pw_2_genes)
        num_overalps = len(overlaps)
    
        # add to lists
        pw_1_list.append(pw_1)
        pw_2_list.append(pw_2)
        gene_overlap_list.append(overlaps)
        number_overlap_list.append(num_overalps)

        
# create dataframe from lists
pathway_gene_overlaps = pd.DataFrame(list(zip(pw_1_list, pw_2_list, number_overlap_list, gene_overlap_list)), 
                                     columns = ['PATHWAY_ID1', 'PATHWAY_ID2', 'NUM_OVERLAPPING_GENES', 'OVERLAPPING_GENES'])

# look at the first entries
print(np.shape(pathway_gene_overlaps))
pathway_gene_overlaps[:10]

(59340, 4)


Unnamed: 0,PATHWAY_ID1,PATHWAY_ID2,NUM_OVERLAPPING_GENES,OVERLAPPING_GENES
0,path:hsa00010,path:hsa00020,7,"{hsa:1737, hsa:5162, hsa:5105, hsa:5160, hsa:1..."
1,path:hsa00010,path:hsa00030,11,"{hsa:55276, hsa:5236, hsa:226, hsa:5214, hsa:2..."
2,path:hsa00010,path:hsa00040,1,{hsa:10327}
3,path:hsa00010,path:hsa00051,13,"{hsa:80201, hsa:226, hsa:5214, hsa:229, hsa:52..."
4,path:hsa00010,path:hsa00052,14,"{hsa:55276, hsa:5236, hsa:80201, hsa:57818, hs..."
5,path:hsa00010,path:hsa00053,6,"{hsa:223, hsa:217, hsa:10327, hsa:501, hsa:224..."
6,path:hsa00010,path:hsa00061,0,{}
7,path:hsa00010,path:hsa00062,0,{}
8,path:hsa00010,path:hsa00071,12,"{hsa:223, hsa:130, hsa:131, hsa:126, hsa:217, ..."
9,path:hsa00010,path:hsa00100,0,{}


#### Save this file for future use

In [None]:
pathway_gene_overlaps.to_csv(path + '2.2_overlapping_genes.csv', sep = ',', mode = 'w', index=False)

## 3. Save the result to a file KEGG_crosstalk.csv 
This file should be saved with the following columns: PATHWAY_ID1, PATHWAY_NAME1, PATHWAY_ID2, PATHWAY_NAME2. Order the results descending by the number of overlapping genes where PATHWAY_ID1 is different than PATHWAY_ID2.

In [None]:
# The following cell is redundant in the merged file.

#Extract unique pathway_id and pathway_info from merged dataframe
Data = merged[['pathway_id', 'pathway_info']]
Data = Data.drop_duplicates(subset='pathway_id', keep='first')
Data = Data.rename(columns={'pathway_id': 'PATHWAY_ID', 'pathway_info': 'PATHWAY_INFO'})

# look at the first entries
print(np.shape(Data))
Data[:10]

In [15]:
#Add pathway information for pathway ID1
# Using original pathway_pinfo obtained from KEGG (see step 1)
Combined_pathways1 = pd.merge(pathway_gene_overlaps,
                 pathway_pinfo,
                 left_on='PATHWAY_ID1',
                 right_on='PATHWAY_ID')

# look at the first entries
print(np.shape(Combined_pathways1))
Combined_pathways1[346:700]

(59340, 6)


Unnamed: 0,PATHWAY_ID1,PATHWAY_ID2,NUM_OVERLAPPING_GENES,OVERLAPPING_GENES,PATHWAY_ID,PATHWAY_INFO
346,path:hsa00020,path:hsa00051,0,{},path:hsa00020,Citrate cycle (TCA cycle) - Homo sapiens (human)
347,path:hsa00020,path:hsa00052,0,{},path:hsa00020,Citrate cycle (TCA cycle) - Homo sapiens (human)
348,path:hsa00020,path:hsa00053,0,{},path:hsa00020,Citrate cycle (TCA cycle) - Homo sapiens (human)
349,path:hsa00020,path:hsa00061,0,{},path:hsa00020,Citrate cycle (TCA cycle) - Homo sapiens (human)
350,path:hsa00020,path:hsa00062,0,{},path:hsa00020,Citrate cycle (TCA cycle) - Homo sapiens (human)
...,...,...,...,...,...,...
695,path:hsa00030,path:hsa00120,0,{},path:hsa00030,Pentose phosphate pathway - Homo sapiens (human)
696,path:hsa00030,path:hsa00130,0,{},path:hsa00030,Pentose phosphate pathway - Homo sapiens (human)
697,path:hsa00030,path:hsa00140,0,{},path:hsa00030,Pentose phosphate pathway - Homo sapiens (human)
698,path:hsa00030,path:hsa00190,0,{},path:hsa00030,Pentose phosphate pathway - Homo sapiens (human)


In [16]:
#Add pathway information for pathway ID2 and rearranging the dataframe into a new dataframe with updated information
Combined_pathways2 = pd.merge(Combined_pathways1,
                 pathway_pinfo,
                 left_on = "PATHWAY_ID2",
                 right_on = "PATHWAY_ID" )
#print(Combined_pathways2.head())

In [19]:
# DELETE LATER
# print(Combined_pathways2.head())

In [18]:
del Combined_pathways2["PATHWAY_ID_x"],Combined_pathways2["PATHWAY_ID_y"]
Combined_pathways2 = Combined_pathways2.rename(columns={'PATHWAY_INFO_x': 'PATHWAY_NAME1', 'PATHWAY_INFO_y': 'PATHWAY_NAME2'})
Combined_pathways2 = Combined_pathways2[["PATHWAY_ID1", "PATHWAY_NAME1", "PATHWAY_ID2", "PATHWAY_NAME2", "NUM_OVERLAPPING_GENES", "OVERLAPPING_GENES"]]
Updated_data = Combined_pathways2.sort_values(by=['NUM_OVERLAPPING_GENES'], ascending=False)

# look at the first entries
print(np.shape(Updated_data))
Updated_data[:10]

(59340, 6)


Unnamed: 0,PATHWAY_ID1,PATHWAY_NAME1,PATHWAY_ID2,PATHWAY_NAME2,NUM_OVERLAPPING_GENES,OVERLAPPING_GENES
35772,path:hsa05010,Alzheimer disease - Homo sapiens (human),path:hsa05022,Pathways of neurodegeneration - multiple disea...,333,"{hsa:7381, hsa:4709, hsa:7478, hsa:113457, hsa..."
35774,path:hsa05014,Amyotrophic lateral sclerosis - Homo sapiens (...,path:hsa05022,Pathways of neurodegeneration - multiple disea...,294,"{hsa:2878, hsa:55860, hsa:6389, hsa:196385, hs..."
35775,path:hsa05016,Huntington disease - Homo sapiens (human),path:hsa05022,Pathways of neurodegeneration - multiple disea...,251,"{hsa:2878, hsa:6389, hsa:196385, hsa:55860, hs..."
35773,path:hsa05012,Parkinson disease - Homo sapiens (human),path:hsa05022,Pathways of neurodegeneration - multiple disea...,228,"{hsa:6389, hsa:4514, hsa:810, hsa:7311, hsa:73..."
34979,path:hsa05014,Amyotrophic lateral sclerosis - Homo sapiens (...,path:hsa05016,Huntington disease - Homo sapiens (human),224,"{hsa:2878, hsa:6389, hsa:196385, hsa:55860, hs..."
35777,path:hsa05020,Prion disease - Homo sapiens (human),path:hsa05022,Pathways of neurodegeneration - multiple disea...,220,"{hsa:6389, hsa:4514, hsa:7381, hsa:4709, hsa:1..."
35506,path:hsa05010,Alzheimer disease - Homo sapiens (human),path:hsa05020,Prion disease - Homo sapiens (human),214,"{hsa:6389, hsa:4514, hsa:7381, hsa:4709, hsa:5..."
34452,path:hsa05010,Alzheimer disease - Homo sapiens (human),path:hsa05012,Parkinson disease - Homo sapiens (human),208,"{hsa:6389, hsa:4514, hsa:810, hsa:7381, hsa:47..."
34714,path:hsa05010,Alzheimer disease - Homo sapiens (human),path:hsa05014,Amyotrophic lateral sclerosis - Homo sapiens (...,206,"{hsa:6389, hsa:4514, hsa:7381, hsa:4709, hsa:1..."
34977,path:hsa05010,Alzheimer disease - Homo sapiens (human),path:hsa05016,Huntington disease - Homo sapiens (human),202,"{hsa:6389, hsa:4514, hsa:7381, hsa:4709, hsa:1..."


In [None]:
#Save the result to a file KEGG_crosstalk.csv with the following columns: 
#PATHWAY_ID1, PATHWAY_NAME1, PATHWAY_ID2, PATHWAY_NAME2. 

Updated_data.to_csv(path + '2.3_KEGG_crosstalk.csv')

## 4. Compute a rank of the genes based on how many pathways they appear on and save it to a file.

In [None]:
#Function to extract gene_description
def get_description(info):
    if type(info) == float:
        return ""
    return info.split(";")[-1]
    

In [None]:
#Apply function to extract gene description and add it to the data
merged['gene_description'] = merged['gene_info'].apply(get_description)

In [None]:
#Apply gene_id for genes without any pathway assigned i.e NaN in description
merged.gene_description[(merged.gene_description == "")] = merged.GENE_ID[(merged.gene_description == "")]

# look at the first entries
print(np.shape(merged))
merged[:10]

In [None]:
#Compute rank of the genes based on how many pathways they appear in
gene_rank = merged.groupby(['gene_description'])['PATHWAY_ID'].count().sort_values(ascending=False)
gene_rank = pd.DataFrame(gene_rank)
gene_rank["gene_rank"] = gene_rank['PATHWAY_ID']
del gene_rank['PATHWAY_ID']

# look at the first entries
print(np.shape(gene_rank))
gene_rank[:10]

In [None]:
#Save the result to a file
gene_rank.to_csv(path + '2.4_Gene_Rank.csv')

## 5. Retrieve a set of the pathways the top 3 genes appear on.

## 6. Compute and display a Venn diagram for number of overlapping pathways for the top 3 genes.