# BIOINF 575 - Group Project - Kegg Pathway Overlap
#### Authors:
* Ryan Rebernick
* Elysia Chou
* Mahnoor Gondal
* Tusharika Rastogi


In [1]:
import numpy as np
import pandas as pd

## 2. Compute the number of overlapping genes between every 2 pathways.

Our goal is to create a dataframe with 4 columns: pathway 1, pathway 2, the number of overlapping genes, and which genes overlap.
To do this we will:

* create a dictionary of each pathway to a set of its genes.
* Then we will loop through each and compare it to the remaining pathways.
* The output of this will be exported as a .csv file

#### First we read in our data that we previously generated

In [2]:

merged = pd.read_table('/Users/ryanrebernick/Desktop/projects/kegg_bioinf575/int/2.1_merged_genes_pathways.csv', sep=',')
merged.head()


Unnamed: 0,pathway_id,pathway_info,gene_id,gene_info
0,path:hsa00010,Glycolysis / Gluconeogenesis - Homo sapiens (h...,hsa:10327,"AKR1A1, ALDR1, ALR, ARM, DD3, HEL-S-6; aldo-ke..."
1,path:hsa00010,Glycolysis / Gluconeogenesis - Homo sapiens (h...,hsa:3945,"LDHB, HEL-S-281, LDH-B, LDH-H, LDHBD, TRG-5; l..."
2,path:hsa00010,Glycolysis / Gluconeogenesis - Homo sapiens (h...,hsa:3948,"LDHC, CT32, LDH3, LDHX; lactate dehydrogenase C"
3,path:hsa00010,Glycolysis / Gluconeogenesis - Homo sapiens (h...,hsa:441531,"PGAM4, PGAM-B, PGAM1, PGAM3, dJ1000K24.1; phos..."
4,path:hsa00010,Glycolysis / Gluconeogenesis - Homo sapiens (h...,hsa:501,"ALDH7A1, ATQ1, EPD, PDE; aldehyde dehydrogenas..."


#### Next we will create dictionaries that store the pathways and a set of each pathways respective genes.


In [3]:

# all the unique pathways
unique_pathways = np.unique(merged['pathway_id'])
print(unique_pathways[:2])

# dictionary to store each pathwyas genes
pathway_genes = {}

# get the genes for each pathway and store in dictionary
for pw in unique_pathways:
    
    # rows of df corresponding to current pathway
    cur_pw = merged[merged.pathway_id == pw]
    # the genes from the current rows
    cur_genes = set(cur_pw.gene_id)
    # add to dictionary
    pathway_genes.update({pw:cur_genes})

# look at the first dicionary entry
print('\n')
print(list(pathway_genes.items())[:1])


['path:hsa00010' 'path:hsa00020']


[('path:hsa00010', {'hsa:226', 'hsa:1738', 'hsa:92483', 'hsa:5224', 'hsa:7167', 'hsa:8789', 'hsa:126', 'hsa:229', 'hsa:219', 'hsa:55276', 'hsa:130589', 'hsa:441531', 'hsa:5232', 'hsa:2203', 'hsa:2538', 'hsa:3939', 'hsa:2645', 'hsa:217', 'hsa:5214', 'hsa:3099', 'hsa:127', 'hsa:3101', 'hsa:131', 'hsa:55902', 'hsa:2027', 'hsa:5236', 'hsa:130', 'hsa:5161', 'hsa:124', 'hsa:1737', 'hsa:3948', 'hsa:5162', 'hsa:5230', 'hsa:83440', 'hsa:5213', 'hsa:5223', 'hsa:5105', 'hsa:26330', 'hsa:9562', 'hsa:387712', 'hsa:5160', 'hsa:125', 'hsa:224', 'hsa:501', 'hsa:160287', 'hsa:10327', 'hsa:5106', 'hsa:2597', 'hsa:3945', 'hsa:669', 'hsa:80201', 'hsa:223', 'hsa:222', 'hsa:218', 'hsa:128', 'hsa:2821', 'hsa:2023', 'hsa:84532', 'hsa:5315', 'hsa:230', 'hsa:92579', 'hsa:57818', 'hsa:3098', 'hsa:5211', 'hsa:5313', 'hsa:221', 'hsa:2026'})]


#### Next we will loop through each pathway and crossref the overalapping genes with all the other pathways. The number of crossovers will be stored in a dataframe.

In [4]:

# lists to store values; will be converted into dataframe at the end
pw_1_list = list()
pw_2_list = list()
gene_overlap_list = list()
number_overlap_list = list()

# for each pathway in the unique pathways
for n in range(0, len(unique_pathways)):
    
    # first pathway
    pw_1 = unique_pathways[n]
    # genes from first pathway
    pw_1_genes = pathway_genes.get(pw_1)
    
    # loop through other pathway combinations
    for nn in range(n+1, len(unique_pathways)):
        
        # second pathway
        pw_2 = unique_pathways[nn]
        # genes from first pathway
        pw_2_genes = pathway_genes.get(pw_2)
    
        # compute overlapping genes
        overlaps = pw_1_genes.intersection(pw_2_genes)
        num_overalps = len(overlaps)
    
        # add to lists
        pw_1_list.append(pw_1)
        pw_2_list.append(pw_2)
        gene_overlap_list.append(overlaps)
        number_overlap_list.append(num_overalps)

        
# create dataframe from lists
pathway_gene_overlaps = pd.DataFrame(list(zip(pw_1_list, pw_2_list, number_overlap_list, gene_overlap_list)), 
                                     columns = ['PATHWAY_ID1', 'PATHWAY_ID2', 'NUM_OVERLAPPING_GENES', 'OVERLAPPING_GENES'])

# look at the first entries
print(np.shape(pathway_gene_overlaps))
pathway_gene_overlaps[:10]



(118680, 4)


Unnamed: 0,PATHWAY_ID1,PATHWAY_ID2,NUM_OVERLAPPING_GENES,OVERLAPPING_GENES
0,path:hsa00010,path:hsa00020,7,"{hsa:5162, hsa:5105, hsa:1738, hsa:5160, hsa:5..."
1,path:hsa00010,path:hsa00030,11,"{hsa:226, hsa:5214, hsa:5213, hsa:229, hsa:282..."
2,path:hsa00010,path:hsa00040,1,{hsa:10327}
3,path:hsa00010,path:hsa00051,13,"{hsa:226, hsa:5214, hsa:3099, hsa:5213, hsa:22..."
4,path:hsa00010,path:hsa00052,14,"{hsa:2645, hsa:5214, hsa:3099, hsa:5213, hsa:3..."
5,path:hsa00010,path:hsa00053,6,"{hsa:223, hsa:217, hsa:219, hsa:224, hsa:501, ..."
6,path:hsa00010,path:hsa00061,0,{}
7,path:hsa00010,path:hsa00062,0,{}
8,path:hsa00010,path:hsa00071,12,"{hsa:223, hsa:217, hsa:127, hsa:128, hsa:126, ..."
9,path:hsa00010,path:hsa00100,0,{}


#### Save this file for future use

In [5]:

pathway_gene_overlaps.to_csv("/Users/ryanrebernick/Desktop/projects/kegg_bioinf575/int/2.2_overlapping_genes.csv", sep = ',', mode = 'w', index=False)
    