# BIOINF 575 - Group Project - Kegg Pathway Overlap
#### Authors:
* Ryan Rebernick
* Elysia Chou
* Mahnoor Gondal
* Tusharika Rastogi


In [6]:
import numpy as np
import pandas as pd

## 1. Map/merge the information by PATHWAY_ID and GENE_ID. 

#### First we read in the data and store in dataframes for easy access.

In [7]:

# Read the pathway names
pathway_pinfo = pd.read_table('http://rest.kegg.jp/list/pathway/hsa', header=None)
pathway_pinfo.columns = ['pathway_id', 'pathway_info']
print(pathway_pinfo.head())
print(np.shape(pathway_pinfo))

# Read the gene-pathway
geneid_pathway = pd.read_table('http://rest.kegg.jp/link/pathway/hsa', header=None)
geneid_pathway.columns = ['gene_id', 'pathway_id']
print(geneid_pathway.head())
print(np.shape(geneid_pathway))

# Read the pathway names
geneid_ginfo = pd.read_table('http://rest.kegg.jp/list/hsa', header=None)
geneid_ginfo.columns = ['gene_id', 'gene_info']
print(geneid_ginfo.head())
print(np.shape(geneid_ginfo))


      pathway_id                                       pathway_info
0  path:hsa00010  Glycolysis / Gluconeogenesis - Homo sapiens (h...
1  path:hsa00020   Citrate cycle (TCA cycle) - Homo sapiens (human)
2  path:hsa00030   Pentose phosphate pathway - Homo sapiens (human)
3  path:hsa00040  Pentose and glucuronate interconversions - Hom...
4  path:hsa00051  Fructose and mannose metabolism - Homo sapiens...
(345, 2)
     gene_id     pathway_id
0  hsa:10327  path:hsa00010
1    hsa:124  path:hsa00010
2    hsa:125  path:hsa00010
3    hsa:126  path:hsa00010
4    hsa:127  path:hsa00010
(35381, 2)
         gene_id                                          gene_info
0  hsa:100423038                  MIR466, hsa-mir-466; microRNA 466
1  hsa:100616498                   MIR378E, mir-378e; microRNA 378e
2     hsa:442913  MIR376C, MIR368, MIRN368, MIRN376C, hsa-mir-36...
3     hsa:574461                   MIR520E, MIRN520E; microRNA 520e
4  hsa:100616159                             MIR4779; microRNA 4

#### Next we need to merge the information by pathwya ids and gene ids

Note that in merging, some genes did not have values in the geneid_ginfo file. These were left as NaN. I have shown them for reference.

In [8]:

# combine the pathway information and the gene ids
merged = pd.merge(pathway_pinfo, geneid_pathway, on = 'pathway_id', how = 'outer')

# add the gene info
merged = pd.merge(merged, geneid_ginfo, on = 'gene_id', how = 'left')

# sort and look at the result
merged = merged.sort_values(by = 'pathway_id')
#print(merged.head())
#print(np.shape(merged))

# View the NA genes
print(merged[merged['gene_info'].isnull()])



          pathway_id                                       pathway_info  \
667    path:hsa00230           Purine metabolism - Homo sapiens (human)   
2421   path:hsa00730         Thiamine metabolism - Homo sapiens (human)   
4882   path:hsa01240   Biosynthesis of cofactors - Homo sapiens (human)   
5415   path:hsa03008  Ribosome biogenesis in eukaryotes - Homo sapie...   
5658   path:hsa03013  Nucleocytoplasmic transport - Homo sapiens (hu...   
5778   path:hsa03015   mRNA surveillance pathway - Homo sapiens (human)   
6218   path:hsa03320      PPAR signaling pathway - Homo sapiens (human)   
20703  path:hsa04923  Regulation of lipolysis in adipocytes - Homo s...   
23923  path:hsa05014  Amyotrophic lateral sclerosis - Homo sapiens (...   
28405  path:hsa05164                 Influenza A - Homo sapiens (human)   
29492  path:hsa05168  Herpes simplex virus 1 infection - Homo sapien...   
29218  path:hsa05168  Herpes simplex virus 1 infection - Homo sapien...   

             gene_id gen

#### Save this file for future use

In [9]:

merged.to_csv("/Users/ryanrebernick/Desktop/projects/kegg_bioinf575/int/2.1_merged_genes_pathways.csv", sep = ',', mode = 'w', index=False)
    