#Chisholm / Prochlorococcus project
##Use Python and its various libraries to link the KEGG compound information with pathways
###KLongnecker, 4/17/2017

In [5]:
import pandas as pd
import numpy as np
import re
import os
import glob

import palettable as pal
import matplotlib.pyplot as plt
import matplotlib as mpl 
mpl.rcParams['pdf.fonttype'] = 42

from Bio import SeqIO
from Bio.KEGG.REST import *
from Bio.KEGG.KGML import KGML_parser
from Bio.Graphics.KGML_vis import KGMLCanvas
from IPython.display import Image, HTML

#used this to step into the function and debug it, also need line with Tracer()() 
from IPython.core.debugger import Tracer 
#os._exit(1) #this should get me out of the debugging...

%matplotlib inline

In [6]:
mtabFile = 'ChisholmPro_KEGGexport.2017.04.17.csv' #first column is KEGG C number

In [7]:
mtabData=pd.read_csv(mtabFile, index_col='KEGG')

In [8]:
mtabData.head()

Unnamed: 0_level_0,Plimited_extracellular_s9301ax_50,Plimited_filter_s9301ax_50,replete_extracellular_s9301ax_10,replete_extracellular_s9301ax_50,replete_extracellular_s0801ax_10,replete_extracellular_s9313ax_5,replete_extracellular_s9313ax_10,replete_filter_s9301ax_10,replete_filter_s9301ax_50,replete_filter_s0801ax_10,replete_filter_s9313ax_5,replete_filter_s9313ax_10
KEGG,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
C00196,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
C19675,59.673373,0.0,541.630502,0.0,399.057861,0.0,0.0,0.0,0.0,0.0,0.0,0.0
C08276,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
C00141,8169.547758,0.0,16449.808476,24651.926818,223.180717,100228.888379,116396.989909,0.0,0.0,0.0,0.0,0.0
C00671,0.0,0.0,5910.269185,13236.046283,0.0,20150.069568,26468.359973,316.366725,303.627647,272.686725,19.57057,395.492424


In [9]:
#Picking code out of the NB project

In [10]:
allPathways = kegg_list("pathway").read()
len(allPathways.split('\n'))
#number here is the # of pathways at KEGG, up to 486 by 4/13/2016; and 513 one year later on 4/17/2017

513

In [11]:
trimPath = []
current_section = None
for line in allPathways.rstrip().split("\n"):
    tp = line[8:13]
    trimPath.append('ko' + tp)
    
#have some cases where KEGG will send back a pathway, but the pathway itself is not searchable...seems to 
#be a KEGG bug, 'ko00351' was first, then realized there are many of these,
#did this list manually since I thought it would be short...
toDelete = ('ko00351', 'ko01010','ko01060',  'ko01061', 'ko01062', 'ko01063', 
            'ko01064', 'ko01065', 'ko01066', 'ko01070', 'ko07011', 'ko07012', 
            'ko07013', 'ko07014', 'ko07015', 'ko07016', 'ko07017', 'ko07018', 
            'ko07019', 'ko07020', 'ko07021', 'ko07023', 'ko07024', 'ko07025', 
            'ko07026', 'ko07027', 'ko07028', 'ko07029', 'ko07030', 'ko07031', 
            'ko07032', 'ko07033', 'ko07034', 'ko07035', 'ko07036', 'ko07037', 
            'ko07038', 'ko07039', 'ko07040', 'ko07041', 'ko07042', 'ko07043', 
            'ko07044', 'ko07045', 'ko07046', 'ko07047', 'ko07048', 'ko07049', 
            'ko07050', 'ko07051', 'ko07052', 'ko07053', 'ko07054', 'ko07055', 
            'ko07056', 'ko07057', 'ko07110', 'ko07112', 'ko07114', 'ko07117', 
            'ko07211', 'ko07212', 'ko07213', 'ko07214', 'ko07215', 'ko07216', 
            'ko07217', 'ko07218', 'ko07219', 'ko07220', 'ko07221', 'ko07222', 
            'ko07223', 'ko07224', 'ko07225', 'ko07226', 'ko07227', 'ko07228', 
            'ko07229', 'ko07230', 'ko07231', 'ko07232', 'ko07233', 'ko07234', 
            'ko07235', 'ko04933')

#probably a way to do this without the for loop, but this will work
for item in toDelete:
    trimPath.remove(item)

In [12]:
#setup some functions

In [13]:
#set up a function to get the list of compounds for a given pathway (must be defined as ko00140 NOT map00140)
def getCfrom_ko(ko_id):
    pathway_file = kegg_get(ko_id).read()  # query and read the pathway
    compound_list = []

    current_section = None
    for line in pathway_file.rstrip().split("\n"):
        section = line[:12].strip()  # section names are within 12 columns
        if not section == "":
            current_section = section
        if current_section == "COMPOUND":
            compound_identifiers = line[12:].split("; ")
            t = compound_identifiers[0]
            compound_id = t[0:6]

            if not compound_id in compound_list:
                compound_list.append(compound_id)
    return compound_list

In [14]:
#set up a function to get the list of K orthologues for a given pathway (must be defined as ko00140 NOT map00140)
#for this project: can use the three digit code for the different Prochlorococcus strains
def getKfrom_ko(ko_id):
    pathway_file = kegg_get(ko_id).read()  # query and read the pathway
    K_list = []

    current_section = None
    for line in pathway_file.rstrip().split("\n"):
        section = line[:12].strip()  # section names are within 12 columns
        if not section == "":
            current_section = section
        if current_section == "ORTHOLOGY":
            K_identifiers = line[12:].split("; ")
            t = K_identifiers[0]
            K_id = t[0:6]

            if not K_id in K_list:
                K_list.append(K_id)
    return K_list

In [15]:
# A bit of code that will help us display the PDF output
def PDF(filename):
    return HTML('<iframe src=%s width=700 height=350></iframe>' % filename)

In [16]:
## A bit of helper code to shorten long text
#def head(text, lines=10):
#    """ Print the first lines lines of the passed text.
#    """
#    print '\n'.join(text.split('\n')[:lines] + ['[...]'])

In [17]:
out = getKfrom_ko('pmg00230')

In [18]:
item = 'pmg00230'

In [19]:
useColors = pal.colorbrewer.qualitative.Set1_4.hex_colors

In [20]:
pathway = KGML_parser.read(kegg_get(item, "kgml"))

In [21]:
pathway

<Bio.KEGG.KGML.KGML_pathway.Pathway at 0x9a521d0>

In [22]:
canvas = KGMLCanvas(pathway, import_imagemap=True)
pdfName = 'mapWithColors_' + str(item) + '.pdf'
#canvas.draw(directoryPDF + '/' + pdfName)
canvas.draw(pdfName)
PDF(pdfName)

In [25]:
import fxn_plotPathway
##if I make a change, have to reload the function:
# from imp import reload
# reload(fxn_plotPathway)

<module 'fxn_plotPathway' from 'C:\\Users\\KLongnecker\\Documents\\GitHub\\Pro_mtabs\\fxn_plotPathway.py'>

In [26]:
item

'pmg00230'

In [27]:
#need the KEGG numbers from mtabData (the index)

In [28]:
mtabData.index

Index(['C00196', 'C19675', 'C08276', 'C00141', 'C00671', 'C00568', 'C00156',
       'C00233', 'C00170', '0', 'C00352', 'C04022', 'C00334', 'C01279',
       'C00004', 'C00134', '0', 'C00147', 'C00212', 'C00020', 'C00026',
       'C00062', 'C00049', 'C00719', 'C00120', 'C07481', 'C01674', '0',
       'C00114', 'C03557', 'C00158', 'C00327', 'C08230', 'C00380', 'C01909',
       'C00111', 'C06231', 'C00504', 'C06454', 'C00122', 'C00092', 'C00025',
       'C00064', 'C01705', 'C00242', 'C00387', '0', 'C00954', 'C00294',
       'C00130', 'C00407', 'C00097', 'C00051', 'C00328', 'C00123', '0',
       'C00149', 'C00073', 'C06470', 'C00140', 'C00624', 'C02713', 'C00077',
       'C00295', 'C00864', 'C00079', 'C00074', 'C05786', 'C00148', 'C00314',
       'C00255', '0', 'C00019', 'C00019', 'C00213', 'C00065', 'C00093',
       'C05122', '0', 'C00042', 'C10833', 'C00245', 'C00378', 'C01081',
       'C00188', 'C00214', 'C00398', 'C00078', 'C00106', 'C00105', 'C00183',
       'C00385', 'C01762'],
      

In [29]:
#need to delete the zeros for now...

In [30]:
mtabPruned = mtabData.drop(['0'])

In [31]:
mtabPruned.index

Index(['C00196', 'C19675', 'C08276', 'C00141', 'C00671', 'C00568', 'C00156',
       'C00233', 'C00170', 'C00352', 'C04022', 'C00334', 'C01279', 'C00004',
       'C00134', 'C00147', 'C00212', 'C00020', 'C00026', 'C00062', 'C00049',
       'C00719', 'C00120', 'C07481', 'C01674', 'C00114', 'C03557', 'C00158',
       'C00327', 'C08230', 'C00380', 'C01909', 'C00111', 'C06231', 'C00504',
       'C06454', 'C00122', 'C00092', 'C00025', 'C00064', 'C01705', 'C00242',
       'C00387', 'C00954', 'C00294', 'C00130', 'C00407', 'C00097', 'C00051',
       'C00328', 'C00123', 'C00149', 'C00073', 'C06470', 'C00140', 'C00624',
       'C02713', 'C00077', 'C00295', 'C00864', 'C00079', 'C00074', 'C05786',
       'C00148', 'C00314', 'C00255', 'C00019', 'C00019', 'C00213', 'C00065',
       'C00093', 'C05122', 'C00042', 'C10833', 'C00245', 'C00378', 'C01081',
       'C00188', 'C00214', 'C00398', 'C00078', 'C00106', 'C00105', 'C00183',
       'C00385', 'C01762'],
      dtype='object', name='KEGG')

In [32]:
justKEGG = mtabPruned.index

In [64]:
# from imp import reload
# reload(fxn_plotPathway)

<module 'fxn_plotPathway' from 'C:\\Users\\KLongnecker\\Documents\\GitHub\\Pro_mtabs\\fxn_plotPathway.py'>

In [66]:
folder = 'pathway_plots'
usePathway = item
useCO = mtabPruned.index
gc_regular = fxn_plotPathway.gatherDetails(usePathway,folder,useCO,justKEGG)

In [None]:
#...next up...figure out what I want to do about the colors...save that for tomorrow