# 1_xml_analysis.ipynb
Extract the edge and node information from the downloaded Pathway map's xml file.
### input
- 0_KEGG_Pathway_xmlfile/xmlfile/* : XMLfiles with Pathway map's information.
### output
- 1_Paths_from_KEGG_Pathway_code/output/all_target_node_DrugBank.csv : A file with nodes information from KEGG.
- 1_Paths_from_KEGG_Pathway_code/output/all_target_edge_DrugBank.csv : A file with edges information from KEGG.

In [1]:
import pandas as pd
import xml.etree.ElementTree as ET
import glob

def extract_kegg_pathway(kegg_pathwayid):
    tree = ET.parse('../0_KEGG_Pathway_xmlfile/xmlfile/' + kegg_pathwayid + '.xml') # Specify the location where the XMLfile is stored
    root = tree.getroot()

    # get node
    df_entry = pd.DataFrame()
    df_relation = pd.DataFrame()
    for child in root:
        if child.tag == "entry":
            df_entry.loc[child.attrib['id'], 'KEGG_id'] = child.attrib['name']
            df_entry.loc[child.attrib['id'], 'type'] = child.attrib['type']
            try:
                df_entry.loc[child.attrib['id'], 'name'] = child[0].attrib['name']
            except KeyError as e:
                #print(f'arise {type(e)} in {child.attrib["id"]}')
                df_entry.loc[child.attrib['id'], 'name'] = ""
            li = []
            try:
                for k in child:
                    if k.tag == "component":
                        li.append(k.attrib['id'])
                if len(li)==0:
                    df_entry.loc[child.attrib['id'], 'component'] = ""
                else:
                    df_entry.loc[child.attrib['id'], 'component'] = li
            except:
                #print(f'arise error in {child.attrib["id"]}')
                df_entry.loc[child.attrib['id'], 'component'] = ""            

    # get edge
        if child.tag == "relation":
            try:
                df_relation = pd.concat([df_relation, pd.DataFrame([[child.attrib['entry1'], child.attrib['entry2'], child[0].attrib['name']]],columns = ['entry1','entry2','arrow'])])
            except:
                df_relation = pd.concat([df_relation, pd.DataFrame([[child.attrib['entry1'], child.attrib['entry2'], ""]],columns = ['entry1','entry2','arrow'])])
    df_relation = df_relation.reset_index(drop = True)
    return df_entry, df_relation

In [2]:
df_xml = pd.DataFrame(glob.glob('../0_KEGG_Pathway_xmlfile/xmlfile/*')).rename(columns = {0:'Xml'})

In [3]:
df_xml

Unnamed: 0,Xml
0,../0_KEGG_Pathway_xmlfile/xmlfile\hsa00010.xml
1,../0_KEGG_Pathway_xmlfile/xmlfile\hsa00020.xml
2,../0_KEGG_Pathway_xmlfile/xmlfile\hsa00030.xml
3,../0_KEGG_Pathway_xmlfile/xmlfile\hsa00040.xml
4,../0_KEGG_Pathway_xmlfile/xmlfile\hsa00051.xml
...,...
305,../0_KEGG_Pathway_xmlfile/xmlfile\hsa05414.xml
306,../0_KEGG_Pathway_xmlfile/xmlfile\hsa05415.xml
307,../0_KEGG_Pathway_xmlfile/xmlfile\hsa05416.xml
308,../0_KEGG_Pathway_xmlfile/xmlfile\hsa05417.xml


In [4]:
df_xml['Xml'] = df_xml['Xml'].reset_index(drop = True).str.replace('.xml', '').str[26:]
df_xml = df_xml[['Xml']]
df_xml

  df_xml['Xml'] = df_xml['Xml'].reset_index(drop = True).str.replace('.xml', '').str[26:]


Unnamed: 0,Xml
0,hsa00010
1,hsa00020
2,hsa00030
3,hsa00040
4,hsa00051
...,...
305,hsa05414
306,hsa05415
307,hsa05416
308,hsa05417


In [5]:
df_node = pd.DataFrame()
df_edge = pd.DataFrame()
if __name__ == '__main__':
    for i in df_xml['Xml']:
        node, edge = extract_kegg_pathway(i)
        node = node.reset_index()
        node['hsa'] = [i for k in range(len(node))]
        edge['hsa'] = [i for k in range(len(edge))]
        df_node = pd.concat([df_node, node])
        df_edge = pd.concat([df_edge, edge])

In [6]:
df_node = df_node.reset_index(drop = True)
df_node

Unnamed: 0,index,KEGG_id,type,name,component,hsa
0,18,hsa:226 hsa:229 hsa:230,gene,"ALDOA, ALDA, GSD12, HEL-S-87p...",,hsa00010
1,42,hsa:217 hsa:219 hsa:223 hsa:224 hsa:501,gene,"ALDH2, ALDH-E2, ALDHI, ALDM...",,hsa00010
2,43,ko:K01905 ko:K22224 ko:K24012,ortholog,K01905...,,hsa00010
3,44,hsa:218 hsa:221 hsa:222,gene,"ALDH3A1, ALDH3, ALDHIII...",,hsa00010
4,45,cpd:C00033,compound,C00033,,hsa00010
...,...,...,...,...,...,...
33230,228,undefined,group,,"[68, 70, 82]",hsa05418
33231,229,undefined,group,,"[213, 214, 215]",hsa05418
33232,230,undefined,group,,"[54, 55, 56, 57, 58]",hsa05418
33233,231,undefined,group,,"[118, 65]",hsa05418


In [7]:
df_edge = df_edge.reset_index(drop = True)
df_edge

Unnamed: 0,entry1,entry2,arrow,hsa
0,73,75,compound,hsa00010
1,73,74,compound,hsa00010
2,73,76,compound,hsa00010
3,70,73,compound,hsa00010
4,69,73,compound,hsa00010
...,...,...,...,...
19557,110,22,activation,hsa05418
19558,137,22,activation,hsa05418
19559,162,137,activation,hsa05418
19560,191,146,binding/association,hsa05418


In [8]:
df_node.to_csv('output/all_target_node_DrugBank.csv')
df_edge.to_csv('output/all_target_edge_DrugBank.csv')