# 2_paths_extraction.ipynb
Extraction of pathways using networkX
- The starting point is Target, and the end point is a node that has no network arrows sticking out of it.
### input
- 0_KEGG_Pathway_xmlfile/output/tbl_drug_target_TI_SE_Pathway.csv : A file with Target and hsa_map ID
- 1_Paths_from_KEGG_Pathway_code/output/all_target_node_DrugBank.csv : A file with nodes information from KEGG
- 1_Paths_from_KEGG_Pathway_code/output/all_target_edge_DrugBank.csv : A file with edges information from KEGG
### output
- 1_Paths_from_KEGG_Pathway_code/output/2_paths_extraction.csv : A file linking Targets and Paths which contain IDs in the 'order' column, which is specific to each hsa_map ID

In [1]:
import pandas as pd
import networkx as nx

df_edge = pd.read_csv('output/all_target_edge_DrugBank.csv',header = 0, index_col=0)
df_node = pd.read_csv('output/all_target_node_DrugBank.csv',header = 0, index_col=0)

In [2]:
df_map = pd.read_csv('../0_KEGG_Pathway_xmlfile/output/tbl_drug_target_TI_SE_Pathway.csv',header = 0, index_col=0)[['HSA', 'hsa_map']].drop_duplicates()

In [3]:
df_map['hsa_map'] = df_map['hsa_map'].apply(eval)
df_map = df_map.explode('hsa_map')

In [4]:
df_map = df_map[df_map['hsa_map']!='hsa00230'].reset_index(drop = True)
df_map = df_map[df_map['hsa_map']!='hsa00061'].reset_index(drop = True)

In [5]:
df_map

Unnamed: 0,HSA,hsa_map
0,3739,hsa04927
1,3739,hsa04934
2,3748,hsa05017
3,3751,hsa04726
4,3752,hsa05017
...,...,...
5866,11238,hsa00910
5867,11238,hsa01100
5868,767,hsa00910
5869,767,hsa01100


In [6]:
df_map = df_map.sort_values(['HSA']).reset_index(drop = True)

Convert the elements of the component column whose type is "group" from the string type to the list type and its contents to the int type.

In [7]:
df_no1 = df_node[df_node['type']=='group']['component'].apply(eval)
df_n = pd.merge(df_node, pd.DataFrame(df_no1),left_index=True, right_index=True ).drop(columns='component_x').rename(columns = {'component_y':'component'})
df_node = pd.concat([df_node[df_node['type']!='group'], df_n])

In [8]:
df_node['KEGG_id'] = df_node['KEGG_id'] + ' '

In [9]:
df_node

Unnamed: 0,index,KEGG_id,type,name,component,hsa
0,18,hsa:226 hsa:229 hsa:230,gene,"ALDOA, ALDA, GSD12, HEL-S-87p...",,hsa00010
1,42,hsa:217 hsa:219 hsa:223 hsa:224 hsa:501,gene,"ALDH2, ALDH-E2, ALDHI, ALDM...",,hsa00010
2,43,ko:K01905 ko:K22224 ko:K24012,ortholog,K01905...,,hsa00010
3,44,hsa:218 hsa:221 hsa:222,gene,"ALDH3A1, ALDH3, ALDHIII...",,hsa00010
4,45,cpd:C00033,compound,C00033,,hsa00010
...,...,...,...,...,...,...
33230,228,undefined,group,,"[68, 70, 82]",hsa05418
33231,229,undefined,group,,"[213, 214, 215]",hsa05418
33232,230,undefined,group,,"[54, 55, 56, 57, 58]",hsa05418
33233,231,undefined,group,,"[118, 65]",hsa05418


In [10]:
df_edge

Unnamed: 0,entry1,entry2,arrow,hsa
0,73,75,compound,hsa00010
1,73,74,compound,hsa00010
2,73,76,compound,hsa00010
3,70,73,compound,hsa00010
4,69,73,compound,hsa00010
...,...,...,...,...
19557,110,22,activation,hsa05418
19558,137,22,activation,hsa05418
19559,162,137,activation,hsa05418
19560,191,146,binding/association,hsa05418


In [11]:
df_all = pd.DataFrame()
for h,m in zip(df_map['HSA'],df_map['hsa_map']):
    #print(m)
    df_no = df_node[df_node['hsa']==m]
    df_ed1 = df_edge[df_edge['hsa']==m]
    
    a1 = df_no[df_no['type']=='group']
    df_ed = df_ed1.copy()
    for i, k in zip(a1['index'],a1['component']):
        k = list(map(int, k))
        df_ed = df_ed.replace(k, i)
    
    df = pd.merge(df_ed,df_no, left_on='entry1', right_on='index', how = 'left')
    df = pd.merge(df,df_no, left_on='entry2', right_on='index', how = 'left')
    G_g = nx.DiGraph()  
    df_g = df.copy()
    
    list_node_g = list(set(list(df_g['entry1']) + list(df_g['entry2'])))
    G_g.add_nodes_from(list_node_g)
    
    list_edge_g = []
    for i in range(len(df_g)):
        list_edge_g.append(tuple(df_g[['entry1','entry2']].loc[i]))
    G_g.add_edges_from(list_edge_g)
    
    end_g = []
    for i in list(G_g.nodes):
        if G_g.out_degree(i) == 0:
            end_g.append(i)
        else:
            pass
    
    a2 = df_no.copy()
    for i, k in zip(a1['index'],a1['component']):
        k = list(map(int, k))
        a2 = a2.replace(k, i)
        
    num = list(a2[a2['KEGG_id'].str.contains('hsa:'+str(h)+' ')]['index'].values)
    
    for o in num:
        try:
            for i in end_g:
                if len(list(nx.all_simple_paths(G_g, source=o, target = i)))!=0:
                    s = list(nx.all_simple_paths(G_g, source=o, target = i))
                    df_all = pd.concat([df_all, pd.DataFrame([[h,m,s]],columns = ['HSA','hsa_map','order'])])
                else:
                    pass
        except:
            pass

In [12]:
df_all = df_all.reset_index(drop = True)
df_all

Unnamed: 0,HSA,hsa_map,order
0,2,hsa04610,"[[104, 90, 138]]"
1,2,hsa04610,"[[104, 90, 137, 100, 21, 16], [104, 90, 137, 1..."
2,2,hsa04610,"[[104, 90, 137, 100, 148]]"
3,2,hsa04610,"[[104, 90, 34], [104, 90, 137, 100, 34]]"
4,2,hsa04610,"[[104, 90, 43, 52, 46]]"
...,...,...,...
16803,150094,hsa04922,"[[107, 80, 201, 90, 202, 119], [107, 80, 201, ..."
16804,162466,hsa00564,"[[112, 248, 73, 132], [112, 248, 132]]"
16805,162466,hsa00564,"[[112, 139]]"
16806,203068,hsa05130,"[[72, 67, 69, 53]]"


Separate the rows by 500 and expand the list for each.

In [13]:
df_c1 = pd.DataFrame()
for i in range(0,16500,500):
    c = df_all['order'][i:i+500].apply(pd.Series).unstack().reset_index().sort_values(['level_1','level_0']).dropna(subset=[0]).reset_index(drop = True)
    df_c = pd.merge(c,df_all,left_on='level_1',right_index = True).drop(columns = 'order')
    df_c1 = pd.concat([df_c1, df_c])

In [14]:
c = df_all['order'][16500:].apply(pd.Series).unstack().reset_index().sort_values(['level_1','level_0']).dropna(subset=[0]).reset_index(drop = True)
df_c = pd.merge(c,df_all,left_on='level_1',right_index = True).drop(columns = 'order')
df_c1 = pd.concat([df_c1, df_c])
df_c1 = df_c1.rename(columns = {0:'order'}).reset_index(drop = True)

In [15]:
df_c1

Unnamed: 0,level_0,level_1,order,HSA,hsa_map
0,0,0,"[104, 90, 138]",2,hsa04610
1,0,1,"[104, 90, 137, 100, 21, 16]",2,hsa04610
2,1,1,"[104, 90, 137, 100, 17, 16]",2,hsa04610
3,0,2,"[104, 90, 137, 100, 148]",2,hsa04610
4,0,3,"[104, 90, 34]",2,hsa04610
...,...,...,...,...,...
1216257,0,16806,"[72, 67, 69, 53]",203068,hsa05130
1216258,0,16807,"[270, 268, 330, 241, 239]",341208,hsa00860
1216259,1,16807,"[270, 418, 268, 330, 241, 239]",341208,hsa00860
1216260,2,16807,"[270, 418, 330, 241, 239]",341208,hsa00860


In [16]:
df_c1.to_csv('output/2_paths_extraction.csv',encoding = 'utf-8')