# 0_1_scraping_pathway_link
Get the link of the Pathway ID associated with each Target by scraping.

### input
- 7_Target_Drug_SE_TI_from_SIDER_DrugBank/tbl_drug_target_TI_SE.pkl : A file with Target and Drug and SE/TI and KEGG ID

### output
- 0_KEGG_Pathway_xmlfile/output/KEGG_ID_Pathway_link.csv : A file linking KEGG ID and Pathway link
- 0_KEGG_Pathway_xmlfile/output/tbl_drug_target_TI_SE_Pathway.csv : A file with Target and Drug and SE/TI and KEGG ID and Pathway ID

In [1]:
import pandas as pd
from pandas import DataFrame
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import scipy.stats as st
import re
import time

In [2]:
import pickle
with open('../7_Target_Drug_SE_TI_from_SIDER_DrugBank/tbl_drug_target_TI_SE.pkl', 'rb') as web:
    df = pickle.load(web)

In [3]:
df['id'] = df['entry_id'].str.replace('hsa:', '').apply(int)

In [4]:
df

Unnamed: 0,Drug,target_name,target_type,TI,SE,gene_name,gene_symbols,entry_id,name,id
0,4-AP,Potassium voltage-gated channel subfamily A me...,Protein,"['Multiple sclerosis', 'Walking disability']","['Anaphylactic shock', 'Angioedema', 'Anxiety'...",KCNA4,kcna4,hsa:3739,KCNA4,3739
1,4-AP,Potassium voltage-gated channel subfamily C me...,Protein,"['Multiple sclerosis', 'Walking disability']","['Anaphylactic shock', 'Angioedema', 'Anxiety'...",KCNC3,kcnc3,hsa:3748,KCNC3,3748
2,4-AP,Potassium voltage-gated channel subfamily D me...,Protein,"['Multiple sclerosis', 'Walking disability']","['Anaphylactic shock', 'Angioedema', 'Anxiety'...",KCND2,kcnd2,hsa:3751,KCND2,3751
3,disopyramide,Potassium voltage-gated channel subfamily D me...,Protein,"['Arrhythmia', 'Bundle branch block', 'Sudden ...","['Agranulocytosis', 'Angioedema', 'Anorexia', ...",KCND2,kcnd2,hsa:3751,KCND2,3751
4,imipramine,Potassium voltage-gated channel subfamily D me...,Protein,"['Alcoholism', 'Depression', 'Endogenous depre...","['Abdominal cramps', 'Abdominal pain', 'Adenit...",KCND2,kcnd2,hsa:3751,KCND2,3751
...,...,...,...,...,...,...,...,...,...,...
2743,zinc,Tumor protein p73,Protein,"['Hepato-lenticular degeneration', 'Liver diso...","['Pancreatitis', 'Gastric irritation']",TP73,tp73,hsa:7161,TP73,7161
2744,zonisamide,Carbonic anhydrase 13,Protein,"['Epilepsy', 'Partial seizures', 'Sudden unexp...","['Abdominal pain', 'Congenital anomaly', 'Rena...",CA13,ca13,hsa:377677,CA13,377677
2745,zonisamide,"Carbonic anhydrase 5B, mitochondrial",Protein,"['Epilepsy', 'Partial seizures', 'Sudden unexp...","['Abdominal pain', 'Congenital anomaly', 'Rena...",CA5B,ca5b,hsa:11238,CA5B,11238
2746,zonisamide,Carbonic anhydrase-related protein,Protein,"['Epilepsy', 'Partial seizures', 'Sudden unexp...","['Abdominal pain', 'Congenital anomaly', 'Rena...",CA8,ca8,hsa:767,CA8,767


In [5]:
df_id = df[['id']].drop_duplicates().reset_index(drop = True)

In [6]:
df_id

Unnamed: 0,id
0,3739
1,3748
2,3751
3,3752
4,240
...,...
741,7161
742,377677
743,11238
744,767


In [7]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager

driver = webdriver.Chrome(ChromeDriverManager().install())



Current google-chrome version is 97.0.4692
Get LATEST chromedriver version for 97.0.4692 google-chrome
Driver [C:\Users\nakamura\.wdm\drivers\chromedriver\win32\97.0.4692.71\chromedriver.exe] found in cache
  driver = webdriver.Chrome(ChromeDriverManager().install())


In [8]:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions
import time
from selenium.common.exceptions import TimeoutException

In [9]:
import requests
from bs4 import BeautifulSoup

In [10]:
df_link = pd.DataFrame()
for g in df_id['id']:
    driver.get('https://www.genome.jp/dbget-bin/www_bget?hsa:'+str(g))
    time.sleep(1)
    source = driver.page_source
    soup = BeautifulSoup(source,'html.parser')
    p = []
    hsa_link = []
    for k in soup('a'):
        p.append(k.get('href'))
    for i in [x for x in p if x is not None]:
        if '/pathway/' in i:
            hsa_link.append('https://www.genome.jp'+i)
            #print(i)
        else:
            pass
    for s in hsa_link:
        df_link = pd.concat([df_link, pd.DataFrame([[g,s.split('+')[0]]],columns = ['HSA','hsa_link'])])
df_link = df_link.reset_index(drop = True)

In [11]:
df_link

Unnamed: 0,HSA,hsa_link
0,3739,https://www.genome.jp/pathway/hsa04927
1,3739,https://www.genome.jp/pathway/hsa04934
2,3748,https://www.genome.jp/pathway/hsa05017
3,3751,https://www.genome.jp/pathway/hsa04726
4,3752,https://www.genome.jp/pathway/hsa05017
...,...,...
5899,11238,https://www.genome.jp/pathway/hsa00910
5900,11238,https://www.genome.jp/pathway/hsa01100
5901,767,https://www.genome.jp/pathway/hsa00910
5902,767,https://www.genome.jp/pathway/hsa01100


In [12]:
df_link.to_csv('output/KEGG_ID_Pathway_link.csv',encoding = 'utf-8')

In [13]:
driver.close()

In [14]:
df_link['hsa_map'] = df_link['hsa_link'].str.replace('https://www.genome.jp/pathway/', '')

  df_link['hsa_map'] = df_link['hsa_link'].str.replace('https://www.genome.jp/pathway/', '')


In [15]:
df_link = df_link.drop(columns=['hsa_link']).drop_duplicates()

In [16]:
df_all = pd.merge(df, df_link.groupby(['HSA'])['hsa_map'].apply(list).reset_index(), left_on = 'id', right_on = 'HSA').drop(columns=['id'])

In [17]:
df_all

Unnamed: 0,Drug,target_name,target_type,TI,SE,gene_name,gene_symbols,entry_id,name,HSA,hsa_map
0,4-AP,Potassium voltage-gated channel subfamily A me...,Protein,"['Multiple sclerosis', 'Walking disability']","['Anaphylactic shock', 'Angioedema', 'Anxiety'...",KCNA4,kcna4,hsa:3739,KCNA4,3739,"[hsa04927, hsa04934]"
1,4-AP,Potassium voltage-gated channel subfamily C me...,Protein,"['Multiple sclerosis', 'Walking disability']","['Anaphylactic shock', 'Angioedema', 'Anxiety'...",KCNC3,kcnc3,hsa:3748,KCNC3,3748,[hsa05017]
2,4-AP,Potassium voltage-gated channel subfamily D me...,Protein,"['Multiple sclerosis', 'Walking disability']","['Anaphylactic shock', 'Angioedema', 'Anxiety'...",KCND2,kcnd2,hsa:3751,KCND2,3751,[hsa04726]
3,disopyramide,Potassium voltage-gated channel subfamily D me...,Protein,"['Arrhythmia', 'Bundle branch block', 'Sudden ...","['Agranulocytosis', 'Angioedema', 'Anorexia', ...",KCND2,kcnd2,hsa:3751,KCND2,3751,[hsa04726]
4,imipramine,Potassium voltage-gated channel subfamily D me...,Protein,"['Alcoholism', 'Depression', 'Endogenous depre...","['Abdominal cramps', 'Abdominal pain', 'Adenit...",KCND2,kcnd2,hsa:3751,KCND2,3751,[hsa04726]
...,...,...,...,...,...,...,...,...,...,...,...
2743,zinc,Tumor protein p73,Protein,"['Hepato-lenticular degeneration', 'Liver diso...","['Pancreatitis', 'Gastric irritation']",TP73,tp73,hsa:7161,TP73,7161,"[hsa04115, hsa04390, hsa04722, hsa05162]"
2744,zonisamide,Carbonic anhydrase 13,Protein,"['Epilepsy', 'Partial seizures', 'Sudden unexp...","['Abdominal pain', 'Congenital anomaly', 'Rena...",CA13,ca13,hsa:377677,CA13,377677,"[hsa00910, hsa01100]"
2745,zonisamide,"Carbonic anhydrase 5B, mitochondrial",Protein,"['Epilepsy', 'Partial seizures', 'Sudden unexp...","['Abdominal pain', 'Congenital anomaly', 'Rena...",CA5B,ca5b,hsa:11238,CA5B,11238,"[hsa00910, hsa01100]"
2746,zonisamide,Carbonic anhydrase-related protein,Protein,"['Epilepsy', 'Partial seizures', 'Sudden unexp...","['Abdominal pain', 'Congenital anomaly', 'Rena...",CA8,ca8,hsa:767,CA8,767,"[hsa00910, hsa01100]"


In [18]:
df_all.to_csv('output/tbl_drug_target_TI_SE_Pathway.csv',encoding = 'utf-8')