<a href="https://colab.research.google.com/github/raynardj/python4ml/blob/master/experiments/kegg_download.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Kegg Drug Download

In [1]:
import logging

In [2]:
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

In [37]:
TEST_TAG = "D00001"

In [4]:
def get_detail_page(tag):
    return f"https://www.kegg.jp/dbget-bin/www_bget?dr:{tag}"

In [5]:
import requests
from bs4 import BeautifulSoup

In [33]:
r = requests.get(get_detail_page(TEST_TAG))

In [34]:
html = r.text

In [8]:
def parse_row(row):
    th = row.select_one("th")
    td = row.select_one("td")
    if th:
        return th.text.replace("\xa0",""), td.text.replace("\xa0"," ") if td else ""
    else:
        return "no_key", td.text.replace("\xa0"," ") if td else ""

def parse_link(link):
    """
    get all the links from the a dom
    """
    try:
        href = link["href"]
    except:
        href = ""
    return link.text if link.text else "no_key", href

def read_html(html: str) -> dict:
    """
    parse meta data page into json data
    """
    doc = BeautifulSoup(html)
    rows = doc.select("tr")
    drug_meta = dict(parse_row(row) for row in rows)
    links = doc.select("a")
    link_data = dict(parse_link(link) for link in links)
    drug_meta["links"] = link_data
    return drug_meta

In [9]:
errors = []
res_errors = []

In [23]:
import traceback
from pathlib import Path
import json

In [11]:
def error_catch(e):
    logging.error(e)
    errors.append({"exception":e, "tb":traceback.format_exc()})

def error_res_catch(r):
    logging.error(f"[{r.status_code}]{r.url}")
    res_errors.append({"response":r, })

In [20]:
def make_dir(path):
    path = Path(path)
    path.mkdir(exist_ok = True, parents = True)
    return path

In [63]:
META = make_dir("drive/MyDrive/kegg_drug/meta")
MOL = make_dir("drive/MyDrive/kegg_drug/mol")

In [64]:
import os
def meta_downloaded():
    return list(i.replace(".json","") for i in os.listdir(META))

def mol_downloaded():
    return list(i.replace(".mol","") for i in os.listdir(MOL))

META_CACHE = meta_downloaded()
MOL_CACHE = mol_downloaded()

In [65]:
def kegg_drug_page(tag, resolve, sleep_factor):
    if tag in META_CACHE:
        logging.info(f"[🍻 SKIP]{tag}.json")
        return
    sleep(random()*sleep_factor)
    r = requests.get(get_detail_page(tag))
    if r.status_code == 200:
        try:
            html = r.text
            if "No such data was found." in html:
                logging.warning(f"[🌴 {tag}]no such data found")
            meta = read_html(html)
            resolve(meta, tag)
        except Exception as e:
            error_catch(e)
    else:
        error_res_catch(r)

In [66]:
def save_meta(data,tag):
    with open(META/f"{tag}.json", "w") as f:
        f.write(json.dumps(data))

In [67]:
from tqdm.notebook import tqdm
from time import sleep
from random import random

In [68]:
def start_download_meta(id_range, sleep_factor):
    for i in tqdm(range(id_range)):
        tag = ("D%5d"%(i)).replace(" ","0")
        kegg_drug_page(tag, save_meta, sleep_factor)
        

In [None]:
start_download_meta(12000,1)

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.kegg.jp:443
DEBUG:urllib3.connectionpool:https://www.kegg.jp:443 "GET /dbget-bin/www_bget?dr:D01462 HTTP/1.1" 200 None
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.kegg.jp:443
DEBUG:urllib3.connectionpool:https://www.kegg.jp:443 "GET /dbget-bin/www_bget?dr:D01463 HTTP/1.1" 200 None
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.kegg.jp:443
DEBUG:urllib3.connectionpool:https://www.kegg.jp:443 "GET /dbget-bin/www_bget?dr:D01464 HTTP/1.1" 200 None
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.kegg.jp:443
DEBUG:urllib3.connectionpool:https://www.kegg.jp:443 "GET /dbget-bin/www_bget?dr:D01465 HTTP/1.1" 200 None
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.kegg.jp:443
DEBUG:urllib3.connectionpool:https://www.kegg.jp:443 "GET /dbget-bin/www_bget?dr:D01466 HTTP/1.1" 200 None
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): w