<a href="https://colab.research.google.com/github/raynardj/python4ml/blob/master/experiments/kegg_download_sqlite.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Kegg Drug Download
> This script will download KEGG drug pages. Using google colab notebook

## Download Cache
> This script will skip the downloaded pages of kegg drug

* By placing the kegg_drug.db under ```drive/MyDrive/kegg_drug.db```, the script will skip the downloaded page automatically

In [30]:
import logging
import pandas as pd
from tqdm.notebook import tqdm
from time import sleep
from random import random
import requests
from bs4 import BeautifulSoup
from sqlalchemy import create_engine as ce

In [2]:
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

In [3]:
TEST_TAG = "D00001"

In [4]:
def get_detail_page(tag):
    return f"https://www.kegg.jp/dbget-bin/www_bget?dr:{tag}"

In [6]:
r = requests.get(get_detail_page(TEST_TAG))

In [7]:
html = r.text

In [8]:
def parse_row(row):
    th = row.select_one("th")
    td = row.select_one("td")
    if th:
        return th.text.replace("\xa0",""), td.text.replace("\xa0"," ") if td else ""
    else:
        return "no_key", td.text.replace("\xa0"," ") if td else ""

def parse_link(link):
    """
    get all the links from the a dom
    """
    try:
        href = link["href"]
    except:
        href = ""
    return link.text if link.text else "no_key", href

def read_html(html: str) -> dict:
    """
    parse meta data page into json data
    """
    doc = BeautifulSoup(html)
    rows = doc.select("tr")
    drug_meta = dict(parse_row(row) for row in rows)
    links = doc.select("a")
    link_data = dict(parse_link(link) for link in links)
    drug_meta["links"] = link_data
    return drug_meta

In [9]:
errors = []
res_errors = []

In [10]:
import traceback
from pathlib import Path
import json

In [11]:
def error_catch(e):
    logging.error(e)
    errors.append({"exception":e, "tb":traceback.format_exc()})

def error_res_catch(r):
    logging.error(f"[{r.status_code}]{r.url}")
    res_errors.append({"response":r, })

In [12]:
def make_dir(path):
    path = Path(path)
    path.mkdir(exist_ok = True, parents = True)
    return path

In [13]:
META = make_dir("drive/MyDrive/kegg_drug/meta")
MOL = make_dir("drive/MyDrive/kegg_drug/mol")

In [14]:
import os

def mol_downloaded():
    return list(i.replace(".mol","") for i in os.listdir(MOL))

MOL_CACHE = mol_downloaded()

In [55]:
con = ce("sqlite:////content/drive/MyDrive/kegg_drug.db")

def create_kegg_page(con):
    with con.connect() as conn:
        conn.execute("""
        CREATE TABLE IF NOT EXISTS drug_page (
            kegg_id text primary key,
            doc text
        )
        """)

def save_doc(kegg_id, doc):
    """
    save html to sqlite db
    """
    df_row = pd.DataFrame([dict(kegg_id=kegg_id, doc=doc),])
    with con.connect() as conn:
        df_row.to_sql("drug_page", con = conn, index=False, if_exists="append")

def get_cache(con,):
    with con.connect() as conn:
        return list(pd.read_sql("SELECT kegg_id FROM drug_page", con =conn)["kegg_id"])

In [47]:
create_kegg_page(con)

In [51]:
downloaded = get_cache(con)
logging.info(downloaded[:5])

INFO:root:['D00001']


In [53]:
def kegg_drug_page(tag, resolve, sleep_factor):
    if tag in downloaded:
        logging.warning(f"[🍻 SKIP]{tag}.json")
        return
    sleep(random()*sleep_factor)
    r = requests.get(get_detail_page(tag))
    if r.status_code == 200:
        try:
            html = r.text
            if "No such data was found." in html:
                logging.warning(f"[🌴 {tag}]no such data found")
            resolve(tag, html)
        except Exception as e:
            error_catch(e)
    else:
        error_res_catch(r)

In [56]:
def start_download_meta(id_range, sleep_factor):
    for i in tqdm(range(id_range)):
        tag = ("D%5d"%(i)).replace(" ","0")
        kegg_drug_page(tag, save_doc, sleep_factor)

In [None]:
start_download_meta(12000,0.8)