<a href="https://colab.research.google.com/github/raynardj/python4ml/blob/master/experiments/kegg_drug_mol.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# KEGG Drug Mol Data

In [48]:
import json
from pathlib import Path
import pandas as pd
import os
from tqdm.notebook import tqdm
from glob import glob
import logging
import requests
from random import random
from time import sleep

In [44]:
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

In [17]:
latest = sorted(glob("drive/MyDrive/kegg_drug-*.zip"))[-1]

In [6]:
!cp {latest} .

In [10]:
!unzip kegg_drug-*.zip > /dev/null

## Extract Meta Data

In [54]:
DATA=Path("./kegg_drug")
META = DATA/"meta"
MOL = Path("drive/MyDrive/kegg_drug/mol")

In [36]:
def open_json(path):
    with open(path, "r") as f:
        return json.loads(f.read())

def open_tag(fname):
    data = open_json(META/fname)
    data['drug_id'] = fname.replace(".json","")
    return data

In [20]:
ALL_JSON = os.listdir(META)

In [22]:
def none_empety(x):
    return list(filter(lambda i:len(i)>1,x))

In [37]:
data_list = none_empety(list(open_tag(tag) for tag in tqdm(ALL_JSON)))

HBox(children=(FloatProgress(value=0.0, max=12000.0), HTML(value='')))




## A list of Drug id that definitely contains MOL file

In [39]:
meta_df = pd.DataFrame(data_list)
with_mol_df = meta_df[meta_df.Structure.str.contains("Mol file").fillna(False)]

In [42]:
drug_ids = list(set(with_mol_df["drug_id"]))
drug_ids[:20]

['D02621',
 'D10562',
 'D08543',
 'D00096',
 'D00138',
 'D03891',
 'D00847',
 'D07998',
 'D04076',
 'D02467',
 'D11891',
 'D03187',
 'D11834',
 'D04640',
 'D07116',
 'D03736',
 'D03510',
 'D11438',
 'D00994',
 'D01887']

In [46]:
SLEEP_FACTOR = 1

In [59]:
def mol_file_link(drug_id):
    return f"https://www.kegg.jp/dbget-bin/www_bget?-f+m+drug+{drug_id}"

def download_drug_mol(drug_id, callback):
    sleep(SLEEP_FACTOR*random())
    url = mol_file_link(drug_id)
    r = requests.get(url)
    if r.status_code == 200:
        callback(r.text, drug_id)
    else:
        logging.error(f"[{r.status_code}]{url}")

def save_mol_file(text, drug_id):
    with open(MOL/f"{drug_id}.mol", "w") as f:
        f.write(text)

def downloaded():
    return list(map(lambda x:x.replace(".mol",""), os.listdir(MOL)))

In [60]:
download_drug_mol("D01887", save_mol_file)

In [62]:
# !cat {MOL/"D01887.mol"}

In [63]:
downloaded_mols = downloaded()
for drug_id in tqdm(drug_ids):
    if drug_id in downloaded_mols:
        continue
    download_drug_mol(drug_id, save_mol_file)

HBox(children=(FloatProgress(value=0.0, max=8596.0), HTML(value='')))


