In [2]:
import numpy as np 
import pandas as pd 
import json
from tqdm.notebook import tqdm
import xmltodict

import os
for dirname, _, filenames in os.walk('../data/drugs/drugbank_5.1.10/'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

../data/drugs/drugbank_5.1.10/3D structures.sdf
../data/drugs/drugbank_5.1.10/drug links.csv
../data/drugs/drugbank_5.1.10/drug sequences.fasta
../data/drugs/drugbank_5.1.10/drugbank vocabulary.csv
../data/drugs/drugbank_5.1.10/full database.xml
../data/drugs/drugbank_5.1.10/protein_identifiers.csv
../data/drugs/drugbank_5.1.10/structure links.csv
../data/drugs/drugbank_5.1.10/structures.sdf
../data/drugs/drugbank_5.1.10/target_sequences_gene.fasta
../data/drugs/drugbank_5.1.10/target_sequences_protein.fasta
../data/drugs/drugbank_5.1.10/uniprot links.csv


In [4]:
# %%

with open('../data/drugs/drugbank_5.1.10/full database.xml', 'r',encoding='utf-8') as xml_file:
    print("Deserializing XML...")
    dat = xml_file.read()
    data_dict = xmltodict.parse(dat)
    print("Done.")
    
dat = data_dict['drugbank']["drug"]
print(len(dat),"objects found.")

# print("Serializing to JSON...")
# with open("drugbank_full.json", 'w') as fp:
#         json.dump(dat, fp, sort_keys=True, indent=4)
# print("Done.")

Deserializing XML...
Done.
15235 objects found.


In [6]:
#node table
# drug ids

drug_dicts = []

for obj in tqdm(dat):
    
    # name
    objname = obj['name']
    filetype = obj['@type']
    
    # case
    try:
        cas = obj["cas-number"]
    except:
        cas = "None"
    
    # atc
    try:
        atc = obj["atc-codes"]["atc-code"]
        if type(atc) == dict:
            atc = atc["@code"]
        if type(atc) == list:
            atc = atc[0]["@code"]
    except:
        if atc is not None:
            atc = obj["atc-codes"]
        else:
            atc = "None"
    #rxcui        
    try:        
        ext_ids = obj['external-identifiers']['external-identifier']
        for item in ext_ids:
            if(item['resource'] == 'RxCUI'):
                rxcui = item['identifier']
    except:
        rxcui='None'

    # drugbank-id
    try:
        k,v = obj['drugbank-id'][0].items()
        fileid = v[1]
    except:
        fileid = obj['drugbank-id']['#text']
        
    #Unii
    try:
        unii = obj['unii']
    except:
        unii = "None"
    
    drug_row = {
        'name': objname,
        'drugbank-id': fileid,
        'type':filetype,
        'atc':atc,
        'cas':cas,
        'rxcui':rxcui,
        'unii':unii
    }
    drug_dicts.append(drug_row)
    
            
print("Done.")

  0%|          | 0/15235 [00:00<?, ?it/s]

Done.


In [7]:
drugs_df = pd.DataFrame(drug_dicts)
drugs_df

Unnamed: 0,name,drugbank-id,type,atc,cas,rxcui,unii
0,Lepirudin,DB00001,biotech,B01AE02,138068-37-8,237057,Y43GF64R34
1,Cetuximab,DB00002,biotech,L01FE01,205923-56-4,318341,PQX0D8J21J
2,Dornase alfa,DB00003,biotech,R05CB13,143831-71-4,337623,953A26OA1Y
3,Denileukin diftitox,DB00004,biotech,L01XX29,173146-27-5,214470,25E79B5CTM
4,Etanercept,DB00005,biotech,L04AB01,185243-69-0,214555,OP401G7OJC
...,...,...,...,...,...,...,...
15230,AUM-601,DB17382,small molecule,,,,
15231,FN-1501,DB17383,small molecule,,1429515-59-2,,6MC966B505
15232,Tinengotinib,DB17384,small molecule,,2230490-29-4,,WZ9TJ0L9Y8
15233,Lipotecan,DB17385,small molecule,,1432468-79-5,,D47234N30N


In [8]:
drugs_df.to_csv("drug_nodes.tsv", sep="\t", index=False)

In [None]:
small_dat = dat

target_dicts = []

fpd = open("drug_gene.txt", "w+")
fpd.writelines("drug \t gene\n")


for i, obj in enumerate(small_dat):
    try:
        k,v = obj['drugbank-id'][0].items()
        fileid = v[1]
    except:
        fileid = obj['drugbank-id']['#text']
        
    # print(i, fileid)
    if obj['targets'] != None:
        if type(obj['targets']['target'])==dict:
            try:
                #print(obj['targets']['target']['polypeptide']['gene-name'])
                fpd.writelines(fileid+"\t"+obj['targets']['target']['polypeptide']['gene-name']+"\n")
            except:
                pass
                #print("FAIL_PARSE_NOT_GENE")
        else:
            tgts = len(obj['targets']['target'])
            for idx in range(tgts):
                if 'polypeptide' in obj['targets']['target'][idx].keys():
                    if(type(obj['targets']['target'][idx]['polypeptide'])==dict):
                        try:
                            # print(obj['targets']['target'][idx]['polypeptide']['gene-name'])
                            fpd.writelines(fileid+"\t"+obj['targets']['target'][idx]['polypeptide']['gene-name']+"\n")
                        except:
                            pass
                    else:
                        ppds = len(obj['targets']['target'][idx]['polypeptide'])
                        for idx_ in range(ppds):
                            try:
                                #print(obj['targets']['target'][idx]['polypeptide'][idx_]['gene-name'])
                                fpd.writelines(fileid+"\t"+obj['targets']['target'][idx]['polypeptide'][idx_]['gene-name']+"\n")
                            except:
                                #print('FAIL_PARSE_NO_GENE')
                                continue
    else:
        #print('NO_TARGETS')
        pass
    #print("-"*80)
fpd.close()

In [None]:
!cat drug_gene.txt

In [None]:
!zip -r JSON.zip JSON

In [None]:
node_descriptors = []

for dirname, _, filenames in os.walk('/kaggle/working'):
    for filename in filenames:
        if filename.__contains__("json"):
            node_descriptors.append(filename)
            
node_descriptors.sort()

In [None]:
class Interaction():
    def __init__(self, source, dest, desc, adverse=False):
        self.source = source
        self.dest = dest
        self.desc = desc
        self.adverse = adverse
        
class Drug():
    def __init__(self, index, drug_id, friendly_name, int_list, stub=False):
        self.index = index
        self.name = friendly_name
        self.id = drug_id
        self.int_list = int_list  
        self.stub = stub

In [None]:
Nodelist = []

for idx,desc in tqdm(enumerate(node_descriptors)):
    f = open("./JSON/" + desc)
    data = json.load(f)
    
    drug_id = desc.split('_')[0]
    friendly_name = desc.split('_')[1]
    adverse = False
    
    try:
        int_list = []
        for interaction in data['drug-interactions']['drug-interaction']:
            dst_id = interaction['drugbank-id']
            if interaction['description'].__contains__('adverse'):
                adverse=True
            int_list.append(Interaction(source=drug_id, dest=dst_id, desc=interaction['description'], adverse=adverse))
    except: 
        int_list = None
    
    node = Drug(idx, drug_id, friendly_name, int_list)
    
    Nodelist.append(node)
len(Nodelist)

In [None]:
non_stub_ids = [a.id for a in Nodelist]
id2idx = {}
for node in Nodelist:
    k,v = node.id, node.index
    id2idx[k] = v

In [None]:
fp = open("edges.txt", "w")
fp.writelines("src \t dest \t desc \t adv \n")
fp.close()

fpm = open("edges_minified.txt", "w")
fpm.writelines("src \t dest \n")
fpm.close()