In [1]:
import numpy as np 
import pandas as pd 
import json
from tqdm.notebook import tqdm
import xmltodict

import os
for dirname, _, filenames in os.walk('../data/drugs/drugbank_5.1.10/'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

../data/drugs/drugbank_5.1.10/3D structures.sdf
../data/drugs/drugbank_5.1.10/drug links.csv
../data/drugs/drugbank_5.1.10/drug sequences.fasta
../data/drugs/drugbank_5.1.10/drugbank vocabulary.csv
../data/drugs/drugbank_5.1.10/fasta_parse.py
../data/drugs/drugbank_5.1.10/full database.7z
../data/drugs/drugbank_5.1.10/full database.xml
../data/drugs/drugbank_5.1.10/protein_identifiers.csv
../data/drugs/drugbank_5.1.10/structure links.csv
../data/drugs/drugbank_5.1.10/structures.sdf
../data/drugs/drugbank_5.1.10/target_sequences_gene.fasta
../data/drugs/drugbank_5.1.10/target_sequences_protein.fasta
../data/drugs/drugbank_5.1.10/uniprot links.csv
../data/drugs/drugbank_5.1.10/uniprot-db-map.csv
../data/drugs/drugbank_5.1.10/targets\carrier\all.csv
../data/drugs/drugbank_5.1.10/targets\carrier\gene.fasta
../data/drugs/drugbank_5.1.10/targets\carrier\protein.fasta
../data/drugs/drugbank_5.1.10/targets\enzyme\all.csv
../data/drugs/drugbank_5.1.10/targets\enzyme\gene.fasta
../data/drugs/dru

In [2]:
with open('../data/drugs/drugbank_5.1.10/full database.xml', 'r',encoding='utf-8') as xml_file:
    print("Deserializing XML...")
    dat = xml_file.read()
    data_dict = xmltodict.parse(dat)
    print("Done.")
    
dat = data_dict['drugbank']["drug"]
print(len(dat),"objects found.")

# print("Serializing to JSON...")
# with open("drugbank_full.json", 'w') as fp:
#         json.dump(dat, fp, sort_keys=True, indent=4)
# print("Done.")

Deserializing XML...
Done.
15235 objects found.


# Process Drug Nodes

In [3]:
#node table
# drug ids

drug_dicts = []
drug_atc_rel = []

for obj in tqdm(dat):
    
    # name
    objname = obj['name']
    filetype = obj['@type']

    # drugbank-id
    try:
        k,v = obj['drugbank-id'][0].items()
        fileid = v[1]
    except:
        fileid = obj['drugbank-id']['#text']
    
    # cas
    try:
        cas = obj["cas-number"]
    except:
        cas = ""
    
    # atc
    try:
        atc = obj["atc-codes"]["atc-code"]
        if type(atc) == dict:
            atc = atc["@code"]
            drug_atc_rel.append((fileid, atc))
        if type(atc) == list:
            for atc_i in atc:
                atc_k = atc_i["@code"]
                drug_atc_rel.append((fileid, atc_k))
            atc = atc[0]["@code"]
    except:
        if atc is not None:
            atc = obj["atc-codes"]
            drug_atc_rel.append((fileid, atc_k))
        else:
            atc = ""
            drug_atc_rel.append((fileid, atc_k))
    
    #rxcui        
    try:        
        ext_ids = obj['external-identifiers']['external-identifier']
        for item in ext_ids:
            if(item['resource'] == 'RxCUI'):
                rxcui = item['identifier']
    except:
        rxcui=""

    
        
    #Unii
    try:
        unii = obj['unii']
    except:
        unii = ""
    

    #uniprotkb
    try:        
        ext_ids = obj['external-identifiers']['external-identifier']
        for item in ext_ids:
            if(item['resource'] == 'UniProtKB'):
                uniprotkb = item['identifier']
    except:
        uniprotkb=""


    drug_row = {
        'name': objname,
        'drugbank-id': fileid,
        'type':filetype,
        'atc':atc,
        'cas':cas,
        'rxcui':rxcui,
        'unii':unii,
        'uniprot_kb':uniprotkb
    }
    drug_dicts.append(drug_row)
    
            
print("Done.")

  0%|          | 0/15235 [00:00<?, ?it/s]

Done.


In [4]:
atc_df = pd.DataFrame(drug_atc_rel, columns=['drugbank_id','atc'])
atc_df

Unnamed: 0,drugbank_id,atc
0,DB00001,B01AE02
1,DB00002,L01FE01
2,DB00003,R05CB13
3,DB00004,L01XX29
4,DB00005,L04AB01
...,...,...
17210,DB17382,S01AA18
17211,DB17383,S01AA18
17212,DB17384,S01AA18
17213,DB17385,S01AA18


In [5]:
atc_df.to_csv("./temp/drug_atcs.csv", sep="\t", index=False)

In [6]:
drugs_df = pd.DataFrame(drug_dicts)
drugs_df

Unnamed: 0,name,drugbank-id,type,atc,cas,rxcui,unii,uniprot_kb
0,Lepirudin,DB00001,biotech,B01AE02,138068-37-8,237057,Y43GF64R34,P01050
1,Cetuximab,DB00002,biotech,L01FE01,205923-56-4,318341,PQX0D8J21J,P01050
2,Dornase alfa,DB00003,biotech,R05CB13,143831-71-4,337623,953A26OA1Y,P24855
3,Denileukin diftitox,DB00004,biotech,L01XX29,173146-27-5,214470,25E79B5CTM,P00587
4,Etanercept,DB00005,biotech,L04AB01,185243-69-0,214555,OP401G7OJC,P20333
...,...,...,...,...,...,...,...,...
15230,AUM-601,DB17382,small molecule,,,,,
15231,FN-1501,DB17383,small molecule,,1429515-59-2,,6MC966B505,
15232,Tinengotinib,DB17384,small molecule,,2230490-29-4,,WZ9TJ0L9Y8,
15233,Lipotecan,DB17385,small molecule,,1432468-79-5,,D47234N30N,


In [7]:
drugs_syn_df = pd.read_csv(r"../data/drugs/drugbank_5.1.10/drugbank vocabulary.csv")
drugs_syn_df.fillna("",inplace=True)
drugs_syn_df = drugs_syn_df[['DrugBank ID', 'Accession Numbers', 'Synonyms', 'Standard InChI Key']]
drugs_syn_df.columns = ['drugbank-id', 'drugbank-id-2', 'synonyms', 'InChI']
drugs_syn_df

Unnamed: 0,drugbank-id,drugbank-id-2,synonyms,InChI
0,DB00001,BIOD00024 | BTD00024,"[Leu1, Thr2]-63-desulfohirudin | Desulfatohiru...",
1,DB00002,BIOD00071 | BTD00071,Cetuximab | Cétuximab | Cetuximabum,
2,DB00003,BIOD00001 | BTD00001,Deoxyribonuclease (human clone 18-1 protein mo...,
3,DB00004,BIOD00084 | BTD00084,Denileukin | Denileukin diftitox | Interleukin...,
4,DB00005,BIOD00052 | BTD00052,Etanercept | etanercept-szzs | etanercept-ykro...,
...,...,...,...,...
15230,DB17382,,"(R,E)-3-(5-(2-(2,5-difluorophenyl)pyrrolidin-1...",
15231,DB17383,,"4-((7h-pyrrolo (2,3-d)pyrimidin-4-yl)amino)-n-...",VXLAKHWYGRKCGI-UHFFFAOYSA-N
15232,DB17384,,"4-(5-(2-Chlorophenyl)-3-Methyl-2,10-Dihydropyr...",DQFCVOOFMXEPOC-UHFFFAOYSA-N
15233,DB17385,,"Lipothecan free base | Propanoic acid, 2-(((2,...",JCCCLGDYMMTBPM-HXDHBHDHSA-N


In [8]:
drugs_df = pd.merge(left=drugs_df, right=drugs_syn_df, on='drugbank-id')
drugs_df.fillna("", inplace=True)
drugs_df

Unnamed: 0,name,drugbank-id,type,atc,cas,rxcui,unii,uniprot_kb,drugbank-id-2,synonyms,InChI
0,Lepirudin,DB00001,biotech,B01AE02,138068-37-8,237057,Y43GF64R34,P01050,BIOD00024 | BTD00024,"[Leu1, Thr2]-63-desulfohirudin | Desulfatohiru...",
1,Cetuximab,DB00002,biotech,L01FE01,205923-56-4,318341,PQX0D8J21J,P01050,BIOD00071 | BTD00071,Cetuximab | Cétuximab | Cetuximabum,
2,Dornase alfa,DB00003,biotech,R05CB13,143831-71-4,337623,953A26OA1Y,P24855,BIOD00001 | BTD00001,Deoxyribonuclease (human clone 18-1 protein mo...,
3,Denileukin diftitox,DB00004,biotech,L01XX29,173146-27-5,214470,25E79B5CTM,P00587,BIOD00084 | BTD00084,Denileukin | Denileukin diftitox | Interleukin...,
4,Etanercept,DB00005,biotech,L04AB01,185243-69-0,214555,OP401G7OJC,P20333,BIOD00052 | BTD00052,Etanercept | etanercept-szzs | etanercept-ykro...,
...,...,...,...,...,...,...,...,...,...,...,...
15230,AUM-601,DB17382,small molecule,,,,,,,"(R,E)-3-(5-(2-(2,5-difluorophenyl)pyrrolidin-1...",
15231,FN-1501,DB17383,small molecule,,1429515-59-2,,6MC966B505,,,"4-((7h-pyrrolo (2,3-d)pyrimidin-4-yl)amino)-n-...",VXLAKHWYGRKCGI-UHFFFAOYSA-N
15232,Tinengotinib,DB17384,small molecule,,2230490-29-4,,WZ9TJ0L9Y8,,,"4-(5-(2-Chlorophenyl)-3-Methyl-2,10-Dihydropyr...",DQFCVOOFMXEPOC-UHFFFAOYSA-N
15233,Lipotecan,DB17385,small molecule,,1432468-79-5,,D47234N30N,,,"Lipothecan free base | Propanoic acid, 2-(((2,...",JCCCLGDYMMTBPM-HXDHBHDHSA-N


In [9]:
drugs_df.to_csv(r"./temp/drug_nodes_identifiers.tsv", sep="\t", index=False)
drugs_df.to_excel(r"./temp/drug_nodes_identifiers.xlsx", index=False)

### TODO: Process Drug structures

In [10]:
from rdkit.Chem import PandasTools
SDFFile = "../data/drugs/drugbank_5.1.10/structures.sdf"
struct_df = PandasTools.LoadSDF(SDFFile)
struct_df

[16:04:25] Explicit valence for atom # 13 Cl, 5, is greater than permitted
[16:04:25] ERROR: Could not sanitize molecule ending on line 298551
[16:04:25] ERROR: Explicit valence for atom # 13 Cl, 5, is greater than permitted
[16:04:25] Explicit valence for atom # 19 O, 3, is greater than permitted
[16:04:25] ERROR: Could not sanitize molecule ending on line 412786
[16:04:25] ERROR: Explicit valence for atom # 19 O, 3, is greater than permitted
[16:04:26] Explicit valence for atom # 1 N, 4, is greater than permitted
[16:04:26] ERROR: Could not sanitize molecule ending on line 540739
[16:04:26] ERROR: Explicit valence for atom # 1 N, 4, is greater than permitted
[16:04:26] Explicit valence for atom # 1 N, 4, is greater than permitted
[16:04:26] ERROR: Could not sanitize molecule ending on line 598037
[16:04:26] ERROR: Explicit valence for atom # 1 N, 4, is greater than permitted
[16:04:26] Explicit valence for atom # 12 N, 4, is greater than permitted
[16:04:26] ERROR: Could not sanitize

Unnamed: 0,DATABASE_ID,DATABASE_NAME,SMILES,INCHI_IDENTIFIER,INCHI_KEY,FORMULA,MOLECULAR_WEIGHT,EXACT_MASS,JCHEM_ACCEPTOR_COUNT,JCHEM_ATOM_COUNT,...,DRUGBANK_ID,SECONDARY_ACCESSION_NUMBERS,DRUG_GROUPS,GENERIC_NAME,SYNONYMS,PRODUCTS,INTERNATIONAL_BRANDS,ID,ROMol,SALTS
0,DB00006,drugbank,CC[C@H](C)[C@H](NC(=O)[C@H](CCC(O)=O)NC(=O)[C@...,InChI=1S/C98H138N24O33/c1-5-52(4)82(96(153)122...,OIRCOABEOLEUMC-GEJPAHFPSA-N,C98H138N24O33,2180.2853,2178.985813062,37,293,...,DB00006,BTD00076; EXPT03302; BIOD00076; DB02351,approved; investigational,Bivalirudin,Bivalirudin; Bivalirudina; Bivalirudinum,Angiomax; Angiomax RTU; Angiox; Bivalirudin; B...,Angiox; Hirulog,,<rdkit.Chem.rdchem.Mol object at 0x00000227AFA...,
1,DB00007,drugbank,CCNC(=O)[C@@H]1CCCN1C(=O)[C@H](CCCNC(N)=N)NC(=...,InChI=1S/C59H84N16O12/c1-6-63-57(86)48-14-10-2...,GFIJNRVAKGFPGQ-LIJARHBVSA-N,C59H84N16O12,1209.3983,1208.645462232,16,171,...,DB00007,BTD00009; BIOD00009,approved; investigational,Leuprolide,Leuprorelin; Leuprorelina; Leuproreline; Leupr...,Camcevi; Eligard; Fensolvi; Leuprolide Acetate...,Camcevi; Leuplin; LeuProMaxx; Memryte; Prostap...,,<rdkit.Chem.rdchem.Mol object at 0x00000227AFA...,Leuprolide acetate; Leuprolide mesylate
2,DB00014,drugbank,CC(C)C[C@H](NC(=O)[C@@H](COC(C)(C)C)NC(=O)[C@H...,InChI=1S/C59H84N18O14/c1-31(2)22-40(49(82)68-3...,BLCLNMBMMGCOAS-URPVMXJPSA-N,C59H84N18O14,1269.4105,1268.641439486,18,175,...,DB00014,BTD00113; BIOD00113,approved,Goserelin,Goserelin; Goserelina,Zoladex; Zoladex LA,,[NO NAME],<rdkit.Chem.rdchem.Mol object at 0x00000227AFA...,Goserelin acetate
3,DB00027,drugbank,CC(C)C[C@@H](NC(=O)CNC(=O)[C@@H](NC=O)C(C)C)C(...,InChI=1S/C96H135N19O16/c1-50(2)36-71(105-79(11...,NDAYQJDHGXTBJL-MWWSRJDJSA-N,C96H135N19O16,1811.253,1810.033419343,16,266,...,DB00027,BTD00036; BIOD00036,approved,Gramicidin D,Bacillus brevis gramicidin D; Gramicidin; Gram...,Antibiotic Cream; Antibiotic Cream Plus Pain R...,Sofradex,,<rdkit.Chem.rdchem.Mol object at 0x00000227AFA...,
4,DB00035,drugbank,NC(=O)CC[C@@H]1NC(=O)[C@H](CC2=CC=CC=C2)NC(=O)...,InChI=1S/C46H64N14O12S2/c47-35(62)15-14-29-40(...,NFLWUMRGJYTJIN-PNIOQBSNSA-N,C46H64N14O12S2,1069.22,1068.426955905,15,138,...,DB00035,BTD00112; BTD00061; BIOD00112; BIOD00061,approved,Desmopressin,1-(3-mercaptopropionic acid)-8-D-arginine-vaso...,Apo-desmopressin; Bipazen; DDAVP Rhinal Tube; ...,Adiuretin; DesmoMelt,,<rdkit.Chem.rdchem.Mol object at 0x00000227AFA...,Desmopressin acetate; Desmopressin acetate anh...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11581,DB17379,drugbank,CC(C)C1=C(O)C(O)=C(C=O)C2=C(O)C(=C(C)C=C12)C1=...,InChI=1S/C30H30O8/c1-11(2)19-15-7-13(5)21(27(3...,QBKSWRVVCFFDOT-UHFFFAOYSA-N,C30H30O8,518.5544,518.194067936,8,68,...,DB17379,,investigational,(-)-Gossypol,"(r)-2,2'-bis(8-formyl-1,6,7-trihydroxy-5-isopr...",,,,<rdkit.Chem.rdchem.Mol object at 0x00000227C57...,
11582,DB17383,drugbank,CN1CCN(CC2=CC=C(NC(=O)C3=NNC=C3NC3=C4C=CNC4=NC...,InChI=1S/C22H25N9O/c1-30-8-10-31(11-9-30)13-15...,VXLAKHWYGRKCGI-UHFFFAOYSA-N,C22H25N9O,431.504,431.218206461,,,...,DB17383,,investigational,FN-1501,"4-((7h-pyrrolo (2,3-d)pyrimidin-4-yl)amino)-n-...",,,,<rdkit.Chem.rdchem.Mol object at 0x00000227C57...,
11583,DB17384,drugbank,CC1=C2N=C(C3=CC=CC=C3Cl)C3=C(NC2=NN1)C=C(N=C3)...,InChI=1S/C20H19ClN6O/c1-12-18-20(26-25-12)23-1...,DQFCVOOFMXEPOC-UHFFFAOYSA-N,C20H19ClN6O,394.86,394.130887,,,...,DB17384,,investigational,Tinengotinib,"4-(5-(2-Chlorophenyl)-3-Methyl-2,10-Dihydropyr...",,,,<rdkit.Chem.rdchem.Mol object at 0x00000227C57...,
11584,DB17385,drugbank,CC[C@@]1(OC(=O)C(C)ON=C2C3=C(C4=C2C=C(C=C4[N+]...,InChI=1S/C39H30N8O15/c1-5-39(26-13-30-34-18(14...,JCCCLGDYMMTBPM-HXDHBHDHSA-N,C39H30N8O15,850.71,850.183062298,,,...,DB17385,,investigational,Lipotecan,"Lipothecan free base; Propanoic acid, 2-(((2,4...",,,,<rdkit.Chem.rdchem.Mol object at 0x00000227C57...,Lipotecan hydrochloride


In [11]:
struct_df = struct_df[['DATABASE_ID', 'SMILES']]
struct_df.dropna(inplace=True)
struct_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  struct_df.dropna(inplace=True)


Unnamed: 0,DATABASE_ID,SMILES
0,DB00006,CC[C@H](C)[C@H](NC(=O)[C@H](CCC(O)=O)NC(=O)[C@...
1,DB00007,CCNC(=O)[C@@H]1CCCN1C(=O)[C@H](CCCNC(N)=N)NC(=...
2,DB00014,CC(C)C[C@H](NC(=O)[C@@H](COC(C)(C)C)NC(=O)[C@H...
3,DB00027,CC(C)C[C@@H](NC(=O)CNC(=O)[C@@H](NC=O)C(C)C)C(...
4,DB00035,NC(=O)CC[C@@H]1NC(=O)[C@H](CC2=CC=CC=C2)NC(=O)...
...,...,...
11581,DB17379,CC(C)C1=C(O)C(O)=C(C=O)C2=C(O)C(=C(C)C=C12)C1=...
11582,DB17383,CN1CCN(CC2=CC=C(NC(=O)C3=NNC=C3NC3=C4C=CNC4=NC...
11583,DB17384,CC1=C2N=C(C3=CC=CC=C3Cl)C3=C(NC2=NN1)C=C(N=C3)...
11584,DB17385,CC[C@@]1(OC(=O)C(C)ON=C2C3=C(C4=C2C=C(C=C4[N+]...


# Targets & MOA edges

In [12]:
def extract_targets(term1='enzymes', term2='enzyme'):
    target_dicts = []
    ex = []
    
    if term1 == 'targets': #use legacy code for targets   
        print("using legacy code.")     
        for i, obj in enumerate(dat):
            # get drug id
            drug_id = None
            action = None
            drug_type = obj['@type']
            
            try:
                k,v = obj['drugbank-id'][0].items()
                drug_id = v[1]
            except:
                drug_id = obj['drugbank-id']['#text']
                
            # get target uniprotkb id
            uniprotkb_id = None
            uniprotac_id = None
            gene_name = None

            if obj['targets'] != None:
                # if a single target is stored as a dict
                if type(obj['targets']['target'])==dict:
                    try:
                        uniprotkb_id = obj['targets']['target']['polypeptide']['external-identifiers']['external-identifier'][-2]['identifier']
                        uniprotac_id = obj['targets']['target']['polypeptide']['external-identifiers']['external-identifier'][-1]['identifier']
                        gene_name = obj['targets']['target']['polypeptide']['gene-name']
                        action = obj['targets']['target']['actions']['action']
                        target_dicts.append([drug_id, drug_type, uniprotkb_id, uniprotac_id, gene_name, action])
                    except:
                        pass

                else:
                    tgts = len(obj['targets']['target'])
                    for idx in range(tgts):
                        if 'polypeptide' in obj['targets']['target'][idx].keys():
                            if(type(obj['targets']['target'][idx]['polypeptide'])==dict):
                                try:
                                    uniprotkb_id = obj['targets']['target'][idx]['polypeptide']['external-identifiers']['external-identifier'][-2]['identifier']
                                    uniprotac_id = obj['targets']['target'][idx]['polypeptide']['external-identifiers']['external-identifier'][-1]['identifier']
                                    gene_name = obj['targets']['target'][idx]['polypeptide']['gene-name']
                                    action = obj['targets']['target'][idx]['actions']['action']
                                    target_dicts.append([drug_id, drug_type, uniprotkb_id, uniprotac_id, gene_name, action])
                                except:
                                    pass
                            else:
                                ppds = len(obj['targets']['target'][idx]['polypeptide'])
                                for idx_ in range(ppds):
                                    try:
                                        #print(obj['targets']['target'][idx]['polypeptide'][idx_]['gene-name'])
                                        gene_name = obj['targets']['target'][idx]['polypeptide'][idx_]['gene-name']
                                        uniprotkb_id = obj['targets']['target'][idx]['polypeptide'][idx_]['external-identifiers']['external-identifier'][-2]['identifier']
                                        uniprotac_id = obj['targets']['target'][idx]['polypeptide'][idx_]['external-identifiers']['external-identifier'][-1]['identifier']
                                        action = obj['targets']['target'][idx]['actions']['action']
                                        target_dicts.append([drug_id, drug_type, uniprotkb_id, uniprotac_id, gene_name, action])
                                    except:
                                        continue
            else:
                target_dicts.append([drug_id, drug_type, uniprotkb_id, uniprotac_id, gene_name, action])
        return pd.DataFrame(data=target_dicts, columns=["drugbank-id", "drug-type" ,"uniprotkb-id", "uniprotac-id","gene-name","action"])
            
    else:   
        print("using modern code.")
        for i, obj in enumerate(dat):
            # get drug id
            drug_id = None
            action = None
            drug_type = obj['@type']

            try:
                k,v = obj['drugbank-id'][0].items()
                drug_id = v[1]
            except:
                drug_id = obj['drugbank-id']['#text']
                
            # get target uniprotkb id
            uniprotkb_id = None
            uniprotac_id = None
            gene_name = None

            if obj[term1] != None:
                if type(obj[term1][term2])==dict:
                    # print(obj[term1][term2]['actions']['action'])
                    # break

                    try:
                        try:
                            uniprotkb_id = obj[term1][term2]['polypeptide']['external-identifiers']['external-identifier'][-2]['identifier']
                        except:
                            uniprotkb_id = obj[term1][term2]['polypeptide']['@id']
                            
                        uniprotac_id = obj[term1][term2]['polypeptide']['external-identifiers']['external-identifier'][-1]['identifier']
                        gene_name = obj[term1][term2]['polypeptide']['gene-name']
                        try:
                            action = obj[term1][term2]['actions']['action']
                        except:
                            action = "unknown"
                        target_dicts.append([drug_id, drug_type, uniprotkb_id, uniprotac_id, gene_name, action])
                    except:
                        try:
                            for idx in range(len(obj[term1][term2]['polypeptide'])):
                                try:
                                    uniprotkb_id = obj[term1][term2]['polypeptide'][idx]['@id']
                                except:
                                    uniprotkb_id = obj[term1][term2]['polypeptide'][idx]['external-identifiers']['external-identifier'][-2]['identifier']
                                    
                                uniprotac_id = obj[term1][term2]['polypeptide'][idx]['external-identifiers']['external-identifier'][-1]['identifier']
                                gene_name = obj[term1][term2]['polypeptide'][idx]['gene-name']
                                try:
                                    action = obj[term1][term2]['actions']['action']
                                except:
                                    action = "unknown"
                                target_dicts.append([drug_id, drug_type, uniprotkb_id, uniprotac_id, gene_name, action])
                        except:
                            ex.append(obj)
                    
                else: # it is a list
                    if type(obj[term1][term2])==list:
                        for idx_ in range(len(obj[term1][term2])):
                            try:
                                try:
                                    uniprotkb_id = obj[term1][term2][idx_]['polypeptide']['external-identifiers']['external-identifier'][-2]['identifier']
                                    
                                    
                                except:
                                    uniprotkb_id = obj[term1][term2][idx_]['polypeptide']['@id']
                                    
                                uniprotac_id = obj[term1][term2][idx_]['polypeptide']['external-identifiers']['external-identifier'][-1]['identifier']
                                gene_name = obj[term1][term2][idx_]['polypeptide']['gene-name']
                                try:
                                    action = obj[term1][term2][idx_]['actions']['action']
                                except:
                                    action = "unknown"
                                target_dicts.append([drug_id, drug_type, uniprotkb_id, uniprotac_id, gene_name, action])
                            except:
                                try:
                                    for idx in range(len(obj[term1][term2][idx_]['polypeptide'])):
                                        try:
                                            uniprotkb_id = obj[term1][term2][idx_]['polypeptide'][idx]['@id']
                                        except:
                                            uniprotkb_id = obj[term1][term2][idx_]['polypeptide'][idx]['external-identifiers']['external-identifier'][-2]['identifier']
                                            
                                        uniprotac_id = obj[term1][term2][idx_]['polypeptide'][idx]['external-identifiers']['external-identifier'][-1]['identifier']
                                        gene_name = obj[term1][term2][idx_]['polypeptide'][idx]['gene-name']
                                        try:
                                            action = obj[term1][term2][idx_]['actions']['action']
                                        except:
                                            action = "unknown"
                                        target_dicts.append([drug_id, drug_type, uniprotkb_id, uniprotac_id, gene_name, action])
                                except:
                                    ex.append(obj)                                
        return pd.DataFrame(data=target_dicts, columns=["drugbank-id", "drug-type" ,"uniprotkb-id", "uniprotac-id","gene-name","action"])
    

In [13]:
tgt_df_enz = extract_targets(term1 = 'enzymes', term2 = 'enzyme')
tgt_df_enz

using modern code.


Unnamed: 0,drugbank-id,drug-type,uniprotkb-id,uniprotac-id,gene-name,action
0,DB00006,small molecule,P05164,PERM_HUMAN,MPO,inhibitor
1,DB00008,biotech,P05177,CP1A2_HUMAN,CYP1A2,inhibitor
2,DB00011,biotech,P05177,CP1A2_HUMAN,CYP1A2,inhibitor
3,DB00013,biotech,P39900,MMP12_HUMAN,MMP12,substrate
4,DB00018,biotech,P05177,CP1A2_HUMAN,CYP1A2,inhibitor
...,...,...,...,...,...,...
6063,DB16703,small molecule,P22309,UD11_HUMAN,UGT1A1,inhibitor
6064,DB16732,biotech,P08684,CP3A4_HUMAN,CYP3A4,substrate
6065,DB17083,small molecule,P11712,CP2C9_HUMAN,CYP2C9,substrate
6066,DB17083,small molecule,P08684,CP3A4_HUMAN,CYP3A4,substrate


In [14]:
tgt_df_tns = extract_targets(term1 = 'transporters', term2 = 'transporter')
tgt_df_tns

using modern code.


Unnamed: 0,drugbank-id,drug-type,uniprotkb-id,uniprotac-id,gene-name,action
0,DB00027,small molecule,P08183,MDR1_HUMAN,ABCB1,"[substrate, inhibitor]"
1,DB00052,biotech,P08183,MDR1_HUMAN,ABCB1,substrate
2,DB00067,small molecule,Q92887,MRP2_HUMAN,ABCC2,substrate
3,DB00080,small molecule,P08183,MDR1_HUMAN,ABCB1,substrate
4,DB00091,small molecule,P08183,MDR1_HUMAN,ABCB1,"[substrate, inhibitor]"
...,...,...,...,...,...,...
3368,DB17083,small molecule,Q9Y6L6,SO1B1_HUMAN,SLCO1B1,"[substrate, inhibitor]"
3369,DB17083,small molecule,Q9NPD5,SO1B3_HUMAN,SLCO1B3,"[substrate, inhibitor]"
3370,DB17083,small molecule,Q8TCC7,S22A8_HUMAN,SLC22A8,"[substrate, inhibitor]"
3371,DB17083,small molecule,Q9UNQ0,ABCG2_HUMAN,ABCG2,substrate


In [15]:
tgt_df_car = extract_targets(term1 = 'carriers', term2 = 'carrier')
tgt_df_car

using modern code.


Unnamed: 0,drugbank-id,drug-type,uniprotkb-id,uniprotac-id,gene-name,action
0,DB00023,biotech,P05543,THBG_HUMAN,SERPINA7,inhibitor
1,DB00059,biotech,P05543,THBG_HUMAN,SERPINA7,inhibitor
2,DB00070,biotech,P02768,ALBU_HUMAN,ALB,inhibitor
3,DB00080,small molecule,P02768,ALBU_HUMAN,ALB,binder
4,DB00080,small molecule,P02763,A1AG1_HUMAN,ORM1,binder
...,...,...,...,...,...,...
991,DB16165,small molecule,P02768,ALBU_HUMAN,ALB,binder
992,DB16699,biotech,P02768,ALBU_HUMAN,ALB,ligand
993,DB16703,small molecule,P02768,ALBU_HUMAN,ALB,binder
994,DB16703,small molecule,P02763,A1AG1_HUMAN,ORM1,binder


In [16]:
tgt_df = extract_targets(term1 = 'targets', term2 = 'target')
tgt_df

using legacy code.


Unnamed: 0,drugbank-id,drug-type,uniprotkb-id,uniprotac-id,gene-name,action
0,DB00001,biotech,P00734,THRB_HUMAN,F2,inhibitor
1,DB00002,biotech,P00533,EGFR_HUMAN,EGFR,binder
2,DB00002,biotech,O75015,FCG3B_HUMAN,FCGR3B,binder
3,DB00002,biotech,P02745,C1QA_HUMAN,C1QA,binder
4,DB00002,biotech,P02746,C1QB_HUMAN,C1QB,binder
...,...,...,...,...,...,...
18612,DB17382,small molecule,,,,
18613,DB17383,small molecule,,,,
18614,DB17384,small molecule,,,,
18615,DB17385,small molecule,,,,


In [17]:
targets_df = pd.concat([tgt_df, tgt_df_enz, tgt_df_car, tgt_df_tns])
targets_df

Unnamed: 0,drugbank-id,drug-type,uniprotkb-id,uniprotac-id,gene-name,action
0,DB00001,biotech,P00734,THRB_HUMAN,F2,inhibitor
1,DB00002,biotech,P00533,EGFR_HUMAN,EGFR,binder
2,DB00002,biotech,O75015,FCG3B_HUMAN,FCGR3B,binder
3,DB00002,biotech,P02745,C1QA_HUMAN,C1QA,binder
4,DB00002,biotech,P02746,C1QB_HUMAN,C1QB,binder
...,...,...,...,...,...,...
3368,DB17083,small molecule,Q9Y6L6,SO1B1_HUMAN,SLCO1B1,"[substrate, inhibitor]"
3369,DB17083,small molecule,Q9NPD5,SO1B3_HUMAN,SLCO1B3,"[substrate, inhibitor]"
3370,DB17083,small molecule,Q8TCC7,S22A8_HUMAN,SLC22A8,"[substrate, inhibitor]"
3371,DB17083,small molecule,Q9UNQ0,ABCG2_HUMAN,ABCG2,substrate


In [18]:
targets_df_expl = []

for idx, row in targets_df.iterrows():
    moa = row.action
    row_l = row.to_list()
    row_l = row_l[:-1]
    if type(moa) == list:
        for a in moa:
            targets_df_expl.append(row_l + [a])
    else:
        targets_df_expl.append(row.to_list())

targets_df_expl = pd.DataFrame(data=targets_df_expl, columns=["drugbank-id", "drug-type" ,"uniprotkb-id", "uniprotac-id","gene-name","action"])    
targets_df_expl

Unnamed: 0,drugbank-id,drug-type,uniprotkb-id,uniprotac-id,gene-name,action
0,DB00001,biotech,P00734,THRB_HUMAN,F2,inhibitor
1,DB00002,biotech,P00533,EGFR_HUMAN,EGFR,binder
2,DB00002,biotech,O75015,FCG3B_HUMAN,FCGR3B,binder
3,DB00002,biotech,P02745,C1QA_HUMAN,C1QA,binder
4,DB00002,biotech,P02746,C1QB_HUMAN,C1QB,binder
...,...,...,...,...,...,...
31027,DB17083,small molecule,Q9NPD5,SO1B3_HUMAN,SLCO1B3,inhibitor
31028,DB17083,small molecule,Q8TCC7,S22A8_HUMAN,SLC22A8,substrate
31029,DB17083,small molecule,Q8TCC7,S22A8_HUMAN,SLC22A8,inhibitor
31030,DB17083,small molecule,Q9UNQ0,ABCG2_HUMAN,ABCG2,substrate


In [19]:
targets_df_expl.drop_duplicates(inplace=True)
targets_df_expl

Unnamed: 0,drugbank-id,drug-type,uniprotkb-id,uniprotac-id,gene-name,action
0,DB00001,biotech,P00734,THRB_HUMAN,F2,inhibitor
1,DB00002,biotech,P00533,EGFR_HUMAN,EGFR,binder
2,DB00002,biotech,O75015,FCG3B_HUMAN,FCGR3B,binder
3,DB00002,biotech,P02745,C1QA_HUMAN,C1QA,binder
4,DB00002,biotech,P02746,C1QB_HUMAN,C1QB,binder
...,...,...,...,...,...,...
31027,DB17083,small molecule,Q9NPD5,SO1B3_HUMAN,SLCO1B3,inhibitor
31028,DB17083,small molecule,Q8TCC7,S22A8_HUMAN,SLC22A8,substrate
31029,DB17083,small molecule,Q8TCC7,S22A8_HUMAN,SLC22A8,inhibitor
31030,DB17083,small molecule,Q9UNQ0,ABCG2_HUMAN,ABCG2,substrate


In [25]:
drugs_df[['drugbank-id','name','atc']]

Unnamed: 0,drugbank-id,name,atc
0,DB00001,Lepirudin,B01AE02
1,DB00002,Cetuximab,L01FE01
2,DB00003,Dornase alfa,R05CB13
3,DB00004,Denileukin diftitox,L01XX29
4,DB00005,Etanercept,L04AB01
...,...,...,...
15230,DB17382,AUM-601,
15231,DB17383,FN-1501,
15232,DB17384,Tinengotinib,
15233,DB17385,Lipotecan,


In [27]:
pd.merge(left=targets_df_expl, right=drugs_df[['drugbank-id','name','atc']], on='drugbank-id', how='left').to_excel('dg-tgt-moa.xlsx')

In [28]:
temp = pd.merge(left=targets_df_expl, right=drugs_df[['drugbank-id','name','atc']], on='drugbank-id', how='left')

In [29]:
temp.loc[(temp['drugbank-id'] == 'DB00091') & (temp['gene-name'] == 'CYP3A4')]

Unnamed: 0,drugbank-id,drug-type,uniprotkb-id,uniprotac-id,gene-name,action,name,atc
19073,DB00091,small molecule,P08684,CP3A4_HUMAN,CYP3A4,substrate,Cyclosporine,L04AD01
19074,DB00091,small molecule,P08684,CP3A4_HUMAN,CYP3A4,inhibitor,Cyclosporine,L04AD01


In [200]:
# execute above cells

In [201]:
t = list(set(targets_df_expl["uniprotkb-id"].to_list()))
with open("./temp/tgt_uniprot_kb.txt","w+") as fp:
    for item in t:
        fp.write(str(item) + "\n")


In [15]:
prot_struct_raw_df = pd.read_excel("./temp/uniprot-download_true_fields_accession_2Cid_2Csequence_2Cft_strand_2-2023.06.21-09.46.39.25.xlsx")
prot_struct_raw_df

  warn("Workbook contains no default style, apply openpyxl's default")


Unnamed: 0,From,Entry,Entry Name,Sequence,Beta strand,Turn,Helix,Active site
0,P08506,P08506,DACC_ECOLI,MTQYSSLLRGLAAGSAFLFLFAPTAFAAEQTVEAPSVDARAWILMD...,"STRAND 39..46; /evidence=""ECO:0007829|PDB:3ITA...","TURN 47..49; /evidence=""ECO:0007829|PDB:3ITA"";...","HELIX 65..67; /evidence=""ECO:0007829|PDB:3ITA""...","ACT_SITE 66; /note=""Acyl-ester intermediate""; ..."
1,P36578,P36578,RL4_HUMAN,MACARPLISVYSEKGESSGKNVTLPAVFKAPIRPDIVNFVHTNLRK...,,,,
2,P45381,P45381,ACY2_HUMAN,MTSCHIAEEHIQKVAIFGGTHGNELTGVFLVKHWLENGAEIQRTGL...,"STRAND 14..18; /evidence=""ECO:0007829|PDB:4MXU...","TURN 245..249; /evidence=""ECO:0007829|PDB:4MXU...","HELIX 25..34; /evidence=""ECO:0007829|PDB:4MXU""...","ACT_SITE 178; /evidence=""ECO:0000269|PubMed:17..."
3,Q96LB4,Q96LB4,VATG3_HUMAN,MTSQSQGIHQLLQAEKRAKDKLEEAKKRKGKRLKQAKEEAMVEIDQ...,,,,
4,Q8NI60,Q8NI60,COQ8A_HUMAN,MAAILGDTIMVAKGLVKLTQAAVETHLQHLGIGGELIMAARALQST...,"STRAND 284..289; /evidence=""ECO:0007829|PDB:4P...","TURN 362..364; /evidence=""ECO:0007829|PDB:4PED...","HELIX 260..269; /evidence=""ECO:0007829|PDB:4PE...","ACT_SITE 488; /note=""Proton acceptor""; /eviden..."
...,...,...,...,...,...,...,...,...
2348,P47944,P47944,MT4_HUMAN,MDPRECVCMSGGICMCGDNCKCTTCNCKTYWKSCCPCCPPGCAKCA...,,,,
2349,O95718,O95718,ERR2_HUMAN,MSSDDRHLGSSCGSFIKTEPSSPSSGIDALSHHSPSGSSDASGGFG...,"STRAND 111..119; /evidence=""ECO:0007829|PDB:1L...","TURN 104..107; /evidence=""ECO:0007829|PDB:1LO1...","HELIX 121..132; /evidence=""ECO:0007829|PDB:1LO...",
2350,P20035,P20035,HGXR_PLAFG,MPIPNNPGAGENAFDPVFVNDDDGYDLDSFMIPAHYKKYLTKVLVP...,"STRAND 40..45; /evidence=""ECO:0007829|PDB:3OZF...","TURN 77..79; /evidence=""ECO:0007829|PDB:3OZF""","HELIX 27..29; /evidence=""ECO:0007829|PDB:3OZF""...","ACT_SITE 148; /note=""Proton acceptor""; /eviden..."
2351,Q9UF02,Q9UF02,CCG5_HUMAN,MSACGRKALTLLSSVFAVCGLGLLGIAVSTDYWLYLEEGVIVPQNQ...,,,,


In [16]:
prot_struct_raw_df['Active site'].loc[prot_struct_raw_df['Active site'].isna()]

1       NaN
3       NaN
5       NaN
6       NaN
8       NaN
       ... 
2347    NaN
2348    NaN
2349    NaN
2351    NaN
2352    NaN
Name: Active site, Length: 1553, dtype: object

In [17]:
targets_df_expl.to_excel(r"./temp/drug_target_edges.xlsx", index=False)
targets_df_expl.dropna().to_excel(r"./temp/drug_target_edges_nonulls.xlsx", index=False)

## Build drug-target-moa dataset

In [133]:
smiles_df = struct_df
smiles_df.columns = ["drugbank-id","structure"]
smiles_df["type"] = ['SMILES']*len(smiles_df)
smiles_df

# drug_target_moa_df = pd.merge()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  smiles_df["type"] = ['SMILES']*len(smiles_df)


Unnamed: 0,drugbank-id,structure,type
0,DB00006,CC[C@H](C)[C@H](NC(=O)[C@H](CCC(O)=O)NC(=O)[C@...,SMILES
1,DB00007,CCNC(=O)[C@@H]1CCCN1C(=O)[C@H](CCCNC(N)=N)NC(=...,SMILES
2,DB00014,CC(C)C[C@H](NC(=O)[C@@H](COC(C)(C)C)NC(=O)[C@H...,SMILES
3,DB00027,CC(C)C[C@@H](NC(=O)CNC(=O)[C@@H](NC=O)C(C)C)C(...,SMILES
4,DB00035,NC(=O)CC[C@@H]1NC(=O)[C@H](CC2=CC=CC=C2)NC(=O)...,SMILES
...,...,...,...
11581,DB17379,CC(C)C1=C(O)C(O)=C(C=O)C2=C(O)C(=C(C)C=C12)C1=...,SMILES
11582,DB17383,CN1CCN(CC2=CC=C(NC(=O)C3=NNC=C3NC3=C4C=CNC4=NC...,SMILES
11583,DB17384,CC1=C2N=C(C3=CC=CC=C3Cl)C3=C(NC2=NN1)C=C(N=C3)...,SMILES
11584,DB17385,CC[C@@]1(OC(=O)C(C)ON=C2C3=C(C4=C2C=C(C=C4[N+]...,SMILES


In [134]:
targets_df_expl

Unnamed: 0,drugbank-id,drug-type,uniprotkb-id,uniprotac-id,gene-name,action
0,DB00006,small molecule,P05164,PERM_HUMAN,MPO,inhibitor
1,DB00008,biotech,P05177,CP1A2_HUMAN,CYP1A2,inhibitor
2,DB00011,biotech,P05177,CP1A2_HUMAN,CYP1A2,inhibitor
3,DB00013,biotech,P39900,MMP12_HUMAN,MMP12,substrate
4,DB00018,biotech,P05177,CP1A2_HUMAN,CYP1A2,inhibitor
...,...,...,...,...,...,...
41748,DB17087,small molecule,Q9Y5N1,HRH3_HUMAN,HRH3,antagonist
41749,DB17096,small molecule,Q8IAS0,PLM10_PLAF7,PMX,inhibitor
41750,DB17096,small molecule,Q8IAS0,PLM10_PLAF7,PMX,binder
41751,DB17096,small molecule,W7JWW5,PLM10_PLAFO,PMX,inhibitor


In [202]:
len(set(targets_df_expl['drugbank-id'].to_list()))

10318

In [207]:
from Bio import SeqIO

drug_seq = []
for seq_record in SeqIO.parse("../data/drugs/drugbank_5.1.10/drug sequences.fasta", "fasta"):
    drug_seq.append(
        {
        	'drugbank-id':seq_record.id.split('|')[1],	
            'structure': str(seq_record.seq),
            'type':'polypeptide'
        }
    )


drug_seq_df = pd.DataFrame(drug_seq)
drug_seq_df

Unnamed: 0,drugbank-id,structure,type
0,DB00001,LTYTDCTESGQNLCLCEGSNVCGQGNKCILGSDGEKNQCVTGEGTP...,polypeptide
1,DB00002,QVQLKQSGPGLVQPSQSLSITCTVSGFSLTNYGVHWVRQSPGKGLE...,polypeptide
2,DB00002,DILLTQSPVILSVSPGERVSFSCRASQSIGTNIHWYQQRTNGSPRL...,polypeptide
3,DB00003,LKIAAFNIQTFGETKMSNATLVSYIVQILSRYDIALVQEVRDSHLT...,polypeptide
4,DB00004,MGADDVVDSSKSFVMENFSSYHGTKPGYVDSIQKGIQKPKSGTQGN...,polypeptide
...,...,...,...
433,DB16701,MEHKEVVLLLLLFLKSGQGEPLDDYVNTQGASLFSVTKKQLGAGSI...,polypeptide
434,DB16755,QITLKESGPTLVKPTQTLTLTCTFSGFSLSISGVGVGWLRQPPGKA...,polypeptide
435,DB16755,QSALTQPASVSGSPGQSITISCTATSSDVGDYNYVSWYQQHPGKAP...,polypeptide
436,DB17012,MGKVKVGVNGFGRIGRLVTRAAFNSGKVDIVAINDPFIDLNYMVYM...,polypeptide


In [208]:
all_struct_df = pd.concat([smiles_df, drug_seq_df])
all_struct_df.to_excel('temp/all_struct.xlsx', index=False)

In [210]:
prot_seq = []
for seq_record in SeqIO.parse("../data/drugs/drugbank_5.1.10/target_sequences_protein.fasta", "fasta"):
    prot_seq.append(
        {
        	'uniprotkb-id':seq_record.id.split('|')[1],	
            'aa-seq': str(seq_record.seq),
        }
    )

for seq_record in SeqIO.parse("../data/drugs/drugbank_5.1.10/targets/carrier/protein.fasta", "fasta"):
    prot_seq.append(
        {
        	'uniprotkb-id':seq_record.id.split('|')[1],	
            'aa-seq': str(seq_record.seq),
        }
    )

for seq_record in SeqIO.parse("../data/drugs/drugbank_5.1.10/targets/enzyme/protein.fasta", "fasta"):
    prot_seq.append(
        {
        	'uniprotkb-id':seq_record.id.split('|')[1],	
            'aa-seq': str(seq_record.seq),
        }
    )

for seq_record in SeqIO.parse("../data/drugs/drugbank_5.1.10/targets/transporter/protein.fasta", "fasta"):
    prot_seq.append(
        {
        	'uniprotkb-id':seq_record.id.split('|')[1],	
            'aa-seq': str(seq_record.seq),
        }
    )

prot_seq_df = pd.DataFrame(prot_seq)
prot_seq_df

Unnamed: 0,uniprotkb-id,aa-seq
0,P45059,MVKFNSSRKSGKSKKTIRKLTAPETVKQNKPQKVFEKCFMRGRYML...
1,P19113,MMEPEEYRERGREMVDYICQYLSTVRERRVTPDVQPGYLRAQLPES...
2,Q9UI32,MRSMKALQKALSRAGSHCGRGGWGHPSRSPLLGGGVRHHLSEAAAQ...
3,P00488,MSETSRTAFGGRRAVPPNNSNAAEDDLPTVELQGVVPRGVNLQEFL...
4,P35228,MACPWKFLFKTKFHQYAMNGEKDINNNVEKAPCATSSPVTQDDLQY...
...,...,...
6282,O14520,MVQASGHRRSTRGSKMVSWSVIAKIQEILQRKMVREFLAEFMSTYV...
6283,O43315,MQPEGAEKGKSFKQRLVLKSSLAKETLSEFLGTFILIVLGCGCVAQ...
6284,Q15109,MAAGTAVGAWVLVLSLWGAVVGAQNITARIGEPLVLKCKGAPKKPP...
6285,Q9BZD2,MAVVSEDDFQHSSNSTYRTTSSSLRADQEALLEKLLDRPPPGLQRP...


In [211]:
df1 = pd.merge(left=targets_df_expl, right=all_struct_df, on='drugbank-id')

In [212]:
df2 = pd.merge(left=df1, right=prot_seq_df, on='uniprotkb-id')
df2

Unnamed: 0,drugbank-id,drug-type,uniprotkb-id,uniprotac-id,gene-name,action,structure,type,aa-seq
0,DB00001,biotech,P00734,THRB_HUMAN,F2,inhibitor,LTYTDCTESGQNLCLCEGSNVCGQGNKCILGSDGEKNQCVTGEGTP...,polypeptide,MAHVRGLQLPGCLALAALCSLVHSQHVFLAPQQARSLLQRVRRANT...
1,DB00001,biotech,P00734,THRB_HUMAN,F2,inhibitor,LTYTDCTESGQNLCLCEGSNVCGQGNKCILGSDGEKNQCVTGEGTP...,polypeptide,MAHVRGLQLPGCLALAALCSLVHSQHVFLAPQQARSLLQRVRRANT...
2,DB00006,small molecule,P00734,THRB_HUMAN,F2,inhibitor,CC[C@H](C)[C@H](NC(=O)[C@H](CCC(O)=O)NC(=O)[C@...,SMILES,MAHVRGLQLPGCLALAALCSLVHSQHVFLAPQQARSLLQRVRRANT...
3,DB00006,small molecule,P00734,THRB_HUMAN,F2,inhibitor,CC[C@H](C)[C@H](NC(=O)[C@H](CCC(O)=O)NC(=O)[C@...,SMILES,MAHVRGLQLPGCLALAALCSLVHSQHVFLAPQQARSLLQRVRRANT...
4,DB00025,biotech,P00734,THRB_HUMAN,F2,activator,ATRRYYLGAVELSWDYMQSDLGELPVDARFPPRVPKSFPFNTSVVY...,polypeptide,MAHVRGLQLPGCLALAALCSLVHSQHVFLAPQQARSLLQRVRRANT...
...,...,...,...,...,...,...,...,...,...
53282,DB01390,small molecule,Q9Y6M7,S4A7_HUMAN,SLC4A7,substrate,[Na+].OC([O-])=O,SMILES,MERFRLEKKLPGPDEEAVVDLGKTSSTVNTKFEKEELESHRAVYIG...
53283,DB01390,small molecule,Q2Y0W8,S4A8_HUMAN,SLC4A8,substrate,[Na+].OC([O-])=O,SMILES,MPAAGSNEPDGVLSYQRPDEEAVVDQGGTSTILNIHYEKEELEGHR...
53284,DB01390,small molecule,Q9BY07,S4A5_HUMAN,SLC4A5,substrate,[Na+].OC([O-])=O,SMILES,MKVKEEKAGVGKLDHTNHRRRFPDQKECPPIHIGLPVPTYPQRKTD...
53285,DB01390,small molecule,Q6U841,S4A10_HUMAN,SLC4A10,substrate,[Na+].OC([O-])=O,SMILES,MEIKDQGAQMEPLLPTRNDEEAVVDRGGTRSILKTHFEKEDLEGHR...


In [213]:
df2.drop_duplicates(inplace=True)
df2

Unnamed: 0,drugbank-id,drug-type,uniprotkb-id,uniprotac-id,gene-name,action,structure,type,aa-seq
0,DB00001,biotech,P00734,THRB_HUMAN,F2,inhibitor,LTYTDCTESGQNLCLCEGSNVCGQGNKCILGSDGEKNQCVTGEGTP...,polypeptide,MAHVRGLQLPGCLALAALCSLVHSQHVFLAPQQARSLLQRVRRANT...
2,DB00006,small molecule,P00734,THRB_HUMAN,F2,inhibitor,CC[C@H](C)[C@H](NC(=O)[C@H](CCC(O)=O)NC(=O)[C@...,SMILES,MAHVRGLQLPGCLALAALCSLVHSQHVFLAPQQARSLLQRVRRANT...
4,DB00025,biotech,P00734,THRB_HUMAN,F2,activator,ATRRYYLGAVELSWDYMQSDLGELPVDARFPPRVPKSFPFNTSVVY...,polypeptide,MAHVRGLQLPGCLALAALCSLVHSQHVFLAPQQARSLLQRVRRANT...
6,DB00170,small molecule,P00734,THRB_HUMAN,F2,activator,CC1=CC(=O)C2=CC=CC=C2C1=O,SMILES,MAHVRGLQLPGCLALAALCSLVHSQHVFLAPQQARSLLQRVRRANT...
8,DB00278,small molecule,P00734,THRB_HUMAN,F2,inhibitor,C[C@@H]1CCN([C@H](C1)C(O)=O)C(=O)[C@H](CCCNC(N...,SMILES,MAHVRGLQLPGCLALAALCSLVHSQHVFLAPQQARSLLQRVRRANT...
...,...,...,...,...,...,...,...,...,...
53282,DB01390,small molecule,Q9Y6M7,S4A7_HUMAN,SLC4A7,substrate,[Na+].OC([O-])=O,SMILES,MERFRLEKKLPGPDEEAVVDLGKTSSTVNTKFEKEELESHRAVYIG...
53283,DB01390,small molecule,Q2Y0W8,S4A8_HUMAN,SLC4A8,substrate,[Na+].OC([O-])=O,SMILES,MPAAGSNEPDGVLSYQRPDEEAVVDQGGTSTILNIHYEKEELEGHR...
53284,DB01390,small molecule,Q9BY07,S4A5_HUMAN,SLC4A5,substrate,[Na+].OC([O-])=O,SMILES,MKVKEEKAGVGKLDHTNHRRRFPDQKECPPIHIGLPVPTYPQRKTD...
53285,DB01390,small molecule,Q6U841,S4A10_HUMAN,SLC4A10,substrate,[Na+].OC([O-])=O,SMILES,MEIKDQGAQMEPLLPTRNDEEAVVDRGGTRSILKTHFEKEDLEGHR...


In [214]:
df2.to_excel("temp/drug-target-moa.xlsx", index=False)

# Drug Drug interactions

In [29]:
intr_dicts = []

for obj in tqdm(dat):
    try:
        k,v = obj['drugbank-id'][0].items()
        drug_id = v[1]
    except:
        drug_id = obj['drugbank-id']['#text']
    
    try:
        for interaction in obj['drug-interactions']['drug-interaction']:
            dst_id = interaction['drugbank-id']
            intr_dicts.append(
                  {
                    'source':drug_id,
                    'dst':dst_id,
                    'desc':interaction['description']
                  }
            )
    except: 
        pass
    

  0%|          | 0/15235 [00:00<?, ?it/s]

In [30]:
pd.DataFrame(intr_dicts).to_csv("temp/drug-interactions.tsv", sep="\t")