In [2]:
import os
import sys

In [3]:
from rdkit import Chem
from rdkit.Chem import AllChem

import numpy as np
import pandas as pd
import torch

In [4]:
BASE_DIR = ".."
MODEL_BASE_DIR = f"{BASE_DIR}/best_models"
DATA_DIR = f"{BASE_DIR}/nbdata"
os.makedirs(MODEL_BASE_DIR,exist_ok=True)
os.makedirs(DATA_DIR,exist_ok=True)
sys.path.append(BASE_DIR)

## Load DrugBank

In [5]:
def morgan_fingerprint(mol,radius=2,nbits=2048):
    return torch.from_numpy(np.array(AllChem.GetMorganFingerprintAsBitVect(mol,radius,nbits)))

In [6]:
molecules = Chem.SDMolSupplier('/afs/csail.mit.edu/u/s/samsl/Work/databases/DrugBank/open_structures.sdf')

In [7]:
vocabulary = pd.read_csv('/afs/csail.mit.edu/u/s/samsl/Work/databases/DrugBank/drugbank_vocabulary.csv')

In [70]:
moleculeDict = {m.GetProp('DRUGBANK_ID'): m for m in molecules if m is not None}

[12:23:23] Explicit valence for atom # 13 Cl, 5, is greater than permitted
[12:23:23] ERROR: Could not sanitize molecule ending on line 129414
[12:23:23] ERROR: Explicit valence for atom # 13 Cl, 5, is greater than permitted
[12:23:24] Explicit valence for atom # 19 O, 3, is greater than permitted
[12:23:24] ERROR: Could not sanitize molecule ending on line 174651
[12:23:24] ERROR: Explicit valence for atom # 19 O, 3, is greater than permitted
[12:23:24] Explicit valence for atom # 1 N, 4, is greater than permitted
[12:23:24] ERROR: Could not sanitize molecule ending on line 225596
[12:23:24] ERROR: Explicit valence for atom # 1 N, 4, is greater than permitted
[12:23:24] Explicit valence for atom # 1 N, 4, is greater than permitted
[12:23:24] ERROR: Could not sanitize molecule ending on line 247936
[12:23:24] ERROR: Explicit valence for atom # 1 N, 4, is greater than permitted
[12:23:24] Explicit valence for atom # 12 N, 4, is greater than permitted
[12:23:24] ERROR: Could not sanitize

In [48]:
vocabulary

Unnamed: 0,DrugBank ID,Accession Numbers,Common name,CAS,UNII,Synonyms,Standard InChI Key
0,DB00001,BIOD00024 | BTD00024,Lepirudin,138068-37-8,Y43GF64R34,Hirudin variant-1 | Lepirudin | Lepirudin reco...,
1,DB00002,BIOD00071 | BTD00071,Cetuximab,205923-56-4,PQX0D8J21J,Cetuximab | Cétuximab | Cetuximabum,
2,DB00003,BIOD00001 | BTD00001,Dornase alfa,143831-71-4,953A26OA1Y,Deoxyribonuclease (human clone 18-1 protein mo...,
3,DB00004,BIOD00084 | BTD00084,Denileukin diftitox,173146-27-5,25E79B5CTM,Denileukin | Denileukin diftitox | Interleukin...,
4,DB00005,BIOD00052 | BTD00052,Etanercept,185243-69-0,OP401G7OJC,Etanercept | etanercept-szzs | etanercept-ykro...,
...,...,...,...,...,...,...,...
14589,DB16742,,RP-67580,135911-02-3,49U9M41BGY,,
14590,DB16743,,Nolpitantium chloride,153050-21-6,22O6XI63E0,,
14591,DB16744,,CP-96345,132746-60-2,W22ILA2I52,,
14592,DB16745,,PXT 3003,1467047-91-1,,,WRUIDZKNUAHKTR-UHFFFAOYSA-N


## Load Surfaceome

In [8]:
target_db = pd.read_csv('/afs/csail.mit.edu/u/s/samsl/Work/databases/surfaceome/surfaceome_drugbank_surface.tsv',sep='\t')

In [9]:
from Bio import SeqIO

records = list(SeqIO.parse('/afs/csail.mit.edu/u/s/samsl/Work/databases/STRING/homo.sapiens/ALL_SEQUENCES.fasta','fasta'))
recordDict = {r.name: r for r in records}

In [41]:
names = ('9606.' + target_db['Ensembl protein'].str.split(';',expand=True).iloc[:,0]).values
seqRecords = [r for r in records if r.name in names]
seqDict = {r.name: str(r.seq) for r in seqRecords}

In [44]:
target_db['ENSP'] = names

In [45]:
target_db.columns

Index(['UniProt accession', 'UniProt name', 'UniProt description',
       'UniProt gene', 'Surfaceome Label', 'Surfaceome Label Source',
       'Comment', 'length', 'TM domains', 'signalpeptide', 'topology',
       'topology source', 'MachineLearning trainingset',
       'MachineLearning score',
       'MachineLearning FPR class (1=1%, 2=5%, 3=15%)', 'Ensembl gene',
       'Ensembl protein', 'CD number', 'Membranome Almen main-class',
       'Membranome Almen sub-class', 'nxst motifs', 'noncyt. nxst count',
       'peps with accessible noncyt. nxst', 'noncyt. Trp count',
       'peps with accessible noncyt. Trp', 'noncyt. Tyr count',
       'peps with accessible noncyt. Tyr', 'glycomineN sites',
       'glycomineO sites', 'glycomineC sites', 'CSPA category',
       'CSPA peptide count', 'CSPA peptides', 'CSPA N115 sites', 'CSPA id',
       'UniProt subcellular', 'UniProt keywords', 'UniProt uniref',
       'COMPARTMENTS link', 'COMPARTMENTS benchmark pos',
       'COMPARTMENTS benchmar

In [47]:
target_db[['UniProt gene','ENSP','UniProt description','DrugBank approved drug IDs']]

Unnamed: 0,UniProt gene,ENSP,UniProt description,DrugBank approved drug IDs
0,GABRR3,9606.ENSP00000420790,Gamma-aminobutyric acid receptor subunit rho-3,DB00231;DB00546;DB00683;DB00690;DB00801;DB0082...
1,TLR4,9606.ENSP00000363089,Toll-like receptor 4,DB01183
2,GABRP,9606.ENSP00000265294,Gamma-aminobutyric acid receptor subunit pi,DB00231;DB00546;DB00683;DB00690;DB00801;DB0082...
3,GABRD,9606.ENSP00000367848,Gamma-aminobutyric acid receptor subunit delta,DB00231;DB00546;DB00683;DB00690;DB00801;DB0082...
4,NRP1,9606.ENSP00000265371,Neuropilin-1,DB00039;DB04895
...,...,...,...,...
403,HRH3,9606.ENSP00000342560,Histamine H3 receptor,DB00370;DB00667;DB06698
404,PTGDR2,9606.ENSP00000332812,Prostaglandin D2 receptor 2,DB00328;DB00605
405,SCN10A,9606.ENSP00000390600,Sodium channel protein type 10 subunit alpha,DB00281;DB00296;DB00297;DB00473;DB00527;DB0064...
406,SLC12A7,9606.ENSP00000264930,Solute carrier family 12 member 7,DB00761
