In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

In [2]:
import pandas as pd

In [59]:
#
# filepath
file_path = '../dataset/BindingDB/test.csv'
# name of SMILES column
smiles_col = 'SMILES'
# name of Sequence column
seq_col = 'Target Sequence'
# name of Label column
label_col = 'Label'

In [76]:
import numpy as np
from numpy.random import choice

In [68]:
df = pd.read_csv(file_path,sep=',')

In [93]:
pos_df = df[df[label_col] == 1]
neg_df = df[df[label_col] == 0]

In [101]:
contrastive = []
n_neg_per = 10

for _,r in pos_df.iterrows():
    for _ in range(n_neg_per):
        contrastive.append((r[seq_col], r[smiles_col], choice(neg_df[smiles_col])))
                       
contrastive = pd.DataFrame(contrastive,columns=['Anchor','Positive','Negative'])

In [103]:
contrastive.head()

Unnamed: 0,Anchor,Positive,Negative
0,MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...,NS(=O)(=O)c1ccc(cc1)C(=O)NCCOCCOCCN(CC([O-])=O...,Cc1[nH]c(\C=C2/C(=O)Nc3ccc(F)cc23)c(C)c1C(=O)N...
1,MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...,NS(=O)(=O)c1ccc(cc1)C(=O)NCCOCCOCCN(CC([O-])=O...,CCN(CCO)CCCOc1ccc2c(Nc3cc(CC(=O)Nc4cccc(F)c4)n...
2,MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...,NS(=O)(=O)c1ccc(cc1)C(=O)NCCOCCOCCN(CC([O-])=O...,CCn1cc(cn1)-c1cnc2ccc(cc2n1)N1CCN(C)Cc2c(OC)cc...
3,MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...,NS(=O)(=O)c1ccc(cc1)C(=O)NCCOCCOCCN(CC([O-])=O...,CC(C)(C)c1cnc(CSc2cnc(NC(=O)C3CCNCC3)s2)o1
4,MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...,NS(=O)(=O)c1ccc(cc1)C(=O)NCCOCCOCCN(CC([O-])=O...,NC(=O)c1ccc(OCCCc2cnc[nH]2)cc1


In [34]:
class ContrastiveDataset(Dataset):
    def __init__(self, contrastive_df, mfeats, pfeats):
        self.mfeats = mfeats
        self.pfeats = pfeats
        self.contrastive_df = contrastive_df
        
        self.anchors = contrastive_df.Anchor
        self.positives = contrastive_df.Positive
        self.negatives = contrastive_df.Negative

    def __len__(self):
        return len(self.contrastive_df)

    @property
    def shape(self):
        return self.mfeats._size, self.pfeats._size

    def __getitem__(self, i):
        
        anchorEmb = self.pfeats(self.anchors[i])
        positiveEmb = self.mfeats(self.positives[i])
        negativeEmb = self.mfeats(self.negatives[i])

        return anchorEmb, positiveEmb, negativeEmb

In [35]:
mdd = MoleculeDecoyDataset('casp3', casp3[casp3.Label == 1].index, casp3[casp3.Label == 0].index, mfeat, pfeat)

In [45]:
mdd[0], mdd[213], mdd[214]

(('casp3', 'CHEMBL193674', 'ZINC10009916'),
 ('casp3', 'CHEMBL193674', 'ZINC14732299'),
 ('casp3', 'CHEMBL592983', 'ZINC10009916'))

In [38]:
casp3

Unnamed: 0_level_0,Target_Seq,Molecule_SMILES,Label
Molecule_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
CHEMBL193674,CFREENANFNKIFLPTIYSIIFLTGIVGNGLVILVMGYQKKLRSMT...,CC(=O)OCCN1C(=O)c2c(-c3ccccc3)nc3ccc(S(=O)(=O)...,1
CHEMBL592983,CFREENANFNKIFLPTIYSIIFLTGIVGNGLVILVMGYQKKLRSMT...,CC1(C)C[NH+]=C2C(=O)c3cc(S(=O)(=O)N4CCCC4COc4c...,1
CHEMBL147642,CFREENANFNKIFLPTIYSIIFLTGIVGNGLVILVMGYQKKLRSMT...,O=CC(CC(=O)[O-])NC(=O)c1ccc(CNS(=O)(=O)c2ccc(O...,1
CHEMBL183437,CFREENANFNKIFLPTIYSIIFLTGIVGNGLVILVMGYQKKLRSMT...,Cc1ccccc1N1C(=O)c2c(C)nc3ccc(S(=O)(=O)N4CCOCC4...,1
CHEMBL100927,CFREENANFNKIFLPTIYSIIFLTGIVGNGLVILVMGYQKKLRSMT...,CC(C)CC(NC(=O)COc1ccnc2ccccc12)C(=O)NC1CC(=O)OC1O,1
...,...,...,...
ZINC66829646,CFREENANFNKIFLPTIYSIIFLTGIVGNGLVILVMGYQKKLRSMT...,CC1CCC(N2CC(C(=O)N3CCC(N(C)S(C)(=O)=O)CC3)CC2=...,0
ZINC66896939,CFREENANFNKIFLPTIYSIIFLTGIVGNGLVILVMGYQKKLRSMT...,CC1CN(C(C(=O)N(C)CC(=O)N2CCOCC2)C(C)C)CC(C)O1,0
ZINC66935942,CFREENANFNKIFLPTIYSIIFLTGIVGNGLVILVMGYQKKLRSMT...,O=C(C1CCCCN1S(=O)(=O)CC1CCCCO1)N1CCOCC1,0
ZINC67109823,CFREENANFNKIFLPTIYSIIFLTGIVGNGLVILVMGYQKKLRSMT...,CCCOC1CCCN(C(=O)C2=CC=CN3CCS(=O)(=O)N=C23)C1,0
