In [95]:
#delfos copy for pka prediction
import torch
from torch import nn
from mol2vec.features import mol2alt_sentence, mol2sentence, MolSentence, DfVec, sentences2vec
from gensim.models import word2vec
import pandas as pd
from rdkit.Chem import PandasTools
from rdkit import Chem
from numpy import genfromtxt
import numpy as np

In [2]:
#step 1: define solvent X and solute Y using mol2vec but don't add the substructures!!
#step 2: run RNN in both directions on each molecule, then concatenate forward;reverse to get H and G
#step 3: feed H and G into attention layer, generate attention alignment matrix, create contexts P and Q
#step 4: maxpool H;P and G;Q into 2D feature vectors
#step 5: create flattened input u;v and feed into linear layer

In [4]:
#step 1: mol2vec embedding

data = pd.read_csv('dwar_pka1.csv')
#data = data.to_numpy()
print(data['SMILES'])

0       CC[C@H](C)[C@H](NC(=O)[C@H](CCC(O)=O)NC(=O)[C@...
1       CC(C)C[C@H](NC(=O)[C@@H](COC(C)(C)C)NC(=O)[C@H...
2       CC(C)C[C@@H](NC(=O)CNC(=O)[C@@H](NC=O)C(C)C)C(...
3       NC(=O)CC[C@@H]1NC(=O)[C@H](CC2=CC=CC=C2)NC(=O)...
4       CC(C)C[C@H](NC(=O)[C@@H](CCCNC(N)=O)NC(=O)[C@H...
                              ...                        
4510               ClC1=CN(CCCCN2CCN(CC2)C2=NC=CC=N2)N=C1
4511    [H][C@](N)(CCC(O)=N[C@@]([H])(CS(=O)(=O)CCOP(=...
4512    C[N+](C)([O-])CCNC1=CC=C(NCC[N+](C)(C)[O-])C2=...
4513    [H][C@@]12CCCN1C(=O)[C@H](CC(C)C)NC(=O)[C@@H](...
4514    C[C@@H]1CC2=CC3=C(OCO3)C=C2C(=NN1C(C)=O)C1=CC=...
Name: SMILES, Length: 4515, dtype: object


In [64]:
#mol2vec model
mol2vec_model = word2vec.Word2Vec.load('models/model_300dim.pkl')

#create mol type
data['mol'] = data.apply(lambda x: Chem.MolFromSmiles(x['SMILES']), axis=1)

#remove invalid smiles
data.replace("", float("NaN"), inplace=True)
data.dropna(subset = ['mol'], inplace=True)
print(data)

#create sentences
data['sentence'] = data.apply(lambda x: mol2alt_sentence(x['mol'],1), axis=1)

In [60]:
#modified sentence2vec function to return lists of word vectors
def sentences2vecs(sentences, model, unseen=None):
    """Generate vectors for each word in a sentence sentence (list) in a list of sentences.
    
    Parameters
    ----------
    sentences : list, array
        List with sentences
    model : word2vec.Word2Vec
        Gensim word2vec model
    unseen : None, str
        Keyword for unseen words. If None, those words are skipped.
        https://stats.stackexchange.com/questions/163005/how-to-set-the-dictionary-for-text-analysis-using-neural-networks/163032#163032
    Returns
    -------
    list of arrays, each sentence -> array of word vectors
    """
    keys = set(model.wv.key_to_index)
    bigveclist = []
    if unseen:
        unseen_vec = model.wv.get_vector(unseen)

    for sentence in sentences:
        veclist = []
        if unseen:
            veclist.append([model.wv.get_vector(y) if y in set(sentence) & keys
                       else unseen_vec for y in sentence])
        else:
            veclist.append([model.wv.get_vector(y) for y in sentence 
                            if y in set(sentence) & keys])
        vecarray = np.concatenate(veclist, axis=1)
        vectensor = torch.Tensor(vecarray)
        bigveclist.append(vectensor)
    return bigveclist

In [84]:
sentences = [x for x in data['sentence']]
targets = torch.Tensor(data['JCHEM_PKA'])
X = sentences2vecs(sentences, model, unseen='UNK')

In [89]:
solvent = {'SMILES': ['CC#N','C1CCOC1','CS(C)=O','C1=CC=CC=C1']}
solvent = pd.DataFrame(solvent)
solvent['mol'] = solvent.apply(lambda x: Chem.MolFromSmiles(x['SMILES']), axis=1)
solvent['sentence'] = solvent.apply(lambda x: mol2alt_sentence(x['mol'],1), axis=1)
solvent = [x for x in solvent['sentence']]
Y = sentences2vecs(solvent, model, unseen='UNK')

In [92]:
print(Y[2].size())

torch.Size([8, 300])


In [12]:
#from torch.nn.utils.rnn import pack_sequence
#X = pack_sequence(X,enforce_sorted = False)

In [133]:
def att(g,h):
    a = torch.exp(torch.dot(g,h))
    return a

def alpha(G,H):
    n = H.shape[0]
    m = G.shape[0]
    alpha = [[att(H[i],G[j])/sum([att(H[i],G[k]) for k in range(m)]) for j in range(m)] for i in range(n)]
    #change this
    return torch.Tensor(alpha)

def batch_att(G,H):
    P = alpha(G,H)@G
    inG = torch.cat((G,Q),1)
    return inG

In [134]:
G = torch.rand(5,2)
H = torch.rand(3,2)
alphaG = alpha(G,H)
alphaH = alpha(H,G)
print(alphaG)

tensor([[0.1881, 0.2322, 0.2151, 0.0970, 0.2676],
        [0.1669, 0.2218, 0.2314, 0.1666, 0.2133],
        [0.2078, 0.2056, 0.1989, 0.1712, 0.2165]])


In [181]:
# model definition
epochs = 10
n_features = 300
n_hidden = 100

class maxpool(nn.Module):
    def __init__(self, L):
        super(maxpool, self).__init__()
        self.maxpool = nn.MaxPool2d((L,2), stride=2)
    def forward(self, X):
        return self.maxpool(X)

class dnet(nn.Module):
    def __init__(self, n_features, D, FF):
        super(dnet, self).__init__()
    
        self.biLSTM_X = nn.LSTM(n_features, D, bidirectional=True)
        self.biLSTM_Y = nn.LSTM(n_features, D, bidirectional=True)
        
        self.FF = nn.Linear(4*D, FF)
        self.out = nn.Linear(FF, 1)
    
    def forward(self,X,Y):
        N = X.data.shape[0]
        M = Y.data.shape[0]
        
        #turn input list of vec into correct shape
        X = X.view(X.data.shape[0],1,X.data.shape[1]) #N rows
        Y = Y.view(Y.data.shape[0],1,Y.data.shape[1]) #M rows
        
        #biLSTM to get hidden states
        H, hcX = self.biLSTM_X(X, None) #Nx1x2D matrix
        G, hcY = self.biLSTM_Y(Y, None) #Mx1x2D matrix
        
        G = G.view(G.data.shape[0],G.data.shape[2]) #N rows
        H = H.view(H.data.shape[0],H.data.shape[2]) #M rows
        #solvent (P) and solute (Q) context
        alphaG = alpha(G, H)   #NxM tensor
        P = alphaG@G           #Nx2D tensor
        alphaH = alpha(H, G)   #MxN tensor
        Q = alphaH@H           #Mx2D tensor
        inG = torch.cat((G,Q),1) #Nx4D
        inH = torch.cat((H,P),1) #Mx4D
        
        #maxpool concatenated tensors
        maxpool_X = maxpool(N)
        maxpool_Y = maxpool(M)
        u = maxpool_X(inH.view(1,inH.data.shape[0],inH.data.shape[1]))  #1x1x2D
        v = maxpool_Y(inG.view(1,inG.data.shape[0],inG.data.shape[1]))  #1x1x2D
        
        #feed forward neural network
        NN = torch.cat((u,v),2)
        NN = self.FF(NN)
        NN = nn.functional.relu(NN)
        output = self.out(NN)
        return output

dmodel = dnet(300,150,2000)
criterion = nn.MSELoss()
optimizer = torch.optim.SGD(dmodel.parameters(), lr=0.0002, momentum=0.9, nesterov=True)

In [173]:
dmodel(X[2],Y[0])

In [178]:
epochs = 10
n_features = 300
n_hidden = 100

for t in range(epochs):
    for b in range(len(X)):
        solute = X[b]
        solvent = Y[0]
        target = targets[b]  

        output = dmodel(solute,solvent) 
        print(output)
        loss = criterion(output, target)  
        
        loss.backward()
        optimizer.step()        
        optimizer.zero_grad() 
    print('step : ' , t , 'loss : ' , loss.item())

tensor([[[-0.0335]]], grad_fn=<AddBackward0>)


  return F.mse_loss(input, target, reduction=self.reduction)


tensor([[[-0.0174]]], grad_fn=<AddBackward0>)
tensor([[[0.0331]]], grad_fn=<AddBackward0>)
tensor([[[0.1142]]], grad_fn=<AddBackward0>)
tensor([[[0.2067]]], grad_fn=<AddBackward0>)
tensor([[[0.3410]]], grad_fn=<AddBackward0>)
tensor([[[0.4313]]], grad_fn=<AddBackward0>)
tensor([[[0.5649]]], grad_fn=<AddBackward0>)
tensor([[[0.7295]]], grad_fn=<AddBackward0>)
tensor([[[0.8763]]], grad_fn=<AddBackward0>)
tensor([[[1.0472]]], grad_fn=<AddBackward0>)
tensor([[[1.3815]]], grad_fn=<AddBackward0>)
tensor([[[1.4849]]], grad_fn=<AddBackward0>)
tensor([[[1.5701]]], grad_fn=<AddBackward0>)
tensor([[[1.9725]]], grad_fn=<AddBackward0>)
tensor([[[1.9300]]], grad_fn=<AddBackward0>)
tensor([[[2.3562]]], grad_fn=<AddBackward0>)
tensor([[[2.9146]]], grad_fn=<AddBackward0>)
tensor([[[3.0545]]], grad_fn=<AddBackward0>)
tensor([[[3.6253]]], grad_fn=<AddBackward0>)
tensor([[[4.1609]]], grad_fn=<AddBackward0>)
tensor([[[4.7093]]], grad_fn=<AddBackward0>)
tensor([[[5.1099]]], grad_fn=<AddBackward0>)
tensor([[

In [188]:
cProfile.run("dmodel(X[2],Y[0])", sort = "cumtime")

         1277250 function calls (1277242 primitive calls) in 8.788 seconds

   Ordered by: cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000   11.659   11.659 {built-in method builtins.exec}
        1    0.009    0.009   11.659   11.659 <string>:1(<module>)
      9/1    0.000    0.000   11.650   11.650 module.py:715(_call_impl)
        1    1.826    1.826   11.650   11.650 <ipython-input-181-258c63f1d29d>:23(forward)
        2    0.002    0.001    9.758    4.879 <ipython-input-133-02d5852e0211>:5(alpha)
      268    0.212    0.001    9.755    0.036 <ipython-input-133-02d5852e0211>:8(<listcomp>)
   424440    0.525    0.000    5.221    0.000 <ipython-input-133-02d5852e0211>:1(att)
   424440    2.643    0.000    2.643    0.000 {built-in method dot}
   424440    2.053    0.000    2.053    0.000 {built-in method exp}
     3144    1.453    0.000    1.453    0.000 {built-in method builtins.sum}
        2    0.000    0.000    0

In [184]:
import cProfile

In [179]:
# batch model definition
epochs = 10
n_features = 300
n_hidden = 100
batch_size = 32

class maxpool(nn.Module):
    def __init__(self, L):
        super(maxpool, self).__init__()
        self.maxpool = nn.MaxPool2d((L,2), stride=2)
    def forward(self, X):
        return self.maxpool(X)
    
def batch_att(G,H):
    P = alpha(G,H)@G
    inG = torch.cat((G,Q),1)
    return inG

class dnet(nn.Module):
    def __init__(self, n_features, D, FF):
        super(dnet, self).__init__()
    
        self.biLSTM_X = nn.LSTM(n_features, D, bidirectional=True)
        self.biLSTM_Y = nn.LSTM(n_features, D, bidirectional=True)
        
        self.FF = nn.Linear(4*D, FF)
        self.out = nn.Linear(FF, 1)
    
    def forward(self,X,Y):
        N = X.data.shape[0]
        M = Y.data.shape[0]
        
        #turn input list of vec into correct shape
        X = X.view(X.data.shape[0],1,X.data.shape[1]) #N rows
        Y = Y.view(Y.data.shape[0],1,Y.data.shape[1]) #M rows
        
        #biLSTM to get hidden states
        H, hcX = self.biLSTM_X(X, None) #NxBx2D matrix
        G, hcY = self.biLSTM_Y(Y, None) #MxBx2D matrix
        
        inG = torch.Tensor([batch_att(G[:,b,:],H[:,b,:]) for b in range(batch_size)])
        inH = torch.Tensor([batch_att(H[:,b,:],G[:,b,:]) for b in range(batch_size)])
        
        #maxpool concatenated tensors
        maxpool_X = maxpool(N)
        maxpool_Y = maxpool(M)
        u = maxpool_X(inH)  #1x1x2D
        v = maxpool_Y(inG)  #1x1x2D
        
        #feed forward neural network
        NN = torch.cat((u,v),2)
        NN = self.FF(NN)
        NN = nn.functional.relu(NN)
        output = self.out(NN)
        return output

dmodel = dnet(300,150,2000)
criterion = nn.MSELoss()
optimizer = torch.optim.SGD(dmodel.parameters(), lr=0.0002, momentum=0.9, nesterov=True)

In [None]:
for t in range(epochs):
    for b in range(len(X)):
        solute = X[b]
        solvent = Y[0]
        target = targets[b]  

        output = dmodel(solute,solvent) 
        print(output)
        loss = criterion(output, target)  
        
        loss.backward()
        optimizer.step()        
        optimizer.zero_grad() 
    print('step : ' , t , 'loss : ' , loss.item())