## Install

In [None]:
! pip install rdkit-pypi
! pip install deepchem
! pip install dgl 
! pip install ogb

In [None]:
## to resolve the torch import error(if arises)
# ! pip install -U numpy

## Import

In [None]:
import numpy as np
import pandas
import time
import networkx as nx
import itertools
import scipy.sparse as sp
import random 

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [None]:
from rdkit.Chem import MACCSkeys
from rdkit import Chem

In [None]:
import dgl
from dgl.nn import SAGEConv,GraphConv
import dgl.function as fn

In [None]:
from sklearn.metrics import roc_auc_score
from ogb.linkproppred import Evaluator

In [None]:
import copy

In [None]:
from ogb.utils.features import (allowable_features, atom_to_feature_vector,
 bond_to_feature_vector, atom_feature_vector_to_dict, bond_feature_vector_to_dict)


## Read file

In [None]:
!wget https://raw.githubusercontent.com/r-b-1-5/Public-files/main/drugIDandSMILES.csv


In [None]:
csvFile = pandas.read_csv('./drugIDandSMILES.csv')
 
print(len(csvFile))
print(csvFile)

In [None]:
drug_id = csvFile['Drug ID']
smiles = csvFile['SMILES']

## Globals


In [None]:
def set_seed(seed=0):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)

## Form graph from molecule

In [None]:
def smiles2graph(smiles_string):

    mol = Chem.MolFromSmiles(smiles_string)

    try:
        A = Chem.GetAdjacencyMatrix(mol)
        A = np.asmatrix(A)
        nnodes=len(A)
        nz = np.nonzero(A)
    except:
        return dgl.graph()
    # forming the graph using the adjacency matrix
    u1, v1 = list(nz[0]), list(nz[1])
    # print(sorted(u1)==sorted(v1))
    # print(u1)
    # print(v1)
    g = dgl.graph((u1, v1))
    bg = dgl.to_bidirected(g)

    # # atoms
    atom_features_list = []
    for atom in mol.GetAtoms():
        atom_features_list.append(atom_to_feature_vector(atom))
    x = np.array(atom_features_list, dtype = np.int64)

    ####### skipping edge features for now
    # # bonds
    # num_bond_features = 3  # bond type, bond stereo, is_conjugated
    # if len(mol.GetBonds()) > 0: # mol has bonds
    #     edges_list = []
    #     edge_features_list = []
    #     for bond in mol.GetBonds():
    #         i = bond.GetBeginAtomIdx()
    #         j = bond.GetEndAtomIdx()

    #         edge_feature = bond_to_feature_vector(bond)

    #         # add edges in both directions
    #         edges_list.append((i, j))
    #         edge_features_list.append(edge_feature)
    #         edges_list.append((j, i))
    #         edge_features_list.append(edge_feature)

    #     # data.edge_index: Graph connectivity in COO format with shape [2, num_edges]
    #     edge_index = np.array(edges_list, dtype = np.int64).T

    #     # data.edge_attr: Edge feature matrix with shape [num_edges, num_edge_features]
    #     edge_attr = np.array(edge_features_list, dtype = np.int64)

    # else:   # mol has no bonds
    #     edge_index = np.empty((2, 0), dtype = np.int64)
    #     edge_attr = np.empty((0, num_bond_features), dtype = np.int64)

    # print(edge_attr.shape, edge_index.shape, x.shape)
    bg.ndata['node_feat'] = torch.FloatTensor(x)
    # bg.edata['edge_feat'] = torch.tensor(edge_attr)

    # return graph 
    return bg

In [None]:
mol1 = smiles2graph(smiles[0])
print(mol1)
mol2 = smiles2graph(smiles[1])
print(mol2)

In [None]:
maxnodes = 0
exception_count = 0
for _ in range(len(smiles)):
  try:
      mol_ = smiles2graph(smiles[_])
  except:
      exception_count += 1
      print(smiles[_])
  maxnodes = max(maxnodes, mol_.num_nodes())

In [None]:
print(maxnodes)
print(exception_count)

## Autoencoder Model and Training loop

In [None]:
# c


In [None]:
class EncoderLayer(nn.Module):
    def __init__(self, in_feats, out_feats):
        super(EncoderLayer, self).__init__()
        self.layersList = nn.ModuleList()
        self.layersList.append(SAGEConv(in_feats, out_feats, 'gcn'))
        self.layersList.append(SAGEConv(out_feats, out_feats, 'gcn'))
        self.fc = nn.Linear( maxnodes*out_feats , 100)
        self.relu = nn.ReLU()

    def forward(self, g, feats):
        temp = feats
        for L in self.layersList:
            temp = L(g,temp)
            temp = self.relu(temp)
        temp2 = temp
        dim1 = temp.shape[0]
        if maxnodes != dim1:
            padder = torch.zeros(maxnodes-dim1,3)
            temp = torch.cat([temp, padder], dim = 0)
        temp = torch.flatten(temp)
        temp = self.fc(temp)
        return temp2, temp
    
class Attention(nn.Module):
    def __init__(self, in_feat,out_feat):
        super().__init__()             
        self.Q = nn.Linear(in_feat,out_feat) # Query
        self.K = nn.Linear(in_feat,out_feat) # Key
        self.V = nn.Linear(in_feat,out_feat) # Value
        self.softmax = nn.Softmax(dim=1)

    def forward(self,x):
        Q = self.Q(x)
        K = self.K(x)
        V = self.V(x)
        d = K.shape[0] # dimension of key vector
        QK_d = (Q @ K.T)/(d)**0.5
        prob = self.softmax(QK_d)
        attention = prob @ V
        return attention

class DecoderLayer(nn.Module):
    def __init__(self, in_feats, out_feats):
        super(DecoderLayer, self).__init__()
        self.layersList = nn.ModuleList()
        self.layersList.append(SAGEConv(in_feats, out_feats, 'gcn'))
        self.layersList.append(SAGEConv(out_feats, out_feats, 'gcn'))
        self.fc = nn.Linear( 100, maxnodes*in_feats)
        self.attention = Attention(maxnodes, maxnodes)
        self.relu = nn.ReLU()

    def forward(self, g, feats, encoded):
        temp = self.fc(encoded)
        temp = torch.reshape(temp, (maxnodes, 3))
        temp = self.attention(temp.t()).t()
        temp = temp[:feats.shape[0]]
        temp += feats
        for L in self.layersList:
            temp = L(g,temp)
            temp = self.relu(temp)
        return temp

class AutoEncoder(nn.Module):
    def __init__(self, in_feats, out_feats):
        super(AutoEncoder, self).__init__()
        self.encoderLayer = EncoderLayer(in_feats, out_feats)
        self.decoderLayer = DecoderLayer(out_feats, in_feats)
        self.batchNormLayer = nn.BatchNorm1d(out_feats)
        # self.dropoutLayer = nn.Dropout(dropout)
    def forward(self, g, inputs):
        temp, encoded = self.encoderLayer(g, inputs)
        temp = self.batchNormLayer(temp)
        # temp = self.dropoutLayer(temp)
        temp = self.decoderLayer(g, temp, encoded)
        return temp

In [None]:
set_seed(0)
g = smiles2graph(smiles[0])
g = dgl.add_self_loop(g)

model = AutoEncoder(g.ndata['node_feat'].shape[1], 3)

optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

loss_fn1 = nn.KLDivLoss() 
loss_fn2 = nn.MSELoss()

all_logits = []
for e in range(1000):
    pred = model(g, g.ndata['node_feat'])
    pred = F.log_softmax(pred, 1)
    loss = loss_fn1(pred, g.ndata['node_feat']) + loss_fn2(pred, g.ndata['node_feat'])

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if e % 50 == 0:
        print('In epoch {}, loss: {}'.format(e, loss))

In [None]:
#c


In [None]:
#cc