## Install

In [None]:
! pip install rdkit-pypi
! pip install deepchem
! pip install dgl 
! pip install ogb

In [None]:
## to resolve the torch import error
# ! pip install -U numpy

## Import

In [None]:
import numpy as np
import pandas
import time
import networkx as nx
import itertools
import scipy.sparse as sp
import random 

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [None]:
from rdkit.Chem import MACCSkeys
from rdkit import Chem

In [None]:
import dgl
from dgl.nn import SAGEConv,GraphConv
import dgl.function as fn

In [None]:
from sklearn.metrics import roc_auc_score
from ogb.linkproppred import Evaluator

In [None]:
import copy

In [None]:
from ogb.utils.features import (allowable_features, atom_to_feature_vector,
 bond_to_feature_vector, atom_feature_vector_to_dict, bond_feature_vector_to_dict)


## Read file

In [None]:
csvFile = pandas.read_csv('./drugIDandSMILES.csv')
 
print(len(csvFile))
print(csvFile)

In [None]:
drug_id = csvFile['Drug ID']
smiles = csvFile['SMILES']

## Form graph from molecule

In [None]:
def smiles2graph(smiles_string):

    mol = Chem.MolFromSmiles(smiles_string)

    A = Chem.GetAdjacencyMatrix(mol)
    A = np.asmatrix(A)
    nnodes=len(A)
    nz = np.nonzero(A)

    # forming the graph using the adjacency matrix
    u1, v1 = list(nz[0]), list(nz[1])
    # print(sorted(u1)==sorted(v1))
    # print(u1)
    # print(v1)
    g = dgl.graph((u1, v1))
    bg = dgl.to_bidirected(g)

    # # atoms
    atom_features_list = []
    for atom in mol.GetAtoms():
        atom_features_list.append(atom_to_feature_vector(atom))
    x = np.array(atom_features_list, dtype = np.int64)

    # # bonds
    # num_bond_features = 3  # bond type, bond stereo, is_conjugated
    # if len(mol.GetBonds()) > 0: # mol has bonds
    #     edges_list = []
    #     edge_features_list = []
    #     for bond in mol.GetBonds():
    #         i = bond.GetBeginAtomIdx()
    #         j = bond.GetEndAtomIdx()

    #         edge_feature = bond_to_feature_vector(bond)

    #         # add edges in both directions
    #         edges_list.append((i, j))
    #         edge_features_list.append(edge_feature)
    #         edges_list.append((j, i))
    #         edge_features_list.append(edge_feature)

    #     # data.edge_index: Graph connectivity in COO format with shape [2, num_edges]
    #     edge_index = np.array(edges_list, dtype = np.int64).T

    #     # data.edge_attr: Edge feature matrix with shape [num_edges, num_edge_features]
    #     edge_attr = np.array(edge_features_list, dtype = np.int64)

    # else:   # mol has no bonds
    #     edge_index = np.empty((2, 0), dtype = np.int64)
    #     edge_attr = np.empty((0, num_bond_features), dtype = np.int64)

    # print(edge_attr.shape, edge_index.shape, x.shape)
    bg.ndata['node_feat'] = torch.FloatTensor(x)
    # bg.edata['edge_feat'] = torch.tensor(edge_attr)

    # return graph 
    return bg

In [None]:
mol1 = smiles2graph(smiles[0])
print(mol1)
mol2 = smiles2graph(smiles[1])
print(mol2)

## Autoencoder Model and Training loop

In [None]:
# https://github.com/MarounHaddad/Exploring-the-representational-power-of-graph-autoencoder/blob/f0aef4b793346913d8ea2ccb0de339b88acb992e/embedding_models/gae.py
# https://github.com/rangan2510/Single-Cell---GCN-Autoencoder/blob/main/sc-gcn-dat_1/runner.py
class EncoderLayer(nn.Module):
    def __init__(self, in_feats, out_feats):
        super(EncoderLayer, self).__init__()
        self.layersList = nn.ModuleList()
        self.layersList.append(SAGEConv(in_feats, out_feats, 'gcn'))
        self.layersList.append(SAGEConv(out_feats, out_feats, 'gcn'))
        # self.layersList.append(GraphConv(out_feats,out_feats))
        # self.fc = nn.Linear(in_feats, out_feats)
        self.relu = nn.ReLU()

    def forward(self, g, feats):
        temp = feats
        for L in self.layersList:
            temp = L(g,temp)
            temp = self.relu(temp)
        # print("1")
        # print(temp.shape)
        # temp = self.fc(temp)
        # print("2")
        return temp

    # def __init__(self, in_feats, out_feats, activation, dropout):
    #     super(EncoderLayer, self).__init__()
    #     self.linear = nn.Linear(in_feats, out_feats, bias=True)
    #     self.activation = activation
    #     self.norm = nn.BatchNorm1d(out_feats)
    #     self.drop = nn.Dropout(dropout)

    # def forward(self, g: dgl.graph, input):
    #     g.ndata['h'] = input

    #     if network_type == "gae_mean":
    #         # broadcast all messages and aggregate them according to the mean rule
    #         g.update_all(gcn_message, gcn_mean_reduce)
    #     else:
    #         # broadcast all messages and aggregate them according to the sum rule
    #         g.update_all(gcn_message, gcn_sum_reduce)

    #     h = g.ndata.pop('h')
    #     h = self.linear(h)
    #     h = self.activation(h)
    #     h = self.norm(h)
    #     h = self.drop(h)
    #     return h

class DecoderLayer(nn.Module):
    def __init__(self, in_feats, out_feats):
        super(DecoderLayer, self).__init__()
        self.layersList = nn.ModuleList()
        self.layersList.append(SAGEConv(in_feats, out_feats, 'gcn'))
        self.layersList.append(SAGEConv(out_feats, out_feats, 'gcn'))
        # self.layersList.append(GraphConv(out_feats,out_feats))
        # self.fc = nn.Linear(in_feats, out_feats)
        self.relu = nn.ReLU()

    def forward(self, g, feats):
        temp = feats
        for L in self.layersList:
            temp = L(g,temp)
            temp = self.relu(temp)
        # print("3")
        # temp = self.fc(temp)
        # print("4")
        return temp

    # def __init__(self, activation, num_features, dropout):
    #     super(DecoderLayer, self).__init__()
    #     self.activation = activation
    #     self.var = torch.var
    #     self.norm = nn.BatchNorm1d(num_features)
    #     self.drop = nn.Dropout(dropout)

    # def forward(self, z):
    #     # the decoder reconstructs the adjacency by multiplying
    #     # the output of the encoder with its transpose
    #     h = torch.mm(z, z.t())
    #     h = self.activation(h)
    #     h = self.norm(h)
    #     h = self.drop(h)
    #     return h

class AutoEncoder(nn.Module):
    def __init__(self, in_feats, out_feats):
        super(AutoEncoder, self).__init__()
        self.encoderLayer = EncoderLayer(in_feats, out_feats)
        self.decoderLayer = DecoderLayer(out_feats, in_feats)
        self.batchNormLayer = nn.BatchNorm1d(out_feats)
        # self.dropoutLayer = nn.Dropout(dropout)
    def forward(self, g, inputs):
        temp = self.encoderLayer(g, inputs)
        temp = self.batchNormLayer(temp)
        # temp = self.dropoutLayer(temp)
        temp = self.decoderLayer(g, temp)
        return temp

In [None]:
# help(nn.KLDivLoss)

In [None]:
g = smiles2graph(smiles[0])
g = dgl.add_self_loop(g)
# 155*9 --> 155*3 -> 100
model = AutoEncoder(g.ndata['node_feat'].shape[1], 3)

optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

loss_fn = nn.KLDivLoss()

all_logits = []
for e in range(1000):
    pred = model(g, g.ndata['node_feat'])
    pred = F.log_softmax(pred, 1)
    loss = loss_fn(pred, g.ndata['node_feat'])

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if e % 50 == 0:
        print('In epoch {}, loss: {}'.format(e, loss))


In [None]:
# # help(nn.CrossEntropyLoss)
# nn.CrossEntropyLoss?
# nn.KLDivLoss?
# nn.LogSoftmax?