#Install Library

[RDKit ](https://github.com/rdkit/rdkit)

[DGL](https://github.com/dmlc/dgl/)

[DGL-LifeSci](https://github.com/awslabs/dgl-lifesci)
[link text](https://)

In [None]:
!pip install rdkit-pypi
!pip install dgllife
!pip install dgl



In [None]:
import os
# os.environ['PYTHONHASHSEED'] = '0'
# os.environ['CUDA_VISIBLE_DEVICES']='-1'
# os.environ['TF_CUDNN_USE_AUTOTUNE'] ='0'

import dgl
import sys
import torch
import random
import cv2
import torchvision
import pandas as pd
import numpy as np
import tensorflow as tf
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
import torch.optim as optim
import torchvision.transforms as transforms

from rdkit.Chem import AllChem
from rdkit import Chem
from rdkit import DataStructs

from torch.utils.data import DataLoader, Dataset
from dgllife.data import SIDER
from numpy import array
from numpy import argmax
from tensorflow.keras.utils import to_categorical
from dgllife.model import load_pretrained
from tensorflow.keras.callbacks import  History
from dgllife.utils import smiles_to_bigraph, CanonicalAtomFeaturizer, AttentiveFPAtomFeaturizer
from torch.utils.data import DataLoader, Dataset
from tqdm.notebook import tqdm, trange
from sklearn.model_selection import train_test_split
from dgl.data.utils import get_download_dir, download, _get_dgl_url, extract_archive
from dgllife.data.csv_dataset import MoleculeCSVDataset
from dgllife.utils.mol_to_graph import smiles_to_bigraph

# random.seed(46)
# np.random.seed(46)
# tf.random.set_seed(46)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
train_file_path = '/content/drive/My Drive/invivo_ability_train.csv'
test_file_path = '/content/drive/My Drive/invivo_ability_test.csv'

In [None]:
df_train =  pd.read_csv(train_file_path, usecols=['smiles', 'label'])
df_test = pd.read_csv(test_file_path , usecols=['smiles', 'label'])

In [None]:
df_train

Unnamed: 0,smiles,label
0,CC(C)C[C@H](NC(=O)N[C@@H](CCC(O)=O)C(O)=O)C(O)=O,1
1,CS(=O)(=O)N(Cc1ccc(cc1)C(=O)NCCOc1ccc(C)cc1C)c...,0
2,C[C@@H](NC(=O)c1cc(C)c(C)cc1)c1ccc(cc1)S(C)(=O)=O,0
3,CC(=O)/N=C1\NC(=O)/C(=C/c2ccc(o2)-c2ccc(cc2)C(...,0
4,CCC[C@@H](C)[C@@]1(CC=C)C(=O)NC(S)=NC1=O,1
...,...,...
114422,CC[C@H](C)c1[n]c2ccc(Br)cc2c(=O)[n]1N=Cc1ccc(O...,0
114423,Cc1ccccc1NC(=O)COc1c(cc(/C=N/[n]2c([n]c3ccccc3...,0
114424,O=C([C@@H]1[C@H](C(=O)c2ccc[s]2)N2[C@H](C=Cc3c...,0
114425,CN(C)C(=O)[C@H]([C@@H](c1ccccc1)C(=O)N(C)C)c1c...,0


In [None]:
class DATASET(MoleculeCSVDataset):

    def __init__(self,dataFram=None,
                 smiles_to_graph=smiles_to_bigraph,
                 node_featurizer=None,
                 edge_featurizer=None,
                 load=False,
                 log_every=1000,
                 cache_file_path = None,
                 n_jobs=1):


        super(DATASET, self).__init__(df=dataFram,
                                    smiles_to_graph=smiles_to_graph,
                                    node_featurizer=node_featurizer,
                                    edge_featurizer=edge_featurizer,
                                    smiles_column='smiles',
                                    cache_file_path = cache_file_path,
                                    load=load,
                                    log_every=log_every,
                                    init_mask=True,
                                    n_jobs=n_jobs)

    def __getitem__(self, item):

        return self.smiles[item], self.graphs[item], self.labels[item], self.mask[item]

In [None]:
from dgllife.model import load_pretrained
from dgllife.model import MLPPredictor

def get_sider_model (model_name = "GCN_attentivefp_SIDER"):
    if model_name ==  "GCN_attentivefp_SIDER":
        predictor_dropout = 0.08333992387843633
        predictor_hidden_feats = 1024
    else:
        predictor_dropout =  0.034959769945995006
        predictor_hidden_feats = 512

    gcn = load_pretrained(model_name)
    gnn_out_feats = gcn.gnn.hidden_feats[-1]
    gcn.predict = MLPPredictor(2 * gnn_out_feats, predictor_hidden_feats, predictor_hidden_feats, predictor_dropout)

    return gcn

In [None]:
def create_dataset_with_gcn(dataset, GCN ):
    created_data = []
    for i, data in enumerate(dataset):
        smiles, g, label, mask = data
        g = g.to(device)
        g = dgl.add_self_loop(g)
        graph_feats = g.ndata.pop('h')
        embbed = GCN(g, graph_feats)
        embbed = embbed.to('cpu')
        embbed = embbed.detach().numpy()
        d = (embbed, label, smiles)
        created_data.append(d)
    print('Data created!!')
    return created_data

In [None]:
def count_lablel(dataset):

    label_pos = 0
    lable_neg = 0

    for i, data in enumerate(dataset):
        embbed, lbl, smiles = data
        if lbl == 1.:
            label_pos += 1
        else:
            lable_neg += 1

    return label_pos, lable_neg

In [None]:
train_dataset = DATASET(df_train, smiles_to_bigraph, AttentiveFPAtomFeaturizer(), cache_file_path="./sider_dglgraph.bin")
test_dataset = DATASET(df_test, smiles_to_bigraph, AttentiveFPAtomFeaturizer(), cache_file_path="./sider_dglgraph.bin")

Processing dgl graphs from scratch...
Processing molecule 1000/114427
Processing molecule 2000/114427
Processing molecule 3000/114427
Processing molecule 4000/114427
Processing molecule 5000/114427
Processing molecule 6000/114427
Processing molecule 7000/114427
Processing molecule 8000/114427
Processing molecule 9000/114427
Processing molecule 10000/114427
Processing molecule 11000/114427
Processing molecule 12000/114427
Processing molecule 13000/114427
Processing molecule 14000/114427
Processing molecule 15000/114427
Processing molecule 16000/114427
Processing molecule 17000/114427
Processing molecule 18000/114427
Processing molecule 19000/114427
Processing molecule 20000/114427
Processing molecule 21000/114427
Processing molecule 22000/114427
Processing molecule 23000/114427
Processing molecule 24000/114427
Processing molecule 25000/114427
Processing molecule 26000/114427
Processing molecule 27000/114427
Processing molecule 28000/114427
Processing molecule 29000/114427
Processing mol

In [None]:
model_name = 'GCN_attentivefp_SIDER'
gcn_model = get_sider_model(model_name)
gcn_model.eval()
gcn_model = gcn_model.to(device)

Downloading GCN_attentivefp_SIDER_pre_trained.pth from https://data.dgl.ai/dgllife/pre_trained/gcn_attentivefp_sider.pth...


GCN_attentivefp_SIDER_pre_trained.pth:   0%|          | 0.00/3.09M [00:00<?, ?B/s]

Pretrained model loaded


In [None]:
train_ds = create_dataset_with_gcn(train_dataset, gcn_model)
test_ds = create_dataset_with_gcn(test_dataset, gcn_model)

Data created!!
Data created!!


In [None]:
label_pos , label_neg = count_lablel(train_ds)
print(f'train positive label: {label_pos} - train negative label: {label_neg}')

label_pos , label_neg = count_lablel(test_ds)
print(f'Test positive label: {label_pos} - Test negative label: {label_neg}')

train positive label: 24508 - train negative label: 89919
Test positive label: 2821 - Test negative label: 9893


In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

def sequential_model():
    inputs = Input(shape=(1024,))  # Define the input shape

    # Define the network
    L1 = Dense(1024, activation='relu')(inputs)
    L1_D = Dropout(0.2)(L1)

    L2 = Dense(512, activation='relu')(L1_D)
    L2_D = Dropout(0.2)(L2)

    L3 = Dense(256, activation='relu')(L2_D)
    L3_D = Dropout(0.2)(L3)

    L4 = Dense(128, activation='relu')(L3_D)
    L4_D = Dropout(0.2)(L4)

    L5 = Dense(64, activation='relu')(L4_D)
    L5_D = Dropout(0.2)(L5)

    L6 = Dense(32, activation='relu')(L5_D)
    L6_D = Dropout(0.2)(L6)

    L7 = Dense(8, activation='relu')(L6_D)

    prediction = Dense(1, activation='sigmoid')(L7)  # Output layer

    # Create the model
    sequential_net = Model(inputs=inputs, outputs=prediction)

    # Compile the model
    optimizer = Adam(learning_rate=0.001)
    sequential_net.compile(loss='binary_crossentropy',
                        optimizer=optimizer,
                        metrics=["accuracy", "mae", "mse", tf.keras.metrics.AUC()])

    return sequential_net

In [29]:
from sklearn.model_selection import KFold

Epoch_S = 10

train = []
lbls_train = []
valid = []
lbls_valid = []

for i , data in enumerate(train_ds):
    embbed, lbl, smiles = data
    train.append(embbed[0])
    lbls_train.append(lbl.tolist())

for i , data in enumerate(test_ds):
    embbed, lbl, smiles = data
    valid.append(embbed[0])
    lbls_valid.append(lbl.tolist())

train = np.array(train).reshape(-1,1024,1)
lbls_train = np.array(lbls_train)

valid = np.array(valid).reshape(-1,1024,1)
lbls_valid = np.array(lbls_valid)

# create neural network model
sequential_net = sequential_model()
history = History()
P = sequential_net.fit(train, lbls_train, epochs = Epoch_S, batch_size = 128, callbacks=[history])

for j in range(100):
    C=1
    Before = int(P.history['accuracy'][-1]*100)
    for i in range(2,Epoch_S+1):
        if  int(P.history['accuracy'][-i]*100) == Before:
            C=C+1
        else:
            C=1
        Before=int(P.history['accuracy'][-i]*100)
        print(Before)
    if C==Epoch_S:
        break
    P = sequential_net.fit(train, lbls_train, epochs = Epoch_S, batch_size = 128, callbacks=[history])
print(j+1)

score  = sequential_net.evaluate(valid, lbls_valid, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
92
91
91
90
90
89
88
87
85
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
94
94
94
94
93
93
93
93
92
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
96
96
96
96
95
95
95
95
95
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
97
97
97
97
96
96
96
96
96
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
97
97
97
97
97
97
97
97
97
5


In [30]:
score

[0.2176288366317749,
 0.9279534220695496,
 0.09472297132015228,
 0.05622006580233574,
 0.962160587310791]